def run(self): service = s3_service.S3Service() for key in service.get_new_logs(): # process the new keys out = StringIO() processed_gzip = gzip.GzipFile(fileobj=out, mode="w") new_records = [] logging.info('Processing %s', key.name) for record in self.extract(service.download(key)): # process every line from downloaded content processed_gzip.write(json.dumps(record) + '\n') new_records.append( (int(record['timestamp']), record['user_id'], record['url'], record['user_agent']['browser_family'], record['user_agent']['os_family'], record['user_agent']['mobile'])) # TODO: implement proper rollback if this fails. logging.info( 'Inserting %d records to statistic database', len(new_records)) # update new user info into database statistic = statistic_db.StatisticDB() statistic.insert_user_info(new_records) statistic.close() logging.info('Finish inserting records to statistic database') # upload the processed file to s3 new_key = key.name.replace(config.LOG_DIR, config.PROCESSED_DIR) processed_gzip.close() service.upload(new_key, out.getvalue()) out.close() logging.info('Finish processing %s', key.name) service.close()
def test_init_if_db_exists(self): statistic_db.StatisticDB.DATABASE = ( './testdata/initial_statistic.sqlite3.db') db = statistic_db.StatisticDB() conn = db.get_connection() c = conn.execute( """ SELECT name FROM sqlite_master WHERE type='table' AND name='statistic' """) self.assertTrue(c.fetchall())
def test_get_user_hourly_statistic_with_timestamp(self): values = [ (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False), (1384729285, u'user2', u'url1', u'Firefox', u'Windows XP', True)] golden = [(1384729200, 2, 2, 2, 1, 1)] tmp = tempfile.NamedTemporaryFile() statistic_db.StatisticDB.DATABASE = tmp.name tmp.close() db = statistic_db.StatisticDB() db.conn.row_factory = None db.insert_user_info(values) c = db.get_user_hourly_statistic(1384729200) self.assertEqual(golden, c.fetchall()) os.unlink(statistic_db.StatisticDB.DATABASE)
def test_get_unreported_hours(self): values = [ (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False), (1384729205, u'user1', u'url3', u'Chrome', u'Windows Vista', False), (1380000010, u'user2', u'url1', u'Firefox', u'Windows XP', True)] tmp = tempfile.NamedTemporaryFile() statistic_db.StatisticDB.DATABASE = tmp.name tmp.close() db = statistic_db.StatisticDB() db.conn.row_factory = None db.insert_user_info(values) db.update_reported(1384729200, 1) result = db.get_unreported_hours() self.assertEqual([1379998800, 1384729200], result) os.unlink(statistic_db.StatisticDB.DATABASE)
def test_insert_user_info(self): values = [ (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False), (1384729285, u'user2', u'url2', u'Firefox', u'Windows XP', True)] golden = [ (1, 1384729205, 'user1', 'url1', 'Chrome', 'Windows Vista', 0), (2, 1384729285, 'user2', 'url2', 'Firefox', 'Windows XP', 1)] tmp = tempfile.NamedTemporaryFile() statistic_db.StatisticDB.DATABASE = tmp.name tmp.close() db = statistic_db.StatisticDB() db.insert_user_info(values) conn = db.get_connection() conn.row_factory = None c = conn.execute("""SELECT * FROM statistic""") self.assertEqual(golden, c.fetchall()) os.unlink(statistic_db.StatisticDB.DATABASE)
def test_update_reported(self): values = [ (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False), (1385729285, u'user2', u'url1', u'Firefox', u'Windows XP', True)] tmp = tempfile.NamedTemporaryFile() statistic_db.StatisticDB.DATABASE = tmp.name tmp.close() db = statistic_db.StatisticDB() db.conn.row_factory = None db.insert_user_info(values) db.update_reported(1384729200, 800) result = db.get_connection().execute('SELECT * FROM reported').fetchall() self.assertEqual([(1384729200, 800), ], result) db.update_reported(1384729201, 799) db.update_reported(1384729200, 801) result = db.get_connection().execute('SELECT * FROM reported').fetchall() self.assertEqual([(1384729200, 801), (1384729201, 799)], result) os.unlink(statistic_db.StatisticDB.DATABASE)
def test_init_if_db_not_exists(self): tmp = tempfile.NamedTemporaryFile() statistic_db.StatisticDB.DATABASE = tmp.name tmp.close() db = statistic_db.StatisticDB() conn = db.get_connection() conn.row_factory = None c = conn.execute( """ SELECT name FROM sqlite_master WHERE type='table' and name <> 'sqlite_sequence' """) golden = ['statistic', 'reported'] self.assertEqual(sorted(golden), sorted(x[0] for x in c.fetchall())) c = conn.execute( """ SELECT name FROM sqlite_master WHERE type='view' """) golden = [ ('os_family_hourly',), ('user_general_hourly',), ('browser_family_hourly',)] self.assertEqual(sorted(golden), sorted(c.fetchall())) os.unlink(statistic_db.StatisticDB.DATABASE)
def run(self): statistic = statistic_db.StatisticDB() dates = set() for timestamp in statistic.get_unreported_hours(): date = datetime.fromtimestamp(timestamp) dates.add((date.year, date.month, date.day)) if not dates: return service = s3_service.S3Service() for date in dates: key = os.path.join(config.REPORTING_DIR, '%d/%d/%d' % date, 'report.gz') logging.info('Generating report to %s', key) out = StringIO() report_gzip = gzip.GzipFile(fileobj=out, mode="w") report_gzip.write('\t'.join(self.header) + '\n') timestamp = calendar.timegm(datetime(*date).utctimetuple()) - 3600 num_of_records = 0 for i in xrange(24): timestamp += 3600 result = statistic.get_user_hourly_statistic( timestamp).fetchone() if not result: continue num_of_records += 1 os_stat = [] for os_family in statistic.get_os_hourly_statistic( result['hour']).fetchall(): os_stat.append( (os_family['os_family'], os_family['quantity'] * 1.0 / result['total_users'])) os_stat.sort(key=lambda x: x[1], reverse=True) browser_stat = [] cursor = statistic.get_browser_hourly_statistic(result['hour']) for browser in cursor.fetchall(): browser_stat.append( (browser['browser_family'], browser['quantity'] * 1.0 / result['total_users'])) browser_stat.sort(key=lambda x: x[1], reverse=True) desktop = result['total_users'] - result['num_mobile'] ratio = 1.0 * desktop / result['num_mobile'] report_gzip.write('\t'.join([ datetime.fromtimestamp(timestamp).isoformat(' '), str(result['distinct_users']), str(result['total_users']), str(result['distinct_urls']), str(result['total_urls']), ';'.join( '%s,%f' % (n, f) for n, f in os_stat), ';'.join( '%s,%f' % (n, f) for n, f in browser_stat), str(result['num_mobile']), str(ratio) ]) + '\n') # TODO: shall update this in one go for a whole day. statistic.update_reported(timestamp, result['total_users']) logging.info('Finished generating report %s with %d records', key, num_of_records) report_gzip.close() service.upload(key, out.getvalue()) out.close() statistic.close() service.close()