def run(self):
   service = s3_service.S3Service()
   for key in service.get_new_logs():
     # process the new keys
     out = StringIO()
     processed_gzip = gzip.GzipFile(fileobj=out, mode="w")
     new_records = []
     logging.info('Processing %s', key.name)
     for record in self.extract(service.download(key)):
       # process every line from downloaded content
       processed_gzip.write(json.dumps(record) + '\n')
       new_records.append(
           (int(record['timestamp']), record['user_id'], record['url'],
            record['user_agent']['browser_family'],
            record['user_agent']['os_family'],
            record['user_agent']['mobile']))
     # TODO: implement proper rollback if this fails.
     logging.info(
         'Inserting %d records to statistic database', len(new_records))
     # update new user info into database
     statistic = statistic_db.StatisticDB()
     statistic.insert_user_info(new_records)
     statistic.close()
     logging.info('Finish inserting records to statistic database')
     # upload the processed file to s3
     new_key = key.name.replace(config.LOG_DIR, config.PROCESSED_DIR)
     processed_gzip.close()
     service.upload(new_key, out.getvalue())
     out.close()
     logging.info('Finish processing %s', key.name)
   service.close()
Example #2
0
 def test_init_if_db_exists(self):
   statistic_db.StatisticDB.DATABASE = (
       './testdata/initial_statistic.sqlite3.db')
   db = statistic_db.StatisticDB()
   conn = db.get_connection()
   c = conn.execute(
       """
       SELECT name FROM sqlite_master WHERE type='table' AND name='statistic'
       """)
   self.assertTrue(c.fetchall())
Example #3
0
 def test_get_user_hourly_statistic_with_timestamp(self):
   values = [
       (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False),
       (1384729285, u'user2', u'url1', u'Firefox', u'Windows XP', True)]
   golden = [(1384729200, 2, 2, 2, 1, 1)]
   tmp = tempfile.NamedTemporaryFile()
   statistic_db.StatisticDB.DATABASE = tmp.name
   tmp.close()
   db = statistic_db.StatisticDB()
   db.conn.row_factory = None
   db.insert_user_info(values)
   c = db.get_user_hourly_statistic(1384729200)
   self.assertEqual(golden, c.fetchall())
   os.unlink(statistic_db.StatisticDB.DATABASE)
Example #4
0
 def test_get_unreported_hours(self):
   values = [
       (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False),
       (1384729205, u'user1', u'url3', u'Chrome', u'Windows Vista', False),
       (1380000010, u'user2', u'url1', u'Firefox', u'Windows XP', True)]
   tmp = tempfile.NamedTemporaryFile()
   statistic_db.StatisticDB.DATABASE = tmp.name
   tmp.close()
   db = statistic_db.StatisticDB()
   db.conn.row_factory = None
   db.insert_user_info(values)
   db.update_reported(1384729200, 1)
   result = db.get_unreported_hours()
   self.assertEqual([1379998800, 1384729200], result)
   os.unlink(statistic_db.StatisticDB.DATABASE)
Example #5
0
 def test_insert_user_info(self):
   values = [
       (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False),
       (1384729285, u'user2', u'url2', u'Firefox', u'Windows XP', True)]
   golden = [
       (1, 1384729205, 'user1', 'url1', 'Chrome', 'Windows Vista', 0),
       (2, 1384729285, 'user2', 'url2', 'Firefox', 'Windows XP', 1)]
   tmp = tempfile.NamedTemporaryFile()
   statistic_db.StatisticDB.DATABASE = tmp.name
   tmp.close()
   db = statistic_db.StatisticDB()
   db.insert_user_info(values)
   conn = db.get_connection()
   conn.row_factory = None
   c = conn.execute("""SELECT * FROM statistic""")
   self.assertEqual(golden, c.fetchall())
   os.unlink(statistic_db.StatisticDB.DATABASE)
Example #6
0
 def test_update_reported(self):
   values = [
       (1384729205, u'user1', u'url1', u'Chrome', u'Windows Vista', False),
       (1385729285, u'user2', u'url1', u'Firefox', u'Windows XP', True)]
   tmp = tempfile.NamedTemporaryFile()
   statistic_db.StatisticDB.DATABASE = tmp.name
   tmp.close()
   db = statistic_db.StatisticDB()
   db.conn.row_factory = None
   db.insert_user_info(values)
   db.update_reported(1384729200, 800)
   result = db.get_connection().execute('SELECT * FROM reported').fetchall()
   self.assertEqual([(1384729200, 800), ], result)
   db.update_reported(1384729201, 799)
   db.update_reported(1384729200, 801)
   result = db.get_connection().execute('SELECT * FROM reported').fetchall()
   self.assertEqual([(1384729200, 801), (1384729201, 799)], result)
   os.unlink(statistic_db.StatisticDB.DATABASE)
Example #7
0
 def test_init_if_db_not_exists(self):
   tmp = tempfile.NamedTemporaryFile()
   statistic_db.StatisticDB.DATABASE = tmp.name
   tmp.close()
   db = statistic_db.StatisticDB()
   conn = db.get_connection()
   conn.row_factory = None
   c = conn.execute(
       """
       SELECT name
       FROM sqlite_master WHERE type='table' and name <> 'sqlite_sequence'
       """)
   golden = ['statistic', 'reported']
   self.assertEqual(sorted(golden), sorted(x[0] for x in c.fetchall()))
   c = conn.execute(
       """
       SELECT name FROM sqlite_master WHERE type='view'
       """)
   golden = [
       ('os_family_hourly',),
       ('user_general_hourly',),
       ('browser_family_hourly',)]
   self.assertEqual(sorted(golden), sorted(c.fetchall()))
   os.unlink(statistic_db.StatisticDB.DATABASE)
 def run(self):
     statistic = statistic_db.StatisticDB()
     dates = set()
     for timestamp in statistic.get_unreported_hours():
         date = datetime.fromtimestamp(timestamp)
         dates.add((date.year, date.month, date.day))
     if not dates:
         return
     service = s3_service.S3Service()
     for date in dates:
         key = os.path.join(config.REPORTING_DIR, '%d/%d/%d' % date,
                            'report.gz')
         logging.info('Generating report to %s', key)
         out = StringIO()
         report_gzip = gzip.GzipFile(fileobj=out, mode="w")
         report_gzip.write('\t'.join(self.header) + '\n')
         timestamp = calendar.timegm(datetime(*date).utctimetuple()) - 3600
         num_of_records = 0
         for i in xrange(24):
             timestamp += 3600
             result = statistic.get_user_hourly_statistic(
                 timestamp).fetchone()
             if not result:
                 continue
             num_of_records += 1
             os_stat = []
             for os_family in statistic.get_os_hourly_statistic(
                     result['hour']).fetchall():
                 os_stat.append(
                     (os_family['os_family'],
                      os_family['quantity'] * 1.0 / result['total_users']))
             os_stat.sort(key=lambda x: x[1], reverse=True)
             browser_stat = []
             cursor = statistic.get_browser_hourly_statistic(result['hour'])
             for browser in cursor.fetchall():
                 browser_stat.append(
                     (browser['browser_family'],
                      browser['quantity'] * 1.0 / result['total_users']))
             browser_stat.sort(key=lambda x: x[1], reverse=True)
             desktop = result['total_users'] - result['num_mobile']
             ratio = 1.0 * desktop / result['num_mobile']
             report_gzip.write('\t'.join([
                 datetime.fromtimestamp(timestamp).isoformat(' '),
                 str(result['distinct_users']),
                 str(result['total_users']),
                 str(result['distinct_urls']),
                 str(result['total_urls']), ';'.join(
                     '%s,%f' % (n, f) for n, f in os_stat), ';'.join(
                         '%s,%f' % (n, f) for n, f in browser_stat),
                 str(result['num_mobile']),
                 str(ratio)
             ]) + '\n')
             # TODO: shall update this in one go for a whole day.
             statistic.update_reported(timestamp, result['total_users'])
         logging.info('Finished generating report %s with %d records', key,
                      num_of_records)
         report_gzip.close()
         service.upload(key, out.getvalue())
         out.close()
     statistic.close()
     service.close()