def test_index_latest(self): latest = datetime.date.today() - datetime.timedelta(days=5) DownloadCount.index({'date': latest}) self.refresh('stats_download_counts') start = latest.strftime('%Y-%m-%d') finish = datetime.date.today().strftime('%Y-%m-%d') with mock.patch('olympia.stats.cron.call_command') as call: cron.index_latest_stats() call.assert_called_with('index_stats', addons=None, date='%s:%s' % (start, finish))
def test_stats_from_model_download_count(): result = serialize_stats( DownloadCount( addon_id=321, date='2016-01-18', count=123, sources={u'search': 1, u'collection': 1})) assert json.loads(result) == { 'date': '2016-01-18', 'addon': 321, 'count': 123, 'sources': {'search': 1, 'collection': 1}}
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set( DownloadSource.objects.filter(type='full').values_list('name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') sep = options['separator'] if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX, 'download_counts', day, '000000_0']) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict( Addon.unfiltered.exclude(status=amo.STATUS_NULL) .values_list('slug', 'id')) # Only accept valid sources, which are constants. The source must # either be exactly one of the "full" valid sources, or prefixed by one # of the "prefix" valid sources. fulls = amo.DOWNLOAD_SOURCES_FULL prefixes = amo.DOWNLOAD_SOURCES_PREFIX count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if id_or_slug.strip().isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue # Does this file exist? if id_or_slug in files_to_addon: addon_id = files_to_addon[id_or_slug] # Maybe it's an add-on ? elif id_or_slug in files_to_addon.values(): addon_id = id_or_slug else: # It's an integer we don't recognize, ignore the row. continue else: # It's probably a slug. if id_or_slug in slugs_to_addon: addon_id = slugs_to_addon[id_or_slug] else: # We've exhausted all possibilities, ignore this row. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) if options['stats_source'] == 'file': # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict(Addon.objects.public().values_list('slug', 'id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set( DownloadSource.objects.filter(type='full').values_list('name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if id_or_slug.strip().isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue # Does this file exist? if id_or_slug in files_to_addon: addon_id = files_to_addon[id_or_slug] # Maybe it's an add-on ? elif id_or_slug in files_to_addon.values(): addon_id = id_or_slug else: # It's an integer we don't recognize, ignore the row. continue else: # It's probably a slug. if id_or_slug in slugs_to_addon: addon_id = slugs_to_addon[id_or_slug] else: # We've exhausted all possibilities, ignore this row. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') sep = options['separator'] if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([ settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX, 'download_counts', day, '000000_0' ]) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict( Addon.unfiltered.exclude(status=amo.STATUS_NULL).values_list( 'slug', 'id')) # Only accept valid sources, which are constants. The source must # either be exactly one of the "full" valid sources, or prefixed by one # of the "prefix" valid sources. fulls = amo.DOWNLOAD_SOURCES_FULL prefixes = amo.DOWNLOAD_SOURCES_PREFIX count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.info('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue if id_or_slug.isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue addon_id = ( # Does this file exist? files_to_addon.get(id_or_slug) or # Maybe it's an add-on ? ( id_or_slug if id_or_slug in files_to_addon.values() # otherwise it doesn't exist else None)) else: # If it's not numeric it's probably a slug. addon_id = slugs_to_addon.get(id_or_slug) if not addon_id: # We've exhausted all possibilities, ignore this row. continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] # update the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) else: dc = DownloadCount(date=day, addon_id=addon_id, count=counter, sources={src: counter}) download_counts[addon_id] = dc # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.info('Total processing time: %s' % (datetime.now() - start)) if options['stats_source'] == 'file': # Clean up file. log.info('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set(DownloadSource.objects.filter(type='full').values_list( 'name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)