def update_locale(self, update_count, locale, count): """Update the locales on the update_count with the given locale.""" locale = locale.replace('_', '-') # Only update if the locale "could be" valid. We can't simply restrict # on locales that AMO know, because Firefox has many more, and custom # packaged versions could have even more. Thus, we only restrict on the # allowed characters, some kind of format, and the total length, and # hope to not miss out on too many locales. if re.match(LOCALE_REGEX, locale): update_count.locales = update_inc(update_count.locales, locale, count)
def update_app(self, update_count, app_id, app_ver, count): """Update the applications on the update_count with the given data.""" # Only update if app_id is a valid application guid, and if app_ver # "could be" a valid version. if (app_id not in VALID_APP_GUIDS or not re.match(APPVERSION_REGEX, app_ver)): return # Applications is a dict of dicts, eg: # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}": # {"10.0": 2, "21.0": 1, ....}, # "some other application guid": ... # } if update_count.applications is None: update_count.applications = {} app = update_count.applications.get(app_id, {}) # Now overwrite this application's dict with # incremented counts for its versions. update_count.applications.update( {app_id: update_inc(app, app_ver, count)})
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set( DownloadSource.objects.filter(type='full').values_list('name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') sep = options['separator'] if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX, 'download_counts', day, '000000_0']) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict( Addon.unfiltered.exclude(status=amo.STATUS_NULL) .values_list('slug', 'id')) # Only accept valid sources, which are constants. The source must # either be exactly one of the "full" valid sources, or prefixed by one # of the "prefix" valid sources. fulls = amo.DOWNLOAD_SOURCES_FULL prefixes = amo.DOWNLOAD_SOURCES_PREFIX count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if id_or_slug.strip().isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue # Does this file exist? if id_or_slug in files_to_addon: addon_id = files_to_addon[id_or_slug] # Maybe it's an add-on ? elif id_or_slug in files_to_addon.values(): addon_id = id_or_slug else: # It's an integer we don't recognize, ignore the row. continue else: # It's probably a slug. if id_or_slug in slugs_to_addon: addon_id = slugs_to_addon[id_or_slug] else: # We've exhausted all possibilities, ignore this row. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) if options['stats_source'] == 'file': # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def update_os(self, update_count, os, count): """Update the OSes on the update_count with the given OS.""" if os.lower() in amo.PLATFORM_DICT: update_count.oses = update_inc(update_count.oses, os, count)
def update_status(self, update_count, status, count): """Update the statuses on the update_count with the given status.""" # Only update if the given status is valid. if status in VALID_STATUSES: update_count.statuses = update_inc(update_count.statuses, status, count)
def update_version(self, update_count, version, count): """Update the versions on the update_count with the given version.""" version = version[:32] # Limit the version to a (random) length. update_count.versions = update_inc(update_count.versions, version, count)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict(Addon.objects.public().values_list('slug', 'id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set( DownloadSource.objects.filter(type='full').values_list('name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if id_or_slug.strip().isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue # Does this file exist? if id_or_slug in files_to_addon: addon_id = files_to_addon[id_or_slug] # Maybe it's an add-on ? elif id_or_slug in files_to_addon.values(): addon_id = id_or_slug else: # It's an integer we don't recognize, ignore the row. continue else: # It's probably a slug. if id_or_slug in slugs_to_addon: addon_id = slugs_to_addon[id_or_slug] else: # We've exhausted all possibilities, ignore this row. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') sep = options['separator'] if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([ settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX, 'download_counts', day, '000000_0' ]) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files and slugs once and for all. # This builds two dicts: # - One where each key (the file_id we get from the hive query) has # the addon_id as value. # - One where each key (the add-on slug) has the add-on_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) slugs_to_addon = dict( Addon.unfiltered.exclude(status=amo.STATUS_NULL).values_list( 'slug', 'id')) # Only accept valid sources, which are constants. The source must # either be exactly one of the "full" valid sources, or prefixed by one # of the "prefix" valid sources. fulls = amo.DOWNLOAD_SOURCES_FULL prefixes = amo.DOWNLOAD_SOURCES_PREFIX count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.info('Badly formatted row: %s' % line) continue day, counter, id_or_slug, src = splitted try: # Clean up data. id_or_slug = id_or_slug.strip() counter = int(counter) except ValueError: # Ignore completely invalid data. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue if id_or_slug.isdigit(): # If it's a digit, then it should be a file id. try: id_or_slug = int(id_or_slug) except ValueError: continue addon_id = ( # Does this file exist? files_to_addon.get(id_or_slug) or # Maybe it's an add-on ? ( id_or_slug if id_or_slug in files_to_addon.values() # otherwise it doesn't exist else None)) else: # If it's not numeric it's probably a slug. addon_id = slugs_to_addon.get(id_or_slug) if not addon_id: # We've exhausted all possibilities, ignore this row. continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] # update the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) else: dc = DownloadCount(date=day, addon_id=addon_id, count=counter, sources={src: counter}) download_counts[addon_id] = dc # Close all old connections in this thread before we start creating the # `DownloadCount` values. # https://github.com/mozilla/addons-server/issues/6886 # If the calculation above takes too long it might happen that we run # into `wait_timeout` problems and django doesn't reconnect properly # (potentially because of misconfiguration). # Django will re-connect properly after it notices that all # connections are closed. close_old_connections() # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.info('Total processing time: %s' % (datetime.now() - start)) if options['stats_source'] == 'file': # Clean up file. log.info('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set(DownloadSource.objects.filter(type='full').values_list( 'name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) for download_count in download_counts.values(): save_stats_to_file(download_count) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)