def update_locale(self, update_count, locale, count):
     """Update the locales on the update_count with the given locale."""
     locale = locale.replace('_', '-')
     # Only update if the locale "could be" valid. We can't simply restrict
     # on locales that AMO know, because Firefox has many more, and custom
     # packaged versions could have even more. Thus, we only restrict on the
     # allowed characters, some kind of format, and the total length, and
     # hope to not miss out on too many locales.
     if re.match(LOCALE_REGEX, locale):
         update_count.locales = update_inc(update_count.locales, locale,
                                           count)
 def update_locale(self, update_count, locale, count):
     """Update the locales on the update_count with the given locale."""
     locale = locale.replace('_', '-')
     # Only update if the locale "could be" valid. We can't simply restrict
     # on locales that AMO know, because Firefox has many more, and custom
     # packaged versions could have even more. Thus, we only restrict on the
     # allowed characters, some kind of format, and the total length, and
     # hope to not miss out on too many locales.
     if re.match(LOCALE_REGEX, locale):
         update_count.locales = update_inc(update_count.locales, locale,
                                           count)
 def update_app(self, update_count, app_id, app_ver, count):
     """Update the applications on the update_count with the given data."""
     # Only update if app_id is a valid application guid, and if app_ver
     # "could be" a valid version.
     if (app_id not in VALID_APP_GUIDS or
             not re.match(APPVERSION_REGEX, app_ver)):
         return
     # Applications is a dict of dicts, eg:
     # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}":
     #       {"10.0": 2, "21.0": 1, ....},
     #  "some other application guid": ...
     # }
     if update_count.applications is None:
         update_count.applications = {}
     app = update_count.applications.get(app_id, {})
     # Now overwrite this application's dict with
     # incremented counts for its versions.
     update_count.applications.update(
         {app_id: update_inc(app, app_ver, count)})
 def update_app(self, update_count, app_id, app_ver, count):
     """Update the applications on the update_count with the given data."""
     # Only update if app_id is a valid application guid, and if app_ver
     # "could be" a valid version.
     if (app_id not in VALID_APP_GUIDS
             or not re.match(APPVERSION_REGEX, app_ver)):
         return
     # Applications is a dict of dicts, eg:
     # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}":
     #       {"10.0": 2, "21.0": 1, ....},
     #  "some other application guid": ...
     # }
     if update_count.applications is None:
         update_count.applications = {}
     app = update_count.applications.get(app_id, {})
     # Now overwrite this application's dict with
     # incremented counts for its versions.
     update_count.applications.update(
         {app_id: update_inc(app, app_ver, count)})
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files once and for all.
        # This builds a dict where each key (the file_id we get from the hive
        # query) has the addon_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(
            DownloadSource.objects.filter(type='full').values_list('name',
                                                                   flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, file_id, src = splitted
                try:
                    file_id, counter = int(file_id), int(counter)
                except ValueError:  # Badly formatted? Drop.
                    continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Does this file exist?
                if file_id in files_to_addon:
                    addon_id = files_to_addon[file_id]
                else:
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)
        for download_count in download_counts.values():
            save_stats_to_file(download_count)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        sep = options['separator']

        if options['stats_source'] == 's3':
            filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET,
                                           settings.AWS_STATS_S3_PREFIX,
                                           'download_counts',
                                           day, '000000_0'])

        elif options['stats_source'] == 'file':
            folder = options['folder_name']
            folder = path.join(settings.TMP_PATH, folder, day)
            filepath = path.join(folder, 'download_counts.hive')

        # Make sure we're not trying to update with mismatched data.
        if get_date(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}

        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(File.objects.values_list('id',
                                                       'version__addon_id'))
        slugs_to_addon = dict(
            Addon.unfiltered.exclude(status=amo.STATUS_NULL)
            .values_list('slug', 'id'))

        # Only accept valid sources, which are constants. The source must
        # either be exactly one of the "full" valid sources, or prefixed by one
        # of the "prefix" valid sources.
        fulls = amo.DOWNLOAD_SOURCES_FULL
        prefixes = amo.DOWNLOAD_SOURCES_PREFIX

        count_file = get_stats_data(filepath)
        for index, line in enumerate(count_file):
            if index and (index % 1000000) == 0:
                log.info('Processed %s lines' % index)

            splitted = line[:-1].split(sep)

            if len(splitted) != 4:
                log.debug('Badly formatted row: %s' % line)
                continue

            day, counter, id_or_slug, src = splitted
            try:
                # Clean up data.
                id_or_slug = id_or_slug.strip()
                counter = int(counter)
            except ValueError:
                # Ignore completely invalid data.
                continue

            if id_or_slug.strip().isdigit():
                # If it's a digit, then it should be a file id.
                try:
                    id_or_slug = int(id_or_slug)
                except ValueError:
                    continue

                # Does this file exist?
                if id_or_slug in files_to_addon:
                    addon_id = files_to_addon[id_or_slug]
                # Maybe it's an add-on ?
                elif id_or_slug in files_to_addon.values():
                    addon_id = id_or_slug
                else:
                    # It's an integer we don't recognize, ignore the row.
                    continue
            else:
                # It's probably a slug.
                if id_or_slug in slugs_to_addon:
                    addon_id = slugs_to_addon[id_or_slug]
                else:
                    # We've exhausted all possibilities, ignore this row.
                    continue

            if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                continue

            # Memoize the DownloadCount.
            if addon_id in download_counts:
                dc = download_counts[addon_id]
            else:
                dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                download_counts[addon_id] = dc

            # We can now fill the DownloadCount object.
            dc.count += counter
            dc.sources = update_inc(dc.sources, src, counter)

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        if options['stats_source'] == 'file':
            # Clean up file.
            log.debug('Deleting {path}'.format(path=filepath))
            unlink(filepath)
 def update_os(self, update_count, os, count):
     """Update the OSes on the update_count with the given OS."""
     if os.lower() in amo.PLATFORM_DICT:
         update_count.oses = update_inc(update_count.oses, os, count)
 def update_status(self, update_count, status, count):
     """Update the statuses on the update_count with the given status."""
     # Only update if the given status is valid.
     if status in VALID_STATUSES:
         update_count.statuses = update_inc(update_count.statuses, status,
                                            count)
 def update_version(self, update_count, version, count):
     """Update the versions on the update_count with the given version."""
     version = version[:32]  # Limit the version to a (random) length.
     update_count.versions = update_inc(update_count.versions, version,
                                        count)
Exemple #10
0
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = options['folder_name']
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))
        slugs_to_addon = dict(Addon.objects.public().values_list('slug', 'id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(
            DownloadSource.objects.filter(type='full').values_list('name',
                                                                   flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, id_or_slug, src = splitted
                try:
                    # Clean up data.
                    id_or_slug = id_or_slug.strip()
                    counter = int(counter)
                except ValueError:
                    # Ignore completely invalid data.
                    continue

                if id_or_slug.strip().isdigit():
                    # If it's a digit, then it should be a file id.
                    try:
                        id_or_slug = int(id_or_slug)
                    except ValueError:
                        continue

                    # Does this file exist?
                    if id_or_slug in files_to_addon:
                        addon_id = files_to_addon[id_or_slug]
                    # Maybe it's an add-on ?
                    elif id_or_slug in files_to_addon.values():
                        addon_id = id_or_slug
                    else:
                        # It's an integer we don't recognize, ignore the row.
                        continue
                else:
                    # It's probably a slug.
                    if id_or_slug in slugs_to_addon:
                        addon_id = slugs_to_addon[id_or_slug]
                    else:
                        # We've exhausted all possibilities, ignore this row.
                        continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        for download_count in download_counts.values():
            save_stats_to_file(download_count)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)
 def update_os(self, update_count, os, count):
     """Update the OSes on the update_count with the given OS."""
     if os.lower() in amo.PLATFORM_DICT:
         update_count.oses = update_inc(update_count.oses, os, count)
 def update_status(self, update_count, status, count):
     """Update the statuses on the update_count with the given status."""
     # Only update if the given status is valid.
     if status in VALID_STATUSES:
         update_count.statuses = update_inc(update_count.statuses, status,
                                            count)
 def update_version(self, update_count, version, count):
     """Update the versions on the update_count with the given version."""
     version = version[:32]  # Limit the version to a (random) length.
     update_count.versions = update_inc(update_count.versions, version,
                                        count)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        sep = options['separator']

        if options['stats_source'] == 's3':
            filepath = 's3://' + '/'.join([
                settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX,
                'download_counts', day, '000000_0'
            ])

        elif options['stats_source'] == 'file':
            folder = options['folder_name']
            folder = path.join(settings.TMP_PATH, folder, day)
            filepath = path.join(folder, 'download_counts.hive')

        # Make sure we're not trying to update with mismatched data.
        if get_date(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}

        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))
        slugs_to_addon = dict(
            Addon.unfiltered.exclude(status=amo.STATUS_NULL).values_list(
                'slug', 'id'))

        # Only accept valid sources, which are constants. The source must
        # either be exactly one of the "full" valid sources, or prefixed by one
        # of the "prefix" valid sources.
        fulls = amo.DOWNLOAD_SOURCES_FULL
        prefixes = amo.DOWNLOAD_SOURCES_PREFIX

        count_file = get_stats_data(filepath)
        for index, line in enumerate(count_file):
            if index and (index % 1000000) == 0:
                log.info('Processed %s lines' % index)

            splitted = line[:-1].split(sep)

            if len(splitted) != 4:
                log.info('Badly formatted row: %s' % line)
                continue

            day, counter, id_or_slug, src = splitted
            try:
                # Clean up data.
                id_or_slug = id_or_slug.strip()
                counter = int(counter)
            except ValueError:
                # Ignore completely invalid data.
                continue

            if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                continue

            if id_or_slug.isdigit():
                # If it's a digit, then it should be a file id.
                try:
                    id_or_slug = int(id_or_slug)
                except ValueError:
                    continue
                addon_id = (
                    # Does this file exist?
                    files_to_addon.get(id_or_slug) or
                    # Maybe it's an add-on ?
                    (
                        id_or_slug if id_or_slug in files_to_addon.values()
                        # otherwise it doesn't exist
                        else None))
            else:
                # If it's not numeric it's probably a slug.
                addon_id = slugs_to_addon.get(id_or_slug)
            if not addon_id:
                # We've exhausted all possibilities, ignore this row.
                continue

            # Memoize the DownloadCount.
            if addon_id in download_counts:
                dc = download_counts[addon_id]
                # update the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)
            else:
                dc = DownloadCount(date=day,
                                   addon_id=addon_id,
                                   count=counter,
                                   sources={src: counter})
                download_counts[addon_id] = dc

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.info('Total processing time: %s' % (datetime.now() - start))

        if options['stats_source'] == 'file':
            # Clean up file.
            log.info('Deleting {path}'.format(path=filepath))
            unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files once and for all.
        # This builds a dict where each key (the file_id we get from the hive
        # query) has the addon_id as value.
        files_to_addon = dict(File.objects.values_list('id',
                                                       'version__addon_id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(DownloadSource.objects.filter(type='full').values_list(
            'name', flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, file_id, src = splitted
                try:
                    file_id, counter = int(file_id), int(counter)
                except ValueError:  # Badly formatted? Drop.
                    continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Does this file exist?
                if file_id in files_to_addon:
                    addon_id = files_to_addon[file_id]
                else:
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)
        for download_count in download_counts.values():
            save_stats_to_file(download_count)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)