Example #1
0
    def test_index_latest(self):
        latest = datetime.date.today() - datetime.timedelta(days=5)
        DownloadCount.index({'date': latest})
        self.refresh('stats_download_counts')

        start = latest.strftime('%Y-%m-%d')
        finish = datetime.date.today().strftime('%Y-%m-%d')
        with mock.patch('olympia.stats.cron.call_command') as call:
            cron.index_latest_stats()
            call.assert_called_with('index_stats',
                                    addons=None,
                                    date='%s:%s' % (start, finish))
def test_stats_from_model_download_count():
    result = serialize_stats(
        DownloadCount(
            addon_id=321, date='2016-01-18', count=123,
            sources={u'search': 1, u'collection': 1}))
    assert json.loads(result) == {
        'date': '2016-01-18',
        'addon': 321,
        'count': 123,
        'sources': {'search': 1, 'collection': 1}}
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files once and for all.
        # This builds a dict where each key (the file_id we get from the hive
        # query) has the addon_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(
            DownloadSource.objects.filter(type='full').values_list('name',
                                                                   flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, file_id, src = splitted
                try:
                    file_id, counter = int(file_id), int(counter)
                except ValueError:  # Badly formatted? Drop.
                    continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Does this file exist?
                if file_id in files_to_addon:
                    addon_id = files_to_addon[file_id]
                else:
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)
        for download_count in download_counts.values():
            save_stats_to_file(download_count)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        sep = options['separator']

        if options['stats_source'] == 's3':
            filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET,
                                           settings.AWS_STATS_S3_PREFIX,
                                           'download_counts',
                                           day, '000000_0'])

        elif options['stats_source'] == 'file':
            folder = options['folder_name']
            folder = path.join(settings.TMP_PATH, folder, day)
            filepath = path.join(folder, 'download_counts.hive')

        # Make sure we're not trying to update with mismatched data.
        if get_date(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}

        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(File.objects.values_list('id',
                                                       'version__addon_id'))
        slugs_to_addon = dict(
            Addon.unfiltered.exclude(status=amo.STATUS_NULL)
            .values_list('slug', 'id'))

        # Only accept valid sources, which are constants. The source must
        # either be exactly one of the "full" valid sources, or prefixed by one
        # of the "prefix" valid sources.
        fulls = amo.DOWNLOAD_SOURCES_FULL
        prefixes = amo.DOWNLOAD_SOURCES_PREFIX

        count_file = get_stats_data(filepath)
        for index, line in enumerate(count_file):
            if index and (index % 1000000) == 0:
                log.info('Processed %s lines' % index)

            splitted = line[:-1].split(sep)

            if len(splitted) != 4:
                log.debug('Badly formatted row: %s' % line)
                continue

            day, counter, id_or_slug, src = splitted
            try:
                # Clean up data.
                id_or_slug = id_or_slug.strip()
                counter = int(counter)
            except ValueError:
                # Ignore completely invalid data.
                continue

            if id_or_slug.strip().isdigit():
                # If it's a digit, then it should be a file id.
                try:
                    id_or_slug = int(id_or_slug)
                except ValueError:
                    continue

                # Does this file exist?
                if id_or_slug in files_to_addon:
                    addon_id = files_to_addon[id_or_slug]
                # Maybe it's an add-on ?
                elif id_or_slug in files_to_addon.values():
                    addon_id = id_or_slug
                else:
                    # It's an integer we don't recognize, ignore the row.
                    continue
            else:
                # It's probably a slug.
                if id_or_slug in slugs_to_addon:
                    addon_id = slugs_to_addon[id_or_slug]
                else:
                    # We've exhausted all possibilities, ignore this row.
                    continue

            if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                continue

            # Memoize the DownloadCount.
            if addon_id in download_counts:
                dc = download_counts[addon_id]
            else:
                dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                download_counts[addon_id] = dc

            # We can now fill the DownloadCount object.
            dc.count += counter
            dc.sources = update_inc(dc.sources, src, counter)

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        if options['stats_source'] == 'file':
            # Clean up file.
            log.debug('Deleting {path}'.format(path=filepath))
            unlink(filepath)
Example #5
0
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = options['folder_name']
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))
        slugs_to_addon = dict(Addon.objects.public().values_list('slug', 'id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(
            DownloadSource.objects.filter(type='full').values_list('name',
                                                                   flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, id_or_slug, src = splitted
                try:
                    # Clean up data.
                    id_or_slug = id_or_slug.strip()
                    counter = int(counter)
                except ValueError:
                    # Ignore completely invalid data.
                    continue

                if id_or_slug.strip().isdigit():
                    # If it's a digit, then it should be a file id.
                    try:
                        id_or_slug = int(id_or_slug)
                    except ValueError:
                        continue

                    # Does this file exist?
                    if id_or_slug in files_to_addon:
                        addon_id = files_to_addon[id_or_slug]
                    # Maybe it's an add-on ?
                    elif id_or_slug in files_to_addon.values():
                        addon_id = id_or_slug
                    else:
                        # It's an integer we don't recognize, ignore the row.
                        continue
                else:
                    # It's probably a slug.
                    if id_or_slug in slugs_to_addon:
                        addon_id = slugs_to_addon[id_or_slug]
                    else:
                        # We've exhausted all possibilities, ignore this row.
                        continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        for download_count in download_counts.values():
            save_stats_to_file(download_count)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        sep = options['separator']

        if options['stats_source'] == 's3':
            filepath = 's3://' + '/'.join([
                settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX,
                'download_counts', day, '000000_0'
            ])

        elif options['stats_source'] == 'file':
            folder = options['folder_name']
            folder = path.join(settings.TMP_PATH, folder, day)
            filepath = path.join(folder, 'download_counts.hive')

        # Make sure we're not trying to update with mismatched data.
        if get_date(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}

        # Perf: preload all the files and slugs once and for all.
        # This builds two dicts:
        # - One where each key (the file_id we get from the hive query) has
        #   the addon_id as value.
        # - One where each key (the add-on slug) has the add-on_id as value.
        files_to_addon = dict(
            File.objects.values_list('id', 'version__addon_id'))
        slugs_to_addon = dict(
            Addon.unfiltered.exclude(status=amo.STATUS_NULL).values_list(
                'slug', 'id'))

        # Only accept valid sources, which are constants. The source must
        # either be exactly one of the "full" valid sources, or prefixed by one
        # of the "prefix" valid sources.
        fulls = amo.DOWNLOAD_SOURCES_FULL
        prefixes = amo.DOWNLOAD_SOURCES_PREFIX

        count_file = get_stats_data(filepath)
        for index, line in enumerate(count_file):
            if index and (index % 1000000) == 0:
                log.info('Processed %s lines' % index)

            splitted = line[:-1].split(sep)

            if len(splitted) != 4:
                log.info('Badly formatted row: %s' % line)
                continue

            day, counter, id_or_slug, src = splitted
            try:
                # Clean up data.
                id_or_slug = id_or_slug.strip()
                counter = int(counter)
            except ValueError:
                # Ignore completely invalid data.
                continue

            if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                continue

            if id_or_slug.isdigit():
                # If it's a digit, then it should be a file id.
                try:
                    id_or_slug = int(id_or_slug)
                except ValueError:
                    continue
                addon_id = (
                    # Does this file exist?
                    files_to_addon.get(id_or_slug) or
                    # Maybe it's an add-on ?
                    (
                        id_or_slug if id_or_slug in files_to_addon.values()
                        # otherwise it doesn't exist
                        else None))
            else:
                # If it's not numeric it's probably a slug.
                addon_id = slugs_to_addon.get(id_or_slug)
            if not addon_id:
                # We've exhausted all possibilities, ignore this row.
                continue

            # Memoize the DownloadCount.
            if addon_id in download_counts:
                dc = download_counts[addon_id]
                # update the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)
            else:
                dc = DownloadCount(date=day,
                                   addon_id=addon_id,
                                   count=counter,
                                   sources={src: counter})
                download_counts[addon_id] = dc

        # Close all old connections in this thread before we start creating the
        # `DownloadCount` values.
        # https://github.com/mozilla/addons-server/issues/6886
        # If the calculation above takes too long it might happen that we run
        # into `wait_timeout` problems and django doesn't reconnect properly
        # (potentially because of misconfiguration).
        # Django will re-connect properly after it notices that all
        # connections are closed.
        close_old_connections()

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.info('Total processing time: %s' % (datetime.now() - start))

        if options['stats_source'] == 'file':
            # Clean up file.
            log.info('Deleting {path}'.format(path=filepath))
            unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        filepath = path.join(folder, 'download_counts.hive')
        # Make sure we're not trying to update with mismatched data.
        if get_date_from_file(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        DownloadCount.objects.filter(date=day).delete()

        # Memoize the files to addon relations and the DownloadCounts.
        download_counts = {}
        # Perf: preload all the files once and for all.
        # This builds a dict where each key (the file_id we get from the hive
        # query) has the addon_id as value.
        files_to_addon = dict(File.objects.values_list('id',
                                                       'version__addon_id'))

        # Only accept valid sources, which are listed in the DownloadSource
        # model. The source must either be exactly one of the "full" valid
        # sources, or prefixed by one of the "prefix" valid sources.
        fulls = set(DownloadSource.objects.filter(type='full').values_list(
            'name', flat=True))
        prefixes = DownloadSource.objects.filter(type='prefix').values_list(
            'name', flat=True)

        with codecs.open(filepath, encoding='utf8') as count_file:
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if len(splitted) != 4:
                    log.debug('Badly formatted row: %s' % line)
                    continue

                day, counter, file_id, src = splitted
                try:
                    file_id, counter = int(file_id), int(counter)
                except ValueError:  # Badly formatted? Drop.
                    continue

                if not is_valid_source(src, fulls=fulls, prefixes=prefixes):
                    continue

                # Does this file exist?
                if file_id in files_to_addon:
                    addon_id = files_to_addon[file_id]
                else:
                    continue

                # Memoize the DownloadCount.
                if addon_id in download_counts:
                    dc = download_counts[addon_id]
                else:
                    dc = DownloadCount(date=day, addon_id=addon_id, count=0)
                    download_counts[addon_id] = dc

                # We can now fill the DownloadCount object.
                dc.count += counter
                dc.sources = update_inc(dc.sources, src, counter)

        # Create in bulk: this is much faster.
        DownloadCount.objects.bulk_create(download_counts.values(), 100)
        for download_count in download_counts.values():
            save_stats_to_file(download_count)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        log.debug('Deleting {path}'.format(path=filepath))
        unlink(filepath)