def test_index_latest(self):
        self.create_switch("local-statistics-processing")
        latest = datetime.date.today() - datetime.timedelta(days=5)
        UpdateCount.index({"date": latest})
        self.refresh("stats")

        start = latest.strftime("%Y-%m-%d")
        finish = datetime.date.today().strftime("%Y-%m-%d")
        with mock.patch("olympia.stats.cron.call_command") as call:
            cron.index_latest_stats()
            call.assert_called_with("index_stats", addons=None, date="%s:%s" % (start, finish))
Exemple #2
0
    def test_index_latest(self):
        self.create_switch('local-statistics-processing')
        latest = datetime.date.today() - datetime.timedelta(days=5)
        UpdateCount.index({'date': latest})
        self.refresh('stats')

        start = latest.strftime('%Y-%m-%d')
        finish = datetime.date.today().strftime('%Y-%m-%d')
        with mock.patch('olympia.stats.cron.call_command') as call:
            cron.index_latest_stats()
            call.assert_called_with('index_stats', addons=None,
                                    date='%s:%s' % (start, finish))
    def test_trim_field(self):
        uc = UpdateCount(addon_id=3615, count=1, date='2015-01-11')
        self.command.trim_field(uc.versions)  # Empty field.
        assert not uc.versions

        uc.versions = {'3.6': 123, '3.7': 321}
        self.command.trim_field(uc.versions)  # Small enough to fit in the db.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Unchanged.

        very_long_key = 'x' * (2 ** 16)
        uc.versions[very_long_key] = 1
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Keep the most used.

        uc.versions[very_long_key] = 1000  # Most used.
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        # Nothing left: least used removed, but still too big, so all the keys
        # were removed.
        assert uc.versions == {}

        # Make sure we can store a very large field in the database.
        long_key = 'x' * 65528  # This makes the dict barely fit in the db.
        uc.versions[long_key] = 1
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
        uc.save()
        uc = UpdateCount.objects.get(pk=uc.pk)  # Reload
        # Fits in the database, so no truncation.
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
Exemple #4
0
    def setUp(self):
        self.persona = addon_factory(type=amo.ADDON_PERSONA)
        self.extension = addon_factory()
        self.static_theme = addon_factory(type=amo.ADDON_STATICTHEME)
        self.awaiting_review = addon_factory(status=amo.STATUS_NOMINATED)

        today = datetime.date.today()

        stats = [
            (today - datetime.timedelta(days=days_in_past), update_count)
            for days_in_past, update_count in (
                (1, 827080), (2, 787930), (3, 995860), (4, 1044260),
                (5, 105431), (6, 106065), (7, 980930), (8, 817100), (9, 78843),
                (10, 993830), (11, 104431), (12, 105943), (13, 105039),
                (14, 100183), (15, 82265), (16, 100183), (17, 82265),
                (18, 100183), (19, 82265), (20, 100183), (21, 82265),

            )]

        for obj in (self.persona, self.extension, self.static_theme,
                    self.awaiting_review):
            UpdateCount.objects.bulk_create([
                UpdateCount(addon=obj, date=date, count=count)
                for date, count in stats
            ])
Exemple #5
0
    def test_13_day_window(self):
        addon = Addon.objects.get(pk=3615)

        # can't use a fixed date since we are relying on
        # mysql to get us the `CURDATE()`
        today = datetime.date.today()

        # data is coming from `tab groups` add-on from
        # jun 11 till may 29th 2017
        stats = [
            (today - datetime.timedelta(days=days_in_past), update_count)
            for days_in_past, update_count in (
                (1, 82708), (2, 78793), (3, 99586), (4, 104426), (5, 105431),
                (6, 106065), (7, 98093), (8, 81710), (9, 78843), (10, 99383),
                (11, 104431), (12, 105943), (13, 105039), (14, 100183),
                (15, 82265)
            )]

        UpdateCount.objects.bulk_create([
            UpdateCount(addon=addon, date=date, count=count)
            for date, count in stats
        ])

        addon.update(average_daily_users=0)

        cron.update_addon_average_daily_users()

        addon.refresh_from_db()

        assert (
            82708 + 78793 + 99586 + 104426 + 105431 + 106065 + 98093 +
            81710 + 78843 + 99383 + 104431 + 105943) / 12 == 95451

        assert addon.average_daily_users == 95451
    def test_trim_field(self):
        uc = UpdateCount(addon_id=3615, count=1, date='2015-01-11')
        self.command.trim_field(uc.versions)  # Empty field.
        assert not uc.versions

        uc.versions = {'3.6': 123, '3.7': 321}
        self.command.trim_field(uc.versions)  # Small enough to fit in the db.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Unchanged.

        very_long_key = 'x' * (2 ** 16)
        uc.versions[very_long_key] = 1
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Keep the most used.

        uc.versions[very_long_key] = 1000  # Most used.
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        # Nothing left: least used removed, but still too big, so all the keys
        # were removed.
        assert uc.versions == {}

        # Make sure we can store a very large field in the database.
        long_key = 'x' * 65528  # This makes the dict barely fit in the db.
        uc.versions[long_key] = 1
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
        uc.save()
        uc = UpdateCount.objects.get(pk=uc.pk)  # Reload
        # Fits in the database, so no truncation.
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
 def test_update_version(self):
     # Initialize the known addons and their versions.
     self.command.addons_versions = {3615: ['3.5', '3.6']}
     uc = UpdateCount(addon_id=3615)
     self.command.update_version(uc, '3.6', 123)
     assert uc.versions == {'3.6': 123}
     # Test very long version:
     self.command.update_version(uc, '1' * 33, 1)
     assert uc.versions == {'3.6': 123, '1' * 32: 1}  # Trimmed.
Exemple #8
0
def migrate_theme_update_count(lwt, static_theme, **kw):
    """Create UpdateCount instances from ThemeUpdateCount instances.
    By default all instances for the specified lwt (lightweight theme) are
    copied.  Any additional **kw are passed to the filter to - for example to
    limit to a certain day or day range."""
    theme_update_counts = ThemeUpdateCount.objects.filter(addon_id=lwt.id,
                                                          **kw).iterator()
    update_counts = [
        UpdateCount(addon_id=static_theme.id, date=tuc.date, count=tuc.count)
        for tuc in theme_update_counts
    ]
    UpdateCount.objects.bulk_create(update_counts, 100)
Exemple #9
0
def test_stats_from_model_update_count():
    result = serialize_stats(
        UpdateCount(addon_id=321,
                    date='2016-01-18',
                    count=123,
                    versions={
                        u'3.8': 2,
                        u'3.7': 3
                    },
                    statuses={u'userEnabled': 5},
                    applications={
                        u'{ec8030f7-c20a-464f-9b0e-13a3a9e97384}': {
                            u'3.6': 18
                        }
                    },
                    oses={u'WINNT': 5},
                    locales={
                        u'en-us': 1,
                        u'en-US': 4
                    }))
    assert json.loads(result) == {
        'date': '2016-01-18',
        'addon': 321,
        'count': 123,
        'versions': {
            '3.7': 3,
            '3.8': 2
        },
        'oses': {
            'WINNT': 5
        },
        'applications': {
            '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}': {
                '3.6': 18
            }
        },
        'locales': {
            'en-US': 4,
            'en-us': 1
        },
        'statuses': {
            'userEnabled': 5
        }
    }
 def test_update_app(self):
     firefox_guid = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}'
     uc = UpdateCount(addon_id=3615)
     self.command.update_app(uc, 'foobar', '1.0', 123)  # Non-existent app.
     assert not uc.applications
     # Malformed versions.
     self.command.update_app(uc, firefox_guid, '3.0.1.2', 123)
     self.command.update_app(uc, firefox_guid, '3.0123', 123)
     self.command.update_app(uc, firefox_guid, '3.0c2', 123)
     self.command.update_app(uc, firefox_guid, 'a.b.c', 123)
     assert not uc.applications
     # Well formed versions.
     self.command.update_app(uc, firefox_guid, '1.0', 123)
     self.command.update_app(uc, firefox_guid, '1.0.1', 124)
     self.command.update_app(uc, firefox_guid, '1.0a1', 125)
     self.command.update_app(uc, firefox_guid, '1.0b2', 126)
     assert uc.applications == {firefox_guid: {
         '1.0': 123,
         '1.0.1': 124,
         '1.0a1': 125,
         '1.0b2': 126}}
 def test_update_locale(self):
     current_locales = [  # Taken from the language pack index.
         'ach', 'af', 'ak', 'an', 'ar', 'as', 'ast', 'ast-ES', 'az',
         'bb-BK', 'be', 'bg', 'bn-BD', 'bn-IN', 'br', 'bs', 'ca',
         'ca-valencia', 'cs', 'csb', 'cy', 'cy-GB', 'da', 'de', 'dsb', 'el',
         'en-GB', 'en-ZA', 'eo', 'es-AR', 'es-CL', 'es-ES', 'es-MX', 'et',
         'eu', 'fa', 'ff', 'fi', 'fj-FJ', 'fr', 'fur-IT', 'fy-NL', 'ga-IE',
         'gd', 'gl', 'gu-IN', 'he', 'hi', 'hi-IN', 'hr', 'hsb', 'hu',
         'hy-AM', 'id', 'is', 'it', 'ja', 'kk', 'km', 'kn', 'ko', 'ku',
         'lg', 'lij', 'lt', 'lv', 'mai', 'mg', 'mk', 'ml', 'mr', 'ms',
         'nb-NO', 'nl', 'nn-NO', 'nr', 'nso', 'or', 'pa-IN', 'pl', 'pt-BR',
         'pt-PT', 'rm', 'ro', 'ru', 'si', 'sk', 'sl', 'son', 'sq', 'sr',
         'ss', 'st', 'sv-SE', 'sw', 'sw-TZ', 'ta', 'ta-IN', 'ta-LK', 'te',
         'th', 'tn', 'tr', 'ts', 'uk', 'ur', 've', 'vi', 'wa', 'wo-SN',
         'xh', 'zap-MX-diiste', 'zh-CN', 'zh-TW', 'zu']
     uc = UpdateCount(addon_id=3615)
     self.command.update_locale(uc, 'foobar', 123)  # Non-existent locale.
     assert not uc.locales
     for locale in current_locales:
         self.command.update_locale(uc, locale, 1)
     assert len(uc.locales) == len(current_locales)
 def test_update_status(self):
     uc = UpdateCount(addon_id=3615)
     self.command.update_status(uc, 'foobar', 123)  # Non-existent status.
     assert not uc.statuses
     self.command.update_status(uc, 'userEnabled', 123)
     assert uc.statuses == {'userEnabled': 123}
 def test_update_os(self):
     uc = UpdateCount(addon_id=3615)
     self.command.update_os(uc, 'foobar', 123)  # Non-existent OS.
     assert not uc.oses
     self.command.update_os(uc, 'WINNT', 123)
     assert uc.oses == {'WINNT': 123}
Exemple #14
0
    def handle(self, *args, **options):
        sep = options['separator']
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        if options['stats_source'] == 's3':
            filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET,
                                           'amo_stats', 'theme_update_counts',
                                           day, '000000_0'])

        elif options['stats_source'] == 'file':
            folder = options['folder_name']
            folder = path.join(settings.TMP_PATH, folder, day)
            filepath = path.join(folder, 'theme_update_counts.hive')

        # Make sure we're not trying to update with mismatched data.
        if get_date(filepath, sep) != day:
            raise CommandError('%s file contains data for another day' %
                               filepath)

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        ThemeUpdateCount.objects.filter(date=day).delete()

        theme_update_counts = {}
        new_stheme_update_counts = {}

        # Preload a set containing the ids of all the persona Add-on objects
        # that we care about. When looping, if we find an id that is not in
        # that set, we'll reject it.
        addons = set(Addon.objects.filter(type=amo.ADDON_PERSONA,
                                          status=amo.STATUS_PUBLIC,
                                          persona__isnull=False)
                                  .values_list('id', flat=True))
        # Preload a dict of persona to static theme ids that are migrated.
        migrated_personas = dict(
            MigratedLWT.objects.values_list(
                'lightweight_theme_id', 'static_theme_id')
        )
        existing_stheme_update_counts = {
            uc.addon_id: uc for uc in UpdateCount.objects.filter(
                addon_id__in=migrated_personas.values())}
        # Preload all the Personas once and for all. This builds a dict where
        # each key (the persona_id we get from the hive query) has the addon_id
        # as value.
        persona_to_addon = dict(Persona.objects.values_list('persona_id',
                                                            'addon_id'))

        count_file = get_stats_data(filepath)
        for index, line in enumerate(count_file):
            if index and (index % 1000000) == 0:
                log.info('Processed %s lines' % index)

            splitted = line[:-1].split(sep)

            if len(splitted) != 4:
                log.debug('Badly formatted row: %s' % line)
                continue

            day, id_, src, count = splitted
            try:
                id_, count = int(id_), int(count)
            except ValueError:  # Badly formatted? Drop.
                continue

            if src:
                src = src.strip()

            # If src is 'gp', it's an old request for the persona id.
            if id_ not in persona_to_addon and src == 'gp':
                continue  # No such persona.
            addon_id = persona_to_addon[id_] if src == 'gp' else id_

            # Is the persona already migrated to static theme?
            if addon_id in migrated_personas:
                mig_addon_id = migrated_personas[addon_id]
                if mig_addon_id in existing_stheme_update_counts:
                    existing_stheme_update_counts[mig_addon_id].count += count
                    existing_stheme_update_counts[mig_addon_id].save()
                elif mig_addon_id in new_stheme_update_counts:
                    new_stheme_update_counts[mig_addon_id].count += count
                else:
                    new_stheme_update_counts[mig_addon_id] = UpdateCount(
                        addon_id=mig_addon_id, date=day, count=count)

            # Does this addon exist?
            if addon_id not in addons:
                continue

            # Memoize the ThemeUpdateCount.
            if addon_id in theme_update_counts:
                tuc = theme_update_counts[addon_id]
            else:
                tuc = ThemeUpdateCount(addon_id=addon_id, date=day,
                                       count=0)
                theme_update_counts[addon_id] = tuc

            # We can now fill the ThemeUpdateCount object.
            tuc.count += count

        # Create in bulk: this is much faster.
        ThemeUpdateCount.objects.bulk_create(theme_update_counts.values(), 100)
        UpdateCount.objects.bulk_create(new_stheme_update_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up file.
        if options['stats_source'] == 'file':
            log.debug('Deleting {path}'.format(path=filepath))
            unlink(filepath)
    def handle(self, *args, **options):
        sep = options['separator']

        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

        groups = ('app', 'locale', 'os', 'status', 'version')
        group_filepaths = []
        # Make sure we're not trying to update with mismatched data.
        for group in groups:
            if options['stats_source'] == 's3':
                filepath = 's3://' + '/'.join([
                    settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX,
                    'update_counts_by_%s' % group, day, '000000_0'
                ])

            elif options['stats_source'] == 'file':
                folder = options['folder_name']
                folder = path.join(settings.TMP_PATH, folder, day)
                filepath = path.join(folder,
                                     'update_counts_by_%s.hive' % group)

            if get_date(filepath, sep) != day:
                raise CommandError('%s file contains data for another day' %
                                   filepath)
            group_filepaths.append((group, filepath))

        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        UpdateCount.objects.filter(date=day).delete()

        # Memoize the addons and the UpdateCounts.
        update_counts = {}
        # Perf: preload all the addons once and for all.
        # This builds a dict where each key (the addon guid we get from the
        # hive query) has the addon_id as value.
        guids_to_addon = (
            dict(
                Addon.unfiltered.exclude(status=amo.STATUS_NULL).exclude(
                    guid__isnull=True)
                # Shouldn't be necessary to exclude _ADDON_PERSONA now but we've
                # still got a huge number of deleted LWT in the database.
                .exclude(type=9).values_list('guid', 'id')))

        for group, filepath in group_filepaths:
            count_file = get_stats_data(filepath)
            for index, line in enumerate(count_file):
                if index and (index % 1000000) == 0:
                    log.info('Processed %s lines' % index)

                splitted = line[:-1].split(sep)

                if ((group == 'app' and len(splitted) != 6)
                        or (group != 'app' and len(splitted) != 5)):
                    log.debug('Badly formatted row: %s' % line)
                    continue

                if group == 'app':
                    day, addon_guid, app_id, app_ver, count, \
                        update_type = splitted
                else:
                    day, addon_guid, data, count, update_type = splitted

                addon_guid = addon_guid.strip()
                if update_type:
                    update_type.strip()

                # Old versions of Firefox don't provide the update type.
                # All the following are "empty-like" values.
                if update_type in [
                        '0', 'NULL', 'None', '', '\\N', '%UPDATE_TYPE%'
                ]:
                    update_type = None

                try:
                    count = int(count)
                    if update_type:
                        update_type = int(update_type)
                except ValueError:  # Badly formatted? Drop.
                    continue

                # The following is magic that I don't understand. I've just
                # been told that this is the way we can make sure a request
                # is valid:
                # > the lower bits for updateType (eg 112) should add to
                # > 16, if not, ignore the request.
                # > udpateType & 31 == 16 == valid request.
                if update_type and update_type & 31 != 16:
                    log.debug("Update type doesn't add to 16: %s" %
                              update_type)
                    continue

                # Does this addon exist?
                if addon_guid and addon_guid in guids_to_addon:
                    addon_id = guids_to_addon[addon_guid]
                else:
                    log.debug(u"Addon {guid} doesn't exist.".format(
                        guid=addon_guid.strip()))
                    continue

                # Memoize the UpdateCount.
                if addon_guid in update_counts:
                    uc = update_counts[addon_guid]
                else:
                    uc = UpdateCount(date=day, addon_id=addon_id, count=0)
                    update_counts[addon_guid] = uc

                # We can now fill the UpdateCount object.
                if group == 'version':
                    self.update_version(uc, data, count)
                elif group == 'status':
                    self.update_status(uc, data, count)
                    if data == UPDATE_COUNT_TRIGGER:
                        # Use this count to compute the global number
                        # of daily users for this addon.
                        uc.count += count
                elif group == 'app':
                    self.update_app(uc, app_id, app_ver, count)
                elif group == 'os':
                    self.update_os(uc, data, count)
                elif group == 'locale':
                    self.update_locale(uc, data, count)

        # Make sure the locales and versions fields aren't too big to fit in
        # the database. Those two fields are the only ones that are not fully
        # validated, so we could end up with just anything in there (spam,
        # buffer overflow attempts and the like).
        # We don't care that they will increase the numbers, but we do not want
        # those to break the process because of a "Data too long for column
        # 'version'" error.
        # The database field (TEXT), can hold up to 2^16 = 64k characters.
        # If the field is longer than that, we we drop the least used items
        # (with the lower count) until the field fits.
        for addon_guid, update_count in update_counts.items():
            self.trim_field(update_count.locales)
            self.trim_field(update_count.versions)

        # Create in bulk: this is much faster.
        UpdateCount.objects.bulk_create(update_counts.values(), 100)

        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up files.
        if options['stats_source'] == 'file':
            for _, filepath in group_filepaths:
                log.debug('Deleting {path}'.format(path=filepath))
                unlink(filepath)
Exemple #16
0
    def setUp(self):
        self.extension = addon_factory()
        self.static_theme = addon_factory(type=amo.ADDON_STATICTHEME)
        self.unpopular_extension = addon_factory()
        self.unpopular_theme = addon_factory(type=amo.ADDON_STATICTHEME)
        self.barely_popular_theme = addon_factory(type=amo.ADDON_STATICTHEME)
        self.same_stats_as_barely_popular_theme = addon_factory()
        self.awaiting_review = addon_factory(status=amo.STATUS_NOMINATED)

        today = datetime.date.today()

        stats = [(today - datetime.timedelta(days=days_in_past), update_count)
                 for days_in_past, update_count in (
                     (1, 827080),
                     (2, 787930),
                     (3, 995860),
                     (4, 1044260),
                     (5, 105431),
                     (6, 106065),
                     (7, 980930),
                     (8, 817100),
                     (9, 78843),
                     (10, 993830),
                     (11, 104431),
                     (12, 105943),
                     (13, 105039),
                     (14, 100183),
                     (15, 82265),
                     (16, 100183),
                     (17, 82265),
                     (18, 100183),
                     (19, 82265),
                     (20, 100183),
                     (21, 82265),
                 )]

        unpopular_stats = [(today - datetime.timedelta(days=days_in_past),
                            update_count) for days_in_past, update_count in (
                                (1, 99),
                                (2, 76),
                                (3, 25),
                                (4, 32),
                                (5, 289),
                                (6, 34),
                                (7, 45),
                                (8, 25),
                                (9, 78),
                                (10, 36),
                                (11, 25),
                                (12, 100),
                                (13, 156),
                                (14, 24),
                                (15, 9),
                                (16, 267),
                                (17, 176),
                                (18, 16),
                                (19, 156),
                                (20, 187),
                                (21, 149),
                            )]

        barely_popular_stats = [(today - datetime.timedelta(days=days_in_past),
                                 update_count)
                                for days_in_past, update_count in (
                                    (1, 399),
                                    (2, 276),
                                    (3, 215),
                                    (4, 312),
                                    (5, 289),
                                    (6, 234),
                                    (7, 345),
                                    (8, 205),
                                    (9, 178),
                                    (10, 336),
                                    (11, 325),
                                    (12, 400),
                                    (13, 456),
                                    (14, 324),
                                    (15, 290),
                                    (16, 267),
                                    (17, 276),
                                    (18, 216),
                                    (19, 256),
                                    (20, 287),
                                    (21, 249),
                                )]

        for obj in (self.extension, self.static_theme, self.awaiting_review):
            UpdateCount.objects.bulk_create([
                UpdateCount(addon=obj, date=date, count=count)
                for date, count in stats
            ])

        for obj in (self.unpopular_extension, self.unpopular_theme):
            UpdateCount.objects.bulk_create([
                UpdateCount(addon=obj, date=date, count=count)
                for date, count in unpopular_stats
            ])

        for obj in (self.barely_popular_theme,
                    self.same_stats_as_barely_popular_theme):
            UpdateCount.objects.bulk_create([
                UpdateCount(addon=obj, date=date, count=count)
                for date, count in barely_popular_stats
            ])