Example #1
0
    def test_index_latest(self):
        latest = date.today() - timedelta(days=5)
        UpdateCount.index({"date": latest})
        self.refresh("update_counts")

        start = latest.strftime("%Y-%m-%d")
        finish = date.today().strftime("%Y-%m-%d")
        with mock.patch("stats.cron.call_command") as call:
            cron.index_latest_stats()
            call.assert_called_with("index_stats", addons=None, date="%s:%s" % (start, finish))
Example #2
0
    def test_index_latest(self):
        latest = datetime.date.today() - datetime.timedelta(days=5)
        UpdateCount.index({'date': latest})
        self.refresh('update_counts')

        start = latest.strftime('%Y-%m-%d')
        finish = datetime.date.today().strftime('%Y-%m-%d')
        with mock.patch('stats.cron.call_command') as call:
            cron.index_latest_stats()
            call.assert_called_with('index_stats', addons=None,
                                    date='%s:%s' % (start, finish))
Example #3
0
    def test_index_latest(self):
        latest = datetime.date.today() - datetime.timedelta(days=5)
        UpdateCount.index({'date': latest})
        self.refresh('update_counts')

        start = latest.strftime('%Y-%m-%d')
        finish = datetime.date.today().strftime('%Y-%m-%d')
        with mock.patch('stats.cron.call_command') as call:
            cron.index_latest_stats()
            call.assert_called_with('index_stats', addons=None,
                                    date='%s:%s' % (start, finish))
Example #4
0
    def test_trim_field(self):
        uc = UpdateCount(addon_id=3615, count=1, date='2015-01-11')
        self.command.trim_field(uc.versions)  # Empty field.
        assert not uc.versions

        uc.versions = {'3.6': 123, '3.7': 321}
        self.command.trim_field(uc.versions)  # Small enough to fit in the db.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Unchanged.

        very_long_key = 'x' * (2**16)
        uc.versions[very_long_key] = 1
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Keep the most used.

        uc.versions[very_long_key] = 1000  # Most used.
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        # Nothing left: least used removed, but still too big, so all the keys
        # were removed.
        assert uc.versions == {}

        # Make sure we can store a very large field in the database.
        long_key = 'x' * 65528  # This makes the dict barely fit in the db.
        uc.versions[long_key] = 1
        assert len(json.dumps(uc.versions)) == (2**16) - 1
        uc.save()
        uc = UpdateCount.objects.get(pk=uc.pk)  # Reload
        # Fits in the database, so no truncation.
        assert len(json.dumps(uc.versions)) == (2**16) - 1
Example #5
0
    def test_trim_field(self):
        uc = UpdateCount(addon_id=3615, count=1, date='2015-01-11')
        self.command.trim_field(uc.versions)  # Empty field.
        assert not uc.versions

        uc.versions = {'3.6': 123, '3.7': 321}
        self.command.trim_field(uc.versions)  # Small enough to fit in the db.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Unchanged.

        very_long_key = 'x' * (2 ** 16)
        uc.versions[very_long_key] = 1
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        assert uc.versions == {'3.6': 123, '3.7': 321}  # Keep the most used.

        uc.versions[very_long_key] = 1000  # Most used.
        self.command.trim_field(uc.versions)  # Too big, must be trimmed.
        # Nothing left: least used removed, but still too big, so all the keys
        # were removed.
        assert uc.versions == {}

        # Make sure we can store a very large field in the database.
        long_key = 'x' * 65528  # This makes the dict barely fit in the db.
        uc.versions[long_key] = 1
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
        uc.save()
        uc = UpdateCount.objects.get(pk=uc.pk)  # Reload
        # Fits in the database, so no truncation.
        assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
Example #6
0
 def test_update_version(self):
     # Initialize the known addons and their versions.
     self.command.addons_versions = {3615: ['3.5', '3.6']}
     uc = UpdateCount(addon_id=3615)
     self.command.update_version(uc, '3.6', 123)
     assert uc.versions == {'3.6': 123}
     # Test very long version:
     self.command.update_version(uc, '1' * 33, 1)
     assert uc.versions == {'3.6': 123, '1' * 32: 1}  # Trimmed.
Example #7
0
 def test_update_app(self):
     # Initialize the known applications and their versions.
     self.command.valid_appversions = {'{app-guid}': ['1.0', '2.0']}
     uc = UpdateCount(addon_id=3615)
     self.command.update_app(uc, 'foobar', '1.0', 123)  # Non-existent app.
     assert not uc.applications
     # Non-existent version.
     self.command.update_app(uc, '{app-guid}', '3.0', 123)
     assert not uc.applications
     self.command.update_app(uc, '{app-guid}', '1.0', 123)
     assert uc.applications == {'{app-guid}': {'1.0': 123}}
Example #8
0
 def test_update_app(self):
     firefox_guid = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}'
     uc = UpdateCount(addon_id=3615)
     self.command.update_app(uc, 'foobar', '1.0', 123)  # Non-existent app.
     assert not uc.applications
     # Malformed versions.
     self.command.update_app(uc, firefox_guid, '3.0.1.2', 123)
     self.command.update_app(uc, firefox_guid, '3.0123', 123)
     self.command.update_app(uc, firefox_guid, '3.0c2', 123)
     self.command.update_app(uc, firefox_guid, 'a.b.c', 123)
     assert not uc.applications
     # Well formed versions.
     self.command.update_app(uc, firefox_guid, '1.0', 123)
     self.command.update_app(uc, firefox_guid, '1.0.1', 124)
     self.command.update_app(uc, firefox_guid, '1.0a1', 125)
     self.command.update_app(uc, firefox_guid, '1.0b2', 126)
     assert uc.applications == {firefox_guid: {
         '1.0': 123,
         '1.0.1': 124,
         '1.0a1': 125,
         '1.0b2': 126}}
Example #9
0
 def test_update_locale(self):
     current_locales = [  # Taken from the language pack index.
         'ach', 'af', 'ak', 'an', 'ar', 'as', 'ast', 'ast-ES', 'az',
         'bb-BK', 'be', 'bg', 'bn-BD', 'bn-IN', 'br', 'bs', 'ca',
         'ca-valencia', 'cs', 'csb', 'cy', 'cy-GB', 'da', 'de', 'dsb', 'el',
         'en-GB', 'en-ZA', 'eo', 'es-AR', 'es-CL', 'es-ES', 'es-MX', 'et',
         'eu', 'fa', 'ff', 'fi', 'fj-FJ', 'fr', 'fur-IT', 'fy-NL', 'ga-IE',
         'gd', 'gl', 'gu-IN', 'he', 'hi', 'hi-IN', 'hr', 'hsb', 'hu',
         'hy-AM', 'id', 'is', 'it', 'ja', 'kk', 'km', 'kn', 'ko', 'ku',
         'lg', 'lij', 'lt', 'lv', 'mai', 'mg', 'mk', 'ml', 'mr', 'ms',
         'nb-NO', 'nl', 'nn-NO', 'nr', 'nso', 'or', 'pa-IN', 'pl', 'pt-BR',
         'pt-PT', 'rm', 'ro', 'ru', 'si', 'sk', 'sl', 'son', 'sq', 'sr',
         'ss', 'st', 'sv-SE', 'sw', 'sw-TZ', 'ta', 'ta-IN', 'ta-LK', 'te',
         'th', 'tn', 'tr', 'ts', 'uk', 've', 'vi', 'wa', 'wo-SN', 'xh',
         'zap-MX-diiste', 'zh-CN', 'zh-TW', 'zu']
     uc = UpdateCount(addon_id=3615)
     self.command.update_locale(uc, 'foobar', 123)  # Non-existent locale.
     assert not uc.locales
     for locale in current_locales:
         self.command.update_locale(uc, locale, 1)
     assert len(uc.locales) == len(current_locales)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.NETAPP_STORAGE, 'tmp', folder, day)
        sep = options['separator']
        groups = ('version', 'status', 'app', 'os', 'locale')
        # Make sure we're not trying to update with mismatched data.
        for group in groups:
            filepath = path.join(folder, 'update_counts_by_%s.hive' % group)
            if get_date_from_file(filepath, sep) != day:
                raise CommandError('%s file contains data for another day' %
                                   filepath)
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        UpdateCount.objects.filter(date=day).delete()

        # Memoize the addons and the UpdateCounts.
        update_counts = {}
        # Perf: preload all the addons once and for all.
        # This builds a dict where each key (the addon guid we get from the
        # hive query) has the addon_id as value.
        guids_to_addon = (dict(
            Addon.objects.exclude(guid__isnull=True).filter(
                type=amo.ADDON_EXTENSION).values_list('guid', 'id')))

        index = -1
        for group in groups:
            filepath = path.join(folder, 'update_counts_by_%s.hive' % group)
            with open(filepath) as results_file:
                for line in results_file:
                    index += 1
                    if index and (index % 1000000) == 0:
                        log.info('Processed %s lines' % index)

                    splitted = line[:-1].split(sep)

                    if ((group == 'app' and len(splitted) != 6)
                            or (group != 'app' and len(splitted) != 5)):
                        log.debug('Badly formatted row: %s' % line)
                        continue

                    if group == 'app':
                        day, addon_guid, app_id, app_ver, count, \
                            update_type = splitted
                    else:
                        day, addon_guid, data, count, update_type = splitted

                    try:
                        count, update_type = int(count), int(update_type)
                    except ValueError:  # Badly formatted? Drop.
                        continue

                    # The following is magic that I don't understand. I've just
                    # been told that this is the way we can make sure a request
                    # is valid:
                    # > the lower bits for updateType (eg 112) should add to
                    # > 16, if not, ignore the request.
                    # > udpateType & 31 == 16 == valid request.
                    if update_type & 31 != 16:
                        log.debug("Update type doesn't add to 16: %s" %
                                  update_type)
                        continue

                    # Does this addon exit?
                    if addon_guid.strip() and addon_guid in guids_to_addon:
                        addon_id = guids_to_addon[addon_guid]
                    else:
                        continue

                    # Memoize the UpdateCount.
                    if addon_guid in update_counts:
                        uc = update_counts[addon_guid]
                    else:
                        uc = UpdateCount(date=day, addon_id=addon_id, count=0)
                        update_counts[addon_guid] = uc

                    # We can now fill the UpdateCount object.
                    if group == 'version':
                        # Take this count as the global number of daily users.
                        uc.count += count
                        uc.versions = update_inc(uc.versions, data, count)
                    elif group == 'status':
                        uc.statuses = update_inc(uc.statuses, data, count)
                    elif group == 'app':
                        # Applications is a dict of dicts, eg:
                        # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}":
                        #       {"10.0": 2, "21.0": 1, ....},
                        #  "some other application guid": ...
                        # }
                        if uc.applications is None:
                            uc.applications = {}
                        app = uc.applications.get(app_id, {})
                        # Now overwrite this application's dict with
                        # incremented counts for its versions.
                        uc.applications.update(
                            {app_id: update_inc(app, app_ver, count)})
                    elif group == 'os':
                        uc.oses = update_inc(uc.oses, data, count)
                    elif group == 'locale':
                        # Drop incorrect locales sizes.
                        if len(data) > 10:
                            continue
                        # Collapse locales to `xx_yy` if possible.
                        data = data.strip().lower().replace('-', '_')
                        uc.locales = update_inc(uc.locales, data, count)

        # Create in bulk: this is much faster.
        UpdateCount.objects.bulk_create(update_counts.values(), 100)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))
Example #11
0
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        groups = ('version', 'status', 'app', 'os', 'locale')
        group_filepaths = []
        # Make sure we're not trying to update with mismatched data.
        for group in groups:
            filepath = path.join(folder, 'update_counts_by_%s.hive' % group)
            if get_date_from_file(filepath, sep) != day:
                raise CommandError('%s file contains data for another day' %
                                   filepath)
            group_filepaths.append((group, filepath))
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        UpdateCount.objects.filter(date=day).delete()

        # Memoize the addons and the UpdateCounts.
        update_counts = {}
        # Perf: preload all the addons once and for all.
        # This builds a dict where each key (the addon guid we get from the
        # hive query) has the addon_id as value.
        guids_to_addon = (dict(
            Addon.objects.exclude(guid__isnull=True).exclude(
                type=amo.ADDON_PERSONA).values_list('guid', 'id')))

        # This gives a list of (application IDs, version).
        appversions = AppVersion.objects.values_list('application', 'version')
        # We want the application GUID, not the application ID.
        appversions = [(amo.APPS_ALL[app_id].guid, version)
                       for app_id, version in appversions]
        # This builds a dict where each key (the application guid) has a list
        # of all its versions as a value.
        self.valid_appversions = {}
        for app_guid, app_version in appversions:
            self.valid_appversions.setdefault(app_guid, [])
            self.valid_appversions[app_guid].append(app_version)

        index = -1
        for group, filepath in group_filepaths:
            with codecs.open(filepath, encoding='utf8') as results_file:
                for line in results_file:
                    index += 1
                    if index and (index % 1000000) == 0:
                        log.info('Processed %s lines' % index)

                    splitted = line[:-1].split(sep)

                    if ((group == 'app' and len(splitted) != 6)
                            or (group != 'app' and len(splitted) != 5)):
                        log.debug('Badly formatted row: %s' % line)
                        continue

                    if group == 'app':
                        day, addon_guid, app_id, app_ver, count, \
                            update_type = splitted
                    else:
                        day, addon_guid, data, count, update_type = splitted

                    addon_guid = addon_guid.strip()
                    if update_type:
                        update_type.strip()

                    # Old versions of Firefox don't provide the update type.
                    # All the following are "empty-like" values.
                    if update_type in [
                            '0', 'NULL', 'None', '', '\N', '%UPDATE_TYPE%'
                    ]:
                        update_type = None

                    try:
                        count = int(count)
                        if update_type:
                            update_type = int(update_type)
                    except ValueError:  # Badly formatted? Drop.
                        continue

                    # The following is magic that I don't understand. I've just
                    # been told that this is the way we can make sure a request
                    # is valid:
                    # > the lower bits for updateType (eg 112) should add to
                    # > 16, if not, ignore the request.
                    # > udpateType & 31 == 16 == valid request.
                    if update_type and update_type & 31 != 16:
                        log.debug("Update type doesn't add to 16: %s" %
                                  update_type)
                        continue

                    # Does this addon exist?
                    if addon_guid and addon_guid in guids_to_addon:
                        addon_id = guids_to_addon[addon_guid]
                    else:
                        log.debug(u"Addon {guid} doesn't exist.".format(
                            guid=addon_guid.strip()))
                        continue

                    # Memoize the UpdateCount.
                    if addon_guid in update_counts:
                        uc = update_counts[addon_guid]
                    else:
                        uc = UpdateCount(date=day, addon_id=addon_id, count=0)
                        update_counts[addon_guid] = uc

                    # We can now fill the UpdateCount object.
                    if group == 'version':
                        self.update_version(uc, data, count)
                        # Use this count to compute the global number of daily
                        # users for this addon.
                        uc.count += count
                    elif group == 'status':
                        self.update_status(uc, data, count)
                    elif group == 'app':
                        self.update_app(uc, app_id, app_ver, count)
                    elif group == 'os':
                        self.update_os(uc, data, count)
                    elif group == 'locale':
                        self.update_locale(uc, data, count)

        # Make sure the locales and versions fields aren't too big to fit in
        # the database. Those two fields are the only ones that are not fully
        # validated, so we could end up with just anything in there (spam,
        # buffer overflow attempts and the like).
        # We don't care that they will increase the numbers, but we do not want
        # those to break the process because of a "Data too long for column
        # 'version'" error.
        # The database field (TEXT), can hold up to 2^16 = 64k characters.
        # If the field is longer than that, we we drop the least used items
        # (with the lower count) until the field fits.
        for addon_guid, update_count in update_counts.iteritems():
            self.trim_field(update_count.locales)
            self.trim_field(update_count.versions)

        # Create in bulk: this is much faster.
        UpdateCount.objects.bulk_create(update_counts.values(), 100)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up files.
        for _, filepath in group_filepaths:
            log.debug('Deleting {path}'.format(path=filepath))
            unlink(filepath)
    def handle(self, *args, **options):
        start = datetime.now()  # Measure the time it takes to run the script.
        day = options['date']
        if not day:
            day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        folder = args[0] if args else 'hive_results'
        folder = path.join(settings.TMP_PATH, folder, day)
        sep = options['separator']
        groups = ('version', 'status', 'app', 'os', 'locale')
        group_filepaths = []
        # Make sure we're not trying to update with mismatched data.
        for group in groups:
            filepath = path.join(folder, 'update_counts_by_%s.hive' % group)
            if get_date_from_file(filepath, sep) != day:
                raise CommandError('%s file contains data for another day' %
                                   filepath)
            group_filepaths.append((group, filepath))
        # First, make sure we don't have any existing counts for the same day,
        # or it would just increment again the same data.
        UpdateCount.objects.filter(date=day).delete()

        # Memoize the addons and the UpdateCounts.
        update_counts = {}
        # Perf: preload all the addons once and for all.
        # This builds a dict where each key (the addon guid we get from the
        # hive query) has the addon_id as value.
        guids_to_addon = (dict(Addon.objects.exclude(guid__isnull=True)
                                            .exclude(type=amo.ADDON_PERSONA)
                                            .values_list('guid', 'id')))

        index = -1
        for group, filepath in group_filepaths:
            with codecs.open(filepath, encoding='utf8') as results_file:
                for line in results_file:
                    index += 1
                    if index and (index % 1000000) == 0:
                        log.info('Processed %s lines' % index)

                    splitted = line[:-1].split(sep)

                    if ((group == 'app' and len(splitted) != 6)
                            or (group != 'app' and len(splitted) != 5)):
                        log.debug('Badly formatted row: %s' % line)
                        continue

                    if group == 'app':
                        day, addon_guid, app_id, app_ver, count, \
                            update_type = splitted
                    else:
                        day, addon_guid, data, count, update_type = splitted

                    addon_guid = addon_guid.strip()
                    if update_type:
                        update_type.strip()

                    # Old versions of Firefox don't provide the update type.
                    # All the following are "empty-like" values.
                    if update_type in ['0', 'NULL', 'None', '', '\N',
                                       '%UPDATE_TYPE%']:
                        update_type = None

                    try:
                        count = int(count)
                        if update_type:
                            update_type = int(update_type)
                    except ValueError:  # Badly formatted? Drop.
                        continue

                    # The following is magic that I don't understand. I've just
                    # been told that this is the way we can make sure a request
                    # is valid:
                    # > the lower bits for updateType (eg 112) should add to
                    # > 16, if not, ignore the request.
                    # > udpateType & 31 == 16 == valid request.
                    if update_type and update_type & 31 != 16:
                        log.debug("Update type doesn't add to 16: %s" %
                                  update_type)
                        continue

                    # Does this addon exist?
                    if addon_guid and addon_guid in guids_to_addon:
                        addon_id = guids_to_addon[addon_guid]
                    else:
                        log.debug("Addon {guid} doesn't exist."
                                  .format(guid=addon_guid.strip()))
                        continue

                    # Memoize the UpdateCount.
                    if addon_guid in update_counts:
                        uc = update_counts[addon_guid]
                    else:
                        uc = UpdateCount(date=day, addon_id=addon_id, count=0)
                        update_counts[addon_guid] = uc

                    # We can now fill the UpdateCount object.
                    if group == 'version':
                        # Take this count as the global number of daily users.
                        uc.count += count
                        uc.versions = update_inc(uc.versions, data, count)
                    elif group == 'status':
                        uc.statuses = update_inc(uc.statuses, data, count)
                    elif group == 'app':
                        # Applications is a dict of dicts, eg:
                        # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}":
                        #       {"10.0": 2, "21.0": 1, ....},
                        #  "some other application guid": ...
                        # }
                        if uc.applications is None:
                            uc.applications = {}
                        app = uc.applications.get(app_id, {})
                        # Now overwrite this application's dict with
                        # incremented counts for its versions.
                        uc.applications.update(
                            {app_id: update_inc(app, app_ver, count)})
                    elif group == 'os':
                        uc.oses = update_inc(uc.oses, data, count)
                    elif group == 'locale':
                        # Drop incorrect locales sizes.
                        if len(data) > 10:
                            continue
                        # Collapse locales to `xx_yy` if possible.
                        data = data.strip().lower().replace('-', '_')
                        uc.locales = update_inc(uc.locales, data, count)

        # Create in bulk: this is much faster.
        UpdateCount.objects.bulk_create(update_counts.values(), 100)
        log.info('Processed a total of %s lines' % (index + 1))
        log.debug('Total processing time: %s' % (datetime.now() - start))

        # Clean up files.
        for _, filepath in group_filepaths:
            log.debug('Deleting {path}'.format(path=filepath))
            unlink(filepath)
Example #13
0
 def test_update_os(self):
     uc = UpdateCount(addon_id=3615)
     self.command.update_os(uc, 'foobar', 123)  # Non-existent OS.
     assert not uc.oses
     self.command.update_os(uc, 'WINNT', 123)
     assert uc.oses == {'WINNT': 123}
Example #14
0
 def test_update_status(self):
     uc = UpdateCount(addon_id=3615)
     self.command.update_status(uc, 'foobar', 123)  # Non-existent status.
     assert not uc.statuses
     self.command.update_status(uc, 'userEnabled', 123)
     assert uc.statuses == {'userEnabled': 123}