Ejemplo n.º 1
0
def update_addon_average_daily_users(chunk_size=250):
    """Update add-ons ADU totals."""
    if not waffle.switch_is_active('local-statistics-processing'):
        return False

    counts = dict(
        # In order to reset the `average_daily_users` values of add-ons that
        # don't exist in BigQuery, we prepare a set of `(guid, 0)` for most
        # add-ons.
        Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude(
            guid__isnull=True
        ).exclude(guid__exact=''
                  ).exclude(average_daily_users=0
                            ).annotate(count=Value(0, IntegerField())
                                       ).values_list('guid', 'count')
        # Just to make order predictable in tests, we order by id. This
        # matches the GROUP BY being generated so it should be safe.
        .order_by('id'))
    # Update the `counts` with values from BigQuery.
    counts.update(get_addons_and_average_daily_users_from_bigquery())
    counts = list(counts.items())

    log.info('Preparing update of `average_daily_users` for %s add-ons.',
             len(counts))

    create_chunked_tasks_signatures(_update_addon_average_daily_users, counts,
                                    chunk_size).apply_async()
Ejemplo n.º 2
0
def gather_index_stats_tasks(index, addons=None, dates=None):
    """
    Return the list of task groups to execute to index statistics for the given
    index/dates/addons.
    """
    queries = [
        (UpdateCount.objects, index_update_counts,
            {'date': 'date'}),
        (DownloadCount.objects, index_download_counts,
            {'date': 'date'}),
    ]

    jobs = []

    for qs, task, fields in queries:
        date_field = fields['date']

        if dates or addons:
            qs = qs.order_by('-%s' % date_field)

        qs = qs.values_list('id', flat=True)

        if addons:
            pks = [int(a.strip()) for a in addons.split(',')]
            qs = qs.filter(addon__in=pks)

        if dates:
            if ':' in dates:
                qs = qs.filter(**{'%s__range' % date_field:
                                  dates.split(':')})
            else:
                qs = qs.filter(**{date_field: dates})

        if not (dates or addons):
            # We're loading the whole world. Do it in stages so we get most
            # recent stats first and don't do huge queries.
            limits = (qs.model.objects.filter(**{'%s__isnull' %
                                                 date_field: False})
                      .extra(where=['%s <> "0000-00-00"' % date_field])
                      .aggregate(min=Min(date_field), max=Max(date_field)))
            # If there isn't any data at all, skip over.
            if not (limits['max'] or limits['min']):
                continue

            num_days = (limits['max'] - limits['min']).days
            for start in range(0, num_days, STEP):
                stop = start + STEP - 1
                date_range = (limits['max'] - timedelta(days=stop),
                              limits['max'] - timedelta(days=start))
                data = list(qs.filter(**{
                    '%s__range' % date_field: date_range
                }))
                if data:
                    jobs.append(create_chunked_tasks_signatures(
                        task, data, CHUNK_SIZE, task_args=(index,)))
        else:
            jobs.append(create_chunked_tasks_signatures(
                task, list(qs), CHUNK_SIZE, task_args=(index,)))
    return jobs
Ejemplo n.º 3
0
def run_yara_query_rule(query_rule_pk):
    """
    Run a specific ScannerQueryRule on multiple Versions.
    """
    # We're not forcing this task to happen on primary db to let the replicas
    # handle the Version query below, but we want to fetch the rule using the
    # primary db in all cases.
    rule = ScannerQueryRule.objects.using('default').get(pk=query_rule_pk)
    # Build a huge list of all pks we're going to run the tasks on.
    pks = Version.unfiltered.all().filter(
        addon__type=amo.ADDON_EXTENSION,
        files__is_webextension=True,
    ).exclude(addon__status=amo.STATUS_DISABLED, ).filter(
        Q(channel=amo.RELEASE_CHANNEL_UNLISTED)
        | Q(channel=amo.RELEASE_CHANNEL_LISTED,
            pk=F('addon___current_version'))).values_list(
                'id', flat=True).order_by('pk')
    rule.update(state=RUNNING)
    # Build the workflow using a group of tasks dealing with 250 files at a
    # time, chained to a task that marks the query as completed.
    chunk_size = 250
    workflow = (create_chunked_tasks_signatures(
        run_yara_query_rule_on_versions_chunk,
        list(pks),
        chunk_size,
        task_args=(query_rule_pk, ))
                | mark_yara_query_rule_as_completed.si(query_rule_pk))
    # Fire it up.
    workflow.apply_async()
Ejemplo n.º 4
0
 def handle(self, *args, **options):
     ids = AddonGUID.objects.filter(hashed_guid=None).values_list('id',
                                                                  flat=True)
     chunked_tasks = create_chunked_tasks_signatures(backfill_hashed_guids,
                                                     items=list(ids),
                                                     chunk_size=100)
     chunked_tasks.apply_async()
Ejemplo n.º 5
0
 def get_indexing_tasks_for_qs(qs):
     index_data_tasks = create_chunked_tasks_signatures(
         cls.get_indexing_task(), qs, cls.CHUNK_SIZE,
         task_args=(index_name,))
     # Unwrap the tasks from the group create_chunked_tasks_signatures()
     # returned, we'll create our own flat group with all the tasks,
     # no need to create unnecessary nesting.
     return index_data_tasks.tasks
Ejemplo n.º 6
0
def reindex_tasks_group(index_name):
    """
    Return the group of tasks to execute for a full reindex of addons on the
    index called `index_name` (which is not an alias but the real index name).
    """
    from olympia.addons.models import Addon
    from olympia.addons.tasks import index_addons

    ids = Addon.unfiltered.values_list('id', flat=True).order_by('id')
    chunk_size = 150
    return create_chunked_tasks_signatures(index_addons, list(ids), chunk_size)
Ejemplo n.º 7
0
def reindex_tasks_group(index_name):
    """
    Return the group of tasks to execute for a full reindex of addons on the
    index called `index_name` (which is not an alias but the real index name).
    """
    from olympia.addons.models import Addon
    from olympia.addons.tasks import index_addons

    ids = Addon.unfiltered.values_list('id', flat=True).order_by('id')
    chunk_size = 150
    return create_chunked_tasks_signatures(index_addons, list(ids), chunk_size)
Ejemplo n.º 8
0
def run_yara_query_rule(query_rule_pk):
    """
    Run a specific ScannerQueryRule on multiple Versions.

    Needs the rule to be a the SCHEDULED state, otherwise does nothing.
    """
    # We're not forcing this task to happen on primary db to let the replicas
    # handle the Version query below, but we want to fetch the rule using the
    # primary db in all cases.
    rule = ScannerQueryRule.objects.using('default').get(pk=query_rule_pk)
    try:
        rule.change_state_to(RUNNING)
    except ImproperScannerQueryRuleStateError:
        log.error(
            'Not proceeding with run_yara_query_rule on rule %s because '
            'its state is %s',
            rule.pk,
            rule.get_state_display(),
        )
        return
    log.info('Fetching versions for run_yara_query_rule on rule %s', rule.pk)
    # Build a huge list of all pks we're going to run the tasks on.
    qs = Version.unfiltered.filter(
        addon__type=amo.ADDON_EXTENSION,
        files__is_webextension=True,
    )
    if not rule.run_on_disabled_addons:
        qs = qs.exclude(addon__status=amo.STATUS_DISABLED)
    qs = qs.values_list('id', flat=True).order_by('pk')
    # Build the workflow using a group of tasks dealing with 250 files at a
    # time, chained to a task that marks the query as completed.
    chunk_size = 250
    chunked_tasks = create_chunked_tasks_signatures(
        run_yara_query_rule_on_versions_chunk,
        list(qs),
        chunk_size,
        task_args=(query_rule_pk, ),
    )
    # Force the group id to be generated for those tasks, and store it in the
    # result backend.
    group_result = chunked_tasks.freeze()
    group_result.save()
    rule.update(task_count=len(chunked_tasks),
                celery_group_result_id=uuid.UUID(group_result.id))
    workflow = chunked_tasks | mark_yara_query_rule_as_completed_or_aborted.si(
        query_rule_pk)
    log.info(
        'Running workflow of %s tasks for run_yara_query_rule on rule %s',
        len(chunked_tasks),
        rule.pk,
    )
    # Fire it up.
    workflow.apply_async()
    def handle(self, *args, **options):
        log = olympia.core.logger.getLogger('z.files')
        files = File.objects.filter(
            is_webextension=True,
            version__addon__type=amo.ADDON_EXTENSION).order_by('pk')
        pks = files.values_list('pk', flat=True)

        log.info('Using %s file pks to extract permissions' % pks.count())
        if pks:
            chunked_tasks = create_chunked_tasks_signatures(
                extract_optional_permissions, list(pks), chunk_size=100)
            chunked_tasks.apply_async()
Ejemplo n.º 10
0
def deliver_hotness(chunk_size=300):
    """
    Calculate hotness of all add-ons.

    a = avg(users this week)
    b = avg(users three weeks before this week)
    threshold = 250 if addon type is theme, else 1000
    hotness = (a-b) / b if a > threshold and b > 1 else 0
    """
    frozen_guids = list(set(fa.addon.guid for fa in FrozenAddon.objects.all()))
    averages = get_averages_by_addon_from_bigquery(today=date.today(),
                                                   exclude=frozen_guids)
    create_chunked_tasks_signatures(update_addon_hotness, averages.items(),
                                    chunk_size).apply_async()

    # Reset add-ons that won't be returned by BigQuery.
    addon_ids = (Addon.objects.filter(status__in=amo.REVIEWED_STATUSES).filter(
        hotness__gt=0).exclude(guid__in=averages.keys()).values_list(
            'id', flat=True))
    create_chunked_tasks_signatures(reset_addon_hotness, addon_ids,
                                    chunk_size).apply_async()
Ejemplo n.º 11
0
def update_addon_hotness(chunk_size=300):
    """
    Calculate hotness of all add-ons.

    a = avg(users this week)
    b = avg(users three weeks before this week)
    threshold = 250 if addon type is theme, else 1000
    hotness = (a-b) / b if a > threshold and b > 1 else 0
    """
    frozen_guids = list(
        {fa.addon.guid for fa in FrozenAddon.objects.all() if fa.addon.guid}
    )
    log.info('Found %s frozen add-on GUIDs.', len(frozen_guids))

    amo_guids = (
        Addon.objects.exclude(guid__in=frozen_guids)
        .exclude(guid__isnull=True)
        .exclude(guid__exact='')
        .exclude(hotness=0)
        .values_list('guid', flat=True)
    )
    averages = {
        guid: {'avg_this_week': 1, 'avg_three_weeks_before': 1} for guid in amo_guids
    }
    log.info('Found %s add-on GUIDs in AMO DB.', len(averages))

    bq_averages = get_averages_by_addon_from_bigquery(
        today=date.today(), exclude=frozen_guids
    )
    log.info('Found %s add-on GUIDs with averages in BigQuery.', len(bq_averages))

    averages.update(bq_averages)
    log.info('Preparing update of `hotness` for %s add-ons.', len(averages))

    create_chunked_tasks_signatures(
        _update_addon_hotness, averages.items(), chunk_size
    ).apply_async()
Ejemplo n.º 12
0
def update_addon_weekly_downloads(chunk_size=250):
    """
    Update 7-day add-on download counts.
    """
    counts = dict(
        # In order to reset the `weekly_downloads` values of add-ons that
        # don't exist in BigQuery, we prepare a set of `(hashed_guid, 0)`
        # for most add-ons.
        Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude(
            guid__isnull=True
        ).exclude(guid__exact=''
                  ).exclude(weekly_downloads=0
                            ).annotate(count=Value(0, IntegerField())
                                       ).values_list('addonguid__hashed_guid',
                                                     'count'))
    # Update the `counts` with values from BigQuery.
    counts.update(get_addons_and_weekly_downloads_from_bigquery())
    counts = list(counts.items())

    log.info('Preparing update of `weekly_downloads` for %s add-ons.',
             len(counts))

    create_chunked_tasks_signatures(_update_addon_weekly_downloads, counts,
                                    chunk_size).apply_async()
Ejemplo n.º 13
0
def update_addon_weekly_downloads(chunk_size=250):
    """
    Update 7-day add-on download counts.
    """
    if waffle.switch_is_active('use-bigquery-for-download-stats-cron'):
        counts = dict(
            # In order to reset the `weekly_downloads` values of add-ons that
            # don't exist in BigQuery, we prepare a set of `(guid, 0)` for most
            # add-ons.
            Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude(
                guid__isnull=True
            ).exclude(guid__exact=''
                      ).exclude(weekly_downloads=0
                                ).annotate(count=Value(0, IntegerField())
                                           ).values_list('guid', 'count'))
        # Update the `counts` with values from BigQuery.
        counts.update(get_addons_and_weekly_downloads_from_bigquery())
        counts = list(counts.items())

        log.info('Preparing update of `weekly_downloads` for %s add-ons.',
                 len(counts))

        create_chunked_tasks_signatures(_update_addon_weekly_downloads, counts,
                                        chunk_size).apply_async()
    else:
        raise_if_reindex_in_progress('amo')

        with connection.cursor() as cursor:
            cursor.execute("""
                SELECT addon_id, SUM(count) AS weekly_count
                FROM download_counts
                WHERE `date` >= DATE_SUB(CURDATE(), INTERVAL 7 DAY)
                GROUP BY addon_id
                ORDER BY addon_id""")
            counts = cursor.fetchall()

        addon_ids = [r[0] for r in counts]

        if not addon_ids:
            return

        with connection.cursor() as cursor:
            cursor.execute(
                """
                SELECT id, 0
                FROM addons
                WHERE id NOT IN %s""", (addon_ids, ))
            counts += cursor.fetchall()

            cursor.execute("""
                CREATE TEMPORARY TABLE tmp_wd
                (addon_id INT PRIMARY KEY, count INT)""")
            cursor.execute(
                'INSERT INTO tmp_wd VALUES %s' %
                ','.join(['(%s,%s)'] * len(counts)),
                list(itertools.chain(*counts)))

            cursor.execute("""
                UPDATE addons INNER JOIN tmp_wd
                    ON addons.id = tmp_wd.addon_id
                SET weeklydownloads = tmp_wd.count""")
            cursor.execute("DROP TABLE IF EXISTS tmp_wd")
Ejemplo n.º 14
0
def gather_index_stats_tasks(index, addons=None, dates=None):
    """
    Return the list of task groups to execute to index statistics for the given
    index/dates/addons.
    """
    queries = [
        (UpdateCount.objects, index_update_counts,
            {'date': 'date'}),
        (DownloadCount.objects, index_download_counts,
            {'date': 'date'}),
        (ThemeUserCount.objects, index_theme_user_counts,
            {'date': 'date'})
    ]

    jobs = []

    for qs, task, fields in queries:
        date_field = fields['date']

        if dates or addons:
            qs = qs.order_by('-%s' % date_field)

        qs = qs.values_list('id', flat=True)

        if addons:
            pks = [int(a.strip()) for a in addons.split(',')]
            qs = qs.filter(addon__in=pks)

        if dates:
            if ':' in dates:
                qs = qs.filter(**{'%s__range' % date_field:
                                  dates.split(':')})
            else:
                qs = qs.filter(**{date_field: dates})

        if not (dates or addons):
            # We're loading the whole world. Do it in stages so we get most
            # recent stats first and don't do huge queries.
            limits = (qs.model.objects.filter(**{'%s__isnull' %
                                                 date_field: False})
                      .extra(where=['%s <> "0000-00-00"' % date_field])
                      .aggregate(min=Min(date_field), max=Max(date_field)))
            # If there isn't any data at all, skip over.
            if not (limits['max'] or limits['min']):
                continue

            num_days = (limits['max'] - limits['min']).days
            for start in range(0, num_days, STEP):
                stop = start + STEP - 1
                date_range = (limits['max'] - timedelta(days=stop),
                              limits['max'] - timedelta(days=start))
                data = list(qs.filter(**{
                    '%s__range' % date_field: date_range
                }))
                if data:
                    jobs.append(create_chunked_tasks_signatures(
                        task, data, CHUNK_SIZE, task_args=(index,)))
        else:
            jobs.append(create_chunked_tasks_signatures(
                task, list(qs), CHUNK_SIZE, task_args=(index,)))
    return jobs