Beispiel #1
0
class RevertRate(Metric):
    """
    This class implements revert rate logic.
    An instance of the class is callable and will compute the number of reverted
    edits for each user in a passed-in list.
    
    This sql query was used as a starting point for the sqlalchemy query:
    
     select r.rev_user, r.count(*)
       from revision r
      where r.rev_timestamp between [start] and [end]
        and r.rev_user in ([cohort's user list or maybe a join to a temp])
        and exists (
             select *
               from revision r1
                        inner join
                    revision r2     on r2.rev_sha1 = r1.rev_sha1
                                    and r1.rev_sha1 <> r.rev_sha1
              where r1.rev_page = r.rev_page
                and r2.rev_page = r.rev_page
                and r1.rev_timestamp between [start - 2 weeks] and r.rev_timestamp
                and r2.rev_timestamp between r.rev_timestamp and [end + 2 weeks]
            )
      group by rev_user
    """

    show_in_ui = False
    id = 'revert-rate'
    label = 'Revert Rate'
    description = 'Compute the number of reverted edits in a mediawiki project'
    default_result = {}

    start_date = BetterDateTimeField(default=thirty_days_ago)
    end_date = BetterDateTimeField(default=today)
    namespaces = CommaSeparatedIntegerListField(
        None,
        [Required()],
        default='0',
        description='0, 2, 4, etc.',
    )

    def __call__(self, user_ids, session):
        """
        Parameters:
            user_ids    : list of mediawiki user ids to find edit reverts for
            session     : sqlalchemy session open on a mediawiki database
        
        Returns:
            dictionary from user ids to the number of edit reverts found.
        """
        return {user: None for user in user_ids}
Beispiel #2
0
class ProgramMetricsForm(CohortUpload):
    """
    Defines the fields necessary to upload inputs to calculate
    the ProgramMetrics
    """
    # Override cohort name and default project
    # The user for the ProgramMetrics API doesn't have to define these
    name = HiddenField(default='ProgramGlobalMetricsCohort_' +
                       str(uuid.uuid1()))
    project = HiddenField(default='enwiki')
    validate_as_user_ids = HiddenField(default='False')
    start_date = BetterDateTimeField(default=thirty_days_ago,
                                     validators=[NotGreater('end_date')])
    end_date = BetterDateTimeField(default=today)
Beispiel #3
0
class PagesEdited(Metric):
    """
    This class counts the pages edited by editors over a period of time.
    This sql query was used as a starting point for the sqlalchemy query:

    SELECT user_id, COUNT(*)
    FROM (
        SELECT DISTINCT user_id, page_id
        FROM (
            SELECT r.rev_user AS user_id, r.rev_page AS page_id
            FROM revision r
                INNER JOIN page p ON p.page_id = r.rev_page
            WHERE r.rev_timestamp BETWEEN [start] AND [end]
            AND r.rev_user in ([parameterized])
            AND p.page_namespace in ([parameterized])

            UNION ALL

            SELECT a.ar_user AS user_id, a.ar_page_id AS page_id
            FROM archive a
            WHERE a.ar_timestamp BETWEEN [start] AND [end]
            AND a.ar_user in ([parameterized])
            AND a.ar_namespace in ([parameterized])
        ) AS subq1
    ) AS subq2 GROUP BY user_id;
    """

    # NOTE: this is not enabled in the UI yet, but could be easily
    # The deduplicate parameter's a bit confusing, maybe consider
    # re-wording that when making this metric available
    show_in_ui  = False
    id          = 'pages_edited'
    label       = 'Pages Edited'
    category    = 'Content'
    description = (
        'Compute the number of unique pages edited by the'
        'cohort\'s users within a time interval'
    )
    default_result = {'pages_edited': 0}

    start_date  = BetterDateTimeField(default=thirty_days_ago)
    end_date    = BetterDateTimeField(default=today)

    include_deleted = BetterBooleanField(
        default=True,
        description='Count pages that have been deleted',
    )
    namespaces = CommaSeparatedIntegerListField(
        None,
        description='0, 2, 4, etc. (leave blank for *all*)',
    )
    deduplicate_across_users = BetterBooleanField(
        default=False,
        description='Count unique pages edited by the entire cohort,'
                    ' rolled up to one number.',
    )

    def __call__(self, user_ids, session):
        """
        Parameters:
            user_ids    : list of mediawiki user ids to find pages for
            session     : sqlalchemy session open on a mediawiki database

        Returns:
            dictionary from user ids to the number of pages edited found
        """
        start_date = self.start_date.data
        end_date = self.end_date.data
        deduplicate = self.deduplicate_across_users.data

        revisions = (
            session.query(
                label('user_id', Revision.rev_user),
                label('page_id', Revision.rev_page),
                label('timestamp', Revision.rev_timestamp)
            )
            .filter(Revision.rev_timestamp > start_date)
            .filter(Revision.rev_timestamp <= end_date))

        archives = (
            session.query(
                label('user_id', Archive.ar_user),
                label('page_id', Archive.ar_page_id),
                label('timestamp', Archive.ar_timestamp)
            )
            .filter(Archive.ar_timestamp > start_date)
            .filter(Archive.ar_timestamp <= end_date))

        if self.namespaces.data and len(self.namespaces.data) > 0:
            revisions = (
                revisions
                .join(Page)
                .filter(Page.page_namespace.in_(self.namespaces.data))
            )
            archives = (
                archives
                .filter(Archive.ar_namespace.in_(self.namespaces.data))
            )

        revisions = self.filter(revisions, user_ids, column=Revision.rev_user)
        archives = self.filter(archives, user_ids, column=Archive.ar_user)

        both = revisions
        if self.include_deleted.data:
            both = both.union_all(archives)
        both = both.subquery()

        if deduplicate:
            # Use a constant user id here to deduplicate only by page
            # A single result will be returned and assigned to user_id = ROLLUP_USER_ID
            both_grouped = (
                session.query(
                    label('user_id', literal_column(str(ROLLUP_USER_ID))), both.c.page_id
                )
                .distinct().subquery()
            )
        else:
            # Select distinct user_id-page_id pairs
            # to count edits by the same user on the same page as one
            both_grouped = (
                session.query(both.c.user_id, both.c.page_id)
                .distinct().subquery()
            )

        query = (
            session.query(both_grouped.c.user_id, func.count())
            .group_by(both_grouped.c.user_id)
        )

        # Format the output
        metric_results = {r[0]: {PagesEdited.id : r[1]} for r in query.all()}
        if user_ids is None:
            return metric_results
        elif deduplicate:
            ret = {}
            ret[ROLLUP_USER_ID] = metric_results.get(
                ROLLUP_USER_ID, self.default_result
            )
            return ret
        else:
            return {
                uid: metric_results.get(uid, self.default_result)
                for uid in user_ids
            }
Beispiel #4
0
class NewlyRegistered(Metric):
    """
    A newly registered user is a previously unregistered user creating a username
    for the first time on a Wikimedia project.

    The SQL query that inspired this metric was:

 SELECT log_user AS user_id
   FROM enwiki.logging
        /* exclude proxy registrations */
  WHERE log_type = 'newusers'
        /* only include self-created users, exclude attached and proxy-registered users */
    AND log_action = 'create'
    AND log_timestamp BETWEEN @start_date AND @end_date;
    """

    show_in_ui = True
    id = 'newly_registered'
    label = 'Newly Registered'
    category = 'New users'
    description = (
        'A newly registered user is a previously unregistered user creating a username \
        for the first time on a Wikimedia project.')
    default_result = {
        'newly_registered': 0,
    }

    start_date = BetterDateTimeField(default=thirty_days_ago)
    end_date = BetterDateTimeField(default=today)

    def __call__(self, user_ids, session):
        """
        Parameters:
            user_ids    : list of mediawiki user ids to investigate
            session     : sqlalchemy session open on a mediawiki database

        Returns:
            dictionary from user ids to a dictionary of the form:
            {
                'newly_registered': 1 for True, 0 for False
            }
            If no user_ids are specified, only {'newly_registered': 1} will be returned
            for those user ids which satisfy the metric
        """

        start_date = self.start_date.data
        end_date = self.end_date.data

        query = session.query(Logging.log_user) \
            .filter(Logging.log_type == 'newusers') \
            .filter(Logging.log_action == 'create') \
            .filter(Logging.log_timestamp > start_date)\
            .filter(Logging.log_timestamp <= end_date)

        metric = self.filter(query, user_ids, column=Logging.log_user)
        data = metric.all()

        metric_results = {r[0]: {NewlyRegistered.id: 1} for r in data}

        if user_ids is None:
            return metric_results
        else:
            return {
                uid: metric_results.get(uid, self.default_result)
                for uid in user_ids
            }
Beispiel #5
0
class TimeseriesMetric(Metric):
    """
    This class is the parent of Metric implementations which can return timeseries
    results.  It provides a single WTForm field to allow configuration of timeseries
    output.
    """
    
    start_date  = BetterDateTimeField(
        default=thirty_days_ago, validators=[NotGreater('end_date')])
    end_date    = BetterDateTimeField(default=today)
    timeseries  = SelectField(
        'Time Series by',
        default=TimeseriesChoices.NONE,
        choices=[
            (TimeseriesChoices.NONE, TimeseriesChoices.NONE),
            (TimeseriesChoices.HOUR, TimeseriesChoices.HOUR),
            (TimeseriesChoices.DAY, TimeseriesChoices.DAY),
            (TimeseriesChoices.MONTH, TimeseriesChoices.MONTH),
            (TimeseriesChoices.YEAR, TimeseriesChoices.YEAR),
        ],
        description='Report results by year, month, day, or hour',
    )
    
    def apply_timeseries(self, query, column=Revision.rev_timestamp):
        """
        Take a query and slice it up into equal time intervals
        
        Parameters
            query   : a sql alchemy query
            column  : defaults to Revision.rev_timestamp, specifies the timestamp
                      column to use for the timeseries
        
        Returns
            The query parameter passed in, with a grouping by the desired time slice
        """
        choice = self.timeseries.data
        
        if choice == TimeseriesChoices.NONE:
            return query
        
        query = query.add_column(func.year(column))
        query = query.group_by(func.year(column))
        
        if choice == TimeseriesChoices.YEAR:
            return query
        
        query = query.add_column(func.month(column))
        query = query.group_by(func.month(column))
        
        if choice == TimeseriesChoices.MONTH:
            return query
        
        query = query.add_column(func.day(column))
        query = query.group_by(func.day(column))
        
        if choice == TimeseriesChoices.DAY:
            return query
        
        query = query.add_column(func.hour(column))
        query = query.group_by(func.hour(column))
        
        if choice == TimeseriesChoices.HOUR:
            return query
    
    def results_by_user(self, user_ids, query, submetrics, date_index=None):
        """
        Get results by user for a timeseries-enabled metric
        
        Parameters
            user_ids            : list of integer ids to return results for
            query               : sqlalchemy query to fetch results
            submetrics          : list of tuples of the form (label, index, default)
            date_index          : index of the year date part in the result row,
                                  in case this is a timeseries query
        
        Returns
            A dictionary of user_ids to results, shaped depending on timeseries:
            user_id: {
                'submetric 1': {
                    'date slice 1': submetric_1_value,
                    'date slice 2': submetric_1_value,
                    ...
                },
                'submetric 2': ...
                
                OR
                
                'submetric 1': submetric_1_value,
                'submetric 2': submetric_2_value,
                ...
            }
        """
        # get a dictionary of user_ids to their metric results
        results = self.submetrics_by_user(query, submetrics, date_index)
        
        # default user_ids to all the user ids in the results, if not present
        if user_ids and len(user_ids):
            if len(results.keys()) > len(user_ids):
                raise Exception('Filtering did not work properly')
        else:
            user_ids = results.keys()

        # make a default return dictionary for users not found by the query
        submetric_defaults = dict()
        for label, index, default in submetrics:
            if self.timeseries.data == TimeseriesChoices.NONE:
                submetric_defaults[label] = default
            else:
                submetric_defaults[label] = dict()
        
        # populate users not found by the query with the default created above
        results = {
            user_id: results.get(user_id, submetric_defaults)
            for user_id in user_ids
        }
        
        # in timeseries results, fill in missing date-times
        results = self.normalize_datetime_slices(results, submetrics)
        return results
    
    def submetrics_by_user(self, query, submetrics, date_index=None):
        """
        Same as results_by_user, except doesn't return results for users not found in
        the query_results list.
        """
        query_results = query.all()
        results = OrderedDict()
        
        # handle simple cases (no results or no timeseries)
        if not query_results:
            return results
        
        # get results by user and by date
        for row in query_results:
            user_id = row[0]
            if user_id not in results:
                results[user_id] = OrderedDict()
            
            date_slice = None
            if self.timeseries.data != TimeseriesChoices.NONE:
                date_slice = self.get_date_from_tuple(row, date_index, len(row))
            
            for label, index, default in submetrics:
                if date_slice:
                    if label not in results[user_id]:
                        results[user_id][label] = dict()
                    results[user_id][label][date_slice] = row[index]
                else:
                    results[user_id][label] = row[index]
        
        return results
    
    def normalize_datetime_slices(self, results_by_user, submetrics):
        """
        Starting from a sparse set of timeseries results, fill in default values
        for the specified list of sub-metrics.  Also make sure the chronological
        first timeseries slice is >= self.start_date.
        If self.timeseries is NONE, this is a simple identity function.
        
        Parameters
            results_by_user : dictionary of submetrics dictionaries by user
            submetrics      : list of tuples of the form (label, index, default)
        
        Returns
            the results, filled in with default values
        """
        if self.timeseries.data == TimeseriesChoices.NONE:
            return results_by_user
        
        slice_delta = self.get_delta_from_choice()
        timeseries_slices = OrderedDict()
        start_slice_key = format_pretty_date(self.start_date.data)
        timeseries_slices[start_slice_key] = None
        
        first_slice = self.get_first_slice()
        first_slice_key = format_pretty_date(first_slice)
        slice_to_default = first_slice
        while slice_to_default < self.end_date.data:
            date_key = format_pretty_date(slice_to_default)
            timeseries_slices[date_key] = None
            slice_to_default += slice_delta
        
        for user_id, user_submetrics in results_by_user.iteritems():
            for label, i, default in submetrics:
                if not label or not user_submetrics or label not in user_submetrics:
                    continue
                defaults = timeseries_slices.copy()
                defaults.update(user_submetrics[label])
                for k, v in defaults.iteritems():
                    if not v:
                        defaults[k] = default
                
                if start_slice_key != first_slice_key:
                    # coerce the first datetime slice to be self.start_date
                    defaults[start_slice_key] = defaults.pop(first_slice_key)
                
                user_submetrics[label] = defaults
        
        return results_by_user
    
    def get_first_slice(self):
        """
        Given a user's choice of timeseries grouping, and the value of the start_date,
        return the first interval in the timeseries
        """
        if self.timeseries.data == TimeseriesChoices.NONE:
            return None
        
        d = self.start_date.data
        if self.timeseries.data == TimeseriesChoices.HOUR:
            return datetime(d.year, d.month, d.day, d.hour)
        if self.timeseries.data == TimeseriesChoices.DAY:
            return datetime(d.year, d.month, d.day, 0)
        if self.timeseries.data == TimeseriesChoices.MONTH:
            return datetime(d.year, d.month, 1, 0)
        if self.timeseries.data == TimeseriesChoices.YEAR:
            return datetime(d.year, 1, 1, 0)
    
    def get_delta_from_choice(self):
        """
        Given a user's choice of timeseries grouping,
        return a delta that would be one "slice" wide
        """
        if self.timeseries.data == TimeseriesChoices.NONE:
            return relativedelta(hours=0)
        if self.timeseries.data == TimeseriesChoices.HOUR:
            return relativedelta(hours=1)
        if self.timeseries.data == TimeseriesChoices.DAY:
            return relativedelta(days=1)
        if self.timeseries.data == TimeseriesChoices.MONTH:
            return relativedelta(months=1)
        if self.timeseries.data == TimeseriesChoices.YEAR:
            return relativedelta(years=1)
    
    def get_date_from_tuple(self, row_tuple, start_index, stop_index):
        """
        Suppose you have a tuple like this:
        ([data], [data], ... , year, month, day, [data], [data])
        Then this function will parse out the year, month, day, and hour
        into a date string.  Anything beyond year, month, day is optional.
        """
        date_pieces = row_tuple[start_index:stop_index]
        year, month, day, hour = 1970, 1, 1, 0
        
        if len(date_pieces) > 0:
            year = date_pieces[0]
        if len(date_pieces) > 1:
            month = date_pieces[1]
        if len(date_pieces) > 2:
            day = date_pieces[2]
        if len(date_pieces) > 3:
            hour = date_pieces[3]
        
        return format_pretty_date(datetime(year, month, day, hour))
Beispiel #6
0
class RollingActiveEditor(Metric):
    """
    This sql query was used as a starting point for the sqlalchemy query:

    SET @n = <<edits threshold, default 5>>;
    SET @u = <<activity unit in days, default 30>>;
    SET @T = <<end date passed in, or each date between start and end if timeseries>>;

    /* Results in a set of "new editors" */
     SELECT user_id
       FROM (/* Get revisions to content pages that are still visible */
             SELECT user_id,
                    SUM(rev_id IS NOT NULL) AS revisions
               FROM User
                        LEFT JOIN
                    Revision    ON rev_user = user_id
              WHERE rev_timestamp BETWEEN [@T - @u days] AND @T
              GROUP BY user_id, user_name, user_registration

              UNION ALL

             /* Get revisions to content pages that have been archived */
             SELECT user_id,
                    /* Note that ar_rev_id is sometimes set to NULL :( */
                    SUM(ar_id IS NOT NULL) AS revisions
               FROM User
                        LEFT JOIN
                    Archive     ON ar_user = user_id
              WHERE ar_timestamp BETWEEN [@T - @u days] AND @T
              GROUP BY user_id
            ) AS user_content_revision_count
      GROUP BY user_id
     HAVING SUM(revisions) >= @n;

    NOTE: updated to exclude bots as identified by:

     SELECT ug_user
       FROM user_groups
      WHERE ug_group = 'bot'
    """

    show_in_ui = True
    id = 'rolling_active_editor'
    label = 'Rolling Active Editor'
    category = 'Community'
    description = (
        'Compute the number of registered users who complete <<n>> edits to pages'
        ' in any namespace of a Wikimedia project between <<end date>> minus <<u>> days'
        ' and <<end date>>')
    default_result = {
        'rolling_active_editor': 0,
    }

    number_of_edits = IntegerField(default=5)
    rolling_days = IntegerField(default=30)
    end_date = BetterDateTimeField(
        label='As Of Date',
        default=today,
        description=
        'Editors making Number Of Edits within Rolling Days of this date')

    def __call__(self, user_ids, session):
        """
        Parameters:
            user_ids    : list of mediawiki user ids to restrict computation to
            session     : sqlalchemy session open on a mediawiki database

        Returns:
            dictionary from user ids to: 1 if they're a rolling active editor, 0 if not
        """
        number_of_edits = int(self.number_of_edits.data)
        rolling_days = int(self.rolling_days.data)
        end_date = self.end_date.data
        start_date = end_date - timedelta(days=rolling_days)

        rev_user = label('user_id', Revision.rev_user)
        ar_user = label('user_id', Archive.ar_user)
        count = label('count', func.count())

        revisions = session.query(rev_user, count)\
            .filter(between(Revision.rev_timestamp, start_date, end_date))\
            .group_by(Revision.rev_user)
        revisions = self.filter(revisions, user_ids, column=Revision.rev_user)

        archived = session.query(ar_user, count)\
            .filter(between(Archive.ar_timestamp, start_date, end_date))\
            .group_by(Archive.ar_user)
        archived = self.filter(archived, user_ids, column=Archive.ar_user)

        bot_user_ids = session.query(MediawikiUserGroups.ug_user)\
            .filter(MediawikiUserGroups.ug_group == 'bot')\
            .subquery()

        edits = revisions.union_all(archived).subquery()
        edits_by_user = session.query(edits.c.user_id)\
            .filter(edits.c.user_id.notin_(bot_user_ids))\
            .group_by(edits.c.user_id)\
            .having(func.SUM(edits.c.count) >= number_of_edits)

        metric_results = {r[0]: {self.id: 1} for r in edits_by_user.all()}

        if user_ids is None:
            return metric_results
        else:
            return {
                uid: metric_results.get(uid, self.default_result)
                for uid in user_ids
            }
class RollingSurvivingNewActiveEditor(Metric):
    """
    This sql query was used as a starting point for the sqlalchemy query:

SET @n = <<edits threshold, default 5>>;
SET @u = <<activity unit in days, default 30>>;
SET @T = <<end date passed in, or each date between start and end if timeseries>>;

 SELECT user_id
   FROM (/* Get revisions to content pages that are still visible */
         SELECT user_id,
                SUM(rev_id IS NOT NULL and rev_timestamp <= [@T - @u days]) AS revisions1
                SUM(rev_id IS NOT NULL and rev_timestamp > [@T - @u days]) AS revisions2
           FROM User
                    INNER JOIN
                Logging     ON log_user = user_id
                    LEFT JOIN
                Revision    ON rev_user = user_id
          WHERE log_type = 'newusers'
            AND log_action = 'create'
            AND user_registration BETWEEN [@T - 2*@u days] AND [@T - @u days]
            AND rev_timestamp BETWEEN [@T - 2*@u days] AND @T
          GROUP BY user_id

          UNION ALL

         /* Get revisions to content pages that have been archived */
         SELECT user_id,
                /* Note that ar_rev_id is sometimes set to NULL :( */
                SUM(ar_id IS NOT NULL and ar_timestamp <= [@T - @u days]) AS revisions1
                SUM(ar_id IS NOT NULL and ar_timestamp > [@T - @u days]) AS revisions2
           FROM User
                    INNER JOIN
                Logging     ON log_user = user_id
                    LEFT JOIN
                Archive     ON ar_user = user_id
          WHERE log_type = 'newusers'
            AND log_action = 'create'
            AND user_registration BETWEEN [@T - 2*@u days] AND [@T - @u days]
            AND ar_timestamp BETWEEN [@T - 2*@u days] AND @T
          GROUP BY user_id
        ) AS user_content_revision_count
  GROUP BY user_id
 HAVING SUM(revisions1) >= @n
    AND SUM(revisions2) >= @n

    NOTE: updated to exclude bots as identified by:

 SELECT ug_user
   FROM user_groups
  WHERE ug_group = 'bot'
    """

    show_in_ui = True
    id = 'rolling_surviving_new_active_editor'
    label = 'Rolling Surviving New Active Editor'
    category = 'Community'
    description = (
        'Compute the number of users newly registered within <<end date>> minus <<u>> * 2'
        ' days who complete <<n>> edits to pages in any namespace of a Wikimedia project'
        ' two times.  Once between <<end date>> minus <<u>> days and <<end date>>.  Then'
        ' between <<end date>> minus <<u*2>> days and <<end date>> minus <<u>> days.'
    )
    default_result = {
        'rolling_surviving_new_active_editor': 0,
    }

    number_of_edits = IntegerField(default=5)
    rolling_days = IntegerField(default=30)
    end_date = BetterDateTimeField(
        label='As Of Date',
        default=today,
        description='Newly Registered users making Number Of Edits within '
        'Rolling Days * 2 of this date, and again another Number '
        'Of Edits within Rolling Days of this date')

    def __call__(self, user_ids, session):
        """
        Parameters:
            user_ids    : list of mediawiki user ids to restrict computation to
            session     : sqlalchemy session open on a mediawiki database

        Returns:
            {
                user id: 1 if they're a rolling surviving new active editor, 0 otherwise
                for all cohort users, or all users that have edits in the time period
            }
        """
        number_of_edits = int(self.number_of_edits.data)
        rolling_days = int(self.rolling_days.data)
        end_date = self.end_date.data
        mid_date = end_date - timedelta(days=rolling_days)
        start_date = end_date - timedelta(days=rolling_days * 2)

        newly_registered = session.query(Logging.log_user) \
            .filter(Logging.log_type == 'newusers') \
            .filter(Logging.log_action == 'create') \
            .filter(between(Logging.log_timestamp, start_date, mid_date))

        # subquery to select only the users registered between start and mid date
        filtered_new = self.filter(newly_registered,
                                   user_ids,
                                   column=Logging.log_user).subquery()

        rev_user = label('user_id', Revision.rev_user)
        ar_user = label('user_id', Archive.ar_user)
        # count edits between start and mid date, for Revision
        rev_count_one = _get_count('count_one',
                                   Revision.rev_timestamp <= mid_date)
        # count edits between start and mid date, for Archive
        ar_count_one = _get_count('count_one',
                                  Archive.ar_timestamp <= mid_date)
        # count edits between mid and end date, for Revision
        rev_count_two = _get_count('count_two',
                                   Revision.rev_timestamp > mid_date)
        # count edits between mid and end date, for Archive
        ar_count_two = _get_count('count_two', Archive.ar_timestamp > mid_date)

        # get both counts by user for Revision
        revisions = session.query(rev_user, rev_count_one, rev_count_two)\
            .filter(between(Revision.rev_timestamp, start_date, end_date))\
            .filter(Revision.rev_user.in_(filtered_new))\
            .group_by(Revision.rev_user)

        # get both counts by user for Archive
        archived = session.query(ar_user, ar_count_one, ar_count_two)\
            .filter(between(Archive.ar_timestamp, start_date, end_date))\
            .filter(Archive.ar_user.in_(filtered_new))\
            .group_by(Archive.ar_user)

        bot_user_ids = session.query(MediawikiUserGroups.ug_user)\
            .filter(MediawikiUserGroups.ug_group == 'bot')\
            .subquery()

        # For each user, with both counts from both tables,
        #   sum the count_one values together, check it's >= number_of_edits
        #   sum the count_two values together, check it's >= number_of_edits
        new_edits = revisions.union_all(archived).subquery()
        new_edits_by_user = session.query(new_edits.c.user_id)\
            .filter(new_edits.c.user_id.notin_(bot_user_ids))\
            .group_by(new_edits.c.user_id)\
            .having(and_(
                func.SUM(new_edits.c.count_one) >= number_of_edits,
                func.SUM(new_edits.c.count_two) >= number_of_edits,
            ))

        metric_results = {r[0]: {self.id: 1} for r in new_edits_by_user.all()}

        if user_ids is None:
            return metric_results
        else:
            return {
                uid: metric_results.get(uid, self.default_result)
                for uid in user_ids
            }