class RevertRate(Metric): """ This class implements revert rate logic. An instance of the class is callable and will compute the number of reverted edits for each user in a passed-in list. This sql query was used as a starting point for the sqlalchemy query: select r.rev_user, r.count(*) from revision r where r.rev_timestamp between [start] and [end] and r.rev_user in ([cohort's user list or maybe a join to a temp]) and exists ( select * from revision r1 inner join revision r2 on r2.rev_sha1 = r1.rev_sha1 and r1.rev_sha1 <> r.rev_sha1 where r1.rev_page = r.rev_page and r2.rev_page = r.rev_page and r1.rev_timestamp between [start - 2 weeks] and r.rev_timestamp and r2.rev_timestamp between r.rev_timestamp and [end + 2 weeks] ) group by rev_user """ show_in_ui = False id = 'revert-rate' label = 'Revert Rate' description = 'Compute the number of reverted edits in a mediawiki project' default_result = {} start_date = BetterDateTimeField(default=thirty_days_ago) end_date = BetterDateTimeField(default=today) namespaces = CommaSeparatedIntegerListField( None, [Required()], default='0', description='0, 2, 4, etc.', ) def __call__(self, user_ids, session): """ Parameters: user_ids : list of mediawiki user ids to find edit reverts for session : sqlalchemy session open on a mediawiki database Returns: dictionary from user ids to the number of edit reverts found. """ return {user: None for user in user_ids}
class ProgramMetricsForm(CohortUpload): """ Defines the fields necessary to upload inputs to calculate the ProgramMetrics """ # Override cohort name and default project # The user for the ProgramMetrics API doesn't have to define these name = HiddenField(default='ProgramGlobalMetricsCohort_' + str(uuid.uuid1())) project = HiddenField(default='enwiki') validate_as_user_ids = HiddenField(default='False') start_date = BetterDateTimeField(default=thirty_days_ago, validators=[NotGreater('end_date')]) end_date = BetterDateTimeField(default=today)
class PagesEdited(Metric): """ This class counts the pages edited by editors over a period of time. This sql query was used as a starting point for the sqlalchemy query: SELECT user_id, COUNT(*) FROM ( SELECT DISTINCT user_id, page_id FROM ( SELECT r.rev_user AS user_id, r.rev_page AS page_id FROM revision r INNER JOIN page p ON p.page_id = r.rev_page WHERE r.rev_timestamp BETWEEN [start] AND [end] AND r.rev_user in ([parameterized]) AND p.page_namespace in ([parameterized]) UNION ALL SELECT a.ar_user AS user_id, a.ar_page_id AS page_id FROM archive a WHERE a.ar_timestamp BETWEEN [start] AND [end] AND a.ar_user in ([parameterized]) AND a.ar_namespace in ([parameterized]) ) AS subq1 ) AS subq2 GROUP BY user_id; """ # NOTE: this is not enabled in the UI yet, but could be easily # The deduplicate parameter's a bit confusing, maybe consider # re-wording that when making this metric available show_in_ui = False id = 'pages_edited' label = 'Pages Edited' category = 'Content' description = ( 'Compute the number of unique pages edited by the' 'cohort\'s users within a time interval' ) default_result = {'pages_edited': 0} start_date = BetterDateTimeField(default=thirty_days_ago) end_date = BetterDateTimeField(default=today) include_deleted = BetterBooleanField( default=True, description='Count pages that have been deleted', ) namespaces = CommaSeparatedIntegerListField( None, description='0, 2, 4, etc. (leave blank for *all*)', ) deduplicate_across_users = BetterBooleanField( default=False, description='Count unique pages edited by the entire cohort,' ' rolled up to one number.', ) def __call__(self, user_ids, session): """ Parameters: user_ids : list of mediawiki user ids to find pages for session : sqlalchemy session open on a mediawiki database Returns: dictionary from user ids to the number of pages edited found """ start_date = self.start_date.data end_date = self.end_date.data deduplicate = self.deduplicate_across_users.data revisions = ( session.query( label('user_id', Revision.rev_user), label('page_id', Revision.rev_page), label('timestamp', Revision.rev_timestamp) ) .filter(Revision.rev_timestamp > start_date) .filter(Revision.rev_timestamp <= end_date)) archives = ( session.query( label('user_id', Archive.ar_user), label('page_id', Archive.ar_page_id), label('timestamp', Archive.ar_timestamp) ) .filter(Archive.ar_timestamp > start_date) .filter(Archive.ar_timestamp <= end_date)) if self.namespaces.data and len(self.namespaces.data) > 0: revisions = ( revisions .join(Page) .filter(Page.page_namespace.in_(self.namespaces.data)) ) archives = ( archives .filter(Archive.ar_namespace.in_(self.namespaces.data)) ) revisions = self.filter(revisions, user_ids, column=Revision.rev_user) archives = self.filter(archives, user_ids, column=Archive.ar_user) both = revisions if self.include_deleted.data: both = both.union_all(archives) both = both.subquery() if deduplicate: # Use a constant user id here to deduplicate only by page # A single result will be returned and assigned to user_id = ROLLUP_USER_ID both_grouped = ( session.query( label('user_id', literal_column(str(ROLLUP_USER_ID))), both.c.page_id ) .distinct().subquery() ) else: # Select distinct user_id-page_id pairs # to count edits by the same user on the same page as one both_grouped = ( session.query(both.c.user_id, both.c.page_id) .distinct().subquery() ) query = ( session.query(both_grouped.c.user_id, func.count()) .group_by(both_grouped.c.user_id) ) # Format the output metric_results = {r[0]: {PagesEdited.id : r[1]} for r in query.all()} if user_ids is None: return metric_results elif deduplicate: ret = {} ret[ROLLUP_USER_ID] = metric_results.get( ROLLUP_USER_ID, self.default_result ) return ret else: return { uid: metric_results.get(uid, self.default_result) for uid in user_ids }
class NewlyRegistered(Metric): """ A newly registered user is a previously unregistered user creating a username for the first time on a Wikimedia project. The SQL query that inspired this metric was: SELECT log_user AS user_id FROM enwiki.logging /* exclude proxy registrations */ WHERE log_type = 'newusers' /* only include self-created users, exclude attached and proxy-registered users */ AND log_action = 'create' AND log_timestamp BETWEEN @start_date AND @end_date; """ show_in_ui = True id = 'newly_registered' label = 'Newly Registered' category = 'New users' description = ( 'A newly registered user is a previously unregistered user creating a username \ for the first time on a Wikimedia project.') default_result = { 'newly_registered': 0, } start_date = BetterDateTimeField(default=thirty_days_ago) end_date = BetterDateTimeField(default=today) def __call__(self, user_ids, session): """ Parameters: user_ids : list of mediawiki user ids to investigate session : sqlalchemy session open on a mediawiki database Returns: dictionary from user ids to a dictionary of the form: { 'newly_registered': 1 for True, 0 for False } If no user_ids are specified, only {'newly_registered': 1} will be returned for those user ids which satisfy the metric """ start_date = self.start_date.data end_date = self.end_date.data query = session.query(Logging.log_user) \ .filter(Logging.log_type == 'newusers') \ .filter(Logging.log_action == 'create') \ .filter(Logging.log_timestamp > start_date)\ .filter(Logging.log_timestamp <= end_date) metric = self.filter(query, user_ids, column=Logging.log_user) data = metric.all() metric_results = {r[0]: {NewlyRegistered.id: 1} for r in data} if user_ids is None: return metric_results else: return { uid: metric_results.get(uid, self.default_result) for uid in user_ids }
class TimeseriesMetric(Metric): """ This class is the parent of Metric implementations which can return timeseries results. It provides a single WTForm field to allow configuration of timeseries output. """ start_date = BetterDateTimeField( default=thirty_days_ago, validators=[NotGreater('end_date')]) end_date = BetterDateTimeField(default=today) timeseries = SelectField( 'Time Series by', default=TimeseriesChoices.NONE, choices=[ (TimeseriesChoices.NONE, TimeseriesChoices.NONE), (TimeseriesChoices.HOUR, TimeseriesChoices.HOUR), (TimeseriesChoices.DAY, TimeseriesChoices.DAY), (TimeseriesChoices.MONTH, TimeseriesChoices.MONTH), (TimeseriesChoices.YEAR, TimeseriesChoices.YEAR), ], description='Report results by year, month, day, or hour', ) def apply_timeseries(self, query, column=Revision.rev_timestamp): """ Take a query and slice it up into equal time intervals Parameters query : a sql alchemy query column : defaults to Revision.rev_timestamp, specifies the timestamp column to use for the timeseries Returns The query parameter passed in, with a grouping by the desired time slice """ choice = self.timeseries.data if choice == TimeseriesChoices.NONE: return query query = query.add_column(func.year(column)) query = query.group_by(func.year(column)) if choice == TimeseriesChoices.YEAR: return query query = query.add_column(func.month(column)) query = query.group_by(func.month(column)) if choice == TimeseriesChoices.MONTH: return query query = query.add_column(func.day(column)) query = query.group_by(func.day(column)) if choice == TimeseriesChoices.DAY: return query query = query.add_column(func.hour(column)) query = query.group_by(func.hour(column)) if choice == TimeseriesChoices.HOUR: return query def results_by_user(self, user_ids, query, submetrics, date_index=None): """ Get results by user for a timeseries-enabled metric Parameters user_ids : list of integer ids to return results for query : sqlalchemy query to fetch results submetrics : list of tuples of the form (label, index, default) date_index : index of the year date part in the result row, in case this is a timeseries query Returns A dictionary of user_ids to results, shaped depending on timeseries: user_id: { 'submetric 1': { 'date slice 1': submetric_1_value, 'date slice 2': submetric_1_value, ... }, 'submetric 2': ... OR 'submetric 1': submetric_1_value, 'submetric 2': submetric_2_value, ... } """ # get a dictionary of user_ids to their metric results results = self.submetrics_by_user(query, submetrics, date_index) # default user_ids to all the user ids in the results, if not present if user_ids and len(user_ids): if len(results.keys()) > len(user_ids): raise Exception('Filtering did not work properly') else: user_ids = results.keys() # make a default return dictionary for users not found by the query submetric_defaults = dict() for label, index, default in submetrics: if self.timeseries.data == TimeseriesChoices.NONE: submetric_defaults[label] = default else: submetric_defaults[label] = dict() # populate users not found by the query with the default created above results = { user_id: results.get(user_id, submetric_defaults) for user_id in user_ids } # in timeseries results, fill in missing date-times results = self.normalize_datetime_slices(results, submetrics) return results def submetrics_by_user(self, query, submetrics, date_index=None): """ Same as results_by_user, except doesn't return results for users not found in the query_results list. """ query_results = query.all() results = OrderedDict() # handle simple cases (no results or no timeseries) if not query_results: return results # get results by user and by date for row in query_results: user_id = row[0] if user_id not in results: results[user_id] = OrderedDict() date_slice = None if self.timeseries.data != TimeseriesChoices.NONE: date_slice = self.get_date_from_tuple(row, date_index, len(row)) for label, index, default in submetrics: if date_slice: if label not in results[user_id]: results[user_id][label] = dict() results[user_id][label][date_slice] = row[index] else: results[user_id][label] = row[index] return results def normalize_datetime_slices(self, results_by_user, submetrics): """ Starting from a sparse set of timeseries results, fill in default values for the specified list of sub-metrics. Also make sure the chronological first timeseries slice is >= self.start_date. If self.timeseries is NONE, this is a simple identity function. Parameters results_by_user : dictionary of submetrics dictionaries by user submetrics : list of tuples of the form (label, index, default) Returns the results, filled in with default values """ if self.timeseries.data == TimeseriesChoices.NONE: return results_by_user slice_delta = self.get_delta_from_choice() timeseries_slices = OrderedDict() start_slice_key = format_pretty_date(self.start_date.data) timeseries_slices[start_slice_key] = None first_slice = self.get_first_slice() first_slice_key = format_pretty_date(first_slice) slice_to_default = first_slice while slice_to_default < self.end_date.data: date_key = format_pretty_date(slice_to_default) timeseries_slices[date_key] = None slice_to_default += slice_delta for user_id, user_submetrics in results_by_user.iteritems(): for label, i, default in submetrics: if not label or not user_submetrics or label not in user_submetrics: continue defaults = timeseries_slices.copy() defaults.update(user_submetrics[label]) for k, v in defaults.iteritems(): if not v: defaults[k] = default if start_slice_key != first_slice_key: # coerce the first datetime slice to be self.start_date defaults[start_slice_key] = defaults.pop(first_slice_key) user_submetrics[label] = defaults return results_by_user def get_first_slice(self): """ Given a user's choice of timeseries grouping, and the value of the start_date, return the first interval in the timeseries """ if self.timeseries.data == TimeseriesChoices.NONE: return None d = self.start_date.data if self.timeseries.data == TimeseriesChoices.HOUR: return datetime(d.year, d.month, d.day, d.hour) if self.timeseries.data == TimeseriesChoices.DAY: return datetime(d.year, d.month, d.day, 0) if self.timeseries.data == TimeseriesChoices.MONTH: return datetime(d.year, d.month, 1, 0) if self.timeseries.data == TimeseriesChoices.YEAR: return datetime(d.year, 1, 1, 0) def get_delta_from_choice(self): """ Given a user's choice of timeseries grouping, return a delta that would be one "slice" wide """ if self.timeseries.data == TimeseriesChoices.NONE: return relativedelta(hours=0) if self.timeseries.data == TimeseriesChoices.HOUR: return relativedelta(hours=1) if self.timeseries.data == TimeseriesChoices.DAY: return relativedelta(days=1) if self.timeseries.data == TimeseriesChoices.MONTH: return relativedelta(months=1) if self.timeseries.data == TimeseriesChoices.YEAR: return relativedelta(years=1) def get_date_from_tuple(self, row_tuple, start_index, stop_index): """ Suppose you have a tuple like this: ([data], [data], ... , year, month, day, [data], [data]) Then this function will parse out the year, month, day, and hour into a date string. Anything beyond year, month, day is optional. """ date_pieces = row_tuple[start_index:stop_index] year, month, day, hour = 1970, 1, 1, 0 if len(date_pieces) > 0: year = date_pieces[0] if len(date_pieces) > 1: month = date_pieces[1] if len(date_pieces) > 2: day = date_pieces[2] if len(date_pieces) > 3: hour = date_pieces[3] return format_pretty_date(datetime(year, month, day, hour))
class RollingActiveEditor(Metric): """ This sql query was used as a starting point for the sqlalchemy query: SET @n = <<edits threshold, default 5>>; SET @u = <<activity unit in days, default 30>>; SET @T = <<end date passed in, or each date between start and end if timeseries>>; /* Results in a set of "new editors" */ SELECT user_id FROM (/* Get revisions to content pages that are still visible */ SELECT user_id, SUM(rev_id IS NOT NULL) AS revisions FROM User LEFT JOIN Revision ON rev_user = user_id WHERE rev_timestamp BETWEEN [@T - @u days] AND @T GROUP BY user_id, user_name, user_registration UNION ALL /* Get revisions to content pages that have been archived */ SELECT user_id, /* Note that ar_rev_id is sometimes set to NULL :( */ SUM(ar_id IS NOT NULL) AS revisions FROM User LEFT JOIN Archive ON ar_user = user_id WHERE ar_timestamp BETWEEN [@T - @u days] AND @T GROUP BY user_id ) AS user_content_revision_count GROUP BY user_id HAVING SUM(revisions) >= @n; NOTE: updated to exclude bots as identified by: SELECT ug_user FROM user_groups WHERE ug_group = 'bot' """ show_in_ui = True id = 'rolling_active_editor' label = 'Rolling Active Editor' category = 'Community' description = ( 'Compute the number of registered users who complete <<n>> edits to pages' ' in any namespace of a Wikimedia project between <<end date>> minus <<u>> days' ' and <<end date>>') default_result = { 'rolling_active_editor': 0, } number_of_edits = IntegerField(default=5) rolling_days = IntegerField(default=30) end_date = BetterDateTimeField( label='As Of Date', default=today, description= 'Editors making Number Of Edits within Rolling Days of this date') def __call__(self, user_ids, session): """ Parameters: user_ids : list of mediawiki user ids to restrict computation to session : sqlalchemy session open on a mediawiki database Returns: dictionary from user ids to: 1 if they're a rolling active editor, 0 if not """ number_of_edits = int(self.number_of_edits.data) rolling_days = int(self.rolling_days.data) end_date = self.end_date.data start_date = end_date - timedelta(days=rolling_days) rev_user = label('user_id', Revision.rev_user) ar_user = label('user_id', Archive.ar_user) count = label('count', func.count()) revisions = session.query(rev_user, count)\ .filter(between(Revision.rev_timestamp, start_date, end_date))\ .group_by(Revision.rev_user) revisions = self.filter(revisions, user_ids, column=Revision.rev_user) archived = session.query(ar_user, count)\ .filter(between(Archive.ar_timestamp, start_date, end_date))\ .group_by(Archive.ar_user) archived = self.filter(archived, user_ids, column=Archive.ar_user) bot_user_ids = session.query(MediawikiUserGroups.ug_user)\ .filter(MediawikiUserGroups.ug_group == 'bot')\ .subquery() edits = revisions.union_all(archived).subquery() edits_by_user = session.query(edits.c.user_id)\ .filter(edits.c.user_id.notin_(bot_user_ids))\ .group_by(edits.c.user_id)\ .having(func.SUM(edits.c.count) >= number_of_edits) metric_results = {r[0]: {self.id: 1} for r in edits_by_user.all()} if user_ids is None: return metric_results else: return { uid: metric_results.get(uid, self.default_result) for uid in user_ids }
class RollingSurvivingNewActiveEditor(Metric): """ This sql query was used as a starting point for the sqlalchemy query: SET @n = <<edits threshold, default 5>>; SET @u = <<activity unit in days, default 30>>; SET @T = <<end date passed in, or each date between start and end if timeseries>>; SELECT user_id FROM (/* Get revisions to content pages that are still visible */ SELECT user_id, SUM(rev_id IS NOT NULL and rev_timestamp <= [@T - @u days]) AS revisions1 SUM(rev_id IS NOT NULL and rev_timestamp > [@T - @u days]) AS revisions2 FROM User INNER JOIN Logging ON log_user = user_id LEFT JOIN Revision ON rev_user = user_id WHERE log_type = 'newusers' AND log_action = 'create' AND user_registration BETWEEN [@T - 2*@u days] AND [@T - @u days] AND rev_timestamp BETWEEN [@T - 2*@u days] AND @T GROUP BY user_id UNION ALL /* Get revisions to content pages that have been archived */ SELECT user_id, /* Note that ar_rev_id is sometimes set to NULL :( */ SUM(ar_id IS NOT NULL and ar_timestamp <= [@T - @u days]) AS revisions1 SUM(ar_id IS NOT NULL and ar_timestamp > [@T - @u days]) AS revisions2 FROM User INNER JOIN Logging ON log_user = user_id LEFT JOIN Archive ON ar_user = user_id WHERE log_type = 'newusers' AND log_action = 'create' AND user_registration BETWEEN [@T - 2*@u days] AND [@T - @u days] AND ar_timestamp BETWEEN [@T - 2*@u days] AND @T GROUP BY user_id ) AS user_content_revision_count GROUP BY user_id HAVING SUM(revisions1) >= @n AND SUM(revisions2) >= @n NOTE: updated to exclude bots as identified by: SELECT ug_user FROM user_groups WHERE ug_group = 'bot' """ show_in_ui = True id = 'rolling_surviving_new_active_editor' label = 'Rolling Surviving New Active Editor' category = 'Community' description = ( 'Compute the number of users newly registered within <<end date>> minus <<u>> * 2' ' days who complete <<n>> edits to pages in any namespace of a Wikimedia project' ' two times. Once between <<end date>> minus <<u>> days and <<end date>>. Then' ' between <<end date>> minus <<u*2>> days and <<end date>> minus <<u>> days.' ) default_result = { 'rolling_surviving_new_active_editor': 0, } number_of_edits = IntegerField(default=5) rolling_days = IntegerField(default=30) end_date = BetterDateTimeField( label='As Of Date', default=today, description='Newly Registered users making Number Of Edits within ' 'Rolling Days * 2 of this date, and again another Number ' 'Of Edits within Rolling Days of this date') def __call__(self, user_ids, session): """ Parameters: user_ids : list of mediawiki user ids to restrict computation to session : sqlalchemy session open on a mediawiki database Returns: { user id: 1 if they're a rolling surviving new active editor, 0 otherwise for all cohort users, or all users that have edits in the time period } """ number_of_edits = int(self.number_of_edits.data) rolling_days = int(self.rolling_days.data) end_date = self.end_date.data mid_date = end_date - timedelta(days=rolling_days) start_date = end_date - timedelta(days=rolling_days * 2) newly_registered = session.query(Logging.log_user) \ .filter(Logging.log_type == 'newusers') \ .filter(Logging.log_action == 'create') \ .filter(between(Logging.log_timestamp, start_date, mid_date)) # subquery to select only the users registered between start and mid date filtered_new = self.filter(newly_registered, user_ids, column=Logging.log_user).subquery() rev_user = label('user_id', Revision.rev_user) ar_user = label('user_id', Archive.ar_user) # count edits between start and mid date, for Revision rev_count_one = _get_count('count_one', Revision.rev_timestamp <= mid_date) # count edits between start and mid date, for Archive ar_count_one = _get_count('count_one', Archive.ar_timestamp <= mid_date) # count edits between mid and end date, for Revision rev_count_two = _get_count('count_two', Revision.rev_timestamp > mid_date) # count edits between mid and end date, for Archive ar_count_two = _get_count('count_two', Archive.ar_timestamp > mid_date) # get both counts by user for Revision revisions = session.query(rev_user, rev_count_one, rev_count_two)\ .filter(between(Revision.rev_timestamp, start_date, end_date))\ .filter(Revision.rev_user.in_(filtered_new))\ .group_by(Revision.rev_user) # get both counts by user for Archive archived = session.query(ar_user, ar_count_one, ar_count_two)\ .filter(between(Archive.ar_timestamp, start_date, end_date))\ .filter(Archive.ar_user.in_(filtered_new))\ .group_by(Archive.ar_user) bot_user_ids = session.query(MediawikiUserGroups.ug_user)\ .filter(MediawikiUserGroups.ug_group == 'bot')\ .subquery() # For each user, with both counts from both tables, # sum the count_one values together, check it's >= number_of_edits # sum the count_two values together, check it's >= number_of_edits new_edits = revisions.union_all(archived).subquery() new_edits_by_user = session.query(new_edits.c.user_id)\ .filter(new_edits.c.user_id.notin_(bot_user_ids))\ .group_by(new_edits.c.user_id)\ .having(and_( func.SUM(new_edits.c.count_one) >= number_of_edits, func.SUM(new_edits.c.count_two) >= number_of_edits, )) metric_results = {r[0]: {self.id: 1} for r in new_edits_by_user.all()} if user_ids is None: return metric_results else: return { uid: metric_results.get(uid, self.default_result) for uid in user_ids }