def get_revisions_and_flagged_data(rev_ids, treatment_date, con): """get number of revisons flagged before `treatment_date` among last 50 edits""" rev_flag_sql = """ select rev_id, rev_page, page_namespace, rev_timestamp, fr_timestamp, (select max(fr_timestamp) from flaggedrevs where fr_page_id=rev_page and fr_timestamp < :treatment_date) max_fr_ts from ( select rev_id, rev_page, rev_timestamp, page_namespace from revision_userindex join page on page_id = rev_page where rev_id in ({rev_ids_str}) ) auser left join flaggedrevs on fr_page_id = rev_page and fr_rev_id = rev_id; """.format(rev_ids_str="{}".format(','.join( [str(x) for x in rev_ids])) if len(rev_ids) > 0 else "null") rev_flag_params = {'treatment_date': to_wmftimestamp(treatment_date)} # print(rev_flag_params) con.execute('use dewiki_p;') rev_flag = pd.read_sql(sqlalchemy.text(rev_flag_sql), con, params=rev_flag_params) rev_flag['fr_timestamp'] = rev_flag['fr_timestamp'].apply( from_wmftimestamp) rev_flag['max_fr_ts'] = rev_flag['max_fr_ts'].apply(from_wmftimestamp) rev_flag['rev_timestamp'] = rev_flag['rev_timestamp'].apply( from_wmftimestamp) return rev_flag
def get_active_users(lang, start_date, end_date, min_rev_id, wmf_con): """ Return the first and last edits of only active users in `lang`wiki between the start_date and end_date. """ wmf_con.execute(f'use {lang}wiki_p;') active_sql = """select user_id, user_name, user_registration, user_editcount as live_edit_count from (select distinct(rev_user) from revision where rev_timestamp >= :start_date and rev_timestamp <= :end_date and rev_id > :min_rev_id) active_users join user on active_users.rev_user=user.user_id;""" active_sql_esc = sqlalchemy.text(active_sql) params = {"start_date":int(to_wmftimestamp(start_date)), "end_date":int(to_wmftimestamp(end_date)), "min_rev_id":min_rev_id} active_df = pd.read_sql(active_sql_esc, con=wmf_con, params=params) return active_df
def get_timestamps_within_range(lang, user_id, con, start_date, end_date): '''this will get all the timestamps of edits for a user that occured before or after 90 within a date range from start_date to end_date''' con.execute('use {lang}wiki_p;'.format(lang=lang)) rev_sql = '''select rev_timestamp from revision_userindex where rev_user = :user_id and rev_timestamp >= :start_date and rev_timestamp < :end_date order by rev_timestamp ''' rev_sql_esc = sqlalchemy.text(rev_sql) sql_params = { 'user_id': int(user_id), 'start_date': to_wmftimestamp(start_date), 'end_date': to_wmftimestamp(end_date) } rev_ts_series = pd.read_sql(rev_sql_esc, con=con, params=sql_params) rev_ts_series['rev_timestamp'] = rev_ts_series['rev_timestamp'].apply( from_wmftimestamp) return rev_ts_series
def get_thanks_thanking_user(lang, user_name, start_date, end_date, wmf_con): wmf_con.execute(f"use {lang}wiki_p;") user_thank_sql = """ select thank_timestamp, sender, receiver, ru.user_id as receiver_id, su.user_id as sender_id from (select log_timestamp as thank_timestamp, replace(log_title, '_', ' ') as receiver, log_user_text as sender from logging_logindex where log_title = :user_name and log_action = 'thank' and :start_date <= log_timestamp <= :end_date ) t left join user ru on ru.user_name = t.receiver left join user su on su.user_name = t.sender """ user_thank_sql_esc = sqlalchemy.text(user_thank_sql) sql_params = { 'user_name': user_name.replace(' ', '_'), 'start_date': to_wmftimestamp(start_date), 'end_date': to_wmftimestamp(end_date) } df = pd.read_sql(user_thank_sql_esc, con=wmf_con, params=sql_params) df['thank_timestamp'] = df['thank_timestamp'].apply(from_wmftimestamp) df['sender'] = df['sender'].apply(decode_or_nan) df['receiver'] = df['receiver'].apply(decode_or_nan) return df
def get_users_edit_spans(lang, start_date, end_date, wmf_con): """ Return the the first and last edits of all users in `lang`wiki between the start_date and end_date """ db_prefix = f'{lang}wiki_p' wmf_con.execute(f'use {db_prefix};') reg_sql = '''select '{lang}' as lang, user_id, user_name, user_registration, user_editcount as live_edit_count, (select min(rev_timestamp) from revision_userindex where rev_user=user_id and {start_date} <= rev_timestamp <= {end_date}) as first_edit, (select max(rev_timestamp) from revision_userindex where rev_user=user_id and {start_date} <= rev_timestamp <= {end_date}) as last_edit from user where coalesce(user_registration, 20010101000000) <= {end_date} and coalesce(user_registration, 20010101000000) >= {start_date}; '''.format(start_date=to_wmftimestamp(start_date), end_date=to_wmftimestamp(end_date), lang=lang) span_df = pd.read_sql(reg_sql, wmf_con) span_df['user_registration'] = span_df['user_registration'].apply( from_wmftimestamp) span_df['first_edit'] = span_df['first_edit'].apply(from_wmftimestamp) span_df['last_edit'] = span_df['last_edit'].apply(from_wmftimestamp) span_df['user_name'] = span_df['user_name'].apply(decode_or_nan) return span_df
def get_recent_edits(lang, user_id, con, prior_days=None, max_revs=None, end_date=None): '''this will get all the rev_ids for a user that occured less than `prior_days` days before their last edit before `start_date` and no more than `max_revs` edits in total :param con: :param start_date''' if not end_date: end_date = datetime.datetime.utcnow() if not prior_days: prior_days = 84 if not max_revs: max_revs = 50 con.execute('use {lang}wiki_p;'.format(lang=lang)) revsql = ''' select user_id, rev_timestamp, rev_id, page_id, page_namespace from (select user_id, ts as rev_timestamp, rev_id, rev_page from (select a.rev_user as user_id, timestamp(a.rev_timestamp) as ts, a.rev_id as rev_id, timestamp(b.mts) as mts, rev_page from (select rev_user, rev_timestamp, rev_id, rev_page from revision_userindex where rev_user = {user_id} and rev_timestamp <= {end_date}) a join (select rev_user, max(rev_timestamp) as mts from revision_userindex where rev_user = {user_id} and rev_timestamp <= {end_date}) b on a.rev_user = b.rev_user ) uhist where ts > date_sub(mts, interval {prior_days} day) limit {max_revs}) revs join page on rev_page = page_id; '''.format(user_id=user_id, prior_days=prior_days, max_revs=max_revs, end_date=to_wmftimestamp(end_date)) udf = pd.read_sql(revsql, con) return udf