def get_month_pageviews_post_hive(): """ Get monthly pageview counts per (project, country, device) from hive. """ query = """ SET mapred.job.queue.name=priority; SELECT sum(view_count) as n, year, month, country_code as country_iso, project, access_method FROM wmf.projectview_hourly WHERE agent_type = 'user' AND project RLIKE 'wikipedia' AND YEAR >= 2015 group by year, month, country_code, project, access_method; """ df = query_hive_ssh(query, 'forecasting_refresh', priority=True, delete=True) df['month'] = df['month'].astype(str) df['year'] = df['year'].astype(str) df['month'] = df['month'].apply(lambda x: x if len(x) == 2 else '0' + x) df['month'].value_counts() df['timestamp'] = df['year'] + '-' + df['month'] + '-01' del df['month'] del df['year'] df.index = df['timestamp'] return df
def get_pageviews(start, stop, country, project): query = """ SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly WHERE agent_type = 'user' AND %(time)s AND project = '%(project)s' AND country_code = '%(country)s' GROUP BY year, month, day, hour, access_method """ params = { 'country': country, 'project': project, 'time': get_hive_timespan(start, stop) } d = query_hive_ssh(query % params, 'pvquery' + country + project, priority=True, delete=True) dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map( str) + ' ' + d["hour"].map(str) + ':00' d.index = pd.to_datetime(dt) del d['year'] del d['month'] del d['day'] del d['hour'] return d
def create_hive_view_proportion_deltas(transition_date, n_pre, n_post): query = """ DROP TABLE IF EXISTS censorship.deltas; CREATE TABLE censorship.deltas AS SELECT (post.proportion - pre.proportion) / pre.proportion AS delta, post.country, post.project, post.page_title FROM (SELECT SUM(proportion) / %(n_pre)d AS proportion, country, project, page_title FROM censorship.daily_ts2 WHERE day < '%(date)s' GROUP BY country, project, page_title) pre JOIN (SELECT SUM(proportion)/%(n_post)d AS proportion, country, project, page_title FROM censorship.daily_ts2 WHERE day >= '%(date)s' GROUP BY country, project, page_title ) post ON ( pre.country = post.country AND pre.project = post.project AND pre.page_title = post.page_title ) """ query %= {'date': transition_date, 'n_pre': n_pre, 'n_post': n_post} query_hive_ssh(query, 'ts', priority = True)
def query_country_deltas(country, n): # This is broken! query = """ SELECT country, project, page_title, delta FROM censorship.deltas WHERE country = '%s' ORDER BY delta DESC LIMIT %d """ return query_hive_ssh(query % (country, n), 'deltas.tsv')
def get_all_features(cp_dict, span_comparison, ts_table_name, limit = 100000000, min_views = 100): params = copy.deepcopy(span_comparison.params) params['cp_conditions'] = get_country_project_condition(cp_dict) params['min_views'] = min_views params['ts_table_name'] = ts_table_name params['limit'] = limit query = """ SELECT cmp_table.*, proportion_ts, ts FROM (SELECT country, project, page_title, CONCAT_WS(' ', COLLECT_SET(day_proportion)) as proportion_ts, CONCAT_WS(' ', COLLECT_SET(day_n)) as ts FROM ( SELECT country, project, page_title, CONCAT(day, '|', proportion) as day_proportion, CONCAT(day, '|', n) as day_n, n FROM %(db)s.%(ts_table_name)s WHERE %(cp_conditions)s ) a GROUP BY country, project, page_title HAVING SUM(n) > %(min_views)d LIMIT %(limit)d) ts_table JOIN %(db)s.%(span_table)s cmp_table ON ( cmp_table.c = ts_table.country AND cmp_table.p = ts_table.project AND cmp_table.t = ts_table.page_title ) """ query %= params #print(query) #return return query_hive_ssh(query, 'ts.tsv', priority = True)
def query_deltas(cmp): """ Pull a dataframe with popular articles where pageviews at least doubles """ query = """ SELECT *, (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc as clean_tpc_view_delta FROM censorship.20150515_20150528__20150617_20150730 WHERE post_n_tpc > 300 AND pre_n_wd > 1000 AND (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc > 2.0 """ df = query_hive_ssh(query , 'get_PVSpanComparison_df', priority = True) df.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in df.columns] df.sort('normalized_tpc_view_proportion_delta', inplace = True, ascending = 0) return df
def get_local_ts(ids, en_titles = True): params = { 'id_condition': get_id_conditions(ids, en_title = en_titles), } query = """ SELECT * FROM censorship.daily_ts2 WHERE %(id_condition)s """ df = query_hive_ssh(query % params, 'ts', priority = True) df.columns = [c.split('.')[1] for c in df] df.index = pd.to_datetime(df.day) return df
def get_impressions_by_banner_count(start, stop): """ Gets all impression data within the time range start:stop Groups data by banner, campaign and number of impressions seen """ data_dict = {} params = get_time_limits(start, stop) query = """ SELECT impressions_seen, CONCAT_WS(' ', banner, campaign, day) as name, n FROM ellery.banner_count WHERE day BETWEEN '%(start)s' AND '%(stop)s'; """ query = query % {'start': start, 'stop': stop} d = query_hive_ssh(query, 'impressions_by_count.tsv') d.index = d.impressions_seen d.drop('impressions_seen', axis=1, inplace=True) return d
def query_span_comparison(span_cmp, country, min_post_article_view = 100, min_wikidata_item_view = 500 ): """ Pull a dataframe with popular articles where pageviews at least doubles """ params = copy.deepcopy(span_cmp.params) params['country'] = country params['min_post_article_view'] = min_post_article_view params['min_wikidata_item_view'] = min_wikidata_item_view query = """ SELECT * FROM %(db)s.%(span_table)s WHERE post_n_tpc > %(min_post_article_view)d AND (pre_n_wd + post_n_wd) > %(min_wikidata_item_view)d AND (post_n_tpc / pre_n_tpc) > 2.0 AND c RLIKE '%(country)s' """ df = query_hive_ssh(query % params , 'get_PVSpanComparison_df', priority = True) df.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in df.columns] return df
def get_pageviews(start, stop, country, project): query = """ SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly WHERE agent_type = 'user' AND %(time)s AND project = '%(project)s' AND country_code = '%(country)s' GROUP BY year, month, day, hour, access_method """ params = {'country': country, 'project': project, 'time': get_hive_timespan(start, stop) } d = query_hive_ssh(query % params, 'pvquery' + country + project, priority = True, delete = True) dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(str) + ' ' + d["hour"].map(str) + ':00' d.index = pd.to_datetime(dt) del d['year'] del d['month'] del d['day'] del d['hour'] return d
def get_impressions_by_banner_count(start, stop): """ Gets all impression data within the time range start:stop Groups data by banner, campaign and number of impressions seen """ data_dict = {} params = get_time_limits(start, stop) query = """ SELECT impressions_seen, CONCAT_WS(' ', banner, campaign, day) as name, n FROM ellery.banner_count WHERE day BETWEEN '%(start)s' AND '%(stop)s'; """ query = query % {'start':start, 'stop':stop} d = query_hive_ssh(query, 'impressions_by_count.tsv') d.index = d.impressions_seen d.drop('impressions_seen', axis=1, inplace=True) return d
def check_normalization(pvsc, article): params = copy.copy(pvsc.params) params['article'] = article query = """ SELECT * FROM %(db)s.%(span_table)s WHERE en_page_title = '%(article)s' """ df = query_hive_ssh(query % params, 'get_PVSpanComparison_df') df.columns = [c.split('.')[1] for c in df.columns] df.sort('normalized_wdc_view_proportion_delta', inplace = True, ascending = 0) return df
def create_hive_ts(d, start, stop): query = """ DROP TABLE IF EXISTS censorship.daily_ts2; CREATE TABLE censorship.daily_ts2 AS SELECT CONCAT(ts.year,'-',LPAD(ts.month,2,'0'),'-',LPAD(ts.day,2,'0')) as day, ts.country, ts.project, ts.page_title, ts.n, ts.n / agg.n_agg as proportion, wd.en_page_title FROM (SELECT year, month, day, country, project, page_title, SUM(view_count) as n FROM wmf.pageview_hourly WHERE agent_type = 'user' AND page_title not RLIKE ':' AND %(cp_conditions)s AND %(time_conditions)s GROUP BY year, month, day, country, project, page_title ) ts LEFT JOIN (SELECT year, month, day, project, page_title, SUM(view_count) as n_agg FROM wmf.pageview_hourly WHERE agent_type = 'user' AND page_title not RLIKE ':' AND %(time_conditions)s GROUP BY year, month, day, project, page_title ) agg ON ( ts.year = agg.year AND ts.month = agg.month AND ts.day = agg.day AND ts.project = agg.project AND ts.page_title = agg.page_title) LEFT JOIN censorship.wikidata wd ON (ts.page_title = wd.page_title AND ts.project = wd.project); """ params = {'cp_conditions' : get_country_project_condition(cp_dict), 'time_conditions': get_hive_timespan(start, stop), } query %= params query_hive_ssh(query, 'ts', priority = True)