def get_month_pageviews_post_hive():
    """
    Get monthly pageview counts per (project, country, device)
    from hive.
    """

    query = """
    SET mapred.job.queue.name=priority;
    SELECT
    sum(view_count) as n,
    year,
    month,
    country_code as country_iso, 
    project, 
    access_method
    FROM wmf.projectview_hourly
    WHERE agent_type = 'user'
    AND project RLIKE 'wikipedia'
    AND YEAR >= 2015
    group by year, month, country_code, project, access_method;
    """

    df = query_hive_ssh(query, 'forecasting_refresh', priority=True, delete=True)
    df['month'] = df['month'].astype(str)
    df['year'] = df['year'].astype(str)
    df['month'] = df['month'].apply(lambda x: x if len(x) == 2 else '0' + x)
    df['month'].value_counts()
    df['timestamp'] = df['year'] + '-' + df['month'] + '-01'
    del df['month']
    del df['year']
    df.index = df['timestamp']

    return df
Example #2
0
def get_pageviews(start, stop, country, project):

    query = """
    SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly
    WHERE agent_type = 'user'
    AND %(time)s
    AND project = '%(project)s'
    AND country_code = '%(country)s'
    GROUP BY year, month, day, hour, access_method
    """

    params = {
        'country': country,
        'project': project,
        'time': get_hive_timespan(start, stop)
    }
    d = query_hive_ssh(query % params,
                       'pvquery' + country + project,
                       priority=True,
                       delete=True)
    dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(
        str) + ' ' + d["hour"].map(str) + ':00'
    d.index = pd.to_datetime(dt)

    del d['year']
    del d['month']
    del d['day']
    del d['hour']
    return d
Example #3
0
def create_hive_view_proportion_deltas(transition_date, n_pre, n_post):
    query = """
    DROP TABLE IF EXISTS censorship.deltas;
    CREATE TABLE censorship.deltas
     AS SELECT
        (post.proportion - pre.proportion) / pre.proportion AS delta,
        post.country, 
        post.project, 
        post.page_title
    FROM
    (SELECT 
        SUM(proportion) / %(n_pre)d AS proportion,
        country, 
        project, 
        page_title
    FROM censorship.daily_ts2
        WHERE day < '%(date)s'
    GROUP BY 
        country, 
        project, 
        page_title) pre  
    JOIN 
    (SELECT 
        SUM(proportion)/%(n_post)d AS proportion,
        country, 
        project, 
        page_title
    FROM censorship.daily_ts2
        WHERE day >= '%(date)s'
    GROUP BY 
        country, 
        project, 
        page_title
        ) post
    ON (
        pre.country = post.country
        AND pre.project = post.project
        AND pre.page_title = post.page_title
    )
    """
    
    query %= {'date': transition_date, 'n_pre': n_pre, 'n_post': n_post}
    query_hive_ssh(query, 'ts', priority = True)
Example #4
0
def query_country_deltas(country, n): # This is broken!
    query = """
    SELECT 
        country,
        project,
        page_title,
        delta
    FROM censorship.deltas
    WHERE country = '%s'
    ORDER BY delta DESC
    LIMIT %d
    """
    return query_hive_ssh(query % (country, n), 'deltas.tsv')
def get_all_features(cp_dict, span_comparison, ts_table_name, limit = 100000000, min_views = 100):
    params = copy.deepcopy(span_comparison.params)
    params['cp_conditions'] = get_country_project_condition(cp_dict)
    params['min_views'] =  min_views
    params['ts_table_name'] =  ts_table_name
    params['limit'] = limit

    query = """
    SELECT 
        cmp_table.*,
        proportion_ts,
        ts
    FROM
        (SELECT
            country, 
            project,
            page_title, 
            CONCAT_WS(' ', COLLECT_SET(day_proportion)) as proportion_ts,
            CONCAT_WS(' ', COLLECT_SET(day_n)) as ts
        FROM (
            SELECT
                country, 
                project, 
                page_title,
                CONCAT(day, '|', proportion) as day_proportion,
                CONCAT(day, '|', n) as day_n,
                n
            FROM %(db)s.%(ts_table_name)s
            WHERE %(cp_conditions)s
            ) a
        GROUP BY
            country, 
            project, 
            page_title
        HAVING 
            SUM(n) > %(min_views)d
        LIMIT %(limit)d) ts_table
    JOIN %(db)s.%(span_table)s cmp_table
    ON (
            cmp_table.c = ts_table.country
        AND cmp_table.p = ts_table.project
        AND cmp_table.t = ts_table.page_title
        )
    """
    query %= params

    #print(query)
    #return
    return query_hive_ssh(query, 'ts.tsv', priority = True)
Example #6
0
def query_deltas(cmp):
    """
    Pull a dataframe with popular articles where pageviews at least doubles
    """
    query = """
    SELECT *,
    (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc as clean_tpc_view_delta
    FROM censorship.20150515_20150528__20150617_20150730
    WHERE post_n_tpc > 300
    AND pre_n_wd > 1000
    AND (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc > 2.0
    """
    df = query_hive_ssh(query , 'get_PVSpanComparison_df', priority = True)
    df.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in df.columns]
    df.sort('normalized_tpc_view_proportion_delta', inplace  = True, ascending = 0)
    return df
Example #7
0
def get_local_ts(ids, en_titles = True):
    
    params = {
        'id_condition': get_id_conditions(ids, en_title = en_titles),
    }

    query = """
    SELECT *
    FROM censorship.daily_ts2
    WHERE %(id_condition)s
    """

    df =  query_hive_ssh(query % params, 'ts', priority = True)
    df.columns = [c.split('.')[1] for c in df]
    df.index  = pd.to_datetime(df.day)
    return df
Example #8
0
def get_impressions_by_banner_count(start, stop):
    """
    Gets all impression data within the time range start:stop
    Groups data by banner, campaign and number of impressions seen
    """

    data_dict = {}
    params = get_time_limits(start, stop)
    query = """
    SELECT impressions_seen, CONCAT_WS(' ', banner, campaign, day) as name, n
    FROM ellery.banner_count
    WHERE day BETWEEN '%(start)s' AND '%(stop)s';
    """
    query = query % {'start': start, 'stop': stop}
    d = query_hive_ssh(query, 'impressions_by_count.tsv')

    d.index = d.impressions_seen
    d.drop('impressions_seen', axis=1, inplace=True)

    return d
def query_span_comparison(span_cmp, country, min_post_article_view = 100, min_wikidata_item_view = 500 ):
    """
    Pull a dataframe with popular articles where pageviews at least doubles
    """
    params = copy.deepcopy(span_cmp.params)
    params['country'] = country
    params['min_post_article_view'] = min_post_article_view
    params['min_wikidata_item_view'] = min_wikidata_item_view


    query = """
    SELECT *
    FROM %(db)s.%(span_table)s
    WHERE post_n_tpc > %(min_post_article_view)d
    AND (pre_n_wd + post_n_wd) > %(min_wikidata_item_view)d
    AND (post_n_tpc / pre_n_tpc) > 2.0
    AND c RLIKE '%(country)s'
    """
    df = query_hive_ssh(query % params , 'get_PVSpanComparison_df', priority = True)
    df.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in df.columns]
    return df
Example #10
0
def get_pageviews(start, stop, country, project):
    
    query = """
    SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly
    WHERE agent_type = 'user'
    AND %(time)s
    AND project = '%(project)s'
    AND country_code = '%(country)s'
    GROUP BY year, month, day, hour, access_method
    """
    
    params = {'country': country, 'project': project, 'time': get_hive_timespan(start, stop) }
    d = query_hive_ssh(query % params, 'pvquery' + country + project, priority = True, delete = True)
    dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(str) + ' ' + d["hour"].map(str) + ':00'
    d.index = pd.to_datetime(dt)

    del d['year']
    del d['month']
    del d['day']
    del d['hour']
    return d
Example #11
0
def get_impressions_by_banner_count(start, stop):

    """
    Gets all impression data within the time range start:stop
    Groups data by banner, campaign and number of impressions seen
    """

    data_dict = {}
    params = get_time_limits(start, stop)
    query = """
    SELECT impressions_seen, CONCAT_WS(' ', banner, campaign, day) as name, n
    FROM ellery.banner_count
    WHERE day BETWEEN '%(start)s' AND '%(stop)s';
    """
    query = query % {'start':start, 'stop':stop}
    d = query_hive_ssh(query, 'impressions_by_count.tsv') 

    d.index = d.impressions_seen 
    d.drop('impressions_seen', axis=1, inplace=True)


    return d
Example #12
0
def check_normalization(pvsc, article):
    params = copy.copy(pvsc.params)
    params['article'] = article
    query = """
    SELECT *
    FROM %(db)s.%(span_table)s
    WHERE en_page_title = '%(article)s'
    """
    df = query_hive_ssh(query % params, 'get_PVSpanComparison_df')
    df.columns = [c.split('.')[1] for c in df.columns]
    df.sort('normalized_wdc_view_proportion_delta', inplace  = True, ascending = 0)
    return df









        
Example #13
0
def create_hive_ts(d, start, stop):
    
    query = """
        DROP TABLE IF EXISTS censorship.daily_ts2;
        CREATE TABLE censorship.daily_ts2
        AS SELECT 
            CONCAT(ts.year,'-',LPAD(ts.month,2,'0'),'-',LPAD(ts.day,2,'0')) as day,
            ts.country, 
            ts.project, 
            ts.page_title,
            ts.n,
            ts.n / agg.n_agg as proportion,
            wd.en_page_title
        FROM 
            (SELECT
                year, 
                month, 
                day, 
                country, 
                project, 
                page_title,
                SUM(view_count) as n
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND page_title not RLIKE ':'
                AND %(cp_conditions)s
                AND %(time_conditions)s
            GROUP BY
                year,
                month,
                day,
                country,
                project,
                page_title
            ) ts
        LEFT JOIN
            (SELECT
                year, 
                month, 
                day, 
                project, 
                page_title,
                SUM(view_count) as n_agg
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND page_title not RLIKE ':'
                AND %(time_conditions)s
            GROUP BY
                year,
                month,
                day,
                project,
                page_title
            ) agg
            ON (    ts.year = agg.year
                AND ts.month = agg.month
                AND ts.day = agg.day
                AND ts.project = agg.project
                AND ts.page_title = agg.page_title)
        LEFT JOIN censorship.wikidata wd
            ON (ts.page_title = wd.page_title AND ts.project = wd.project);
    """
    params = {'cp_conditions' : get_country_project_condition(cp_dict),
              'time_conditions': get_hive_timespan(start, stop),
              }
    query %= params
    query_hive_ssh(query, 'ts', priority = True)