Ejemplo n.º 1
0
def get_pageviews(start, stop, country, project):

    query = """
    SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly
    WHERE agent_type = 'user'
    AND %(time)s
    AND project = '%(project)s'
    AND country_code = '%(country)s'
    GROUP BY year, month, day, hour, access_method
    """

    params = {
        'country': country,
        'project': project,
        'time': get_hive_timespan(start, stop)
    }
    d = query_hive_ssh(query % params,
                       'pvquery' + country + project,
                       priority=True,
                       delete=True)
    dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(
        str) + ' ' + d["hour"].map(str) + ':00'
    d.index = pd.to_datetime(dt)

    del d['year']
    del d['month']
    del d['day']
    del d['hour']
    return d
Ejemplo n.º 2
0
def get_pageviews(start, stop, country, project):
    
    query = """
    SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly
    WHERE agent_type = 'user'
    AND %(time)s
    AND project = '%(project)s'
    AND country_code = '%(country)s'
    GROUP BY year, month, day, hour, access_method
    """
    
    params = {'country': country, 'project': project, 'time': get_hive_timespan(start, stop) }
    d = query_hive_ssh(query % params, 'pvquery' + country + project, priority = True, delete = True)
    dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(str) + ' ' + d["hour"].map(str) + ':00'
    d.index = pd.to_datetime(dt)

    del d['year']
    del d['month']
    del d['day']
    del d['hour']
    return d
Ejemplo n.º 3
0
 def __init__(self, start, stop, db, dry = False):
     
     basename =  start.replace('-', '') + '_' + stop.replace('-', '')
     
     self.params = {
         'basename': basename,
         'start': start,
         'stop': stop,
         'tpc_table' : basename + '_tpc',
         'wdc_table': basename + '_wdc',
         'tp_table' : basename + '_tp',
         'wd_table' : basename + '_wd',
         'c_table' : basename + '_c',
         'db': db,
         'time_conditon': get_hive_timespan(start, stop)
     }
     
     if not dry:
         self.create_tpc_table()
         self.create_wdc_table()
         self.create_tp_table()
         self.create_wd_table()
         self.create_c_table()
         self.join_and_clean()
Ejemplo n.º 4
0
def get_clickstream(table, lang, start, stop, priority = False, min_count = 10):

    params = {  'time_conditions': get_hive_timespan(start, stop, hour = False),
                'table': table,
                'lang': lang,
                'min_count': min_count,
                }



    query = """

    -- ############################################
    -- create helper tables


    -- create copy of page table and insert rows for our special prev pages
    -- this will let us work with ids instead of titles later, which is much less error prone

    DROP TABLE IF EXISTS clickstream.%(table)s_page_helper;
    CREATE TABLE clickstream.%(table)s_page_helper AS
    SELECT
        *
    FROM
        clickstream.%(lang)s_page
    ;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -1 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-empty' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;


    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -2 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-internal' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -3 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-external' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -4 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-search' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -5 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-other' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;


    -- create pagelinks table that resolves links that end in a redirect
    -- this means that if A links to B, and B redirects to C, we replace the link (A,B) with (A,C)
    -- this lets us properly annotate link types after resolving redirects in the clickstream, since
    -- a user will experience following A as if it linked to C
    -- the group be ensures that each link only occurs once

    DROP TABLE IF EXISTS clickstream.%(table)s_pagelinks_helper;
    CREATE TABLE clickstream.%(table)s_pagelinks_helper AS
    SELECT
        pl_from_page_id,
        pl_to_page_id
    FROM
        (SELECT
            pl_from_page_id,
            CASE
                WHEN r.rd_to_page_id IS NULL THEN pl_to_page_id
                ELSE rd_to_page_id
            END AS pl_to_page_id
        FROM
            clickstream.%(lang)s_pagelinks l
        LEFT JOIN
            clickstream.%(lang)s_redirect r ON (r.rd_from_page_id = l.pl_to_page_id)            
        ) a
    GROUP BY
        pl_from_page_id,
        pl_to_page_id
    ;

    -- ############################################




    -- extract raw prev, curr pairs

    DROP VIEW IF EXISTS clickstream.%(table)s_temp1;
    CREATE VIEW clickstream.%(table)s_temp1 AS
    SELECT 
        CASE
            -- empty or malformed referer
            WHEN referer IS NULL THEN 'other-empty'
            WHEN referer == '' THEN 'other-empty'
            WHEN referer == '-' THEN 'other-empty'
            WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty'
            -- internal referer from the same wikipedia
            WHEN 
                parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org')
                AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1
            THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)
            -- other referers 
            WHEN referer_class = 'internal' THEN 'other-internal'
            WHEN referer_class = 'external' THEN 'other-external'
            WHEN referer_class = 'external (search engine)' THEN 'other-search'
            ELSE 'other-other'
        END as prev,
        pageview_info['page_title'] as curr
    FROM
        wmf.webrequest
    WHERE 
        %(time_conditions)s
        AND webrequest_source = 'text'
        AND normalized_host.project_class = 'wikipedia'
        AND normalized_host.project = '%(lang)s'
        AND is_pageview 
        AND agent_type = 'user'
    ;



    -- count raw prev, curr pairs, this speeds up later queries

    DROP TABLE IF EXISTS clickstream.%(table)s_temp2;
    CREATE TABLE clickstream.%(table)s_temp2 AS
    SELECT
        prev, curr, COUNT(*) as n
    FROM
        clickstream.%(table)s_temp1
    GROUP BY 
        prev, curr
    ;


    -- we enforce that curr and prev are main namespace pages
    -- the joins accomplish this because, in the logs, the non main namespace pages have the namespace prepended
    -- at this point curr and prev are ids

    DROP TABLE IF EXISTS clickstream.%(table)s_temp3;
    CREATE TABLE clickstream.%(table)s_temp3 AS
    SELECT 
        pp.page_id as prev,
        pc.page_id as curr,
        n
    FROM
        clickstream.%(table)s_temp2
    JOIN
        clickstream.%(table)s_page_helper pp ON (prev = pp.page_title)
    JOIN
        clickstream.%(table)s_page_helper pc ON (curr = pc.page_title)
    WHERE
        pp.page_namespace = 0
        AND pc.page_namespace = 0
    ;



    -- resolve curr redirects, one step
    -- note that prev should not be a redirect, so we do not bother resolving it
    -- and prev redirects will be filtered out at the end

    DROP TABLE IF EXISTS clickstream.%(table)s_temp4;
    CREATE TABLE clickstream.%(table)s_temp4 AS
    SELECT 
        prev,
        CASE
            WHEN rd_to_page_id IS NULL THEN curr
            ELSE rd_to_page_id
        END AS curr,
        n
    FROM
        clickstream.%(table)s_temp3
    LEFT JOIN
        clickstream.%(lang)s_redirect ON (curr = rd_from_page_id)
    ;

    -- re-aggregate after resolving redirects and filter out pairs that occur infrequently

    DROP TABLE IF EXISTS clickstream.%(table)s_temp5;
    CREATE TABLE clickstream.%(table)s_temp5 AS
    SELECT
        prev, curr, SUM(n) as n
    FROM
        clickstream.%(table)s_temp4
    GROUP BY
        prev, curr
    HAVING
        SUM(n) > %(min_count)s
    ;



    -- annotate link types

    DROP TABLE IF EXISTS clickstream.%(table)s_temp6;
    CREATE TABLE clickstream.%(table)s_temp6 AS
    SELECT
        prev,
        curr,
        CASE
            WHEN prev < 0 THEN 'external'
            WHEN (pl_from_page_id IS NOT NULL AND pl_to_page_id IS NOT NULL) THEN 'link'
            ELSE 'other'
        END AS type,
        n
    FROM
        clickstream.%(table)s_temp5
    LEFT JOIN
        clickstream.%(table)s_pagelinks_helper ON (prev = pl_from_page_id AND curr = pl_to_page_id)
    ;



    -- create final table
    -- remove self loops
    -- restrict prev and curr to main namespace, no redirects
    -- get page titles

    DROP TABLE IF EXISTS clickstream.%(table)s;
    CREATE TABLE clickstream.%(table)s
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE AS
    SELECT
        pp.page_title as prev,
        pc.page_title as curr,
        a.type,
        a.n
    FROM
        clickstream.%(table)s_temp6 a
    JOIN
        clickstream.%(table)s_page_helper pp ON (prev = pp.page_id)
    JOIN
        clickstream.%(table)s_page_helper pc ON (curr = pc.page_id)
    WHERE
        pp.page_is_redirect = false
        AND pp.page_namespace = 0
        AND pc.page_is_redirect = false
        AND pc.page_namespace = 0
        AND a.curr != a.prev
    ;



    DROP VIEW clickstream.%(table)s_temp1;
    DROP TABLE clickstream.%(table)s_temp2;
    DROP TABLE clickstream.%(table)s_temp3;
    DROP TABLE clickstream.%(table)s_temp4;
    DROP TABLE clickstream.%(table)s_temp5;
    DROP TABLE clickstream.%(table)s_temp6;
    DROP TABLE clickstream.%(table)s_page_helper;
    DROP TABLE clickstream.%(table)s_pagelinks_helper;
    """

    exec_hive_stat2(query % params, priority = priority)
Ejemplo n.º 5
0
def get_requests(start, stop, table,  trace_db = 'a2v', prod_db = 'prod', priority = True, min_count=50):

    query = """
    SET mapreduce.input.fileinputformat.split.maxsize=200000000;
    SET hive.mapred.mode=nonstrict;


    -- get pageviews, resolve redirects, add wikidata ids

    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_pageviews AS
    SELECT
        year,month,day,
        client_ip,
        user_agent,
        x_forwarded_for,
        ts,
        pv2.lang,
        pv2.title, 
        id
    FROM
        (SELECT
            year,month,day,
            client_ip,
            user_agent,
            x_forwarded_for,
            ts, 
            pv1.lang,
            CASE
                WHEN rd_to_page_title IS NULL THEN raw_title
                ELSE rd_to_page_title
            END AS title
        FROM
            (SELECT
                year,month,day,
                client_ip,
                user_agent,
                x_forwarded_for,
                ts, 
                normalized_host.project AS lang,
                REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1) as raw_title
            FROM
                wmf.webrequest
            WHERE 
                is_pageview
                AND webrequest_source = 'text'
                AND normalized_host.project_class = 'wikipedia'
                AND agent_type = 'user'
                AND %(time_conditions)s
                AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0
            ) pv1
        LEFT JOIN
            (SELECT
                *
            FROM
                prod.redirect 
            WHERE
                rd_from_page_namespace = 0
                AND rd_to_page_namespace = 0
                AND lang RLIKE '.*'
            ) r
        ON
            pv1.raw_title = r.rd_from_page_title AND pv1.lang = r.lang
        ) pv2
    INNER JOIN
        %(prod_db)s.wikidata_will w ON pv2.title = w.title AND pv2.lang = w.lang;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors;
    CREATE TABLE %(trace_db)s.%(trace_table)s_editors AS
    SELECT
        client_ip,
        user_agent,
        x_forwarded_for
    FROM
        wmf.webrequest
    WHERE  
        uri_query RLIKE 'action=edit' 
        AND %(time_conditions)s
    GROUP BY
        client_ip,
        user_agent,
        x_forwarded_for;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_reader_pageviews AS
    SELECT
        p.*
    FROM
        %(trace_db)s.%(trace_table)s_pageviews p
    LEFT JOIN
        %(trace_db)s.%(trace_table)s_editors e
    ON (
        p.client_ip = e.client_ip
        AND p.user_agent = e.user_agent
        AND p.x_forwarded_for = e.x_forwarded_for
        )
    WHERE
        e.client_ip is NULL
        AND e.user_agent is NULL
        AND e.x_forwarded_for is NULL;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item;
    CREATE TABLE %(trace_db)s.%(trace_table)s_clients_per_item AS
    SELECT
        id, 
        COUNT(*) as n
    FROM
        (SELECT
            client_ip,
            user_agent,
            x_forwarded_for,
            id
        FROM
            %(trace_db)s.%(trace_table)s_reader_pageviews
        GROUP BY
            client_ip,
            user_agent,
            x_forwarded_for,
            id
        ) a
    GROUP BY
        id;

    -- remove disambiguation pages and pages with colon in title
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_eligible_reader_pageviews AS
    SELECT pv.*
    FROM
        (SELECT
            p.*
        FROM
            %(trace_db)s.%(trace_table)s_reader_pageviews p
        JOIN
            %(trace_db)s.%(trace_table)s_clients_per_item c
        ON (p.id = c.id)
        WHERE
            c.n >= %(min_count)s
        ) pv
    LEFT JOIN
        (SELECT
            lang,
            page_title
        FROM
            %(prod_db)s.page_props
        WHERE
            propname = 'disambiguation'
            AND lang RLIKE '.*'
            AND page_namespace = 0
        GROUP BY
            lang,
            page_title
        ) d 
    ON
        (pv.lang = d.lang and pv.title = d.page_title)
    WHERE
        d.page_title IS NULL
        AND pv.title NOT RLIKE 'disambig'
        AND pv.title NOT RLIKE ':';
            


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_requests;
    CREATE TABLE %(trace_db)s.%(trace_table)s_requests AS
    SELECT
        CONCAT_WS('||', COLLECT_LIST(request)) AS requests
    FROM
        (SELECT
            client_ip,
            user_agent,
            x_forwarded_for,
            CONCAT('ts|', ts, '|id|', id, '|title|', title, '|lang|', lang ) AS request
        FROM %(trace_db)s.%(trace_table)s_eligible_reader_pageviews      
        ) a
    GROUP BY
        client_ip,
        user_agent,
        x_forwarded_for
    HAVING 
        COUNT(*) <= 1000
        AND COUNT(*) > 1;

    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews;
    """

    params = {  'time_conditions': get_hive_timespan(start, stop, hour = False),
                'trace_db': trace_db,
                'prod_db': prod_db,
                'trace_table': table,
                'min_count': min_count
                }
    exec_hive_stat2(query % params, priority = priority)
Ejemplo n.º 6
0
def create_hive_ts(d, start, stop):
    
    query = """
        DROP TABLE IF EXISTS censorship.daily_ts2;
        CREATE TABLE censorship.daily_ts2
        AS SELECT 
            CONCAT(ts.year,'-',LPAD(ts.month,2,'0'),'-',LPAD(ts.day,2,'0')) as day,
            ts.country, 
            ts.project, 
            ts.page_title,
            ts.n,
            ts.n / agg.n_agg as proportion,
            wd.en_page_title
        FROM 
            (SELECT
                year, 
                month, 
                day, 
                country, 
                project, 
                page_title,
                SUM(view_count) as n
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND page_title not RLIKE ':'
                AND %(cp_conditions)s
                AND %(time_conditions)s
            GROUP BY
                year,
                month,
                day,
                country,
                project,
                page_title
            ) ts
        LEFT JOIN
            (SELECT
                year, 
                month, 
                day, 
                project, 
                page_title,
                SUM(view_count) as n_agg
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND page_title not RLIKE ':'
                AND %(time_conditions)s
            GROUP BY
                year,
                month,
                day,
                project,
                page_title
            ) agg
            ON (    ts.year = agg.year
                AND ts.month = agg.month
                AND ts.day = agg.day
                AND ts.project = agg.project
                AND ts.page_title = agg.page_title)
        LEFT JOIN censorship.wikidata wd
            ON (ts.page_title = wd.page_title AND ts.project = wd.project);
    """
    params = {'cp_conditions' : get_country_project_condition(cp_dict),
              'time_conditions': get_hive_timespan(start, stop),
              }
    query %= params
    query_hive_ssh(query, 'ts', priority = True)
Ejemplo n.º 7
0
def add_day_to_hive_trace_table(db_name, table_name, day, priority = True):

    query = """
    INSERT OVERWRITE TABLE %(db_name)s.%(table_name)s_by_day
    PARTITION(year=%(year)d, month=%(month)d, day =%(day)d, host)
    SELECT
        client_ip,
        user_agent,
        geocoded_data,
        user_agent_map,
        CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests,
        uri_host AS host
    FROM
        (SELECT
            client_ip,
            user_agent,
            geocoded_data,
            user_agent_map,
            CONCAT( 'ts|', ts,
                    '|referer|', referer,
                    '|title|', title,
                    '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path),
                    '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query),
                    '|is_pageview|', is_pageview,
                    '|access_method|', access_method,
                    '|referer_class|', referer_class,
                    '|project|', normalized_host.project_class,
                    '|lang|', normalized_host.project
                ) AS request,
            uri_host
        FROM
            (SELECT
                c.*,
                CASE
                    WHEN NOT is_pageview THEN NULL
                    WHEN rd_to IS NULL THEN raw_title
                    ELSE rd_to
                END AS title
            FROM
                (SELECT
                    w.*,
                    CASE
                        WHEN is_pageview THEN pageview_info['page_title']
                        ELSE round(RAND(), 5) 
                    END AS raw_title
                FROM
                    wmf.webrequest w
                WHERE 
                    webrequest_source = 'text'
                    AND agent_type = 'user'
                    AND %(time_conditions)s
                    AND hour = 1
                    AND access_method != 'mobile app'
                    AND uri_host in ('en.wikipedia.org', 'en.m.wikipedia.org')
                ) c
            LEFT JOIN
                traces.en_redirect r
            ON c.raw_title = r.rd_from
            ) b
        ) a
    GROUP BY
        client_ip,
        user_agent,
        geocoded_data,
        user_agent_map,
        uri_host
    HAVING 
        COUNT(*) < 500;
    """

    day_dt = dateutil.parser.parse(day)

    params = {  'time_conditions': get_hive_timespan(day, day, hour = False),
                'db_name': db_name,
                'table_name': table_name,
                'year' : day_dt.year,
                'month': day_dt.month,
                'day': day_dt.day
                }

    
    exec_hive_stat2(query % params, priority = priority)
Ejemplo n.º 8
0
def get_requests(start,
                 stop,
                 table,
                 trace_db='a2v',
                 prod_db='prod',
                 priority=True,
                 min_count=50):

    query = """
    SET mapreduce.input.fileinputformat.split.maxsize=200000000;
    SET hive.mapred.mode=nonstrict;


    -- get pageviews, resolve redirects, add wikidata ids

    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_pageviews AS
    SELECT
        year,month,day,
        client_ip,
        user_agent,
        x_forwarded_for,
        ts,
        pv2.lang,
        pv2.title, 
        id
    FROM
        (SELECT
            year,month,day,
            client_ip,
            user_agent,
            x_forwarded_for,
            ts, 
            pv1.lang,
            CASE
                WHEN rd_to_page_title IS NULL THEN raw_title
                ELSE rd_to_page_title
            END AS title
        FROM
            (SELECT
                year,month,day,
                client_ip,
                user_agent,
                x_forwarded_for,
                ts, 
                normalized_host.project AS lang,
                REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1) as raw_title
            FROM
                wmf.webrequest
            WHERE 
                is_pageview
                AND webrequest_source = 'text'
                AND normalized_host.project_class = 'wikipedia'
                AND agent_type = 'user'
                AND %(time_conditions)s
                AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0
            ) pv1
        LEFT JOIN
            (SELECT
                *
            FROM
                prod.redirect 
            WHERE
                rd_from_page_namespace = 0
                AND rd_to_page_namespace = 0
                AND lang RLIKE '.*'
            ) r
        ON
            pv1.raw_title = r.rd_from_page_title AND pv1.lang = r.lang
        ) pv2
    INNER JOIN
        %(prod_db)s.wikidata_will w ON pv2.title = w.title AND pv2.lang = w.lang;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors;
    CREATE TABLE %(trace_db)s.%(trace_table)s_editors AS
    SELECT
        client_ip,
        user_agent,
        x_forwarded_for
    FROM
        wmf.webrequest
    WHERE  
        uri_query RLIKE 'action=edit' 
        AND %(time_conditions)s
    GROUP BY
        client_ip,
        user_agent,
        x_forwarded_for;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_reader_pageviews AS
    SELECT
        p.*
    FROM
        %(trace_db)s.%(trace_table)s_pageviews p
    LEFT JOIN
        %(trace_db)s.%(trace_table)s_editors e
    ON (
        p.client_ip = e.client_ip
        AND p.user_agent = e.user_agent
        AND p.x_forwarded_for = e.x_forwarded_for
        )
    WHERE
        e.client_ip is NULL
        AND e.user_agent is NULL
        AND e.x_forwarded_for is NULL;


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item;
    CREATE TABLE %(trace_db)s.%(trace_table)s_clients_per_item AS
    SELECT
        id, 
        COUNT(*) as n
    FROM
        (SELECT
            client_ip,
            user_agent,
            x_forwarded_for,
            id
        FROM
            %(trace_db)s.%(trace_table)s_reader_pageviews
        GROUP BY
            client_ip,
            user_agent,
            x_forwarded_for,
            id
        ) a
    GROUP BY
        id;

    -- remove disambiguation pages and pages with colon in title
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews;
    CREATE TABLE %(trace_db)s.%(trace_table)s_eligible_reader_pageviews AS
    SELECT pv.*
    FROM
        (SELECT
            p.*
        FROM
            %(trace_db)s.%(trace_table)s_reader_pageviews p
        JOIN
            %(trace_db)s.%(trace_table)s_clients_per_item c
        ON (p.id = c.id)
        WHERE
            c.n >= %(min_count)s
        ) pv
    LEFT JOIN
        (SELECT
            lang,
            page_title
        FROM
            %(prod_db)s.page_props
        WHERE
            propname = 'disambiguation'
            AND lang RLIKE '.*'
            AND page_namespace = 0
        GROUP BY
            lang,
            page_title
        ) d 
    ON
        (pv.lang = d.lang and pv.title = d.page_title)
    WHERE
        d.page_title IS NULL
        AND pv.title NOT RLIKE 'disambig'
        AND pv.title NOT RLIKE ':';
            


    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_requests;
    CREATE TABLE %(trace_db)s.%(trace_table)s_requests AS
    SELECT
        CONCAT_WS('||', COLLECT_LIST(request)) AS requests
    FROM
        (SELECT
            client_ip,
            user_agent,
            x_forwarded_for,
            CONCAT('ts|', ts, '|id|', id, '|title|', title, '|lang|', lang ) AS request
        FROM %(trace_db)s.%(trace_table)s_eligible_reader_pageviews      
        ) a
    GROUP BY
        client_ip,
        user_agent,
        x_forwarded_for
    HAVING 
        COUNT(*) <= 1000
        AND COUNT(*) > 1;

    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors;
    DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews;
    """

    params = {
        'time_conditions': get_hive_timespan(start, stop, hour=False),
        'trace_db': trace_db,
        'prod_db': prod_db,
        'trace_table': table,
        'min_count': min_count
    }
    exec_hive_stat2(query % params, priority=priority)
Ejemplo n.º 9
0
def get_clickstream(table, lang, start, stop, priority = False, min_count = 10):

    params = {  'time_conditions': get_hive_timespan(start, stop, hour = False),
                'table': table,
                'lang': lang,
                'min_count': min_count,
                }

    query = """

    -- extract raw prev, curr pairs
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp1;
    CREATE VIEW west1.clickstream_%(table)s_temp1 AS
    SELECT 
        CASE
            -- empty or malformed referer
            WHEN referer IS NULL THEN 'other-empty'
            WHEN referer == '' THEN 'other-empty'
            WHEN referer == '-' THEN 'other-empty'
            WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty'
            -- internal referer from the same wikipedia
            WHEN 
                parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org')
                AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1
            THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)
            -- other referers 
            WHEN referer_class = 'internal' THEN 'other-internal'
            WHEN referer_class = 'external' THEN 'other-external'
            WHEN referer_class = 'external (search engine)' THEN 'other-search'
            ELSE 'other-other'
        END as prev,
        pageview_info['page_title'] as curr
    FROM
        wmf.webrequest
    WHERE 
        %(time_conditions)s
        AND webrequest_source = 'text'
        AND normalized_host.project_class = 'wikipedia'
        AND normalized_host.project = '%(lang)s'
        AND is_pageview 
        AND agent_type = 'user';


    -- count raw prev, curr pairs
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp2;
    CREATE VIEW west1.clickstream_%(table)s_temp2 AS
    SELECT
        curr, prev, COUNT(*) as n
    FROM
        west1.clickstream_%(table)s_temp1
    GROUP BY 
        curr, prev;


    -- resolve redirects
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp3;
    CREATE VIEW west1.clickstream_%(table)s_temp3 AS
    SELECT 
        CASE
            WHEN prev  in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other') THEN prev
            WHEN pr.rd_to IS NULL THEN prev
            ELSE pr.rd_to
        END AS prev,
        CASE
            WHEN cr.rd_to IS NULL THEN curr
            ELSE cr.rd_to
        END AS curr,
        curr AS curr_unresolved,
        n
    FROM
        west1.clickstream_%(table)s_temp2
    LEFT JOIN
        west1.%(lang)s_redirect pr ON (prev = pr.rd_from)
    LEFT JOIN
        west1.%(lang)s_redirect cr ON (curr = cr.rd_from);

    -- re-aggregate after resolving redirects and filter out pairs that occur infrequently
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp4;
    CREATE VIEW west1.clickstream_%(table)s_temp4 AS
    SELECT
        curr, curr_unresolved, prev, SUM(n) as n
    FROM
        west1.clickstream_%(table)s_temp3
    GROUP BY
        curr, curr_unresolved, prev
    HAVING
        SUM(n) > %(min_count)s;

    -- only include main namespace articles
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp5;
    CREATE VIEW west1.clickstream_%(table)s_temp5 AS
    SELECT 
        curr, curr_unresolved, prev, n
    FROM
        west1.clickstream_%(table)s_temp4
    LEFT JOIN
        west1.%(lang)s_page_raw pp ON (prev = pp.page_title)
    LEFT JOIN
        west1.%(lang)s_page_raw cp ON (curr = cp.page_title)
    WHERE
        cp.page_title is not NULL
        AND ( pp.page_title is NOT NULL
              OR prev  in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other')
            );

    -- annotate link types
    DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp6;
    CREATE VIEW west1.clickstream_%(table)s_temp6 AS
    SELECT
        prev,
        curr,
        curr_unresolved,
        CASE
            WHEN prev  in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other') THEN 'external'
            WHEN l.pl_from IS NOT NULL AND l.pl_to IS NOT NULL THEN 'link'
            ELSE 'other'
        END AS type,
        n
    FROM
        west1.clickstream_%(table)s_temp5
    LEFT JOIN
        west1.%(lang)s_pagelinks l ON (prev = l.pl_from AND curr = l.pl_to);


    -- create table
    DROP TABLE IF EXISTS west1.clickstream_%(table)s;
    CREATE TABLE west1.clickstream_%(table)s
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE AS
    SELECT
        *
    FROM
        west1.clickstream_%(table)s_temp6
    WHERE 
        curr != prev;

    DROP VIEW west1.clickstream_%(table)s_temp1;
    DROP VIEW west1.clickstream_%(table)s_temp2;
    DROP VIEW west1.clickstream_%(table)s_temp3;
    DROP VIEW west1.clickstream_%(table)s_temp4;
    DROP VIEW west1.clickstream_%(table)s_temp5;
    DROP VIEW west1.clickstream_%(table)s_temp6;
    """

    print(query % params)
    exec_hive_stat2(query % params, priority = priority)
Ejemplo n.º 10
0
def get_clickstream(table, lang, start, stop, priority=False, min_count=10):

    params = {
        'time_conditions': get_hive_timespan(start, stop, hour=False),
        'table': table,
        'lang': lang,
        'min_count': min_count,
    }

    query = """

    -- ############################################
    -- create helper tables


    -- create copy of page table and insert rows for our special prev pages
    -- this will let us work with ids instead of titles later, which is much less error prone

    DROP TABLE IF EXISTS clickstream.%(table)s_page_helper;
    CREATE TABLE clickstream.%(table)s_page_helper AS
    SELECT
        *
    FROM
        clickstream.%(lang)s_page
    ;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -1 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-empty' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;


    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -2 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-internal' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -3 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-external' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -4 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-search' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;

    INSERT INTO TABLE clickstream.%(table)s_page_helper 
    SELECT
        -5 AS page_id,
        0 AS page_namespace,
        false AS page_is_redirect,
        'other-other' AS page_title 
    FROM   clickstream.%(table)s_page_helper 
    LIMIT   1;


    -- create pagelinks table that resolves links that end in a redirect
    -- this means that if A links to B, and B redirects to C, we replace the link (A,B) with (A,C)
    -- this lets us properly annotate link types after resolving redirects in the clickstream, since
    -- a user will experience following A as if it linked to C
    -- the group be ensures that each link only occurs once

    DROP TABLE IF EXISTS clickstream.%(table)s_pagelinks_helper;
    CREATE TABLE clickstream.%(table)s_pagelinks_helper AS
    SELECT
        pl_from_page_id,
        pl_to_page_id
    FROM
        (SELECT
            pl_from_page_id,
            CASE
                WHEN r.rd_to_page_id IS NULL THEN pl_to_page_id
                ELSE rd_to_page_id
            END AS pl_to_page_id
        FROM
            clickstream.%(lang)s_pagelinks l
        LEFT JOIN
            clickstream.%(lang)s_redirect r ON (r.rd_from_page_id = l.pl_to_page_id)            
        ) a
    GROUP BY
        pl_from_page_id,
        pl_to_page_id
    ;

    -- ############################################




    -- extract raw prev, curr pairs

    DROP VIEW IF EXISTS clickstream.%(table)s_temp1;
    CREATE VIEW clickstream.%(table)s_temp1 AS
    SELECT 
        CASE
            -- empty or malformed referer
            WHEN referer IS NULL THEN 'other-empty'
            WHEN referer == '' THEN 'other-empty'
            WHEN referer == '-' THEN 'other-empty'
            WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty'
            -- internal referer from the same wikipedia
            WHEN 
                parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org')
                AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1
            THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)
            -- other referers 
            WHEN referer_class = 'internal' THEN 'other-internal'
            WHEN referer_class = 'external' THEN 'other-external'
            WHEN referer_class = 'external (search engine)' THEN 'other-search'
            ELSE 'other-other'
        END as prev,
        pageview_info['page_title'] as curr
    FROM
        wmf.webrequest
    WHERE 
        %(time_conditions)s
        AND webrequest_source = 'text'
        AND normalized_host.project_class = 'wikipedia'
        AND normalized_host.project = '%(lang)s'
        AND is_pageview 
        AND agent_type = 'user'
    ;



    -- count raw prev, curr pairs, this speeds up later queries

    DROP TABLE IF EXISTS clickstream.%(table)s_temp2;
    CREATE TABLE clickstream.%(table)s_temp2 AS
    SELECT
        prev, curr, COUNT(*) as n
    FROM
        clickstream.%(table)s_temp1
    GROUP BY 
        prev, curr
    ;


    -- we enforce that curr and prev are main namespace pages
    -- the joins accomplish this because, in the logs, the non main namespace pages have the namespace prepended
    -- at this point curr and prev are ids

    DROP TABLE IF EXISTS clickstream.%(table)s_temp3;
    CREATE TABLE clickstream.%(table)s_temp3 AS
    SELECT 
        pp.page_id as prev,
        pc.page_id as curr,
        n
    FROM
        clickstream.%(table)s_temp2
    JOIN
        clickstream.%(table)s_page_helper pp ON (prev = pp.page_title)
    JOIN
        clickstream.%(table)s_page_helper pc ON (curr = pc.page_title)
    WHERE
        pp.page_namespace = 0
        AND pc.page_namespace = 0
    ;



    -- resolve curr redirects, one step
    -- note that prev should not be a redirect, so we do not bother resolving it
    -- and prev redirects will be filtered out at the end

    DROP TABLE IF EXISTS clickstream.%(table)s_temp4;
    CREATE TABLE clickstream.%(table)s_temp4 AS
    SELECT 
        prev,
        CASE
            WHEN rd_to_page_id IS NULL THEN curr
            ELSE rd_to_page_id
        END AS curr,
        n
    FROM
        clickstream.%(table)s_temp3
    LEFT JOIN
        clickstream.%(lang)s_redirect ON (curr = rd_from_page_id)
    ;

    -- re-aggregate after resolving redirects and filter out pairs that occur infrequently

    DROP TABLE IF EXISTS clickstream.%(table)s_temp5;
    CREATE TABLE clickstream.%(table)s_temp5 AS
    SELECT
        prev, curr, SUM(n) as n
    FROM
        clickstream.%(table)s_temp4
    GROUP BY
        prev, curr
    HAVING
        SUM(n) > %(min_count)s
    ;



    -- annotate link types

    DROP TABLE IF EXISTS clickstream.%(table)s_temp6;
    CREATE TABLE clickstream.%(table)s_temp6 AS
    SELECT
        prev,
        curr,
        CASE
            WHEN prev < 0 THEN 'external'
            WHEN (pl_from_page_id IS NOT NULL AND pl_to_page_id IS NOT NULL) THEN 'link'
            ELSE 'other'
        END AS type,
        n
    FROM
        clickstream.%(table)s_temp5
    LEFT JOIN
        clickstream.%(table)s_pagelinks_helper ON (prev = pl_from_page_id AND curr = pl_to_page_id)
    ;



    -- create final table
    -- remove self loops
    -- restrict prev and curr to main namespace, no redirects
    -- get page titles

    DROP TABLE IF EXISTS clickstream.%(table)s;
    CREATE TABLE clickstream.%(table)s
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE AS
    SELECT
        pp.page_title as prev,
        pc.page_title as curr,
        a.type,
        a.n
    FROM
        clickstream.%(table)s_temp6 a
    JOIN
        clickstream.%(table)s_page_helper pp ON (prev = pp.page_id)
    JOIN
        clickstream.%(table)s_page_helper pc ON (curr = pc.page_id)
    WHERE
        pp.page_is_redirect = false
        AND pp.page_namespace = 0
        AND pc.page_is_redirect = false
        AND pc.page_namespace = 0
        AND a.curr != a.prev
    ;



    DROP VIEW clickstream.%(table)s_temp1;
    DROP TABLE clickstream.%(table)s_temp2;
    DROP TABLE clickstream.%(table)s_temp3;
    DROP TABLE clickstream.%(table)s_temp4;
    DROP TABLE clickstream.%(table)s_temp5;
    DROP TABLE clickstream.%(table)s_temp6;
    DROP TABLE clickstream.%(table)s_page_helper;
    DROP TABLE clickstream.%(table)s_pagelinks_helper;
    """

    exec_hive_stat2(query % params, priority=priority)