Exemple #1
0
    def create_tpc_table(self):

        query = """
        DROP TABLE IF EXISTS %(db)s.%(tpc_table)s;
        CREATE TABLE %(db)s.%(tpc_table)s AS
        SELECT
        tbl.*,
        wd.id, 
        wd.en_page_title
        FROM
        (
            SELECT
            SUM(view_count) as n_tpc,
            regexp_replace(page_title, ' ', '_') as t,
            country as c,
            project as p
            FROM wmf.pageview_hourly
            WHERE agent_type = 'user'
            AND project RLIKE 'wikipedia'
            AND page_title not RLIKE ':'
            AND %(time_conditon)s
            GROUP BY regexp_replace(page_title, ' ', '_'), country, project
        ) tbl
        LEFT JOIN
        %(db)s.wikidata wd
        ON (tbl.t = wd.page_title AND tbl.p = wd.project);
        """ 
        
        execute_hive_expression(query % self.params) 
Exemple #2
0
    def get_wd_sufficient_statistics(self):
        metric = 'wdc_view_proportion_delta'
        params = self.params
        params['metric'] = metric
        params['ss_table'] = '_'.join(['ss', metric])

        query = """
        DROP TABLE IF EXISTS %(db)s.%(span_table)s_temp;
        ALTER TABLE %(db)s.%(span_table)s RENAME TO %(db)s.%(span_table)s_temp;
        DROP TABLE IF EXISTS %(db)s.%(span_table)s;
        CREATE TABLE %(db)s.%(span_table)s AS SELECT
        tbl.*,
        (tbl.%(metric)s - ss.mu) / ss.sigma as normalized_%(metric)s
        FROM
        %(db)s.%(span_table)s_temp tbl
        LEFT JOIN
        (SELECT
        id,
        SUM(%(metric)s) / COUNT(*) as mu,
        SQRT( SUM(POW(%(metric)s, 2)) / COUNT(*) - POW(SUM(%(metric)s) / COUNT(*), 2) ) as sigma
        FROM %(db)s.%(span_table)s_temp
        GROUP BY id) ss
        ON (tbl.id = ss.id);
        DROP TABLE %(db)s.%(span_table)s_temp;
        """   

        execute_hive_expression(query % params)
Exemple #3
0
 def merge_spans(self):
 
     query = """
     DROP TABLE IF EXISTS %(db)s.%(span_table)s;
     CREATE TABLE %(db)s.%(span_table)s AS
     SELECT
     (post.n_wdc - pre.n_wdc) / pre.n_wdc as wdc_view_delta,
     (((10000 * post.n_wdc) / post.n_c) - ((10000 * pre.n_wdc) / pre.n_c)) / ((10000 * pre.n_wdc) / pre.n_c) as wdc_view_proportion_delta,
     (post.n_tpc - pre.n_tpc) / pre.n_tpc as tpc_view_delta,
     (((10000 * post.n_tpc) / post.n_c) - ((10000 * pre.n_tpc) / pre.n_c)) / ((10000 * pre.n_tpc) / pre.n_c) as tpc_view_proportion_delta,
     pre.n_tpc as pre_n_tpc,
     post.n_tpc as post_n_tpc,
     pre.n_wdc as pre_n_wdc,
     post.n_wdc as post_n_wdc,
     pre.n_tp as pre_n_tp,
     post.n_tp as post_n_tp,
     pre.n_wd as pre_n_wd,
     post.n_wd as post_n_wd,
     pre.n_c as pre_n_c,
     post.n_c as post_n_c,
     post.p,
     post.t,
     post.c,
     post.id,
     post.en_page_title
     FROM
     %(db)s.%(post)s post
     LEFT JOIN
     %(db)s.%(pre)s pre
     ON (pre.t = post.t AND pre.c = post.c AND pre.p = post.p)
     """
     execute_hive_expression(query % self.params)
Exemple #4
0
 def join_and_clean(self):
     query = """
     DROP TABLE IF EXISTS %(db)s.%(basename)s;
     CREATE TABLE %(db)s.%(basename)s AS
     SELECT 
     tpc.*,
     wdc.n_wdc,
     tp.n_tp as n_tp,
     wd.n_wd as n_wd,
     c_.n_c as n_c
     FROM 
     %(db)s.%(tpc_table)s tpc,
     %(db)s.%(wdc_table)s wdc,
     %(db)s.%(tp_table)s tp,
     %(db)s.%(wd_table)s wd,
     %(db)s.%(c_table)s c_
     WHERE tpc.c = c_.c
     AND tpc.t = tp.t
     AND tpc.p = tp.p
     AND tpc.c = wdc.c
     AND tpc.id = wdc.id
     AND tpc.id = wd.id;
     
     DROP TABLE IF EXISTS %(db)s.%(tpc_table)s;
     DROP TABLE IF EXISTS %(db)s.%(wdc_table)s;
     DROP TABLE IF EXISTS %(db)s.%(tp_table)s;
     DROP TABLE IF EXISTS %(db)s.%(wd_table)s;
     DROP TABLE IF EXISTS %(db)s.%(c_table)s;
     """
     execute_hive_expression(query % self.params)
Exemple #5
0
  def create_wdc_table(self):
 
      query = """
      DROP TABLE IF EXISTS %(db)s.%(wdc_table)s;
      CREATE TABLE %(db)s.%(wdc_table)s AS
      SELECT
      SUM(n_tpc) as n_wdc, c, id
      FROM %(db)s.%(tpc_table)s
      GROUP BY c, id
      """
      execute_hive_expression(query % self.params)
Exemple #6
0
def create_wikidata_table(db):
    params = {'db': db}
    query = """
    DROP TABLE IF EXISTS %(db)s.wikidata;
    CREATE TABLE %(db)s.wikidata AS
    SELECT wikidata.id,
    CONCAT(wikidata.lang, '.wikipedia') as project,
    regexp_replace(wikidata.page_title, ' ', '_') as page_title,
    regexp_replace(en_wikidata.en_page_title, ' ', '_') as en_page_title
    FROM ellery.wikidata wikidata
    LEFT JOIN
    (SELECT id, regexp_replace(page_title, ' ', '_') as en_page_title
    FROM ellery.wikidata
    WHERE lang == 'en') en_wikidata
    ON (wikidata.id == en_wikidata.id);
    """
    execute_hive_expression(query % params)