def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_tsv",
                        default=config.edit_el_tsv,
                        help="TSV filename for output EventLogging data")
    parser.add_argument(
        "--hive_requests_table",
        default=config.hive_el_requests_table,
        help=
        "Hive table with all potential QuickSurvey webrequests and surveySessionTokens"
    )
    args = parser.parse_args()

    # make sure dates WHERE clause matches config logic
    query = (
        "SELECT event.session_token AS session_token, "
        "event.action AS action, "
        "event.init_mechanism AS init_mechanism, "
        "event.editor_interface AS editor_interface, "
        "event.page_title AS edit_page_title, "
        "event.user_editcount AS user_edit, "
        "event.user_id = 0 AS anon, "
        "REFLECT('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', CONCAT(s.client_ip, s.user_agent, '{0}')) AS userhash "
        "FROM event.editattemptstep e "
        "INNER JOIN {1} s "
        "ON (e.event.session_token = SUBSTR(s.survey_session_token, 0, 20)) "
        "WHERE e.year = 2019 AND e.month = 3 AND (e.day = 4 OR e.day = 5)".
        format(config.hash_key, args.hive_requests_table))
    exec_hive_stat2(query, args.output_tsv)
def ungroup(db_name,
            table_name,
            lang,
            priority,
            nice,
            year=config.survey_start_date.year):
    query = """
    CREATE TABLE {0}.{1}_{2}
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    STORED AS PARQUET AS
    SELECT
        userhash,
        geocoded_data,
        MAX(logged_in) as has_account,
        MAX(attempted_edit) as attempted_edit,
        CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(requests)) AS requests,
        SUM(r_count) as request_count,
        RAND() AS rand_sample
    FROM
        {0}.{1}_{2}_by_day
    WHERE
        year = {3}
    GROUP BY
        userhash,
        geocoded_data
    """.format(db_name, table_name, lang, year)

    exec_hive_stat2(query, priority=priority, nice=nice)
def add_day_to_hive_trace_table(req_table,
                                db_name,
                                table_name,
                                day,
                                lang,
                                priority,
                                nice,
                                sampling_rate=1.0):
    year = day.year
    month = day.month
    day = day.day

    query = """
    INSERT OVERWRITE TABLE {0}.{1}_{2}_by_day
    PARTITION(year={3}, month={4}, day ={5}, host='{2}')
    SELECT
        userhash,
        geocoded_data,
        MAX(logged_in) as logged_in,
        MAX(edit_attempt) as attempted_edit,
        CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests,
        COUNT(*) as r_count
    FROM
        (SELECT
            userhash,
            geocoded_data,
            logged_in,
            CAST(page_title = '{6}' as int) as edit_attempt,
            CAST(normalized_host.project = '{2}' as int) as correct_wiki,
            CONCAT( 'ts|', ts,
                    '|referer|', referer,
                    '|page_id|', page_id,
                    '|title|', page_title,
                    '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path),
                    '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query),
                    '|access_method|', access_method,
                    '|referer_class|', referer_class,
                    '|project|', normalized_host.project_class,
                    '|lang|', normalized_host.project,
                    '|uri_host|', uri_host
                ) AS request
        FROM
            {7}
        WHERE 
            day = {5}
            AND CONV(SUBSTR(userhash, 113), 16, 10) / 18446744073709551615 < {8}
        ) a
    GROUP BY
        userhash,
        geocoded_data
    HAVING
    COUNT(*) < 500 AND SUM(correct_wiki) > 0;
    """.format(db_name, table_name, lang, year, month, day,
               config.edit_attempt_str, req_table, sampling_rate)

    exec_hive_stat2(query, priority=priority, nice=nice)
def get_pageview_data(lang, output_dir):
    query = (
        "SELECT page_id, sum(view_count) AS weekly_pageviews FROM wmf.pageview_hourly "
        "WHERE project = '{0}.wikipedia' "
        "AND agent_type = 'user' "
        "AND {1} "
        "AND namespace_id = 0 "
        "GROUP BY page_id;".format(lang, config.hive_days_clause))
    filename = os.path.join(output_dir, "{0}_pageviews.csv".format(lang))

    exec_hive_stat2(query, filename)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--all_ids_csv",
                        default=config.all_ids_csv,
                        help="CSV with userIDs from all languages")
    parser.add_argument("--ids_table_name",
                        default=config.hive_ids_table,
                        help="Hive table with hashed userIDs.")
    parser.add_argument("--srvy_req_table",
                        default=config.hive_survey_requests_table,
                        help="Hive table w/ all survey requests")
    parser.add_argument("--all_req_table",
                        default=config.hive_all_requests_table,
                        help="Hive table w/ all webrequests.")
    args = parser.parse_args()

    exec_hive_stat2("DROP TABLE IF EXISTS {0};".format(args.ids_table_name))
    exec_hive_stat2(
        "CREATE TABLE {0} (userhash string) "
        "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES "
        "('separatorChar' = ',', 'quoteChar' = '\\\"');".format(
            args.ids_table_name))
    exec_hive_stat2(
        "LOAD DATA LOCAL INPATH '{0}' OVERWRITE INTO TABLE {1};".format(
            args.all_ids_csv, args.ids_table_name))

    query = ("CREATE TABLE {0} STORED AS PARQUET AS "
             "SELECT * FROM {1} "
             "WHERE {2}.userhash in (SELECT {3}.userhash from {4});".format(
                 args.srvy_req_table, args.all_req_table,
                 args.all_req_table.split(".")[1],
                 args.ids_table_name.split(".")[1], args.ids_table_name))
    exec_hive_stat2(query)
def traces_to_csv(db, table, lang, smpl_req_folder, max_num=200000):
    full_tablename = db + "." + table + "_" + lang
    query = (
        "SET mapreduce.map.memory.mb=9000; "
        "SET mapreduce.map.java.opts=-Xmx7200m; "
        "SET mapreduce.reduce.memory.mb=9000; "
        "SET mapreduce.reduce.java.opts=-Xmx7200m; "
        "SELECT userhash, geocoded_data, has_account, attempted_edit, requests "
        "FROM ("
        "SELECT * "
        "FROM {0} "
        "WHERE request_count < 500 "
        "ORDER BY rand_sample "
        "LIMIT {1}) w;".format(full_tablename, max_num))

    exec_hive_stat2(
        query, os.path.join(smpl_req_folder, "sample_{0}.csv".format(lang)))
def add_day_to_hive_trace_table(req_table, db_name, table_name, day, lang, priority, nice):
    year = day.year
    month = day.month
    day = day.day

    query = """
    INSERT OVERWRITE TABLE {0}.{1}_{2}_by_day
    PARTITION(year={3}, month={4}, day={5}, host='{2}')
    SELECT
        userhash,
        geocoded_data,
        MAX(logged_in) as logged_in,
        MAX(edit_attempt) as attempted_edit,
        CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests,
        COUNT(*) as r_count
    FROM
        (SELECT
            userhash,
            geocoded_data,
            logged_in, 
            CAST(page_title = '{6}' as int) as edit_attempt,
            CONCAT( 'ts|', ts,
                    '|referer|', referer,
                    '|page_id|', page_id,
                    '|title|', page_title,
                    '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path),
                    '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query),
                    '|access_method|', access_method,
                    '|referer_class|', referer_class,
                    '|project|', normalized_host.project_class,
                    '|lang|', normalized_host.project,
                    '|uri_host|', uri_host
                ) AS request
        FROM
            {6} w
        WHERE 
            day = {5}
        ) a
    GROUP BY
        userhash,
        geocoded_data;""".format(db_name, table_name, lang, year, month, day, req_table)
#    HAVING
#        COUNT(*) < 500;"""

    exec_hive_stat2(query, priority=priority, nice=nice)
def create_hive_trace_table(db_name, table_name, lang, priority, nice):
    """
    Create a Table partitioned by day and host
    """

    query = """
    CREATE TABLE IF NOT EXISTS {0}.{1}_{2}_by_day (
        userhash STRING,
        geocoded_data MAP<STRING,STRING>,
        logged_in INT,
        attempted_edit INT,
        requests STRING,
        r_count INT
    )
    PARTITIONED BY (year INT, month INT, day INT, host STRING)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    STORED AS PARQUET
    """.format(db_name, table_name, lang)

    exec_hive_stat2(query, priority=priority, nice=nice)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_csv",
                        default=config.quicksurvey_requests_tsv,
                        help="CSV filename for output survey-related webrequests")
    parser.add_argument("--quicksurvey_requests_table",
                        default=config.hive_el_requests_table,
                        help="Hive table with all potential QuickSurvey webrequests and surveySessionTokens")
    args = parser.parse_args()

    # All Hive webrequests including QuickSurvey beacon (survey may have run) on the days while survey was live
    get_qs_query = ("CREATE TABLE {0} AS "
                    "SELECT *, reflect('java.net.URLDecoder', 'decode', substr(uri_query, 2)) AS json_event "
                    "FROM wmf.webrequest "
                    "WHERE uri_path LIKE '%beacon/event' AND uri_query LIKE '%QuickSurvey%' AND uri_query LIKE '%{1}%' "
                    "AND {2}".format(args.quicksurvey_requests_table, config.survey_name_start, config.hive_days_clause))
    #exec_hive_stat2(get_qs_query)

    # NOTE: empirically, the client_ip and user_agent checks have filtered out zero webrequests
    anonymized_to_csv_query = ("SELECT dt as dt_QSinitialization, "
                               "reflect('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', concat(client_ip, user_agent, '{0}')) as userhash, "
                               "get_json_object(json_event, '$.event.surveySessionToken') AS survey_session_token, "
                               "get_json_object(json_event, '$.event.pageviewToken') as pageview_token, "
                               "get_json_object(json_event, '$.event.surveyResponseValue') as response_type, "
                               "get_json_object(json_event, '$.event.pageTitle') as page_title, "
                               "get_json_object(json_event, '$.event.pageId') as page_id, "
                               "get_json_object(json_event, '$.event.isLoggedIn') as logged_in, "
                               "geocoded_data['country'] as country, "
                               "geocoded_data['country_code'] as country_code, "
                               "geocoded_data['timezone'] as timezone, "
                               "geocoded_data['city'] as city, "
                               "geocoded_data['subdivision'] as subdivision, "
                               "geocoded_data['latitude'] as lat, "
                               "geocoded_data['longitude'] as lon "
                               "FROM {1} "
                               "WHERE client_ip <> '-' AND "
                               "user_agent <> '-'".format(hash_key, args.quicksurvey_requests_table))

    exec_hive_stat2(anonymized_to_csv_query, args.output_csv)
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--hash_key",
                        default=config.hash_key,
                        help="Hash key for salting user-agent + client-IP")
    parser.add_argument("--all_req_table",
                        default=config.hive_all_requests_table,
                        help="Hive table w/ all webrequests.")
    args = parser.parse_args()

    query = (
        "CREATE TABLE {0} STORED AS PARQUET AS "
        "SELECT reflect('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', concat(client_ip, user_agent, '{1}')) as userhash,"
        "map('country', geocoded_data['country'], 'timezone', geocoded_data['timezone']) as geocoded_data, "
        "ts, "
        "referer, "
        "uri_path, "
        "uri_host, "
        "uri_query, "
        "access_method, "
        "referer_class, "
        "normalized_host, "
        "COALESCE(pageview_info['page_title'], '{2}') as page_title, "
        "COALESCE(x_analytics_map['loggedIn'], 0) as logged_in, "
        "page_id, "
        "day, "
        "hour "
        "FROM wmf.webrequest "
        "WHERE {3} "
        "AND webrequest_source = 'text' AND access_method != 'mobile app' AND agent_type = 'user' "
        "AND normalized_host.project_class = 'wikipedia' "
        "AND ((namespace_id = 0 AND is_pageview = TRUE) OR ({4}));".format(
            args.all_req_table, args.hash_key, config.edit_attempt_str,
            config.hive_days_clause, config.hive_edit_clause))

    exec_hive_stat2(query)
def traces_to_csv(db, table, lang, srv_dir):
    full_tablename = db + "." + table + "_" + lang
    query = "SELECT userhash, geocoded_data, has_account, attempted_edit, requests from {0};".format(full_tablename)
    exec_hive_stat2(query, os.path.join(srv_dir, "sample_{0}.csv".format(lang)))