Example #1
0
def update_custom_agg(dt_str):
    dy = dt_str[0:4]
    dm = dt_str[0:7]
    dd = dt_str[0:10]
    dh = dt_str

    hive_agg_click_q = """
    SELECT
    date_time,
    auction_id_64,
    user_id_64,
    tag_id,
    venue_id,
    inventory_source_id,
    session_frequency
    FROM dmf.agg_dw_clicks_pb
    WHERE dh = '%s';
    """ % (dh)

    insert_click = """
    INSERT OVERWRITE TABLE
        %s
    PARTITION
       (dy = '%s', dm = '%s', dd = '%s', dh = '%s')
    """ % (DESTINATION_AGG_TABLE, dy, dm, dd, dh)

    compress_config = """
    SET hive.exec.compress.output=true;
    SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
    SET mapred.output.compression.type=BLOCK;
    """

    merge_config = """
    SET hive.merge.mapredfiles=true;
    SET hive.merge.mapfiles=true;
    """

    data_schema_jar = """
    add jar file:///usr/local/adnxs/agg/current/hadoop-jars/data-schemas.jar;
    """

    query = '\n'.join([data_schema_jar,
                       merge_config,
                       compress_config,
                       insert_click,
                       hive_agg_click_q])

    beeline.beeline_execute(query,
                            cluster_name=CLUSTER_NAME,
                            debug=False)
Example #2
0
def delete_old_data(dt_str):
    dy = dt_str[0:4]
    dm = dt_str[0:7]
    dd = dt_str[0:10]
    dh = dt_str

    delete_old_data = """
    ALTER TABLE
        %s
    DROP IF EXISTS PARTITION(dh<= '%s')
    """ % (DESTINATION_AGG_TABLE, dt_str)

    beeline.beeline_execute(delete_old_data,
                            cluster_name=CLUSTER_NAME,
                            debug=False)
Example #3
0
import beeline_utils as beeline


#Setting the cluster name: hadoop-c or hadoop-f
CLUSTER_NAME = "hadoop-c"


# Beeline execute query without collecting the output
beeline.beeline_execute(
    """
    select *
    from dmf.view_agg_dw_clicks
    where dh = '2015-07-10 00' limit 10
    """,
    cluster_name=CLUSTER_NAME,
    debug=True)




# Pull Beeline into a dataframe
df = beeline.pull_beeline(
    """
    select *
    from dmf.view_agg_dw_clicks
    where dh = '2015-07-10 00' limit 10
    """,
    cluster_name=CLUSTER_NAME,
    debug=True)

print(df.columns)