def update_custom_agg(dt_str): dy = dt_str[0:4] dm = dt_str[0:7] dd = dt_str[0:10] dh = dt_str hive_agg_click_q = """ SELECT date_time, auction_id_64, user_id_64, tag_id, venue_id, inventory_source_id, session_frequency FROM dmf.agg_dw_clicks_pb WHERE dh = '%s'; """ % (dh) insert_click = """ INSERT OVERWRITE TABLE %s PARTITION (dy = '%s', dm = '%s', dd = '%s', dh = '%s') """ % (DESTINATION_AGG_TABLE, dy, dm, dd, dh) compress_config = """ SET hive.exec.compress.output=true; SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; SET mapred.output.compression.type=BLOCK; """ merge_config = """ SET hive.merge.mapredfiles=true; SET hive.merge.mapfiles=true; """ data_schema_jar = """ add jar file:///usr/local/adnxs/agg/current/hadoop-jars/data-schemas.jar; """ query = '\n'.join([data_schema_jar, merge_config, compress_config, insert_click, hive_agg_click_q]) beeline.beeline_execute(query, cluster_name=CLUSTER_NAME, debug=False)
def delete_old_data(dt_str): dy = dt_str[0:4] dm = dt_str[0:7] dd = dt_str[0:10] dh = dt_str delete_old_data = """ ALTER TABLE %s DROP IF EXISTS PARTITION(dh<= '%s') """ % (DESTINATION_AGG_TABLE, dt_str) beeline.beeline_execute(delete_old_data, cluster_name=CLUSTER_NAME, debug=False)
import beeline_utils as beeline #Setting the cluster name: hadoop-c or hadoop-f CLUSTER_NAME = "hadoop-c" # Beeline execute query without collecting the output beeline.beeline_execute( """ select * from dmf.view_agg_dw_clicks where dh = '2015-07-10 00' limit 10 """, cluster_name=CLUSTER_NAME, debug=True) # Pull Beeline into a dataframe df = beeline.pull_beeline( """ select * from dmf.view_agg_dw_clicks where dh = '2015-07-10 00' limit 10 """, cluster_name=CLUSTER_NAME, debug=True) print(df.columns)