Esempio n. 1
0
def get_pdt_partitions2process_show_partitions(source_table,
                                               target_table,
                                               env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    s = df_src.groupby('date_partition').size()
    src_list = [y for x, y in zip(s, s.index) if x == 24]
    #----
    # If no target, return the source list
    try:
        df_tgt = util.get_pandas_frame(
            util.execute_query(''' show partitions %s;''' % (target_table)))
        df_tgt['date_partition'] = df_tgt['partition'].apply(
            lambda x: x.split('=')[1].replace('-', ''))
        return sorted(list(set(src_list) - set(df_tgt['date_partition'])),
                      reverse=True)
    except:
        return sorted(list(set(src_list)), reverse=True)
Esempio n. 2
0
def get_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    athena_ctas_query = '''show partitions %s;''' % (source_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_src = util.get_pandas_frame(result)
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: '-'.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    athena_ctas_query = ''' show partitions %s;''' % (target_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_tgt = util.get_pandas_frame(result)
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1])
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    data = {'source_table': source_table, 'target_table': target_table}
    query_str_a = '''
       WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from {source_table}  
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24 ),
            tgt as ( select distinct event_date
                       from {target_table} )
       select src.event_date 
       from src left outer join tgt on (src.event_date = tgt.event_date )
       where tgt.event_date IS NULL
       order by src.event_date desc '''.format(**data)

    query_str_b = '''
       select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from %s
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24  
                    order by event_date desc ''' % (source_table)

    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a))
        return sorted(list(df_delta['event_date']), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b))
        return sorted(list(df_delta['event_date']), reverse=True)
Esempio n. 4
0
def get_partitions2process(source_table, target_table, env='dev' ):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env)
    util = AthenaUtil(s3_staging_folder = s3_location_target)
    data = { 'source_table': source_table,'target_table':target_table}
    query_str_a = '''
      WITH src as(SELECT DISTINCT event_date  
                  FROM  {source_table}  ),
          tgt as( SELECT DISTINCT event_date  
                  FROM {target_table} )
    select   src.event_date
    from src left outer join  tgt
    ON (src.event_date = tgt.event_date )
    WHERE tgt.event_date IS NULL
    ORDER BY  src.event_date DESC '''.format(**data)
   

    query_str_b = '''
       select distinct  event_date 
                    from %s
                    order by event_date desc ''' %(source_table)   
    
    
    
    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
Esempio n. 5
0
def get_pdt_partitions2process(source_table, target_table):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test'
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    df_tgt = util.get_pandas_frame(
        util.execute_query(''' show partitions %s;''' % (target_table)))
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1].replace('-', ''))
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_record_count(table_name, env):
    s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    sql_query = """select count(1) as ct from %s""" % (table_name)
    df_delta = util.get_pandas_frame(util.execute_query(sql_query))
    return ''.join(list(df_delta['ct']))
Esempio n. 7
0
    ON ( d.year = m.year
        AND d.month=m.month
        AND d.day=m.day
        AND   d.col_num = m.col_num )
WHERE m.year = '{year}'
        AND m.month = '{month}'
        AND m.day = '{day}' 
  )
select source_filename, count(1) as rd_ct
from dataset
group by 1
order by 1 
""".format(**data)
print rd_audit_query

df_rd = util.get_pandas_frame(util.execute_query(rd_audit_query))
data = {'input_date': input_date}
pdt_audit_query = """ 
select etl_source_filename, count(1) as pdt_ct 
from cnpd_omtr_pdt.hit_data_forqa
where cast (concat(year, '-', month, '-', day) as date) 
between  date_add('day', -2, cast( '{input_date}' as date)) 
and date_add('day', 2, cast( '{input_date}' as date))
group by 1
order by 1           
""".format(**data)
print pdt_audit_query

df_pdt = util.get_pandas_frame(util.execute_query(pdt_audit_query))

df_rd_clean = df_rd[1:]
Esempio n. 8
0
aws_region_name = 'us-west-2'
#s3_bucket = 'aws-athena-query-results-057425096214-us-west-2'
#s3_key = 'Unsaved/Abtest_data'
temp_location = 's3://move-dataeng-temp-dev/sql_refractor/'
result = pd.DataFrame()

with open(
        'Input_SQL_Redshift.txt', 'r'
) as f:  ##### SQL.txt should hold the queries to be executed on the athena seperated by ";"
    s = f.read()
    d = s.split(';')

athena_df = pd.DataFrame()
util = AthenaUtil(s3_staging_folder=temp_location)

for _ in d:
    try:
        result = util.execute_query(sql_query=_)
        temp = util.get_pandas_frame(result)
        print(temp)
        athena_df = pd.concat([athena_df, temp], ignore_index=True)

    except Exception as e:
        print("Exception")
        print("Not Executed --------- ", _)
        print(e)

#athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value']
athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)