Esempio n. 1
0
    def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet,
                 glue_role, glue_script_location):
        """ constructor requires s3 staging folder for storing results
        Parameters:
        s3_staging_folder = s3 folder with write permissions for storing results
        """
        print 's3_staging_folder_csv=', s3_staging_folder_csv
        print 's3_target_folder_parquet = ', s3_target_folder_parquet
        self.s3_staging_folder_csv = s3_staging_folder_csv
        self.s3_target_folder_parquet = s3_target_folder_parquet
        #         self.sql_query = sql_query
        # Athena Initialization
        self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv)
        #Glue Create Job derive from s3_target_folder_parquet
        job_name = '.'.join([
            re.sub('[^0-9a-zA-Z]+', '',
                   x).title() for x in s3_target_folder_parquet.replace(
                       's3://', '').split('/')[1:]
        ])
        self.glue_job_name = job_name  #'athena_ctas_part2'
        glue_job = self.client.create_job(Name=self.glue_job_name,
                                          Role=glue_role,
                                          Command={
                                              'Name':
                                              'glueetl',
                                              'ScriptLocation':
                                              glue_script_location
                                          })

        self.glue_script_location = glue_script_location  # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py'
Esempio n. 2
0
def drop_table(table_name, env):
    s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    sql_query = """drop table %s""" % (table_name)
    print sql_query
    QueryExecutionId = util.start_query_execution(sql_query=sql_query)
    results = util.get_results(QueryExecutionId)
    print results
Esempio n. 3
0
 def athena_query_execute_save_s3(self):
     text = "AthenaUtil Initialization .."
     print text
     util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv)
     text = "Started athena_query_execute_save_s3 .."
     print text
     print 'athena_ctas_query= ', self.sql_query
     util.execute_save_s3(self.sql_query, self.s3_staging_folder_csv)
     return True
Esempio n. 4
0
def extract_athena_data(weekly_query):
    try:
        print ("Extract the data from Athena...!!!")
        #conn = connect_athena()
        #curr = conn.cursor()
        if os.path.exists('WeeklyKpiReport.xls'):
            wbk = xlrd.open_workbook('WeeklyKpiReport.xls', formatting_info=True)
            workbook = copy(wbk)
        else:
            workbook = xlwt.Workbook()

        worksheet = workbook.add_sheet(weekstartdate)
        '''bold = workbook.add_format({'bold': True})
        date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})
        text_format = workbook.add_format()

        # Add a number format for cells with money.
        money = workbook.add_format({'num_format': '$#,##0'})'''
        date_format = xlwt.XFStyle()
        date_format.num_format_str = 'yyyy-mm-dd'
        # Write some data headers.
        worksheet.write(0, 0, 'metric_name')
        worksheet.write(0, 1, 'metric_id')
        worksheet.write(0, 2, 'Week Start Date')
        worksheet.write(0, 3, 'Week End Date')
        worksheet.write(0, 4, 'time_dim_key')
        worksheet.write(0, 5, 'metric_source_id')
        worksheet.write(0, 6, 'metric_value')
        worksheet.write(0, 7, 'created_by')
        worksheet.write(0, 8, 'created_date')
        worksheet.write(0, 9, 'updated_by')
        worksheet.write(0, 10, 'updated_date')
        row = 1
        col = 0

        util = AthenaUtil(s3_staging_folder='s3://move-dataeng-temp-prod/athena-results/')
        for k, v in weekly_query.items():
            print (v)
            #curr.execute(v)
            #queryoutput = curr.fetchone()
            snap_result = util.execute_query(sql_query=v, use_cache=False)
            queryoutput = snap_result["ResultSet"]["Rows"][0]['Data']
            print (queryoutput[0])
            worksheet.write(row, col, k)
            for i in range(len(queryoutput)):
                if i == 1 or i == 2 or i == 7 or i == 9:
                    worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'], date_format)
                else:
                    worksheet.write(row, col + 1 + i, queryoutput[i]['VarCharValue'])

            query = ("""INSERT INTO dm_rdckpi.metric_actual_fact(metric_id, timedimkey, metric_source_id, metric_value, created_by, created_date, updated_by, updated_date) VALUES(%s,'%s',%s,%s,'%s','%s','%s','%s');""" %(queryoutput[0]['VarCharValue'],queryoutput[3]['VarCharValue'],queryoutput[4]['VarCharValue'],queryoutput[5]['VarCharValue'],queryoutput[6]['VarCharValue'],queryoutput[7]['VarCharValue'],queryoutput[8]['VarCharValue'], queryoutput[9]['VarCharValue']))
            worksheet.write(row, 11, query)
            row += 1
        workbook.save('WeeklyKpiReport.xls')

    except Exception as err:
        print ("Here is the error...('%s')" %(err))
Esempio n. 5
0
def refresh_partitions(target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    athena_ctas_query = '''msck repair table   %s;''' % (target_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    return result
Esempio n. 6
0
    def start_query_execution_and_wait_for_completion(self, sql):
        """ starts a query execution an waits for it to complete, use this for last queries where u need a query id and want to download results page by page.
        Parameters:
        sql  query to execute

        Returns:
        query_status_result dictionary with following structure:
                    {"SUCCESS" : False | True,
                    "STATUS" :  status
                    , "QUERY_TYPE" : "FIRST WORD OF QUERY e.g. SELECT or INSERT or ALTER"
                    , "QUERY" : "ACTUAL QUERY"
                    , "StateChangeReason" : None | "Error string if any"}
        """
        query_status_result = None
        for attempt in range(3):
            try:
                util = None
                if hasattr(thread_data, 'util') == False:
                    thread_data.util = AthenaUtil(self.s3_staging_folder)

                util = thread_data.util
                result = util.start_query_execution(sql_query=sql)
                query_status_result = util.__wait_for_query_to_complete(result)
                # print(query_status_result)
                if query_status_result["SUCCESS"] == True:
                    return query_status_result
                else:
                    pass
                    # print("attempt",attempt,query_status_result, sql)
            except Exception as e:
                print("attempt", attempt, str(e), sql)
                time.sleep((attempt + 1)**2)
        return None
Esempio n. 7
0
def get_pdt_partitions2process_show_partitions(source_table,
                                               target_table,
                                               env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    s = df_src.groupby('date_partition').size()
    src_list = [y for x, y in zip(s, s.index) if x == 24]
    #----
    # If no target, return the source list
    try:
        df_tgt = util.get_pandas_frame(
            util.execute_query(''' show partitions %s;''' % (target_table)))
        df_tgt['date_partition'] = df_tgt['partition'].apply(
            lambda x: x.split('=')[1].replace('-', ''))
        return sorted(list(set(src_list) - set(df_tgt['date_partition'])),
                      reverse=True)
    except:
        return sorted(list(set(src_list)), reverse=True)
Esempio n. 8
0
def get_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/apillai/ctas-test' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    athena_ctas_query = '''show partitions %s;''' % (source_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_src = util.get_pandas_frame(result)
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: '-'.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    athena_ctas_query = ''' show partitions %s;''' % (target_table)
    print 'athena_ctas_query= ', athena_ctas_query
    result = util.execute_query(athena_ctas_query)
    df_tgt = util.get_pandas_frame(result)
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1])
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_pdt_partitions2process(source_table, target_table, env='dev'):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' % (
        env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    data = {'source_table': source_table, 'target_table': target_table}
    query_str_a = '''
       WITH src as(select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from {source_table}  
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24 ),
            tgt as ( select distinct event_date
                       from {target_table} )
       select src.event_date 
       from src left outer join tgt on (src.event_date = tgt.event_date )
       where tgt.event_date IS NULL
       order by src.event_date desc '''.format(**data)

    query_str_b = '''
       select concat(year, month, day) as event_date, cardinality(array_agg(distinct  hour  ) )as hours_list 
                    from %s
                    where  year  =  split(cast(current_date as varchar), '-')[1]
                    and ( month  =  split(cast(current_date as varchar), '-')[2]  
                       OR month  =  split(cast(date_add('day', -30, current_date) as varchar), '-')[2] 
                        )
                    group by 1 
                    having cardinality(array_agg(distinct  hour  ) ) = 24  
                    order by event_date desc ''' % (source_table)

    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a))
        return sorted(list(df_delta['event_date']), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b))
        return sorted(list(df_delta['event_date']), reverse=True)
Esempio n. 10
0
def get_partitions2process(source_table, target_table, env='dev' ):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-%s/glue-ctas/athena-results' %(env)
    util = AthenaUtil(s3_staging_folder = s3_location_target)
    data = { 'source_table': source_table,'target_table':target_table}
    query_str_a = '''
      WITH src as(SELECT DISTINCT event_date  
                  FROM  {source_table}  ),
          tgt as( SELECT DISTINCT event_date  
                  FROM {target_table} )
    select   src.event_date
    from src left outer join  tgt
    ON (src.event_date = tgt.event_date )
    WHERE tgt.event_date IS NULL
    ORDER BY  src.event_date DESC '''.format(**data)
   

    query_str_b = '''
       select distinct  event_date 
                    from %s
                    order by event_date desc ''' %(source_table)   
    
    
    
    # If no target, return the source list
    try:
        print 'query_str_a=', query_str_a
        df_delta = util.get_pandas_frame(util.execute_query(query_str_a) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
    except:
        print 'Inc Query failed! Falling back to query_str_b=', query_str_b
        df_delta = util.get_pandas_frame(util.execute_query(query_str_b) )
        return sorted(list(df_delta['event_date'][1:] ), reverse=True)
Esempio n. 11
0
def get_pdt_partitions2process(source_table, target_table):
    '''
    Use Athena or LGK to derive to-be processed partitions
    Source PDT table partition in year/month/day/hour format(Hive)
    Target table partition in YYYYMMDD format
    '''
    from move_dl_common_api.athena_util import AthenaUtil
    import json
    import sys
    s3_location_target = 's3://move-dataeng-temp-dev/apillai/ctas-test'
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    df_src = util.get_pandas_frame(
        util.execute_query('''show partitions %s;''' % (source_table)))
    df_src['date_partition'] = df_src['partition'].apply(
        lambda part_str: ''.join(
            [x.split('=')[1] for x in part_str.split('/')][:-1]))
    #----
    df_tgt = util.get_pandas_frame(
        util.execute_query(''' show partitions %s;''' % (target_table)))
    df_tgt['date_partition'] = df_tgt['partition'].apply(
        lambda x: x.split('=')[1].replace('-', ''))
    return sorted(
        list(set(df_src['date_partition']) - set(df_tgt['date_partition'])),
        reverse=True)
def get_record_count(table_name, env):
    s3_location_target = 's3://move-dataeng-temp-%s/athena_ctas/tmp/' % (env)
    util = AthenaUtil(s3_staging_folder=s3_location_target)
    sql_query = """select count(1) as ct from %s""" % (table_name)
    df_delta = util.get_pandas_frame(util.execute_query(sql_query))
    return ''.join(list(df_delta['ct']))
Esempio n. 13
0
from move_dl_common_api.athena_util import AthenaUtil

import boto3, sys, json, time, uuid, datetime, botocore, re, uuid
import pandas as pd
import json


def get_hour_from_filename(in_filename):
    date, time = in_filename.split('/')[-1].split('.')[0].split('_')[1].split(
        '-')
    return date + time[:2]


env = 'dev'
s3_location_target = 's3://move-dataeng-temp-%s/apillai/audit-rd-pdt' % (env)
util = AthenaUtil(s3_staging_folder=s3_location_target)

##--
from argparse import ArgumentParser
parser = ArgumentParser(
    description="Audit  program to be called with manadatory Input Arguments")
parser.add_argument('--input_date', help='RD Input date in yyyy-mm-dd format')
#input_date = '2017-12-01'
ar = parser.parse_args()
input_date = ar.input_date
s3_base_location = 's3://move-dataeng-omniture-prod/homerealtor/raw-data-uncompressed/hit_data'
y, m, d = input_date.split('-')
table_name = 'move_dl.hit_data_raw'
sql_text = """
ALTER TABLE %s ADD IF NOT EXISTS PARTITION  (year='%s',month='%s',day='%s') location '%s/year=%s/month=%s/day=%s';
""" % (table_name, y, m, d, s3_base_location, y, m, d)
Esempio n. 14
0
class AthenaCTAS(object):
    region_name = 'us-west-2'
    client = boto3.client(service_name='glue',
                          region_name='us-west-2',
                          endpoint_url='https://glue.us-west-2.amazonaws.com')

    s3_staging_folder_csv = None
    s3_target_folder_parquet = None
    glue_script_location = None
    glue_job_name = None
    glue_role = None
    util = None
    glue_job = None

    def __init__(self, s3_staging_folder_csv, s3_target_folder_parquet,
                 glue_role, glue_script_location):
        """ constructor requires s3 staging folder for storing results
        Parameters:
        s3_staging_folder = s3 folder with write permissions for storing results
        """
        print 's3_staging_folder_csv=', s3_staging_folder_csv
        print 's3_target_folder_parquet = ', s3_target_folder_parquet
        self.s3_staging_folder_csv = s3_staging_folder_csv
        self.s3_target_folder_parquet = s3_target_folder_parquet
        #         self.sql_query = sql_query
        # Athena Initialization
        self.util = AthenaUtil(s3_staging_folder=self.s3_staging_folder_csv)
        #Glue Create Job derive from s3_target_folder_parquet
        job_name = '.'.join([
            re.sub('[^0-9a-zA-Z]+', '',
                   x).title() for x in s3_target_folder_parquet.replace(
                       's3://', '').split('/')[1:]
        ])
        self.glue_job_name = job_name  #'athena_ctas_part2'
        glue_job = self.client.create_job(Name=self.glue_job_name,
                                          Role=glue_role,
                                          Command={
                                              'Name':
                                              'glueetl',
                                              'ScriptLocation':
                                              glue_script_location
                                          })

        self.glue_script_location = glue_script_location  # 's3://move-dataeng-temp-dev/glue-etl/scripts/athena_ctas_part2.py'

    def athena_query_execute_save_s3(self, sql_query):
        text = "Started athena_query_execute_save_s3 .."
        print text
        #         self.banner(text)
        print 'athena_ctas_query= ', sql_query
        self.util.execute_save_s3(sql_query, self.s3_staging_folder_csv)
        return True

    def wait_for_job_to_complete(self, JobRunId):
        """ waits for query to execute """
        text = 'Waiting for JobName = %s and  JobId=%s  to Complete processing ...' % (
            self.glue_job_name, JobRunId)
        print text
        #         self.banner(text)
        status = "STARTING"  # assumed
        error_count = 0
        response = None
        response = self.client.get_job_run(JobName=self.glue_job_name,
                                           RunId=JobRunId)
        status = response["JobRun"]["JobRunState"]
        while (
                status in ("QUEUED','RUNNING, STARTING")
        ):  # 'JobRunState': 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED',
            try:
                response = self.client.get_job_run(JobName=self.glue_job_name,
                                                   RunId=JobRunId)
                status = response["JobRun"]["JobRunState"]
                # my_print(status)
                time.sleep(0.5)
            except botocore.exceptions.ClientError as ce:

                error_count = error_count + 1
                if (error_count > 3):
                    status = "FAILED"
                    print(str(ce))
                    break  # out of the loop
                if "ExpiredTokenException" in str(ce):
                    self.client = boto3.session.Session(
                        region_name=self.region_name).client('glue')

        if (status == "FAILED" or status == "STOPPED"):
            # print(response)
            pass

        if response is None:
            return {"SUCCESS": False, "STATUS": status}
        else:
            return response

    def glue_etl_execute_csv2parquet(self, target_table_name):
        text = 'Starting glue_etl_execute_csv2parquet process  for %s ....' % (
            self.glue_job_name)
        print text
        print 's3_staging_folder_csv=', self.s3_staging_folder_csv
        print 's3_target_folder_parquet = ', self.s3_target_folder_parquet
        print ''
        #         self.banner(text)
        response = self.client.start_job_run(JobName=self.glue_job_name,
                                             Arguments={
                                                 '--s3_location_csv_file':
                                                 self.s3_staging_folder_csv,
                                                 '--s3_location_parquet_file':
                                                 self.s3_target_folder_parquet,
                                                 '--table_name':
                                                 target_table_name
                                             })
        return response
#         if self.wait_for_job_to_complete(response['JobRunId']):
#             return True
#         else:
#             return False

    def job_cleanup(self):
        self.banner("Job Cleanup")
        return self.client.delete_job(JobName=self.glue_job_name)

    @staticmethod
    def banner(text, ch='=', length=78):
        spaced_text = ' %s ' % text
        banner = spaced_text.center(length, ch)
        return banner
Esempio n. 15
0
import pandas as pd

aws_region_name = 'us-west-2'
#s3_bucket = 'aws-athena-query-results-057425096214-us-west-2'
#s3_key = 'Unsaved/Abtest_data'
temp_location = 's3://move-dataeng-temp-dev/sql_refractor/'
result = pd.DataFrame()

with open(
        'Input_SQL_Redshift.txt', 'r'
) as f:  ##### SQL.txt should hold the queries to be executed on the athena seperated by ";"
    s = f.read()
    d = s.split(';')

athena_df = pd.DataFrame()
util = AthenaUtil(s3_staging_folder=temp_location)

for _ in d:
    try:
        result = util.execute_query(sql_query=_)
        temp = util.get_pandas_frame(result)
        print(temp)
        athena_df = pd.concat([athena_df, temp], ignore_index=True)

    except Exception as e:
        print("Exception")
        print("Not Executed --------- ", _)
        print(e)

#athena_df.columns = ['metric_id', 'start_date', 'end_date', 'fy_monthdimkey', 'metric_value']
athena_df.to_csv('ATHENA_SQL_OUTPUT.csv', index=False)