def update_athena_partition(*args, **kwargs): execution_date = datetime.datetime.strptime(kwargs['ds'], '%Y-%m-%d') execution_month = execution_date.month execution_year = execution_date.year s3_prefix = Variable.get('bikeshare_s3_prefix') logs_location = Variable.get('bikeshare_athena_logs') athena_db = Variable.get('bikeshare_athena_db') bucket_name = Variable.get('bikeshare_bucket_name') athena_table_name = Variable.get('bikeshare_athena_table') file_location = 's3://bikeshare-data-copy/' + s3_prefix + f'year={execution_year}/month={execution_month}/' result_configuration = {"OutputLocation": logs_location} partition_update_query = """ ALTER TABLE {} add partition (year="{}", month='{}') location "{}"; """ athena_hook = AWSAthenaHook(aws_conn_id='aws_credentials') athena_hook.run_query(partition_update_query.format( athena_table_name, execution_year, execution_month, file_location), result_configuration=result_configuration, query_context={"Database": athena_db}, client_request_token=str(uuid4()))
def check_data_in_redshift(*args, **kwargs): execution_date = datetime.datetime.strptime(kwargs['ds'], '%Y-%m-%d') execution_month = execution_date.month execution_year = execution_date.year bucket_name = Variable.get("bikeshare_bucket_name") logs_location = Variable.get("bikeshare_athena_logs") athena_db = Variable.get("bikeshare_athena_db") num_records_athena_query = f""" SELECT COUNT(*) FROM trips WHERE year = {execution_year} AND month = {execution_month} """ num_records_redshift_query = f""" SELECT COUNT(*) FROM trips WHERE date_part(year, trips.start_time) = {execution_year} AND date_part(month, trips.start_time) = {execution_month} """ athena_hook = AWSAthenaHook(aws_conn_id='aws_credentials') query_id = athena_hook.run_query( query=num_records_athena_query, query_context={"Database": athena_db}, result_configuration={"OutputLocation": logs_location}, client_request_token=str(uuid4())) time.sleep(20) athena_query_results = athena_hook.get_query_results( query_execution_id=query_id) num_records_in_s3 = int(athena_query_results['ResultSet']['Rows'][1] ['Data'][0]['VarCharValue']) redshift_hook = PostgresHook('redshift_connection') redshift_query_results = redshift_hook.get_records( sql=num_records_redshift_query) num_records_in_redshift = int(redshift_query_results[0][0]) logging.info(f"Number of records in S3 - {num_records_in_s3}") logging.info(f"Number of records in Redshift - {num_records_in_redshift}") if num_records_in_s3 == num_records_in_redshift: logging.info( f"Successfully Copied data for the year - {execution_year}, month - {execution_month}" ) else: raise Partial_Data_Missing
def get_hook(self): return AWSAthenaHook(self.aws_conn_id, self.sleep_time)