def insert_master(): connection = get_redshift_connection() connection.execution_options(autocommit=True).execute( open('sql/create_leadcountry_to_geo_code.sql', 'r').read()) connection.execution_options(autocommit=True).execute( open('sql/insert_mst_sellerleads.sql', 'r').read()) connection.execution_options(autocommit=True).execute( open('sql/insert_mst_selleropportunities.sql', 'r').read()) connection.execution_options(autocommit=True).execute( open('sql/insert_mst_sellercampaigns.sql', 'r').read()) connection.close()
def insert_csv_data_tab3(): connection = get_redshift_connection() q3="COPY us_gtmsales.stg_sellercampaigns_v1(selleruid, sellercompanyname, gtmcampaignsource, campaignname, " \ "crmsystemcampaignid, campaigncreatedate, investment, boxmonth, boxyear, insertiondate) " \ "FROM 's3://{}/{}/{}/Successful Files/{}' " \ "CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' " \ "CSV timeformat 'auto' dateformat 'auto';".format(get_s3_bucket(), get_year_from_sys_args(), get_month_from_sys_args(), get_tab3_filename(), get_s3_config()['access_key_id'], get_s3_config()['secret_access_key']) connection.execution_options(autocommit=True).execute(q3) connection.close()
def validate_lead_country_column(sheet, numrows, col, Column_missing_tab1, one_based_column_position, log_file): err_cnt_lead_country = 0 # query for obtaining all the possible names of a lead country sql_possible_countries = '''SELECT country_name FROM us_gtmsales.countries UNION SELECT country_code FROM us_gtmsales.countries UNION SELECT country_code_iso_3a FROM us_gtmsales.countries;''' # open one Redshift connection to use for the lead country value in each row redshift_connection = get_redshift_connection() # query Redshift and get the results in a dataframe df_possible_countries = pd.read_sql(sql_possible_countries, con=redshift_connection) # store the possible countries in a set for faster lookups set_possible_countries = set(df_possible_countries['country_name']) for row in range(1, numrows): lead_country = sheet.cell_value(row, col) err_msg_lead_country = 'Lead Country column error on Row: {} Column: {} Value: {}'.format(row + 1, col + 1, lead_country) try: if lead_country.upper() not in set_possible_countries: err_cnt_lead_country += 1 print(err_msg_lead_country, file=log_file) except: err_cnt_lead_country += 1 print(err_msg_lead_country, file=log_file) passed = numrows - err_cnt_lead_country if err_cnt_lead_country == 0 and one_based_column_position not in Column_missing_tab1: print('Validation on Lead Country : PASS (Satisfied Conditions :: found lead country and corresponding lead region)', file=log_file) print('Total number of rows PASSED : {}'.format(passed - 1), file=log_file) print('Total number of rows FAILED : {}'.format(err_cnt_lead_country), file=log_file) else: print('Validation on Lead Country : FAIL', file=log_file) print('Total number of rows PASSED : {}'.format(passed - 1), file=log_file) print('Total number of rows FAILED : {}'.format(err_cnt_lead_country), file=log_file) # close the Redshift connection redshift_connection.close() return err_cnt_lead_country
def insert_csv_data_tab2(): connection = get_redshift_connection() q3 = "COPY us_gtmsales.stg_selleropportunities_v1(" \ "selleruid, sellercompanyname, gtmcampaignsource, campaignname, crmsystemcampaignid, " \ "campaigncreatedate, opportunityid, convertdate, opportunitycountry, " \ "opportunitystatus, awsmarketopportunity, pipelinerevenue, accountname, accountid, windate, billedrevenue, " \ "boxmonth, boxyear, insertiondate )" \ "FROM 's3://{}/{}/{}/Successful Files/{}' " \ "CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' " \ "CSV timeformat 'auto' dateformat 'auto';".format(get_s3_bucket(), get_year_from_sys_args(), get_month_from_sys_args(), get_tab2_filename(), get_s3_config()['access_key_id'], get_s3_config()['secret_access_key']) connection.execution_options(autocommit=True).execute(q3) connection.close()
def setUp(self): # access glue service self.glue = boto3.client( service_name='glue', region_name='us-east-1', endpoint_url='https://glue.us-east-1.amazonaws.com' ) # Create CloudWatch client self.cloudwatch = boto3.client('cloudwatch') # configure redshift access self.db_conn = get_redshift_connection() # access s3 storage self.s3 = boto3.resource('s3') # get json file for this test suite self.json_results = {} # define the jobs list, including initial params self.job_list = { 'EDUDirect_to_parquet_last_N_months': { 'args': { '--MONTHS': '3', '--ALL_TABLES': 'False' }, 'bucket': 'highereducation-dw-transformed-data', 'date_partition': True, 'initial_folders': [ 'EDUDirectDB' ], 'tables': [ 'cddirect_production_lead', 'cddirect_production_visitor' ], 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_to_parquet_replace': { 'bucket': 'highereducation-dw-transformed-data', 'initial_folders': [ 'EDUDirectDB' ], 'tables': [ 'cddirect_production_lead_cap', 'cddirect_production_migration_versions', 'cddirect_production_school_campus_program', 'cddirect_production_school_criteria', 'cddirect_production_school_criteria_affiliate', 'cddirect_production_school_multilead_segment', 'cddirect_production_visitor_tag', 'cddirect_production_zip_state' ], 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_to_parquet_new_snapshot': { 'bucket': 'highereducation-dw-transformed-data', 'initial_folders': [ 'EDUDirectDB' ], 'tables': [ 'cddirect_production_affiliate', 'cddirect_production_country', 'cddirect_production_education_level', 'cddirect_production_publisher', 'cddirect_production_school', 'cddirect_production_school_alias', 'cddirect_production_school_campus', 'cddirect_production_school_eligible_country', 'cddirect_production_school_eligible_state', 'cddirect_production_school_program', 'cddirect_production_school_program_ineligible_state', 'cddirect_production_school_provider', 'cddirect_production_school_provider_campus', 'cddirect_production_school_provider_cap', 'cddirect_production_school_provider_cap_program', 'cddirect_production_school_provider_cap_publisher', 'cddirect_production_school_provider_cap_state', 'cddirect_production_school_provider_category', 'cddirect_production_school_provider_education_level', 'cddirect_production_school_provider_leadid_flag', 'cddirect_production_school_provider_program', 'cddirect_production_school_publisher', 'cddirect_production_school_targus_score', 'cddirect_production_state', 'cddirect_production_tag', 'cddirect_production_targus_score', 'cddirect_production_user', 'cddirect_production_widget_category', 'cddirect_production_widget_degree', 'cddirect_production_widget_degree_recommendation', 'cddirect_production_widget_subject', 'cddirect_production_widget_subject_alias', 'cddirect_production_widget_subject_recommendation', 'form_position_csv' ], 'date_partition': True, 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_to_parquet_current_dimensions': { 'bucket': 'highereducation-dw-transformed-data', 'initial_folders': ['EDUDirectDB-current'], 'files': [ 'cddirect_production_affiliate', 'cddirect_production_country', 'cddirect_production_lead_cap', 'cddirect_production_publisher', 'cddirect_production_school', 'cddirect_production_school_program', 'cddirect_production_school_provider', 'cddirect_production_school_provider_cap', 'cddirect_production_school_provider_category', 'cddirect_production_school_provider_program', 'cddirect_production_school_provider_education_level', 'cddirect_production_state', 'cddirect_production_user', 'cddirect_production_widget_category', 'cddirect_production_widget_degree', 'cddirect_production_widget_subject', 'cddirect_production_widget_degree_recommendation', 'cddirect_production_widget_subject_recommendation', 'form_position_csv', ], 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_user_agent': { 'args': { '--TYPE': 'historical', }, 'bucket': 'highereducation-dw-transformed-data', 'tables': ['user_agent'], 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_to_staging': { 'args': { '--TYPE': 'historical', '--ENVIRONMENT': 'dev', '--START_DATE': '000', '--END_DATE': '000', }, 'bucket': 'highereducation-dw-staging-data', 'initial_folders': ['EDUDirectDB', 'tmp'], 'date_partition': True, 'file_extension': 'parquet', 'job_type': ['file_creation'] }, 'EDUDirect_related_subject': { 'args': { '--TYPE': 'historical', '--ENVIRONMENT': 'dev', '--DATABASE': 'highereducation-dw-edudirectdb-parquet-current', '--START_DATE': '000', '--END_DATE': '000', }, 'bucket': 'highereducation-dw-staging-data', 'initial_folders': ['EDUDirectDB', 'env'], 'date_partition': True, 'file_extension': 'parquet', 'job_type': ['file_creation'] }, # migration to redshift 'UpdateCrawler': { 'args': { '--CRAWLER': 'highereducation-dw-edudirectdb-staging' }, 'job_type': ['update_db'] }, 'Migrate_EDUDirect_to_Redshift': { 'args': { '--ORIGIN': 'stag-platformevents-db-staging.normalized_events_fact_table_stag_current', #database and table to query '--DESTINATION': 'stag_platform_events_staging_internal.normalized_events_fact_table_stag_current', }, 'job_type': ['update_db'], 'output_db': 'stag_platform_events_staging_internal', 'output_table': 'normalized_events_fact_table_stag_current' ' }, } # initialize logger self.logger = logging.getLogger() handler = logging.StreamHandler() handler.setFormatter(logmatic.JsonFormatter()) self.logger.addHandler(handler) self.logger.setLevel(logging.INFO)