Exemple #1
0
def insert_master():
    connection = get_redshift_connection()
    connection.execution_options(autocommit=True).execute(
        open('sql/create_leadcountry_to_geo_code.sql', 'r').read())
    connection.execution_options(autocommit=True).execute(
        open('sql/insert_mst_sellerleads.sql', 'r').read())
    connection.execution_options(autocommit=True).execute(
        open('sql/insert_mst_selleropportunities.sql', 'r').read())
    connection.execution_options(autocommit=True).execute(
        open('sql/insert_mst_sellercampaigns.sql', 'r').read())
    connection.close()
Exemple #2
0
def insert_csv_data_tab3():
    connection = get_redshift_connection()
    q3="COPY us_gtmsales.stg_sellercampaigns_v1(selleruid, sellercompanyname, gtmcampaignsource, campaignname, " \
       "crmsystemcampaignid, campaigncreatedate, investment, boxmonth, boxyear, insertiondate) " \
       "FROM 's3://{}/{}/{}/Successful Files/{}' " \
       "CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' " \
       "CSV timeformat 'auto' dateformat 'auto';".format(get_s3_bucket(),
                                                        get_year_from_sys_args(),
                                                        get_month_from_sys_args(),
                                                        get_tab3_filename(),
                                                        get_s3_config()['access_key_id'],
                                                        get_s3_config()['secret_access_key'])

    connection.execution_options(autocommit=True).execute(q3)
    connection.close()
Exemple #3
0
def validate_lead_country_column(sheet, numrows, col, Column_missing_tab1, one_based_column_position, log_file):
    err_cnt_lead_country = 0

    # query for obtaining all the possible names of a lead country
    sql_possible_countries = '''SELECT country_name FROM us_gtmsales.countries
                                UNION
                                SELECT country_code FROM us_gtmsales.countries
                                UNION
                                SELECT country_code_iso_3a FROM us_gtmsales.countries;'''

    # open one Redshift connection to use for the lead country value in each row
    redshift_connection = get_redshift_connection()

    # query Redshift and get the results in a dataframe
    df_possible_countries = pd.read_sql(sql_possible_countries,
                                        con=redshift_connection)

    # store the possible countries in a set for faster lookups
    set_possible_countries = set(df_possible_countries['country_name'])

    for row in range(1, numrows):
        lead_country = sheet.cell_value(row, col)
        err_msg_lead_country = 'Lead Country column error on Row: {} Column: {} Value: {}'.format(row + 1,
                                                                                                  col + 1,
                                                                                                  lead_country)
        try:
            if lead_country.upper() not in set_possible_countries:
                err_cnt_lead_country += 1
                print(err_msg_lead_country, file=log_file)
        except:
            err_cnt_lead_country += 1
            print(err_msg_lead_country, file=log_file)

    passed = numrows - err_cnt_lead_country
    if err_cnt_lead_country == 0 and one_based_column_position not in Column_missing_tab1:
        print('Validation on Lead Country 		   : PASS (Satisfied Conditions :: found lead country and corresponding lead region)', file=log_file)
        print('Total number of rows PASSED                : {}'.format(passed - 1), file=log_file)
        print('Total number of rows FAILED                : {}'.format(err_cnt_lead_country), file=log_file)
    else:
        print('Validation on Lead Country 		   : FAIL', file=log_file)
        print('Total number of rows PASSED                : {}'.format(passed - 1), file=log_file)
        print('Total number of rows FAILED                : {}'.format(err_cnt_lead_country), file=log_file)

    # close the Redshift connection
    redshift_connection.close()

    return err_cnt_lead_country
Exemple #4
0
def insert_csv_data_tab2():
    connection = get_redshift_connection()
    q3 = "COPY us_gtmsales.stg_selleropportunities_v1(" \
         "selleruid, sellercompanyname,  gtmcampaignsource, campaignname, crmsystemcampaignid, " \
         "campaigncreatedate, opportunityid, convertdate, opportunitycountry, " \
         "opportunitystatus, awsmarketopportunity, pipelinerevenue, accountname, accountid, windate, billedrevenue, " \
         "boxmonth, boxyear, insertiondate )" \
         "FROM 's3://{}/{}/{}/Successful Files/{}' " \
         "CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' " \
         "CSV timeformat 'auto' dateformat 'auto';".format(get_s3_bucket(),
                                                           get_year_from_sys_args(),
                                                           get_month_from_sys_args(),
                                                           get_tab2_filename(),
                                                           get_s3_config()['access_key_id'],
                                                           get_s3_config()['secret_access_key'])

    connection.execution_options(autocommit=True).execute(q3)
    connection.close()
Exemple #5
0
    def setUp(self):

        # access glue service
        self.glue = boto3.client(
            service_name='glue',
            region_name='us-east-1',
            endpoint_url='https://glue.us-east-1.amazonaws.com'
            )

        # Create CloudWatch client
        self.cloudwatch = boto3.client('cloudwatch')

        # configure redshift access

        self.db_conn = get_redshift_connection()

        # access s3 storage
        self.s3 = boto3.resource('s3')

        # get json file for this test suite 
        self.json_results = {}

        # define the jobs list, including initial params
        self.job_list = {
            'EDUDirect_to_parquet_last_N_months': {
                'args': {
                        '--MONTHS': '3',
                        '--ALL_TABLES': 'False'
                },
                'bucket': 'highereducation-dw-transformed-data',
                'date_partition': True,
                'initial_folders': [
                    'EDUDirectDB'
                ],
                'tables': [
                    'cddirect_production_lead',
                    'cddirect_production_visitor'
                ],
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_to_parquet_replace': {
                'bucket': 'highereducation-dw-transformed-data',
                'initial_folders': [
                    'EDUDirectDB'
                ],
                'tables': [
                    'cddirect_production_lead_cap',
                    'cddirect_production_migration_versions',
                    'cddirect_production_school_campus_program',
                    'cddirect_production_school_criteria',
                    'cddirect_production_school_criteria_affiliate',
                    'cddirect_production_school_multilead_segment',
                    'cddirect_production_visitor_tag',
                    'cddirect_production_zip_state'
                ],
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_to_parquet_new_snapshot': {
                'bucket': 'highereducation-dw-transformed-data',
                'initial_folders': [
                    'EDUDirectDB'
                ],
                'tables': [
                    'cddirect_production_affiliate',
                    'cddirect_production_country',
                    'cddirect_production_education_level',
                    'cddirect_production_publisher',
                    'cddirect_production_school',
                    'cddirect_production_school_alias',
                    'cddirect_production_school_campus',
                    'cddirect_production_school_eligible_country',
                    'cddirect_production_school_eligible_state',
                    'cddirect_production_school_program',
                    'cddirect_production_school_program_ineligible_state',
                    'cddirect_production_school_provider',
                    'cddirect_production_school_provider_campus',
                    'cddirect_production_school_provider_cap',
                    'cddirect_production_school_provider_cap_program',
                    'cddirect_production_school_provider_cap_publisher',
                    'cddirect_production_school_provider_cap_state',
                    'cddirect_production_school_provider_category',
                    'cddirect_production_school_provider_education_level',
                    'cddirect_production_school_provider_leadid_flag',
                    'cddirect_production_school_provider_program',
                    'cddirect_production_school_publisher',
                    'cddirect_production_school_targus_score',
                    'cddirect_production_state',
                    'cddirect_production_tag',
                    'cddirect_production_targus_score',
                    'cddirect_production_user',
                    'cddirect_production_widget_category',
                    'cddirect_production_widget_degree',
                    'cddirect_production_widget_degree_recommendation',
                    'cddirect_production_widget_subject',
                    'cddirect_production_widget_subject_alias',
                    'cddirect_production_widget_subject_recommendation',
                    'form_position_csv'
                ],
                'date_partition': True,
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_to_parquet_current_dimensions': {
                'bucket': 'highereducation-dw-transformed-data',
                'initial_folders': ['EDUDirectDB-current'],
                'files': [
                    'cddirect_production_affiliate',
                    'cddirect_production_country',
                    'cddirect_production_lead_cap',
                    'cddirect_production_publisher',
                    'cddirect_production_school',
                    'cddirect_production_school_program',
                    'cddirect_production_school_provider',
                    'cddirect_production_school_provider_cap',
                    'cddirect_production_school_provider_category',
                    'cddirect_production_school_provider_program',
                    'cddirect_production_school_provider_education_level',
                    'cddirect_production_state',
                    'cddirect_production_user',
                    'cddirect_production_widget_category',
                    'cddirect_production_widget_degree',
                    'cddirect_production_widget_subject',
                    'cddirect_production_widget_degree_recommendation',
                    'cddirect_production_widget_subject_recommendation',
                    'form_position_csv',
                ],
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_user_agent': {
                'args': {
                    '--TYPE': 'historical',
                },
                'bucket': 'highereducation-dw-transformed-data',
                'tables': ['user_agent'],
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_to_staging': {
                'args': {
                    '--TYPE': 'historical',
                    '--ENVIRONMENT': 'dev',
                    '--START_DATE': '000',
                    '--END_DATE': '000',
                },
                'bucket': 'highereducation-dw-staging-data',
                'initial_folders': ['EDUDirectDB', 'tmp'],
                'date_partition': True,
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            'EDUDirect_related_subject': {
                'args': {
                    '--TYPE': 'historical',
                    '--ENVIRONMENT': 'dev',
                    '--DATABASE': 'highereducation-dw-edudirectdb-parquet-current',
                    '--START_DATE': '000',
                    '--END_DATE': '000',
                },
                'bucket': 'highereducation-dw-staging-data',
                'initial_folders': ['EDUDirectDB', 'env'],
                'date_partition': True,
                'file_extension': 'parquet',
                'job_type': ['file_creation']
            },
            # migration to redshift
            'UpdateCrawler': {
                'args': {
                    '--CRAWLER': 'highereducation-dw-edudirectdb-staging'
                },
                'job_type': ['update_db']
            },
            'Migrate_EDUDirect_to_Redshift': {
                'args': {
                    '--ORIGIN': 'stag-platformevents-db-staging.normalized_events_fact_table_stag_current', #database and table to query
                    '--DESTINATION': 'stag_platform_events_staging_internal.normalized_events_fact_table_stag_current',
                },
                'job_type': ['update_db'],
                'output_db':  'stag_platform_events_staging_internal',
                'output_table': 'normalized_events_fact_table_stag_current'
                '
            },
            
        }

        # initialize logger
        self.logger = logging.getLogger()

        handler = logging.StreamHandler()
        handler.setFormatter(logmatic.JsonFormatter())

        self.logger.addHandler(handler)
        self.logger.setLevel(logging.INFO)