Esempio n. 1
0
def main():

    #PENDING Try Sending output to screen

    # To fetch the current date and script name from sys.argv[] and generate log file path.
    current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    file_name = sys.argv[0].split('/')[-1].split('.')[0]

    log_file_path = "{}/{}_{}.log".format(config.log_file_directory, file_name,
                                          current_date)

    # To initialize logging
    #cHANGED FROM INFO to WARNING.
    logging.basicConfig(filename=log_file_path,
                        filemode='w',
                        level=logging.ERROR)

    logging.info(
        '\n945##################  Mapping Logic Started at %s ##################',
        datetime.datetime.now())

    if len(sys.argv) > 1:
        co_nbrs = sys.argv[1].split(',')
        co_nbr_list = ', '.join("'{0}'".format(co_nbr.zfill(3))
                                for co_nbr in co_nbrs)
        logging.info('Company Number - %s', co_nbr_list)

    else:
        co_nbr_list = "'000'"
        logging.info(
            "Company Number is not passed as argument. Script will process data for all OpCo's"
        )

    # calling initializeSparkHiveContext() function from common_func.py to initialize spark session, register spark and hive context.
    #pending replace later hive_context = common_func.initializeSparkHiveContext('VendorAgreements')

    #---------------------------------------------------------------------------------------------------
    from pyspark.sql import HiveContext
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext

    spark = SparkSession.builder.master(
        "yarn").appName("Purchase Order").config(
            "spark.serializer",
            "org.apache.spark.serializer.KryoSerializer").config(
                "spark.kryoserializer.buffer.max",
                "126mb").enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    hive_context = HiveContext(sc)

    # Control the logs to the stdout (console)
    # Other     options     for Level include: all, debug, error, fatal, info, off, trace, trace_int, warn
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

    # ---------------------------------------------------------------------------------------------------

    logging.info(
        '\n##################  Mapping Logic Started at %s ##################',
        datetime.datetime.now())

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Register Temporary Tables for Sources  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    #
    #   _____                _         _                              _   _        _     _
    #  / ____|              | |       | |                            | | | |      | |   | |
    # | |     _ __ ___  __ _| |_ ___  | |_ _ __ ___  _ __   ___  __ _| | | |_ __ _| |__ | | ___  ___
    # | |    | '__/ _ \/ _` | __/ _ \ | __| '_ ` _ \| '_ \ / __|/ _` | | | __/ _` | '_ \| |/ _ \/ __|
    # | |____| | |  __/ (_| | ||  __/ | |_| | | | | | |_) |\__ \ (_| | | | || (_| | |_) | |  __/\__ \
    #  \_____|_|  \___|\__,_|\__\___|  \__|_| |_| |_| .__/ |___/\__, |_|  \__\__,_|_.__/|_|\___||___/
    #                                               | |______      | |
    #                                               |_|______|     |_|
    #

    filter_type = 1

    if filter_type == 1:
        type1_filter_VendorAgreement = " WHERE co_skey in (7,56) and incm_ern_dt>='2018-01-01' and vndragr.itm_skey in  (394169,425281,377710,368931,874129,404300,373607,904799,76346) "
        type1_filter_oblig_dtl = " WHERE  oblig_dt>='01/01/2018' "
        type1_filter_oblig_head = " WHERE  oblig_dt>='01/01/2018' "
        type1_filter_itm_ = " WHERE itm_skey in  (394169,425281,377710,368931,874129,404300,373607,904799,76346) "
        type1_filter_itm_co_itm = " WHERE itm_skey in  (394169,425281,377710,368931,874129,404300,373607,904799,76346) "
        type1_filter_calendar = " WHERE day_dt>'01/01/2018' "
        type1_filter_cal_day_dim = " WHERE day_dt>'01/01/2018' "
        type1_filter_cust_ship_to = " WHERE co_skey in (7,56) "

        var_sql_VendorAgreement = sqlfile.sql_src_VendorAgreement
        var_sql_oblig_dtl = sqlfile.sql_src_oblig_dtl
        var_sql_oblig_head = sqlfile.sql_src_oblig_head
        var_sql_itm = sqlfile.sql_src_itm
        var_sql_itm_co_itm = sqlfile.sql_src_itm_co_itm
        var_sql_calendar = sqlfile.sql_src_calendar
        var_sql_cal_day_dim = sqlfile.sql_src_cal_day_dim
        var_sql_cust_ship_to = sqlfile.sql_src_cust_ship_to

        var_sql_VendorAgreement = var_sql_VendorAgreement.replace(
            "where 1=1", type1_filter_VendorAgreement)
        var_sql_oblig_dtl = var_sql_oblig_dtl.replace("where 1=1",
                                                      type1_filter_oblig_dtl)
        var_sql_oblig_head = var_sql_oblig_head.replace(
            "where 1=1", type1_filter_oblig_head)
        var_sql_itm_ = var_sql_itm.replace("where 1=1", type1_filter_itm_)
        var_sql_itm_co_itm = var_sql_itm_co_itm.replace(
            "where 1=1", type1_filter_itm_co_itm)
        var_sql_calendar = var_sql_calendar.replace("where 1=1",
                                                    type1_filter_calendar)
        var_sql_cal_day_dim = var_sql_cal_day_dim.replace(
            "where 1=1", type1_filter_cal_day_dim)
        var_sql_cust_ship_to = var_sql_cust_ship_to.replace(
            "where 1=1", type1_filter_cust_ship_to)
    else:
        print(
            "Do nothing-will process the whole entire set with no filters set")

    dfsrc1 = common_func.registerRedshiftQuery(
        hive_context, sqlfile.sql_src_oblig_dtl + "  ",
        "DUMMY_TMP_TABLE_MISSING_WEEK_ENDING_SALE")
    dfsrc1 = dfsrc1.withColumn('week_ending_sale',
                               next_day(dfsrc1.oblig_dt, 'Sun'))
    dfsrc1.createOrReplaceTempView("rs_TMP_SQL_src_sale_oblig_dtl_fact_mstr")

    dfsrc2 = common_func.registerRedshiftQuery(
        hive_context, var_sql_oblig_head, "TMP_SQL_src_sale_oblig_head_fact")
    dfsrc3 = common_func.registerRedshiftQuery(hive_context, var_sql_itm_,
                                               "TMP_SQL_src_itm_dim")
    dfsrc4 = common_func.registerRedshiftQuery(hive_context,
                                               var_sql_itm_co_itm,
                                               "TMP_SQL_src_itm_co_itm_rel")
    dfsrc5 = common_func.registerRedshiftQuery(
        hive_context, var_sql_VendorAgreement,
        "TMP_SQL_src_agr_vndr_agr_trans_fact")

    dfsrc6 = common_func.registerRedshiftQuery(hive_context,
                                               var_sql_cust_ship_to,
                                               "TMP_SQL_src_cust_ship_to_dim")
    dfsrc7 = common_func.registerRedshiftQuery(hive_context, var_sql_calendar,
                                               "TMP_SQL_src_calendar")
    dfsrc8 = common_func.registerRedshiftQuery(hive_context,
                                               var_sql_cal_day_dim,
                                               "TMP_SQL_src_cal_day_dim")

    dfsrc9 = common_func.registerRedshiftQuery(
        hive_context, 'select * from intp.ei_sap_go_live_dates',
        "TMP_SQL_src_ei_sap_go_live_dates")

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%READ ETL STAGE INTERMEDIATE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    dfstg1 = common_func.registerRedshiftQuery(
        hive_context, sqlfile.sql_stg_agr_sum,
        "TMP_SQL_stg_ei_agr_vndr_agr_trans_reseg")
    dfstg2 = common_func.registerRedshiftQuery(hive_context,
                                               sqlfile.sql_stg_sus_weekly,
                                               "TMP_SQL_stg_ei_sus_weekly")

    print("***************************CACHE TABLES *************************")

    spark.cacheTable("rs_TMP_SQL_src_sale_oblig_dtl_fact_mstr")
    spark.cacheTable("rs_TMP_SQL_src_sale_oblig_head_fact_mstr")
    spark.cacheTable("rs_TMP_SQL_src_itm_dim_mstr")
    spark.cacheTable("rs_TMP_SQL_src_itm_co_itm_rel_mstr")
    spark.cacheTable("rs_TMP_SQL_src_agr_vndr_agr_trans_fact_mstr")
    spark.cacheTable("rs_TMP_SQL_src_cust_ship_to_dim_mstr")
    spark.cacheTable("rs_TMP_SQL_src_cal_day_dim_mstr")
    spark.cacheTable("rs_TMP_SQL_src_ei_sap_go_live_dates_mstr")

    #Two staging tables
    spark.cacheTable("rs_TMP_SQL_stg_ei_agr_vndr_agr_trans_reseg_mstr")
    spark.cacheTable("rs_TMP_SQL_stg_ei_sus_weekly_mstr")

    print(
        "***************************ORIGINAL QUERY direct reference using x.var *************************"
    )
    sqlmain = sqlfile.sql_final_main_full

    print(
        "***************************REPLACING EDWP TABLES by LOADED HADOOP TABLES *************************"
    )

    sqlmain = sqlmain.replace("edwp.sale_oblig_dtl_fact",
                              "rs_TMP_SQL_src_sale_oblig_dtl_fact_mstr")
    sqlmain = sqlmain.replace("edwp.sale_oblig_head_fact",
                              "rs_TMP_SQL_src_sale_oblig_head_fact_mstr")
    sqlmain = sqlmain.replace("edwp.itm_dim", "rs_TMP_SQL_src_itm_dim_mstr")
    sqlmain = sqlmain.replace("edwp.itm_co_itm_rel",
                              "rs_TMP_SQL_src_itm_co_itm_rel_mstr")

    sqlmain = sqlmain.replace("edwp.agr_vndr_agr_trans_fact vndragr",
                              "rs_TMP_SQL_src_agr_vndr_agr_trans_fact_mstr")
    sqlmain = sqlmain.replace("edwp.cust_ship_to_dim",
                              "rs_TMP_SQL_src_cust_ship_to_dim_mstr")
    sqlmain = sqlmain.replace("edwp.cal_day_dim day_dim",
                              "rs_TMP_SQL_src_cal_day_dim_mstr")
    sqlmain = sqlmain.replace("intp.ei_sap_go_live_dates",
                              "rs_TMP_SQL_src_ei_sap_go_live_dates_mstr")

    #STAGING
    sqlmain = sqlmain.replace(
        "intp.ei_agr_vndr_agr_trans_reseg",
        "rs_TMP_SQL_stg_ei_agr_vndr_agr_trans_reseg_mstr")
    sqlmain = sqlmain.replace("intp.ei_sus_weekly",
                              "rs_TMP_SQL_stg_ei_sus_weekly_mstr")

    print(
        "************************** REPLACE_QUERY  **************************")
    print(sqlmain)

    #spark.stop()
    # print(dfsrc1.count())
    # print(dfsrc2.count())
    # print(dfsrc3.count())
    # print(dfsrc4.count())
    # print(dfsrc5.count())
    # print(dfsrc6.count())
    #
    # print(dfstg1.count())
    # print(dfstg2.count())

    #The collect proves that is reading well from s3 buckets in the source AWS
    # dfsrc1.collect()
    # dfsrc2.collect()
    # dfsrc3.collect()
    # dfsrc4.collect()
    # dfsrc5.collect()
    # dfsrc6.collect()

    # dflogic1.createOrReplaceTempView("rs_TMP_SQL_sql_ei_main_part_a_mstr")
    # dflogic2.createOrReplaceTempView("rs_TMP_SQL_ei_purchase_order_item_level_mstr")
    # dflogic3.createOrReplaceTempView("rs_TMP_SQL_ei_purchase_order_item_level_mstr")

    # PROCESS BUSINESS LOGIC

    print('*logic3')
    dflogic1 = spark.sql(sqlmain)

    print("%%*Insert Statements %%*")
    common_func.loadDataIntoRedshift(logging,
                                     'INSERT',
                                     'intp',
                                     'ei_sus_ei',
                                     dflogic1,
                                     opco_list=co_nbr_list)

    print("%%*Program finished%%*")

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%THE END  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%',
                 datetime.datetime.now())
    logging.info('Script read01_afr_vendor_enterprise completed %s',
                 datetime.datetime.now())
Esempio n. 2
0
def main():

    #PENDING Try Sending output to screen

    # To fetch the current date and script name from sys.argv[] and generate log file path.
    current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    file_name = sys.argv[0].split('/')[-1].split('.')[0]

    log_file_path = "{}/{}_{}.log".format(config.log_file_directory, file_name,
                                          current_date)

    # To initialize logging
    #cHANGED FROM INFO to WARNING.
    logging.basicConfig(filename=log_file_path,
                        filemode='w',
                        level=logging.INFO)

    logging.info(
        '\n945##################  Mapping Logic Started at %s ##################',
        datetime.datetime.now())

    if len(sys.argv) > 1:
        co_nbrs = sys.argv[1].split(',')
        co_nbr_list = ', '.join("'{0}'".format(co_nbr.zfill(3))
                                for co_nbr in co_nbrs)
        logging.info('Company Number - %s', co_nbr_list)

    else:
        co_nbr_list = "'000'"
        logging.info(
            "Company Number is not passed as argument. Script will process data for all OpCo's"
        )

    # calling initializeSparkHiveContext() function from common_func.py to initialize spark session, register spark and hive context.
    #pending replace later hive_context = common_func.initializeSparkHiveContext('VendorAgreements')

    #---------------------------------------------------------------------------------------------------
    from pyspark.sql import HiveContext
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext

    spark = SparkSession.builder.master(
        "yarn").appName("Purchase Order").config(
            "spark.serializer",
            "org.apache.spark.serializer.KryoSerializer").config(
                "spark.kryoserializer.buffer.max",
                "126mb").enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    hive_context = HiveContext(sc)

    # Control the logs to the stdout (console)
    # Other     options     for Level include: all, debug, error, fatal, info, off, trace, trace_int, warn
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

    # ---------------------------------------------------------------------------------------------------

    logging.info(
        '\n##################  Mapping Logic Started at %s ##################',
        datetime.datetime.now())

    logging.info('Assigning values Started at  %s', datetime.datetime.now())

    # sqlstatement1_po_detail
    # sqlstatement2_po_header
    # sqlstatement3_join

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%READ SOURCES PURCHASE ORDER HEADER AND DETAIL %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    print("%%*sqlfile. read source po_dtl_fact")
    print(sqlfile.sql_src_po_dtl_fact)
    df1 = common_func.registerRedshiftQuery(hive_context,
                                            sqlfile.sql_src_po_dtl_fact,
                                            "TMP_SQL_src_po_dtl_fact")
    #this converts into rs_TMP_SQL_src_po_dtl_fact_mstr

    print("%%*sqlfile. read source po_head_fact")
    print(sqlfile.PurchaseOrderHeader)
    df2 = common_func.registerRedshiftQuery(hive_context,
                                            sqlfile.PurchaseOrderHeader,
                                            "TMP_SQL_src_PurchaseOrderHeader")
    #this converts into rs_TMP_SQL_src_po_dtl_fact_mstr

    print("%%*sqlfile.TransactionsOrder1")
    print(sqlfile.TransactionsOrder1)
    df3 = common_func.registerRedshiftQuery(hive_context,
                                            sqlfile.TransactionsOrder1,
                                            "TMP_SQL_src_TransactionsOrder1")
    # this converts into rs_TMP_SQL_src_po_dtl_fact_mstr

    #Apply filters to dataframes for testing
    #     sql_list_po = """ SELECT co_po_nbr where co_po_typ_cd = 'DRP' and co_po_nbr > '00476040' and co_po_nbr < '00790830' """

    #Header and Detail tables have the same column names thus the same filter can be applied
    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%aADDING FILTERS TO QUERIES USING METHOD %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    #df1 = df1.filter("co_po_typ_cd = 'DRP' and co_po_nbr > '00476040' and co_po_nbr < '00790830' ")
    #df2 = df2.filter("co_po_typ_cd = 'DRP' and co_po_nbr > '00476040' and co_po_nbr < '00790830' ")
    #For transactions the field is ordr_po_nbr= '00758850'
    #df3 = df3.filter("ordr_po_nbr  > '00476040' and ordr_po_nbr < '00790830'")

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%READ SOURCES TRANSACTION ORDER TABLES  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    # RETURNED/REGISTERED NAME IS rs_TMP_SQL_agr_vndr_agr_trans_fact_mstr

    #template    df1_count_all = hive_context.sql("SELECT count(*) COUNT_records_po_detail_mstr FROM rs_TMP_SQL_po_detail_mstr")

    df1_count_all = hive_context.sql(
        "SELECT count(*) COUNT_TMP_SQL_src_PurchaseOrderHeader_mstr FROM rs_TMP_SQL_src_PurchaseOrderHeader_mstr"
    )
    # df1_count_all.printSchema()
    df1_count_all.show()

    df2_count_all = hive_context.sql(
        "SELECT count(*) COUNT_TMP_SQL_src_po_dtl_fact_mstr FROM rs_TMP_SQL_src_po_dtl_fact_mstr"
    )
    #df2_count_all.printSchema()
    df2_count_all.show()

    df3_count_all = hive_context.sql(
        "SELECT count(*) COUNT_TMP_SQL_src_TransactionsOrder1_mstr FROM rs_TMP_SQL_src_TransactionsOrder1_mstr"
    )
    df3_count_all.printSchema()
    df3_count_all.show()

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%% Defining PURCHASE ORDER data frames DEPENDING ON DETAIL TABLE%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    # pending review conversion do decimal(19,2)
    # This data frams are all using the same source, so the data is read once, and the dataframes only hold
    # the definition of the different queries .

    print("%%PurchaseOrder1_rdc")
    print(sqlfile.PurchaseOrder1_rdc)
    dfPO1 = spark.sql(sqlfile.PurchaseOrder1_rdc)
    dfPO1.createOrReplaceTempView("TMP_SQL_PurchaseOrder1_rdc")
    #print("Count Records:")
    #print(dfTransformation1.count())

    print("%%PurchaseOrder2_non_rdc")
    print(sqlfile.PurchaseOrder2_non_rdc)
    dfPO2 = spark.sql(sqlfile.PurchaseOrder2_non_rdc)
    dfPO2.createOrReplaceTempView("TMP_SQL_PurchaseOrder2_non_rdc")
    #print("Count Records:")
    #print(dfTransformation1.count())

    print("%%PurchaseOrder4")
    print(sqlfile.PurchaseOrder4)
    dfPO3 = spark.sql(sqlfile.PurchaseOrder4)
    dfPO3.createOrReplaceTempView("TMP_SQL_PurchaseOrder4")
    # print("Count Records:")
    # print(dfTransformation1.count())

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%JOIN STATEMENT MAIN QUERY%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    #pending review conversion do decimal(19,2)

    print("%%Transformation1 - TMP_SQL_Join4Pieces")
    print(sqlfile.Join4Pieces)
    dfTransformation1 = spark.sql(sqlfile.Join4Pieces)
    dfTransformation1.createOrReplaceTempView("TMP_SQL_Join4Pieces")
    dfTransformation1.printSchema()

    #print("Count Records:")
    #print(dfTransformation1.count())

    print("%%*MAIN_QUERY")

    varAllCols = 1

    if varAllCols:
        print(
            "Printing all columns used in the program for debugging purposes")
        dfmain = spark.sql(sqlfile.LogisticEarnedIncomeMainQryAllCols)
        print(sqlfile.LogisticEarnedIncomeMainQryAllCols)
        common_func.loadDataIntoRedshift(logging,
                                         'INSERT',
                                         'intp',
                                         'ei_sus_lei_all_cols2',
                                         dfmain,
                                         opco_list=co_nbr_list)
    else:
        print("0 - Storing only columns originally defined in 1010")
        dfmain = spark.sql(sqlfile.LogisticEarnedIncomeMainQry1)
        print(sqlfile.LogisticEarnedIncomeMainQry1)
        common_func.loadDataIntoRedshift(logging,
                                         'INSERT',
                                         'intp',
                                         'ei_sus_lei',
                                         dfmain,
                                         opco_list=co_nbr_list)

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%THE END  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*"
    )

    logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%',
                 datetime.datetime.now())
    logging.info('Script read01_afr_vendor_enterprise completed %s',
                 datetime.datetime.now())
Esempio n. 3
0
def main():

    #PENDING Try Sending output to screen

    # To fetch the current date and script name from sys.argv[] and generate log file path.
    current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    file_name = sys.argv[0].split('/')[-1].split('.')[0]

    log_file_path = "{}/{}_{}.log".format(config.log_file_directory, file_name, current_date)


    # To initialize logging
    #cHANGED FROM INFO to WARNING.
    logging.basicConfig(filename=log_file_path, filemode='w', level=logging.ERROR)

    logging.info('\n##################  Mapping Logic Started at %s ##################', datetime.datetime.now())

    if len(sys.argv) > 1:
        co_nbrs = sys.argv[1].split(',')
        co_nbr_list = ', '.join("'{0}'".format(co_nbr.zfill(3)) for co_nbr in co_nbrs)
        logging.info('Company Number - %s', co_nbr_list)

    else:
        co_nbr_list = "'000'"
        logging.info("Company Number is not passed as argument. Script will process data for all OpCo's")

    # calling initializeSparkHiveContext() function from common_func.py to initialize spark session, register spark and hive context.
    #pending replace later hive_context = common_func.initializeSparkHiveContext('VendorAgreements')

    #---------------------------------------------------------------------------------------------------
    from pyspark.sql import HiveContext
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext

    spark = SparkSession.builder.master("yarn").appName("VendorAgreements").config("spark.serializer",
                                                                                 "org.apache.spark.serializer.KryoSerializer").config(
        "spark.kryoserializer.buffer.max", "126mb").enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    hive_context = HiveContext(sc)

    # Control the logs to the stdout (console)
    # Other     options     for Level include: all, debug, error, fatal, info, off, trace, trace_int, warn
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

    # ---------------------------------------------------------------------------------------------------

    logging.info('\n##################  Mapping Logic Started at %s ##################', datetime.datetime.now())

    logging.info('Assigning values Started at  %s', datetime.datetime.now())

    df_vendor_agr = common_func.registerRedshiftQuery(hive_context, sqlfile.SqlVendorAgreement, "TMP_SQL_agr_vndr_agr_trans_fact")

    # RETURNED/REGISTERED NAME IS rs_TMP_SQL_agr_vndr_agr_trans_fact_mstr

    df1_count_all = hive_context.sql("SELECT count(*) COUNT__agr_vndr_agr_trans_fact FROM rs_TMP_SQL_agr_vndr_agr_trans_fact_mstr")

    print("********************************SHOW*******************************************")
    df1_count_all.show()

    print("********************************SCHEMA*******************************************")
    df_vendor_agr.printSchema()
    # Count using Select statement
    # TEMPLATE EXAMPLE countDistinctDF_sql = sqlContext.sql("SELECT firstName, lastName, count(distinct firstName) as distinct_first_names FROM databricks_df_example GROUP BY firstName, lastName")


    ##The tempdir values is tempdir="s3://sysco-nonprod-seed-spark-redshift-temp/
    print("***step 1 before writing***")
    # need to call function insertDataFrameToS3(dataframe_name, path)
    # sample call common_func.loadDataIntoRedshift(logging, 'CUSTOM', config.dataMartSchema, 'PO_UNIQUE', PO_UNIQUE_INSERT_DATA_FRAME,    co_nbr_list, preaction_query=preaction_query)

    #PARAMETERS ARE:
    # param1=logging
    # param2='INSERT','UPSERT'
    # param3=schema (intp value for stageSchema)
    # param4=table_name (final destination)
    # param5=dataframe

    print("***Prepare Company List***")
    co_nbr_list = "'000'"

    print("***Insert Statements ***")
    common_func.loadDataIntoRedshift(logging, 'INSERT', config.stageSchema, 'EI_AGR_VNDR_AGR_TRANS_RESEG', df_vendor_agr , opco_list=co_nbr_list)

    print("********************************THE END  *******************************************")

    logging.info('**********************************************', datetime.datetime.now())
    logging.info('Script read01_afr_vendor_enterprise completed %s', datetime.datetime.now())
Esempio n. 4
0
def main():

    #PENDING Try Sending output to screen

    # To fetch the current date and script name from sys.argv[] and generate log file path.
    current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    file_name = sys.argv[0].split('/')[-1].split('.')[0]

    log_file_path = "{}/{}_{}.log".format(config.log_file_directory, file_name, current_date)


    # To initialize logging
    #cHANGED FROM INFO to WARNING.
    logging.basicConfig(filename=log_file_path, filemode='w', level=logging.ERROR)

    logging.info('\n945##################  Mapping Logic Started at %s ##################', datetime.datetime.now())

    if len(sys.argv) > 1:
        co_nbrs = sys.argv[1].split(',')
        co_nbr_list = ', '.join("'{0}'".format(co_nbr.zfill(3)) for co_nbr in co_nbrs)
        logging.info('Company Number - %s', co_nbr_list)

    else:
        co_nbr_list = "'000'"
        logging.info("Company Number is not passed as argument. Script will process data for all OpCo's")

    # calling initializeSparkHiveContext() function from common_func.py to initialize spark session, register spark and hive context.
    #pending replace later hive_context = common_func.initializeSparkHiveContext('VendorAgreements')

    #---------------------------------------------------------------------------------------------------
    from pyspark.sql import HiveContext
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext

    spark = SparkSession.builder.master("yarn").appName("Purchase Order").config("spark.serializer",
                                                                                 "org.apache.spark.serializer.KryoSerializer").config(
        "spark.kryoserializer.buffer.max", "126mb").enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    hive_context = HiveContext(sc)

    # Control the logs to the stdout (console)
    # Other     options     for Level include: all, debug, error, fatal, info, off, trace, trace_int, warn
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

    # ---------------------------------------------------------------------------------------------------

    logging.info('\n##################  Mapping Logic Started at %s ##################', datetime.datetime.now())

    logging.info('Assigning values Started at  %s', datetime.datetime.now())

    # sqlstatement1_po_detail
    # sqlstatement2_po_header
    # sqlstatement3_join

    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%READ SOURCES PURCHASE ORDER HEADER AND DETAIL %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")


    print( "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%DEBUG registerRedshiftQuery  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")

   # df1 = common_func.registerRedshiftQuery(hive_context, "SELECT * from edwp.po_dtl_fact  F    where    co_skey in (7, 56) and co_bus_ordr_dt > '06/01/2017' ", "TMP_SQL_PurchaseOrder1_rdc")


    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Register Temporary Tables for Sources  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")

    #
    #   _____                _         _                              _   _        _     _
    #  / ____|              | |       | |                            | | | |      | |   | |
    # | |     _ __ ___  __ _| |_ ___  | |_ _ __ ___  _ __   ___  __ _| | | |_ __ _| |__ | | ___  ___
    # | |    | '__/ _ \/ _` | __/ _ \ | __| '_ ` _ \| '_ \ / __|/ _` | | | __/ _` | '_ \| |/ _ \/ __|
    # | |____| | |  __/ (_| | ||  __/ | |_| | | | | | |_) |\__ \ (_| | | | || (_| | |_) | |  __/\__ \
    #  \_____|_|  \___|\__,_|\__\___|  \__|_| |_| |_| .__/ |___/\__, |_|  \__\__,_|_.__/|_|\___||___/
    #                                               | |______      | |
    #                                               |_|______|     |_|
    #

    # SOURCE TABLE  intp.ei_src_vendor_agreements
    #  INTERMEDIATE TABLE  revman_stg.ei_logistic_earned_income
    # This saves the work of having to change the query and hardcode table names used in the ETL for hadoop

# sqlApplyCatchWeightIndicator = sqlfile.sqlApplyCatchWeightIndicator
 #   sqlApplyCatchWeightIndicator = sqlApplyCatchWeightIndicator.replace("intp.ei_src_vendor_agreements", "rs_TMP_SQL_ei_src_vendor_agreements_mstr")
 #   sqlApplyCatchWeightIndicator = sqlApplyCatchWeightIndicator.replace("revman_stg.ei_logistic_earned_income", "rs_TMP_SQL_ei_logistic_earned_income_mstr")

    #print(sqlApplyCatchWeightIndicator)

    #LOAD DATA
    #this code is currently loading from staged tables (Saved in intp) -for normal run this is not necessary. -can load directly

    print("*** Read table intp.ei_src_vendor_agreement and create a temporary SQL table")
    df1 = common_func.registerRedshiftQuery(hive_context, 'SELECT * FROM intp.ei_agr_vndr_agr_trans_reseg', "TMP_SQL_ei_src_vendor_agreements")


    print("*** Read table revman_stg.ei_logistic_earned_income and create a temporary SQL table")
    df2 = common_func.registerRedshiftQuery(hive_context, 'SELECT * FROM intp.ei_sus_lei', "TMP_SQL_ei_logistic_earned_income")


    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  MAIN QUERY%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")

    print(sqlfile.sqlApplyCatchWeightIndicator)
    df3 = spark.sql(sqlfile.sqlApplyCatchWeightIndicator)

    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%SCHEMA%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")
    # Count using Select statement
    # TEMPLATE EXAMPLE countDistinctDF_sql = sqlContext.sql("SELECT firstName, lastName, count(distinct firstName) as distinct_first_names FROM databricks_df_example GROUP BY firstName, lastName")


    ##The tempdir values is tempdir="s3://sysco-nonprod-seed-spark-redshift-temp/
    print("%%*step 1 before writing%%*")
    # need to call function insertDataFrameToS3(dataframe_name, path)
    # sample call common_func.loadDataIntoRedshift(logging, 'CUSTOM', config.dataMartSchema, 'PO_UNIQUE', PO_UNIQUE_INSERT_DATA_FRAME,    co_nbr_list, preaction_query=preaction_query)

    #
    # param1=logging
    # param2='INSERT','UPSERT'
    # param3=schema (intp value for stageSchema)
    # param4=table_name (final destination)
    # param5=dataframe

    print("%%*Prepare Company List%%*")
    #PENGIND START USING FUNCTIONALITY FOR NBR LIST
    co_nbr_list = "'000'"

    print("%%*Number of Records calculated:")
    print(df3.count())

    print("%%*Insert Statements %%*")
    common_func.loadDataIntoRedshift(logging, 'INSERT', 'intp', 'ei_sus_po', df3, opco_list=co_nbr_list)

    print("%%*Program finished%%*")

    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%THE END  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*")

    logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%', datetime.datetime.now())
    logging.info('Script read01_afr_vendor_enterprise completed %s', datetime.datetime.now())