コード例 #1
0
ファイル: SyncDimData.py プロジェクト: kenshinsee/common
class SyncDimData:
    """
    Get product and store dimension data from hub. DIM_PRODUCT and DIM_STORE save pivoted data,
    tall table PRODUCT and STORE save attribute that config in AP_CONFIG_META_ATTRIBUTE
    """
    def __init__(self, context):
        self.context = {**context, **context["meta"]}.copy()
        self._hub_ids = context["hub_ids"]
        self._logger = context["logger"]
        self._db = MSOperation(meta=context["meta"])
        self._dw = DWOperation(meta=context["meta"])
        self._common_schema = context["meta"]["db_conn_vertica_common_schema"]
        # self._dw_conn_vertica = self.context["dw_conn_vertica"]
        self._all_dim = self.context["all_dim"]
        self._dim_calendar = "DIM_CALENDAR"
        self._dim_product = "DIM_PRODUCT"
        self._dim_store = "DIM_STORE"
        self._product_cis_src_tbl = "PS_ITEM_DIM_EXT"
        self._store_cis_src_tbl = "PS_STORE_DIM_EXT"
        self._dct_vendor_retailer_hub = {}
        self._dct_vendor_hub = {}
        self._dct_retailer_hub = {}
        self._all_vendor_retailer_hub = self._get_avaliable_vr()

    def _get_column(self):
        """Get attribute from config table
        :return:
        """
        self._dct_table = {"DIM_PRODUCT": "", "DIM_STORE": ""}
        self._dct_key = {"DIM_PRODUCT": "ITEM_KEY", "DIM_STORE": "STORE_KEY"}
        self._dct_table_column = {"PRODUCT": [], "STORE": []}
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 1"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["PRODUCT"].append(row[0])
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 2"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["STORE"].append(row[0])
        for k, v in self._dct_table_column.items():
            self._dct_table_column[k] = """'{}'""".format("', '".join(v))

    def _get_source_config(self, hub_id):
        """Get source config
        :return:
        """
        self._source_dw = SiloOperation(hub_id=hub_id,
                                        meta=self.context["meta"])
        if self.context["meta"]["db_conn_vertica_silo_username"]:
            self._source_dw.config.json_data["configs"][
                "dw.etluser.id"] = self._source_dw.username
            self._source_dw.config.json_data["configs"][
                "dw.etluser.password"] = Crypto().encrypt(
                    self._source_dw.password)
        silo_type = self._source_dw.config.get_property(
            "etl.silo.type", "None").upper()
        # TODO For hub, silo_type is MD, not find para for this, default host
        if silo_type in ("WMSSC", "SASSC", "WMCAT", "WMINTL"):
            self._active_status = ""
        else:
            self._active_status = " AND D.ACTIVE = 'T'"

    def _get_avaliable_vr(self):
        """
        getting all avaliable vendor and retailer from cycle mapping table.
        and get related hub_id via config service.
        :return: dict like {'vendor,retailer': 'hub_id', ...}
        """

        sql = "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_MAPPING " \
              "UNION " \
              "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_RC_MAPPING"
        self._logger.debug(sql)
        dct_vendor_retailer_hub = {}
        # dct_vendor_retailer_hub = dict(self._db.query(sql))
        for v_r in self._db.query(sql):
            try:
                config = Config(meta=self.context["meta"],
                                vendor_key=v_r.VENDOR_KEY,
                                retailer_key=v_r.RETAILER_KEY)
                hub_id = config.get_hub_id()
                _key = str(v_r.VENDOR_KEY) + ',' + str(v_r.RETAILER_KEY)
                dct_vendor_retailer_hub[_key] = hub_id
            # in case there is no config returned for given vendor & retailer, then skip this vendor & retailer.
            except Exception as e:
                # self._logger.warning(str(e))
                self._logger.warning(
                    "Seems there is no silo configed for vendor: %s and retailer: %s"
                    % (str(v_r.VENDOR_KEY), str(v_r.RETAILER_KEY)))
                continue

        return dct_vendor_retailer_hub

    def _get_vendor_retailer_from_hub(self, hub_id):
        """Get vendor retailer from table AP_ALERT_CYCLE_MAPPING, only keep the first one that appear in multiple hubs
        :return:
        """
        self._str_vendor_retailer_filter = ""
        self._str_vendor_filter = ""
        self._str_retailer_filter = ""
        lst_vendor_retailer_filter = []
        lst_vendor_filter = []
        lst_retailer_filter = []
        dct_vendor_retailer_hub = self._all_vendor_retailer_hub

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ "
            "    DISTINCT VENDOR_KEY, RETAILER_KEY FROM {self._meta_schema}.CONFIG_CUSTOMERS"
            " WHERE VENDOR_NAME != 'Alert' AND HUB_ID = '{hub_id}'"
            " ORDER BY VENDOR_KEY, RETAILER_KEY".format(hub_id=hub_id,
                                                        self=self))
        self._logger.debug(sql)
        vendor_retailers = self._source_dw.query(sql)
        for row in vendor_retailers:
            _vr = "{row.VENDOR_KEY},{row.RETAILER_KEY}".format(row=row)
            # same vendor & retailer could have multi hubs.
            # So only getting the one which matches with the hub from Config service.
            if dct_vendor_retailer_hub.get(_vr) == hub_id or self._all_dim:
                if _vr not in self._dct_vendor_retailer_hub:
                    self._dct_vendor_retailer_hub[_vr] = hub_id
                    lst_vendor_retailer_filter.append(
                        "({row.VENDOR_KEY},{row.RETAILER_KEY})".format(
                            row=row))
                if str(row.VENDOR_KEY) not in self._dct_vendor_hub:
                    self._dct_vendor_hub["{}".format(row.VENDOR_KEY)] = hub_id
                    lst_vendor_filter.append("{}".format(row.VENDOR_KEY))
                if str(row.RETAILER_KEY) not in self._dct_retailer_hub:
                    self._dct_retailer_hub["{}".format(
                        row.RETAILER_KEY)] = hub_id
                    lst_retailer_filter.append("{}".format(row.RETAILER_KEY))
        if lst_vendor_retailer_filter:
            self._str_vendor_retailer_filter = (
                " AND (VENDOR_KEY, RETAILER_KEY) IN ({})"
                "".format(",".join(lst_vendor_retailer_filter)))
            self._logger.debug(self._str_vendor_retailer_filter)
        if lst_vendor_filter:
            self._str_vendor_filter = (" AND VENDOR_KEY IN ({})".format(
                ",".join(list(set(lst_vendor_filter)))))
            self._logger.debug(self._str_vendor_filter)
        if lst_retailer_filter:
            self._str_retailer_filter = (" AND RETAILER_KEY IN ({})".format(
                ",".join(list(set(lst_retailer_filter)))))
            self._logger.debug(self._str_retailer_filter)

    def _recreate_stage_table(self):
        """
        Creating related stage tables on IRIS DW.
        :return:
        """
        lst_sql = []
        for table in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append("""
                DROP TABLE IF EXISTS {self._common_schema}.STAGE_{table};
                CREATE TABLE {self._common_schema}.STAGE_{table} LIKE {self._common_schema}.{table} INCLUDING PROJECTIONS;
            """.format(self=self, table=table))
        sql = ''.join(lst_sql)
        self._logger.info(sql)
        self._dw.execute(sql)

        # creating staging table for loading CIS source data.
        _sql = """
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{itemTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{itemTable}_CIS LIKE {cmnSchema}.{itemTable} INCLUDING PROJECTIONS;
         
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{storeTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{storeTable}_CIS LIKE {cmnSchema}.{storeTable} INCLUDING PROJECTIONS;
         """.format(cmnSchema=self._common_schema,
                    itemTable=self._dim_product,
                    storeTable=self._dim_store)
        self._logger.info(_sql)
        self._dw.execute(_sql)

    def _truncate_stage_table(self):
        """Truncate stage table
        :return:
        """
        lst_sql = []
        for k in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append(
                "TRUNCATE TABLE {self._common_schema}.STAGE_{k}".format(
                    self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _analyze_table(self):
        """Analyze table
        :return:
        """
        lst_sql = []
        for k in self._dct_table.keys():
            lst_sql.append(
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.{k}')"
                "".format(self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _export_calendar_data(self, hub_id):
        """ Export calendar data to common schema from source schema if no data in DIM_CALENDAR table
        :param hub_id:
        :return:
        """
        cnt = self._dw.query_scalar(
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ COUNT(*) "
            "FROM {self._common_schema}.{self._dim_calendar}".format(
                self=self))
        if cnt != 0:
            self._logger.info(
                "Calendar data exists, skip hub_id {}.".format(hub_id))
        else:
            col = (
                'PERIOD_KEY, CALENDAR_KEY, CALENDARNAME, YEAR, YEARNAME, QUARTER, QUARTERNAME, MONTH, MONTHNAME, '
                'PERIOD, PERIODNAME, WEEKENDED, WEEKENDEDNAME, WEEKBEGIN, WEEKBEGINNAME, YEARWEEKNAME, YEARWEEK, '
                'YEARMONTHWEEKNAME, LY_PERIOD_KEY, NY_PERIOD_KEY, DATE_NAME, DATE_VALUE, CAL_PERIOD_KEY, '
                '"2YA_PERIOD_KEY", "3YA_PERIOD_KEY", "4YA_PERIOD_KEY"')
            sql = """
            SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CAL.PERIOD_KEY AS PERIOD_KEY,
                   META.CALENDAR_KEY AS CALENDAR_KEY,
                   MAX(META.CALENDAR_NAME) AS CALENDARNAME,
                   MAX(CAL."YEAR") AS "YEAR",
                   MAX(CAL.YEARNAME) AS YEARNAME,
                   MAX(CAL.YEARQUARTER) AS QUARTER,
                   MAX(CAL.YEARQUARTERNAME) AS QUARTERNAME,
                   MAX(CAL.YEARPERIOD) AS "MONTH",
                   MAX(CAL.YEARPERIODNAME) AS MONTHNAME,
                   MAX(CAL.YEARMONTH) AS PERIOD,
                   MAX(CAL.YEARMONTHNAME) AS PERIODNAME,
                   MAX(CAL.WEEKENDED) AS WEEKENDED,
                   MAX(CAL.WEEKENDEDNAME) AS WEEKENDEDNAME,
                   MAX(CAL.WEEKBEGIN) AS WEEKBEGIN,
                   MAX(CAL.WEEKBEGINNAME) AS WEEKBEGINNAME,
                   (((MAX(CAL.YEAR))::VARCHAR|| ' - Wk '::VARCHAR(6)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(MAX(CAL.WEEK))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR)) - 2) + 1),2)
                   ) AS YEARWEEKNAME,
                   MAX(CAL.YEARWEEK) AS YEARWEEK,
                   (((((MAX(CAL.YEAR))::VARCHAR|| ' - '::VARCHAR(3)) || SUBSTR (('00'::VARCHAR(2) 
                   ||(MAX(CAL.MONTH))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR)) - 2) + 1),2)) 
                   || ' Wk '::VARCHAR(4)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(META.MONTH) 
                   ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR),((LENGTH(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(CAL.MONTH) ORDER BY 
                   CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR)) - 2) + 1),2)) AS YEARMONTHWEEKNAME,
                   MAX(CAL.LY_PERIOD_KEY) LY_PERIOD_KEY,
                   MAX(CAL.NY_PERIOD_KEY) NY_PERIOD_KEY,
                   TO_CHAR(TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD'), 'YYYY-MM-DD') DATE_NAME,
                   TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD') DATE_VALUE,
                   CAL.PERIOD_KEY CAL_PERIOD_KEY,
                   MAX("2YA_PERIOD_KEY") "2YA_PERIOD_KEY",  MAX("3YA_PERIOD_KEY") "3YA_PERIOD_KEY", 
                   MAX("4YA_PERIOD_KEY") "4YA_PERIOD_KEY"
            FROM (
                (SELECT 2 AS CALENDAR_KEY, 'STD' AS PREFIX, 1 AS "MONTH", '445 (Standard)' AS CALENDAR_NAME) META
                  JOIN (SELECT C.PERIOD_KEY AS PERIOD_KEY,
                        C.CALENDAR_KEY AS CALENDAR_KEY,
                        C.YEAR,
                        C.WEEK,
                        C.MONTH,
                        (C.YEAR)::VARCHAR(4) AS YEARNAME,
                        QE.YEARQUARTER,
                        QE.YEARQUARTERNAME,
                        ME.YEARPERIOD AS YEARMONTH,
                        ME.YEARPERIODNAME AS YEARMONTHNAME,
                        ME.YEARPERIOD,
                        ME.YEARPERIODNAME,
                        WE.YEARWEEK AS YEARWEEK,
                        WE.WEEKBEGIN,
                        TO_CHAR(TO_DATE ((WE.WEEKBEGIN)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKBEGINNAME,
                        WE.WEEKENDED,
                        TO_CHAR(TO_DATE ((WE.WEEKENDED)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKENDEDNAME,
                        C.LY_PERIOD_KEY,
                        C.NY_PERIOD_KEY,
                        YA2.LY_PERIOD_KEY "2YA_PERIOD_KEY",  YA3.LY_PERIOD_KEY "3YA_PERIOD_KEY", 
                        YA4.LY_PERIOD_KEY "4YA_PERIOD_KEY"
                    FROM ((({self._dim_schema}.CALENDAR_PERIOD C
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            MIN(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKBEGIN,
                            MAX(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKENDED,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK AS WEEK,
                            CASE
                            WHEN (CALENDAR_PERIOD.WEEK > 2000) THEN CALENDAR_PERIOD.WEEK
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.WEEK)
                            END AS YEARWEEK
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.WEEK_CODE = 'B'::CHAR(1)) OR (CALENDAR_PERIOD.WEEK_CODE = 'E'::CHAR(1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK) WE
                      ON ( ( (C.CALENDAR_KEY = WE.CALENDAR_KEY)
                      AND (C.YEAR = WE.YEAR)
                      AND (C.WEEK = WE.WEEK))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH AS PERIOD,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN (CALENDAR_PERIOD.MONTH)::VARCHAR(6)
                            ELSE CONCAT (CONCAT (CONCAT ('Period '::VARCHAR(7),(CALENDAR_PERIOD.MONTH)::VARCHAR(2)), 
                                 ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4))
                            END AS YEARPERIODNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN CALENDAR_PERIOD.MONTH
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.MONTH)
                            END AS YEARPERIOD
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.MONTH_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.MONTH_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH) ME
                      ON ( ( (C.CALENDAR_KEY = ME.CALENDAR_KEY)
                      AND (C.YEAR = ME.YEAR)
                      AND (C.MONTH = ME.PERIOD))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER AS QUARTER,
                            CASE WHEN CALENDAR_PERIOD.QUARTER > 2000 THEN CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1), 
                                (SUBSTR((CALENDAR_PERIOD.QUARTER)::VARCHAR(6), 6))::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.QUARTER)::VARCHAR(4)) 
                            ELSE CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1),(CALENDAR_PERIOD.QUARTER)::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) 
                            END AS  YEARQUARTERNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.QUARTER > 2000) THEN CALENDAR_PERIOD.QUARTER
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.QUARTER)
                            END AS YEARQUARTER
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.QUARTER_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.QUARTER_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER) QE
                      ON (((C.CALENDAR_KEY = QE.CALENDAR_KEY) AND (C.YEAR = QE.YEAR) AND (C.QUARTER = QE.QUARTER))))
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA2 
                      ON C.CALENDAR_KEY = YA2.CALENDAR_KEY AND C.LY_PERIOD_KEY = YA2.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA3 
                      ON C.CALENDAR_KEY = YA3.CALENDAR_KEY AND YA2.LY_PERIOD_KEY = YA3.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA4 
                      ON C.CALENDAR_KEY = YA4.CALENDAR_KEY AND YA3.LY_PERIOD_KEY = YA4.PERIOD_KEY        
                    WHERE (C.YEAR >= ((DATE_PART ('year'::VARCHAR (4),(NOW ())::TIMESTAMP))::INT- 5))) CAL 
                  ON ( (CAL.CALENDAR_KEY = META.CALENDAR_KEY))
            )
            GROUP BY CAL.PERIOD_KEY,META.CALENDAR_KEY;
            """.format(self=self)
            dct_sync_data = self.context.copy()
            dct_sync_data["source_dw"] = self._source_dw
            dct_sync_data["source_config"] = self._source_dw.config.json_data[
                "configs"]
            dct_sync_data["target_dw_schema"] = self._common_schema
            dct_sync_data["target_dw_table"] = self._dim_calendar
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)
            self._logger.info(
                "Sync hub_id {} dim calendar done".format(hub_id))

    def _export_product_data(self, hub_id, cis_source=False):
        """
        Export product data to common schema from source schema
        :param hub_id:
        :return:
        """

        _target_stage_table = "STAGE_{}".format(self._dim_product)
        _src_item_table = "PRODUCT"
        if cis_source:
            _target_stage_table = "STAGE_{}_CIS".format(self._dim_product)
            _src_item_table = "STAGE_{}".format(self._product_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgItemTable};
            CREATE TABLE {dimSchema}.{stgItemTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC_ID AS UPC, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' AS ACTIVE 
            FROM {dimSchema}.{srcItemTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgItemTable=_src_item_table,
                       srcItemTable=self._product_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC, ITEM_DESCRIPTION, ITEM_GROUP, OSM_ITEM_NBR, OSM_ITEM_STATUS,"
            "OSM_ITEM_TYPE, OSM_BRAND, OSM_UNIT_PRICE, OSM_CATEGORY, OSM_MAJOR_CATEGORY_NO, OSM_MAJOR_CATEGORY, "
            "OSM_SUB_CATEGORY_NO, OSM_SUB_CATEGORY, OSM_VENDOR_STK_NBR, OSM_VENDOR_PACK_COST, VENDOR_PACK_QTY, "
            "OSM_WHSE_PACK_COST, OSM_WHSE_PACK_QTY, DSD_IND, VENDOR_NAME ")
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
            CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY, PV.UPC, AGPG.ITEM_DESCRIPTION, 
            CASE WHEN AGPG.ITEM_GROUP IS NULL OR AGPG.ITEM_GROUP = '000000' THEN PV.UPC ELSE AGPG.ITEM_GROUP END AS 
            ITEM_GROUP, AGPG.OSM_ITEM_NBR, AGPG.OSM_ITEM_STATUS, AGPG.OSM_ITEM_TYPE, AGPG.OSM_BRAND,AGPG.OSM_UNIT_PRICE, 
            AGPG.OSM_CATEGORY, AGPG.OSM_MAJOR_CATEGORY_NO, AGPG.OSM_MAJOR_CATEGORY, AGPG.OSM_SUB_CATEGORY_NO, 
            AGPG.OSM_SUB_CATEGORY, AGPG.OSM_VENDOR_STK_NBR, AGPG.OSM_VENDOR_PACK_COST, 
            CASE WHEN length(AGPG.VENDOR_PACK_QTY) < 15
                THEN CASE WHEN REGEXP_LIKE(SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15), '^[0-9]*(\.[0-9]+)?$') 
                    THEN CASE WHEN SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC <= 1 THEN 1
                        ELSE SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC END 
                    ELSE 1 END 
                ELSE 1 END VENDOR_PACK_QTY, AGPG.OSM_WHSE_PACK_COST, AGPG.OSM_WHSE_PACK_QTY, 
            CASE WHEN AGPG.OSM_ITEM_TYPE IN ('07','37') THEN '1' ELSE '0' END AS DSD_IND, CC.VENDOR_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, VENDOR_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.VENDOR_KEY, D.ITEM_KEY, MAX(D.UPC) UPC FROM {self._dim_schema}.{itemTable} D
            JOIN {self._dim_schema}.VENDOR V on V.VENDOR_KEY = D.VENDOR_KEY
            WHERE D.ITEM_KEY NOT IN (SELECT ITEM_KEY FROM {self._dim_schema}.PRODUCT_GLOBAL_FILTER) 
            {self._active_status}
            GROUP BY D.VENDOR_KEY, D.ITEM_KEY
        ) PV ON PV.VENDOR_KEY = CC.VENDOR_KEY
        LEFT JOIN (
            SELECT AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_GROUP_X' THEN ATTRIBVALUE END) ITEM_GROUP
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'VENDOR_PACK_QTY_X' THEN ATTRIBVALUE END) VENDOR_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_DESCRIPTION_X' THEN ATTRIBVALUE END) ITEM_DESCRIPTION
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_NBR_X' THEN ATTRIBVALUE END)     OSM_ITEM_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_STATUS_X' THEN ATTRIBVALUE END)     OSM_ITEM_STATUS
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_TYPE_X' THEN ATTRIBVALUE END)     OSM_ITEM_TYPE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_QTY_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_BRAND_X' THEN ATTRIBVALUE END)     OSM_BRAND
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_UNIT_PRICE_X' THEN ATTRIBVALUE END)     OSM_UNIT_PRICE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_STK_NBR_X' THEN ATTRIBVALUE END)     OSM_VENDOR_STK_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_VENDOR_PACK_COST
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_COST
            FROM (
                SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(ATTR_GROUP, '_X') ATTRIBNAME, D.ITEM_KEY, D.ATTRIBVALUE
                      FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                      JOIN {self._dim_schema}.{itemTable} D
                      ON D.VENDOR_KEY = AG.VENDOR_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                      WHERE AG.ATTR_GROUP IN ('ITEM_GROUP','VENDOR_PACK_QTY','ITEM_DESCRIPTION','OSM_ITEM_NBR'
                          ,'OSM_ITEM_STATUS','OSM_ITEM_TYPE','OSM_WHSE_PACK_QTY','OSM_MAJOR_CATEGORY_NO'
                          ,'OSM_MAJOR_CATEGORY','OSM_CATEGORY','OSM_SUB_CATEGORY_NO','OSM_SUB_CATEGORY','OSM_BRAND',
                          'OSM_UNIT_PRICE','OSM_VENDOR_STK_NBR','OSM_VENDOR_PACK_COST','OSM_WHSE_PACK_COST')
            ) AGP
            GROUP BY AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
        ) AGPG ON AGPG.RETAILER_KEY = CC.RETAILER_KEY AND AGPG.VENDOR_KEY = PV.VENDOR_KEY AND AGPG.ITEM_KEY =PV.ITEM_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY FROM {self._dim_schema}.PRODUCT_DEFAULT_FILTER)
        """.format(self=self, itemTable=_src_item_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_stage_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS).
        if self._str_vendor_filter and not cis_source:
            self._product_columns = self._dct_table_column["PRODUCT"]
            col = " VENDOR_KEY, ITEM_KEY, UPC, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.PRODUCT WHERE ATTRIBNAME "
                "IN ({self._product_columns}) {self._str_vendor_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_PRODUCT"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _export_store_data(self, hub_id, cis_source=False):
        """
        Export store data to common schema table stage_<table_name> from source schema <table_name>
        :param hub_id:
        :return:
        """

        _target_store_table = "STAGE_{}".format(self._dim_store)
        _src_store_table = "STORE"
        if cis_source:
            _target_store_table = "STAGE_{}_CIS".format(self._dim_store)
            _src_store_table = "STAGE_{}".format(self._store_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgStoreTable};
            CREATE TABLE {dimSchema}.{stgStoreTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID AS STOREID, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' as ACTIVE 
            FROM {dimSchema}.{srcStoreTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgStoreTable=_src_store_table,
                       srcStoreTable=self._store_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID, MARKET_CLUSTER, STATE, CITY, ZIP, OSM_REGION, "
            "OSM_DISTRICT, RSI_ALERT_TYPE, RETAILER_NAME, RSI_BANNER, PRIME_DC, STORE_NAME"
        )
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY, SR.STOREID, 
            AGSG.MARKET_CLUSTER, SR.STATE,
            CASE WHEN SR.CITY IS NULL AND STATE IS NULL THEN 'N/A'
                WHEN SR.CITY IS NOT NULL AND STATE IS NULL THEN CONCAT(SR.CITY, ', N/A')
                WHEN SR.CITY IS NULL AND STATE IS NOT NULL THEN CONCAT('N/A, ' , SR.STATE)
                WHEN SR.CITY IS NOT NULL AND STATE IS NOT NULL THEN CONCAT(CONCAT(SR.CITY, ', '), SR.STATE) END CITY,
            SR.ZIP, AGSG.OSM_REGION, AGSG.OSM_DISTRICT, COALESCE(SR.RSI_ALERT_TYPE, '0') RSI_ALERT_TYPE,
            CC.RETAILER_NAME, coalesce(AGSG.RSI_BANNER, 'unmapped') AS RSI_BANNER, AGSG.PRIME_DC, AGSG.STORE_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, RETAILER_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.RETAILER_KEY, D.STORE_KEY, D.STOREID,
                MAX(CASE WHEN D.ATTRIBNAME = 'STATE' THEN D.ATTRIBVALUE END) STATE,
                MAX(CASE WHEN D.ATTRIBNAME = 'CITY' THEN D.ATTRIBVALUE END) CITY,
                MAX(CASE WHEN D.ATTRIBNAME = 'ZIP' THEN D.ATTRIBVALUE END) ZIP,
                MAX(CASE WHEN D.ATTRIBNAME = 'RSI_ALERT_TYPE' THEN D.ATTRIBVALUE END) RSI_ALERT_TYPE
            FROM {self._dim_schema}.{storeTable} D
            JOIN {self._dim_schema}.RETAILER R ON R.RETAILER_KEY = D.RETAILER_KEY
            WHERE STORE_KEY NOT IN (SELECT STORE_KEY FROM {self._dim_schema}.STORES_GLOBAL_FILTER) {self._active_status}
            GROUP BY D.RETAILER_KEY, D.STORE_KEY, D.STOREID
        ) SR ON SR.RETAILER_KEY = CC.RETAILER_KEY
        LEFT JOIN (
            SELECT AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'MARKET_CLUSTER_X' THEN ATTRIBVALUE END) MARKET_CLUSTER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_REGION_X' THEN ATTRIBVALUE END) OSM_REGION,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_DISTRICT_X' THEN ATTRIBVALUE END) OSM_DISTRICT,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'RSI_BANNER_X' THEN ATTRIBVALUE END) RSI_BANNER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'PRIME_DC_X' THEN ATTRIBVALUE END) PRIME_DC,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'SAR_STORE_NAME_X' THEN ATTRIBVALUE END) STORE_NAME
                FROM (
                    SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(AG.ATTR_GROUP, '_X') ATTRIBNAME, D.STORE_KEY, 
                        D.ATTRIBVALUE
                    FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                    JOIN {self._dim_schema}.{storeTable} D
                    ON D.RETAILER_KEY = AG.RETAILER_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                    WHERE AG.ATTR_GROUP IN ('RSI_BANNER','PRIME_DC', 'MARKET_CLUSTER', 'OSM_REGION', 'OSM_DISTRICT', 'SAR_STORE_NAME')
                ) AGS
                GROUP BY AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY
        ) AGSG 
        ON AGSG.RETAILER_KEY = SR.RETAILER_KEY AND AGSG.VENDOR_KEY = CC.VENDOR_KEY
        AND AGSG.STORE_KEY = SR.STORE_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY FROM {self._dim_schema}.STORES_DEFAULT_FILTER);
        """.format(self=self, storeTable=_src_store_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_store_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS)
        if self._str_retailer_filter and not cis_source:
            self._store_columns = self._dct_table_column["STORE"]
            col = " RETAILER_KEY, STORE_KEY, STOREID, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.STORE WHERE ATTRIBNAME "
                "IN ({self._store_columns}) {self._str_retailer_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_STORE"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _load_product_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        if the same item_key exists in both CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_product_cis
        self._export_product_data(hub_id=hub_id, cis_source=True)

        # Below part is moved from method self._export_product_data. Put it here when all product data are ready.
        # Override nxg items with CIS if there are duplicated items on both NXG and CIS
        sql = """
         DELETE FROM {cmnSchema}.STAGE_{itemTable} 
         WHERE (vendor_key, retailer_key, item_key) IN
         (SELECT vendor_key, retailer_key, item_key
          FROM {cmnSchema}.STAGE_{itemTable}_CIS);
         
         INSERT INTO {cmnSchema}.STAGE_{itemTable} 
         SELECT * FROM {cmnSchema}.STAGE_{itemTable}_CIS;
        """.format(itemTable=self._dim_product, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

        # updating column OSM_MAJOR_CATEGORY when NXG and CIS sources are all done.
        sql = """
        DROP TABLE IF EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_SUB_CATEGORY_NO,MAX(OSM_SUB_CATEGORY) as OSM_SUB_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_SUB_CATEGORY_NO;

        DROP TABLE IF EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_MAJOR_CATEGORY_NO,MAX(OSM_MAJOR_CATEGORY) as OSM_MAJOR_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_MAJOR_CATEGORY_NO;
        """.format(self=self)
        self._logger.info(sql)
        self._dw.execute(sql)

        sql = ("""
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.STAGE_{self._dim_product}');

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_SUB_CATEGORY_CONSISTENCY
        SELECT a.OSM_SUB_CATEGORY_NO, MAX(a.OSM_SUB_CATEGORY) AS OSM_SUB_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_SUB_CATEGORY_CONSISTENCY b on a.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO
        WHERE a.OSM_SUB_CATEGORY_NO IS NOT NULL AND a.OSM_SUB_CATEGORY_NO <> '' AND b.OSM_SUB_CATEGORY_NO IS NULL
        GROUP BY a.OSM_SUB_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_SUB_CATEGORY = b.OSM_SUB_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_SUB_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO;
        /*UPDATE {self._common_schema}.STAGE_{self._dim_product}
        SET ITEM_GROUP = B.ITEM_GROUP
        FROM $schemaName.OLAP_ITEM B
        WHERE {self._common_schema}.STAGE_{self._dim_product}.ITEM_KEY = B.ITEM_KEY;*/

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY
        SELECT a.OSM_MAJOR_CATEGORY_NO, MAX(a.OSM_MAJOR_CATEGORY) AS OSM_MAJOR_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY b on a.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO
        WHERE a.OSM_MAJOR_CATEGORY_NO IS NOT NULL AND a.OSM_MAJOR_CATEGORY_NO <> '' AND b.OSM_MAJOR_CATEGORY_NO IS NULL
        GROUP BY a.OSM_MAJOR_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_MAJOR_CATEGORY = b.OSM_MAJOR_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO;
        """.format(self=self))
        self._logger.info(sql)
        self._dw.execute(sql)

    def _load_store_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        # if the same store_key exists in CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_store_cis
        self._export_store_data(hub_id=hub_id, cis_source=True)

        # # Override nxg stores with CIS if there are duplicated items on both NXG and CIS
        sql = """
        DELETE FROM {cmnSchema}.STAGE_{storeTable} 
        WHERE (vendor_key, retailer_key, store_key) 
        IN (SELECT vendor_key, retailer_key, store_key 
            FROM {cmnSchema}.STAGE_{storeTable}_CIS );
        
        INSERT INTO {cmnSchema}.STAGE_{storeTable}  
        SELECT * FROM {cmnSchema}.STAGE_{storeTable}_CIS;
        """.format(storeTable=self._dim_store, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

    def _swap_data(self):
        """
        Switch partition from stage table to final table.
        :return:
        """

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_{self._dim_product}".format(
                   self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_product,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_product}'.format(self=self))

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_PRODUCT".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='PRODUCT',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_PRODUCT')

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_{self._dim_store}".format(
                self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_store,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_store}'.format(self=self))

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_STORE".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='STORE',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_STORE')

    def _process_sync_dim_data(self):
        """
        Process to sync dim data
        :return:
        """
        try:
            self._get_column()
            self._recreate_stage_table()
            for hub_id in self._hub_ids:
                try:
                    self._dim_schema = "DIM_{hub_id}".format(hub_id=hub_id)
                    self._meta_schema = "METADATA_{hub_id}".format(
                        hub_id=hub_id)
                    self._logger.info(
                        "Sync dim data start hub_id {hub_id}".format(
                            hub_id=hub_id))
                    self._get_source_config(hub_id)
                    self._export_calendar_data(hub_id)
                    self._get_vendor_retailer_from_hub(hub_id)

                    if not self._str_vendor_retailer_filter:
                        self._logger.warning(
                            "hub_id {} does not have any matched product and store data"
                            .format(hub_id))
                        continue

                    self._export_product_data(hub_id)
                    self._export_store_data(hub_id)

                    # handle CIS source.
                    self._load_product_data_cis(hub_id)
                    self._load_store_data_cis(hub_id)

                    self._logger.info(
                        "Sync hub_id {hub_id} dim data to stage table.".format(
                            hub_id=hub_id))
                except Exception as msg:
                    self._logger.warning(msg)
                    continue

            self._swap_data()
            # self._truncate_stage_table()
            self._logger.info("Sync dim data done from hubs {hub_ids}".format(
                hub_ids=self._hub_ids))
            self._analyze_table()
        except Exception as msg:
            self._logger.error(msg)
        finally:
            pass

    def process(self):
        try:
            self._process_sync_dim_data()

        except Exception as e:
            self._logger.error(e)

        finally:
            if hasattr(self, "_dw"):
                self._dw.close_connection()
            if hasattr(self, "_source_dw"):
                self._source_dw.close_connection()
            if hasattr(self, "_db"):
                self._db.close_connection()
コード例 #2
0
class Feedback(object):
    def __init__(self, meta, params=None, init_flag=False, logger=None):
        """
        # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key.
        # 1, only sync those vendor&retailer which applied OSA Service.
        # 2, for Those new vendor&retailer, copy all historical data when initialization.

        :param meta: [mandatory] config data from config.properties file
        :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below.
            1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs.
               Noted: rdp_id will be passed when calling this service via REST API.
            2, vendor_key:   mandatory only when init_flag is True.
               retailer_key: mandatory only when init_flag is True
               Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts.
        :param init_flag:
            if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer.
            if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily).
        :param logger:
        """
        self.meta = meta
        self._params = {} if params is None else params
        self._rdp_id = self._params.get("rdpId", None)
        self._fact_type = 'fdbk'
        self._init_flag = init_flag
        self._vendor_key = self._params.get("vendor_key", None)
        self._retailer_key = self._params.get("retailer_key", None)
        self._debug = self._params.get('debug', 'N')
        self._default_rdp = "RDP_AUX"

        self._log_file = './log/sync_fdbk_%s_%s.log' % (
            self._rdp_id, datetime.datetime.now().strftime('%Y%m%d'))
        self.logger = logger if logger else Logger(log_level="debug",
                                                   target="console|file",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   log_file=self._log_file,
                                                   sql_conn=None)

        self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger)
        self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger)
        self.max_event_key = None

        # we already know feedback table name of RDP
        self.source_table_rdp = "DS_FACT_FEEDBACK"  # source table in RDP side.
        self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP"  # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK)
        self.target_table_osa = "FACT_FEEDBACK"  # final table in OSA side
        self.capacity = Capacity(meta=meta)

        self.dct_sync_data = copy.deepcopy(
            self.meta)  # required for calling sync_data module
        self.dct_sync_data[
            "meta"] = self.meta  # required for calling sync_data module
        self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn
        self.dct_sync_data["target_dw_schema"] = self.meta[
            'db_conn_vertica_common_schema']
        self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa
        self.dct_sync_data["logger"] = self.logger

        # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql.
        self.dct_sync_data["dw_conn_vertica"] = False
        # self.dct_sync_data["dw_conn_vertica"] = True

        self.transfer = TransferData(dct_sync_data=self.dct_sync_data)

    def _populate_source_config(self, source_config):
        self.logger.debug("The source config is: %s" % source_config)

        _src_config = {}
        if os.name == 'nt':
            _src_config["temp_file_path"] = "d:"
        elif os.name == 'posix':
            _src_config["temp_file_path"] = "/tmp"

        # Getting user account from config.properties file first.
        if self.meta.get("db_conn_vertica_rdp_username"):
            _src_config["dw.etluser.id"] = self.meta.get(
                "db_conn_vertica_rdp_username")
            if self.meta.get("db_conn_vertica_rdp_password"):
                _src_config["dw.etluser.password"] = self.meta.get(
                    "db_conn_vertica_rdp_password")
            else:
                _pmp_pwd = get_password(
                    username=self.meta.get("db_conn_vertica_rdp_username"),
                    meta=self.meta)
                # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module
                _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd)
        # if not configed then get them directly from RDP config.
        else:
            _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id")
            # the pwd is encrypted
            _src_config["dw.etluser.password"] = source_config.get(
                "dw.etluser.password")

        # required info for calling sync_data module.
        _src_config["dw.server.name"] = source_config.get("dw.server.name")
        _src_config["dw.db.name"] = source_config.get("dw.db.name")
        _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433)
        _src_config["dw.schema.name"] = source_config.get("dw.schema.name")

        self.logger.debug("srouce config is: %s" % _src_config)
        self.dct_sync_data["source_config"] = _src_config

        # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster
        rdp_meta = copy.deepcopy(self.meta)
        tmp_rdp_meta = {
            'db_conn_vertica_servername': _src_config["dw.server.name"],
            'db_conn_vertica_port': _src_config["dw.db.portno"],
            'db_conn_vertica_dbname': _src_config["dw.db.name"],
            'db_conn_vertica_username': _src_config["dw.etluser.id"],
            'db_conn_vertica_password': _src_config["dw.etluser.password"],
            'db_conn_vertica_password_encrypted': "true"
        }
        rdp_meta.update(tmp_rdp_meta)
        self.logger.debug("rdp config is: %s" % rdp_meta)
        rdp_connection = DWOperation(meta=rdp_meta)
        self.dct_sync_data["source_dw"] = rdp_connection

    def main_process(self):
        try:
            # if not introducing new customer and _rdp_id was given,
            # then we will sync all feedback data from given RDP for registered users.
            if self._init_flag is False and self._rdp_id:
                try:
                    rdp_config = Config(meta=self.meta,
                                        hub_id=self._rdp_id).json_data
                    if not rdp_config['configs']:
                        raise Warning(
                            "There is no configs returned for RDP: %s."
                            "Please check if this RDP registered in CP with below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self._rdp_id, self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    _rdp_schema = rdp_config['configs'].get('dw.schema.name')
                    self.logger.info("Started to sync data from rdp: %s" %
                                     _rdp_schema)

                    # self.dct_sync_data["source_config"] = rdp_config['configs']
                    self._populate_source_config(rdp_config['configs'])

                    self.initialize()

                    _flag = self.load_data()
                    if _flag:
                        # if no data, then no need to process & update variables table.
                        self.process_data()

                        sql = """
                        IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                            INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                            VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                        ELSE
                            UPDATE VARIABLES
                            SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                            WHERE VARIABLE_NAME = '{eventType}'
                        """.format(eventType=_rdp_schema,
                                   value=self.max_event_key)
                        self.logger.info(sql)
                        self.osa_app_conn.execute(sql)

                    self.logger.info("Data sync done for RDP: %s" %
                                     _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise
                    # exit(StepStatus.SUCCESS.value)  # exit(0) otherwise Docker container will fail.

            # Else we will get all RDPs from REST API: http://10.172.36.75/config/properties/rdps?factType=fdbk
            # There could be multi RDPs(e.g. for SVR & WM). if so, loop all RDPs
            elif self._init_flag is False and self._rdp_id is None:
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        raise Warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    for rdp_config in rdp_configs:
                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info("Started to sync data from rdp: %s" %
                                         _rdp_schema)

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process & update variables table.
                            self.process_data()

                            sql = """
                            IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                                INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                                VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                            ELSE
                                UPDATE VARIABLES
                                SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                                WHERE VARIABLE_NAME = '{eventType}'
                            """.format(eventType=_rdp_schema,
                                       value=self.max_event_key)
                            self.logger.info(sql)
                            self.osa_app_conn.execute(sql)

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise

            elif self._init_flag is True:
                if self._vendor_key is None or self._retailer_key is None:
                    self.logger.warning(
                        "vendor_key and retailer_key are required when initilize feedback for new customer"
                    )
                    raise ValueError

                # getting fdbk related rdps.
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        self.logger.warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        exit(StepStatus.SUCCESS.value)

                    fdbk_rdps = [
                        str(rdp_config["rdpId"]).upper()
                        for rdp_config in rdp_configs
                    ]

                    # change table name in case conflict with normal sync process
                    self.dct_sync_data[
                        "target_dw_table"] = "{0}_{1}_{2}".format(
                            self.staging_import_table_osa, self._vendor_key,
                            self._retailer_key)

                    _silo_config = Config(
                        meta=self.meta,
                        vendor_key=self._vendor_key,
                        retailer_key=self._retailer_key).json_data
                    _silo_type = _silo_config['configs'].get(
                        'etl.silo.type', 'SVR')
                    _rdp_id = _silo_config['configs'].get('rdp.db.name')

                    # RDP_AUX is default rdp id for feedback etl on PRODUCTION.
                    # 1, if there is no RDP for given silo. then exit.
                    if not _rdp_id or str(_rdp_id).strip() == '':
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # 2, Getting configed RDP list, and check if there are feedback related RDPs.
                    _tmp_rdp_lst = str(_rdp_id).upper().split(sep=",")
                    _rdp_lst = [_tmp.strip() for _tmp in _tmp_rdp_lst]
                    # common_rdps is RDP silo configed for syncing feedback data for given silo(vendor&retailer)
                    common_rdps = list(set(_rdp_lst).intersection(fdbk_rdps))
                    if common_rdps is None:
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # If there is 1 or more than 1 feedback related rdps configed, then loop them to sync feedback data,
                    # Normally, there should be only 1. or no feedback rdp configed.
                    for common_rdp in common_rdps:
                        _rdp_id = common_rdp
                        self.logger.info(
                            "Started to sync data from rdp: %s for given vendor:%s and retailer:%s. "
                            % (_rdp_id, self._vendor_key, self._retailer_key))

                        # if RDP is not RDP_AUX, Won't exit but log a warning.
                        if _rdp_id != self._default_rdp:
                            self.logger.warning(
                                "Please be noted: The RDP is:%s. It is not RDP_AUX."
                                % _rdp_id)

                        # WM silos are also following above logic.
                        # all hosted silo are ultilizing RDP_AUX to transfer feedback data. not sure about Walmart.
                        # if str(_silo_type).upper() in ["WMSSC", "WMCAT", "SASSC", "WMINTL"]:
                        #     _rdp_id = self._default_rdp  # WM rdp is RDP_AUX as well?

                        rdp_config = Config(meta=self.meta,
                                            hub_id=_rdp_id).json_data
                        if not rdp_config['configs']:
                            self.logger.warning(
                                "There is no configs for RDP: %s. Please check following URL:"
                                "%s/properties/%s/%s" %
                                (_rdp_id, self.meta["api_config_str"], _rdp_id,
                                 _rdp_id))
                            exit(StepStatus.SUCCESS.value)

                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info(
                            "Started to init feedback data from rdp: %s for "
                            "given vendor:%s and retailer:%s " %
                            (_rdp_schema, self._vendor_key,
                             self._retailer_key))

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process.
                            self.process_data()

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_id)

                except Exception as e:
                    self.logger.warning(e)
                    self.logger.warning(
                        "Please check if any warning or error messages when doing the initialization!"
                    )

        finally:
            if self.osa_app_conn:
                self.osa_app_conn.close_connection()
            if self.osa_dw_conn:
                self.osa_dw_conn.close_connection()

    def initialize(self):
        """
        Create local temp tables , and DDLs required to process this fact type
        :return:
        """
        self.logger.info("Initialize...")

        # recreate this table for every RDP. no need to truncate any longer.
        # sql = "TRUNCATE TABLE {cmnSchema}.{targetTable}"\
        #     .format(cmnSchema=self.dct_sync_data["target_dw_schema"], targetTable=self.staging_import_table_osa)
        # self.logger.info(sql)
        # self.osa_dw.execute(sql)

        sql = """
        --Store data from RDP table.
        DROP TABLE IF EXISTS {cmnSchema}.{importTable};
        CREATE TABLE {cmnSchema}.{importTable}
        (
            EVENT_KEY int NOT NULL,
            RETAILER_KEY int,
            VENDOR_KEY int,
            STORE_VISIT_DATE date,
            PERIOD_KEY int NOT NULL,
            TYPE varchar(1),
            TYPE_DATE varchar(10),
            ALERT_ID int,
            ALERT_TYPE varchar(64),
            MERCHANDISER_STORE_NUMBER varchar(512),
            STORE_ID varchar(512),
            MERCHANDISER_UPC varchar(512),
            INNER_UPC varchar(512),
            MERCHANDISER varchar(100),
            STORE_REP varchar(1000),
            SOURCE varchar(1000),
            BEGIN_STATUS varchar(255),
            ACTION varchar(255),
            FEEDBACK_DESCRIPTION varchar(255),
            FEEDBACK_HOTLINEREPORTDATE date,
            FEEDBACK_ISININVENTORY varchar(5),
            ZIP_CODE varchar(64),
            ARTS_CHAIN_NAME varchar(255),
            UPC_STATUS varchar(255),
            MSI varchar(255)
        )
        UNSEGMENTED ALL NODES;
        """.format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                   importTable=self.dct_sync_data["target_dw_table"])
        self.logger.info(sql)
        self.osa_dw_conn.execute(sql)

    def load_data(self):
        """
        # Load data from RDP table ds_fact_feedback to local temp tables.
        There is an column event_key which is incremental for all customers in ds_fact_feedback table.
        we can save the snapshot of this column to variable table, and do the incremental every time based on this column.
        There are few cases here:
        1, Routinely, There will be a scheduled job to sync the whole feedback data for valid customers from related RDP silo.
           And save the snapshot of the event_key from previous loading for next incremental loading.
        2, if on-boarding a new vendor & retailer customer. Getting rdp_event_key from variable for related RDP silo. (rdp_event_key is from previous loading)
           and then sync feedback data from related RDP silo only for this given customer when event_key < rdp_event_key.
           Then case1 will take care the rest of feedback data.
        :return:
        """
        rdp_schema = self.dct_sync_data["source_config"].get('dw.schema.name')

        # rdp_aux.ds_fact_feedback
        source_table = "{rdpSchema}.{rdptableName}"\
            .format(rdpSchema=rdp_schema,
                    rdptableName=self.source_table_rdp)
        # common.stage_fact_feedback_rdp
        target_table = "{dwSchema}.{importTable}"\
            .format(dwSchema=self.dct_sync_data["target_dw_schema"],
                    importTable=self.dct_sync_data["target_dw_table"])

        self.logger.info(
            "Ready to load Data from {srouceTable} to {targetTable}".format(
                targetTable=target_table, srouceTable=source_table))

        insert_columns = " EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISIT_DATE, PERIOD_KEY, TYPE, TYPE_DATE," \
                         " ALERT_ID, ALERT_TYPE, MERCHANDISER_STORE_NUMBER, STORE_ID, MERCHANDISER_UPC, INNER_UPC," \
                         " MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION," \
                         " FEEDBACK_HOTLINEREPORTDATE, FEEDBACK_ISININVENTORY, ZIP_CODE, ARTS_CHAIN_NAME, UPC_STATUS, MSI "

        try:
            self.logger.info(
                "Getting the previous Event_key from last run for incremental load."
            )
            _event_sql = "SELECT VARIABLE_VALUE FROM variables " \
                         "WHERE VARIABLE_NAME = '{rdpName}'".format(rdpName=rdp_schema)
            self.logger.info(_event_sql)
            event_key = self.osa_app_conn.query_scalar(_event_sql)

            self.logger.info(
                "Getting customer info which only applied OSA services as filter"
            )
            sql = "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_MAPPING " \
                  "UNION " \
                  "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_RC_MAPPING"
            self.logger.info(sql)
            results = self.osa_app_conn.query(sql)
            if not results:
                raise Warning(
                    "There is no data in table AP_ALERT_CYCLE_MAPPING. Please check sql: %s"
                    % sql)
                # exit(StepStatus.SUCCESS.value)

            user_filters = [
                'SELECT ' + str(result.retailer_key) + ',' +
                str(result.vendor_key) for result in results
            ]
            user_filter_str = ' UNION ALL '.join(user_filters)
            self.logger.info("Customer filters are: %s" % user_filter_str)

            # incremental filter from RDP table
            where_sql = "EVENT_KEY > {eventKey} AND SOURCE != 'ARIA' " \
                        "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                .format(eventKey=event_key,
                        userFilter=user_filter_str)

            # TODO2DONE: how to set default value? use -1
            # copy all if there is no value in variables table.
            if not event_key:
                self.logger.warning(
                    "There is no value set in variables table for RDP:{name}, "
                    "So copy the whole table".format(name=rdp_schema))
                where_sql = " SOURCE != 'ARIA' AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str)
                event_key = -1  # check if this is the first run.

            if self._init_flag is True:
                if event_key == -1:  # event_key is None
                    self.logger.warning(
                        "There is no event_key logged in variables table for the given RDP: %s."
                        "So Let's wait for the routine job to sync the whole rdp feedback data"
                        % rdp_schema)
                    return False

                self.logger.info("Generating init feedback filters")
                where_sql = "EVENT_KEY <= {eventKey} AND SOURCE != 'ARIA' " \
                            "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter}) " \
                            "AND RETAILER_KEY={retailerKey} AND VENDOR_KEY={vendorKey} "\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str,
                            retailerKey=self._retailer_key,
                            vendorKey=self._vendor_key)

            self.logger.debug("The filters are: %s" % where_sql)

            # form the fetch query from RDP and then Insert into the target table
            fetch_query = """
            SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ {insertQuery} FROM {sourceTable}
            WHERE {whereSql}
            """.format(insertQuery=insert_columns,
                       sourceTable=source_table,
                       whereSql=where_sql)
            self.logger.info("fetch_query is : %s" % fetch_query)

            self.logger.info(
                ">>Loading {factType} Data from event_key:{eventKey} start at: {timestamp}<<"
                .format(factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            self.dct_sync_data["target_column"] = insert_columns
            self.dct_sync_data["source_sql"] = fetch_query

            row_count = self.transfer.transfer_data(
                dct_sync_data=self.dct_sync_data)

            self.logger.info(
                ">>Done loaded {cnt} rows from event_key:{eventKey} completed at: {timestamp}<<"
                .format(cnt=row_count,
                        factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            # if no data transfered, then update variables with previous value.
            sql = "SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ nvl(max(event_key), {oldEventKey}) " \
                  "FROM {schemaName}.{importTable} "\
                .format(schemaName=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"],
                        oldEventKey=event_key)
            self.logger.info(sql)
            self.max_event_key = self.osa_dw_conn.query_scalar(sql)

            # max_event_key = -1   # testing purpose
            if self.max_event_key == -1:
                self.logger.warning(
                    "There is no feedback data in RDP table: {0}".format(
                        source_table))
                return False

            return True

        except Exception as e:
            self.logger.warning(e)
            raise

        finally:
            pass

    def process_data(self):
        """
        after load_data part completes. sync data from temp table to related schemas.
        :return:
        """
        try:
            self.logger.info("Processing feedback start...")

            # loop retailer to insert feedback data
            sql = "SELECT DISTINCT retailer_key " \
                  "FROM {cmnSchema}.{importTable}"\
                .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"])
            self.logger.info(sql)
            retailers = self.osa_dw_conn.query(sql)

            if retailers.rowcount == 0:
                self.logger.warning(
                    "There is no data in table {cmnSchema}.{importTable}."
                    "It could be no incremental data. Please check fetch_query against RDP database"
                    .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"]))

            for retailer in retailers:
                retailer_key = retailer.retailer_key
                osa_schema = self.capacity.get_retailer_schema_name(
                    retailer_key)

                # Finally, run the sql
                msg = "Processing fdbk data within retailer {retailerKey}:{retailerName}"\
                    .format(retailerKey=retailer_key, retailerName=osa_schema)
                self.logger.info(msg)

                # Normally, There should NOT be duplicated alert_id transfered by incremental.
                # But should consider this case here. Delete existing alertid from target table
                # TODO: delete could have performance issue. consider using switch partition
                delete_sql = "DELETE FROM {osaSchema}.{targetTable} " \
                             "WHERE alert_id IN (SELECT alert_id FROM {cmnSchema}.{importTable} )"\
                    .format(targetTable=self.target_table_osa,
                            osaSchema=osa_schema,
                            cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(delete_sql)
                self.osa_dw_conn.execute(delete_sql)

                # inserting feedback data into final table fact_feedback from processed table.
                sql = """
                INSERT INTO {osaSchema}.{targetTable}
                (EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISITED_PERIOD_KEY, PERIOD_KEY,
                 ALERT_ID, STORE_KEY, MERCHANDISER_STORE_NUMBER,
                 STORE_ID, ITEM_KEY, MERCHANDISER_UPC, UPC, MERCHANDISER, STORE_REP, SOURCE,
                 BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION,
                 ON_HAND_PHYSICAL_COUNT, ON_HAND_CAO_COUNT
                )
                SELECT stage.EVENT_KEY, stage.RETAILER_KEY, stage.VENDOR_KEY,
                       TO_CHAR(stage.STORE_VISIT_DATE, 'YYYYMMDD')::int AS STORE_VISITED_PERIOD_KEY,
                       stage.PERIOD_KEY,
                       stage.ALERT_ID,
                       store.STORE_KEY AS STORE_KEY,
                       stage.MERCHANDISER_STORE_NUMBER,
                       COALESCE(store.STORE_ID , alert.STOREID, stage.STORE_ID) AS STORE_ID,
                       item.ITEM_KEY AS ITEM_KEY,
                       stage.MERCHANDISER_UPC,
                       COALESCE(item.UPC, alert.UPC, stage.INNER_UPC, stage.MERCHANDISER_UPC) AS UPC,
                       stage.MERCHANDISER, stage.STORE_REP, stage.SOURCE,
                       stage.BEGIN_STATUS, stage.ACTION, stage.FEEDBACK_DESCRIPTION,
                       0 AS ON_HAND_PHYSICAL_COUNT,
                       0 AS ON_HAND_CAO_COUNT
                FROM {cmnSchema}.{importTable} stage
                LEFT JOIN {osaSchema}.FACT_PROCESSED_ALERT alert
                ON stage.alert_id = alert.alert_id AND alert.issuanceid = 0
                AND alert.retailer_key = {retailerKey} AND stage.vendor_key = alert.vendor_key
                INNER JOIN {cmnSchema}.DIM_PRODUCT item
                ON item.retailer_key = {retailerKey} AND alert.vendor_key = item.vendor_key
                AND item.item_key = alert.item_key
                INNER JOIN {cmnSchema}.DIM_STORE store
                ON store.retailer_key = {retailerKey} AND alert.vendor_key = store.vendor_key
                AND store.store_key = alert.store_key
                WHERE stage.retailer_key = {retailerKey}
                """.format(osaSchema=osa_schema,
                           targetTable=self.target_table_osa,
                           cmnSchema=self.dct_sync_data["target_dw_schema"],
                           importTable=self.dct_sync_data["target_dw_table"],
                           retailerKey=retailer_key)
                self.logger.info(
                    "SQL used to load data to related schema. %s" % sql)
                self.osa_dw_conn.execute(sql)

            self.logger.info("Processing feedback ended...")

        except Exception as e:
            self.logger.warning("Process data for RDP {0} failed: {1}".format(
                self._rdp_id, e))
            raise

        finally:
            if self._debug.upper() == 'N':
                _drop_sql = "DROP TABLE IF EXISTS {schemaName}.{importTable};" \
                    .format(schemaName=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(_drop_sql)
                self.osa_dw_conn.execute(_drop_sql)
コード例 #3
0
    _common_schema = "common"
    _dim_product = "DIM_PRODUCT"
    _dim_store = "DIM_STORE"
    _dim_calendar = "DIM_CALENDAR"
    _meta_intvn = "ALT_META_INTERVENTION"
    _dim_base_dir = meta.get("azure_storage_dim_root")
    _prefix_name = "IRIS_"

    _dw_conn = DWOperation(meta=meta)
    # Getting retailer list from dim table
    retailer_list_in_table = []
    _retailer_sql = "SELECT DISTINCT retailer_key FROM {schema}.{table} where retailer_key in (6, 5240)"\
        .format(schema=_common_schema,
                table=_dim_store)
    res = _dw_conn.query(_retailer_sql)
    for ele in res:
        retailer_list_in_table.append(ele.retailer_key)
    print(retailer_list_in_table)

    capacity = Capacity(meta=meta)
    azure_uploader = UploadToBlobStorage(meta=meta)
    # sql_for_item = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_product)
    # sql_for_store = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_store)
    # sql_for_cal = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_calendar)
    # sql_for_alert = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_meta_intvn)

    # retailer_list = [6, 5240]
    retailer_list = retailer_list_in_table

    for retailer_key in retailer_list:
コード例 #4
0
class Scd(object):
    def __init__(self,
                 sql,
                 actioner,
                 log_detail=True,
                 batch_size=100,
                 meta={},
                 db_type=None,
                 table_schema=None,
                 logger=None):
        self.meta = meta
        self.actioner = actioner
        self.log_detail = log_detail
        self.batch_size = batch_size
        self.logger = logger if logger else Logger(log_level="info",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   module_name="scd")
        self.sql = sql
        self.raw_sql_splitted = self.sql.split(' ')
        self.raw_sql_splitted_trimed = [
            str for str in self.raw_sql_splitted if str != ''
        ]
        self.db_type = db_type
        self.table_schema = table_schema
        self.logger.info('[SQL] %s' % self.sql)
        self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted)
        self.mssql = MSOperation(meta=self.meta)
        self.vtc = None
        self.saved_records_count = 0

    def set_insert_delete_update(self):
        cmd = self.raw_sql_splitted_trimed[0].upper()
        if cmd in ('DELETE', 'UPDATE', 'INSERT'):
            self.cmd = cmd
        else:
            raise RuntimeError('%s is not supported.' % cmd)

    def locate_keyword(self, keyword, sql_splitted=[]):
        # find the location of the keyword firstly appears in the sql
        # e.g. locate WHERE
        # UPDATE TEST SET COL = ' WHERE ' WHERE ID=1, the first ' WHERE ' is not a keyword, so return the index of the 2nd WHERE
        keyword = keyword.upper()
        base_seq_splitted = sql_splitted if sql_splitted else self.raw_sql_splitted  # if sql_splitted is specified, locate keyword in sql_splitted, otherwise locate keyword in raw_sql_splitted
        index_keyword = [
            k for k, v in enumerate(base_seq_splitted) if keyword in v.upper()
        ]
        for ind in index_keyword:
            count_of_single_quote_ahead_of_where = sum([
                base_seq_splitted[i].count("'") for i in range(0, ind)
                if "'" in base_seq_splitted[i]
            ])
            i = base_seq_splitted[ind].upper().index(keyword)
            count_of_single_quote_ahead_of_where += base_seq_splitted[ind][
                0:i].count("'")
            if count_of_single_quote_ahead_of_where % 2 == 0:  # the KEYWORD we want to capture is the one has n*2 "'" ahead of it
                return ind
        return -1  # KEYWORD not found

    def locate_keyword_combination(
            self,
            keywords):  #--mainly for parsing merge, may need to be updated
        # find the location of the first keyword, and to check if 2nd, 3rd keywords are just following the 1st keyword in the SQL
        # if match all the keywords, return the index of the first keyword
        # else return -1
        keywords_list = [
            w.upper().strip() for w in keywords.split(' ') if w.strip() != ''
        ]
        sql_splitted = [*self.raw_sql_splitted]
        base_ind = 0
        self.logger.info('Locating %s' % keywords_list)
        while True:
            self.logger.info('Locating 1st keyword "%s"' % keywords_list[0])
            ind = self.locate_keyword(keywords_list[0], sql_splitted)
            if ind == -1:
                self.logger.warning('Keyword not found.')
                return -1
            following = [
                v.upper().strip() for v in sql_splitted[ind + 1:]
                if v.strip() != ''
            ]
            self.logger.info('Getting following %s', following)
            if following[0:len(keywords_list) - 1] == keywords_list[1:]:
                self.logger.info('Keywords matched!')
                return base_ind + ind
            else:
                self.logger.warning('Keywords not matched.')
                sql_splitted = sql_splitted[ind + 1:]
                base_ind += ind

    def get_merge_where(
        self
    ):  #--TODO, not done yet, the sql gramma "update from" for vertica and sqlserver are different, need more time to work on this
        index_using = self.locate_keyword('USING')
        index_on = self.locate_keyword('ON')
        index_when_match = self.locate_keyword_combination('WHEN MATCHED THEN')
        index_when_not_match = self.locate_keyword_combination(
            'WHEN NOT MATCHED THEN')

    def get_where_condition(self):
        index_where = self.locate_keyword('WHERE')
        return '' if index_where == -1 else ' '.join(
            self.raw_sql_splitted[index_where:])

    def get_table_name(self):
        self.set_insert_delete_update()
        return self.raw_sql_splitted_trimed[1].upper(
        ) if self.cmd == 'UPDATE' else self.raw_sql_splitted_trimed[2].upper()

    def get_raw_table_name(self, table_name):
        return table_name.split('.')[-1]

    def get_table_def(self, raw_table_name):
        sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name
        if self.db_type:
            sql += " AND DB_TYPE='%s'" % self.db_type
        if self.table_schema:
            sql += " AND TABLE_SCHEMA='%s'" % self.table_schema
        rs = self.mssql.query(sql)
        if len(rs) == 0:
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.'
                % (raw_table_name, db_type, table_schema))
        elif len(rs) > 1:
            self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|')
            for r in rs:
                self.logger.info(
                    '|%s|%s|%s|%s|' %
                    (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA))
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.'
                % (raw_table_name, db_type, table_schema))
        else:
            if rs[0].DB_TYPE == 'MSSQL':
                self.target_conn = self.mssql
            else:
                self.vtc = DWOperation(meta=self.meta)
                self.target_conn = self.vtc
            return rs[0]

    def get_mssql_table_ddl_def(self, db_name, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM %s.INFORMATION_SCHEMA.COLUMNS
            WHERE TABLE_NAME = N'%s'
            AND TABLE_SCHEMA = N'%s'
            ORDER BY ORDINAL_POSITION
        ''' % (db_name, table_name, table_schema)
        rs = self.mssql.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def get_vertica_table_ddl_def(self, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM COLUMNS
            WHERE TABLE_NAME = '%s'
            AND TABLE_SCHEMA = '%s'
            ORDER BY ORDINAL_POSITION
        ''' % (table_name, table_schema)
        rs = self.vtc.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def save_to_db(self, table_def, recs):
        pk = json.loads(table_def.PK)
        batch_sqls = []
        columns = 'TABLE_DEF_ID, PK_VALUE, ACTION, RECORD, UPDATE_TIME, ACTIONER' if self.log_detail else 'TABLE_DEF_ID, PK_VALUE, ACTION, UPDATE_TIME, ACTIONER'
        counter = 0
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for rec in recs:
            pk_dict = collections.OrderedDict()
            for k in pk:
                pk_dict[k] = rec[k]
            pk_value = json.dumps(pk_dict).replace("'", "''")
            if self.log_detail:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, json.dumps(rec).replace(
                        "'", "''"), now, self.actioner))
            else:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, now, self.actioner))
            counter += 1
            if len(batch_sqls) % self.batch_size == 0:
                self.mssql.execute(
                    '''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                    (columns, ' UNION ALL '.join(batch_sqls)))
                batch_sqls = []
                self.logger.info('%s has been logged.' % counter)
        if len(batch_sqls) > 0:
            self.mssql.execute('''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                               (columns, ' UNION ALL '.join(batch_sqls)))
            self.logger.info('%s has been logged.' % counter)

    @Timer()
    def save_data_to_be_changed(self, table_def, where_condition):
        if table_def.DB_TYPE == 'MSSQL':
            table_schema = 'dbo' if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_mssql_table_ddl_def(
                self.meta['db_conn_mssql_dbname'], table_def.TABLE_NAME,
                table_schema)
        else:  # vertica
            table_schema = self.meta[
                'db_conn_vertica_common_schema'] if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_vertica_table_ddl_def(
                table_def.TABLE_NAME, table_schema)
        if not self.log_detail:  # only dump primary key columns
            pk = json.loads(table_def.PK)
            table_ddl_def = dict([(c, table_ddl_def[c]) for c in table_ddl_def
                                  if c in pk])
        self.logger.info(table_ddl_def)
        query_columns = ','.join([
            '"%s"' % c if ' ' in c else c for c in list(table_ddl_def.keys())
        ])
        pull_data_sql = 'SELECT %s FROM %s.%s %s' % (
            query_columns, table_schema, table_def.TABLE_NAME, where_condition)
        self.logger.info(pull_data_sql)
        rs = self.target_conn.query(pull_data_sql)
        recs = []
        for r in rs:
            rec = {}
            for c in list(table_ddl_def.keys()):
                #rec[c] = str(eval("r."+c)) if 'TIME' not in table_ddl_def[c].upper() and 'DATE' not in table_ddl_def[c].upper() else eval("r."+c).strftime("%Y-%m-%d %H:%M:%S")
                rec[c] = str(eval('r.__getattribute__("%s")' % c))
            recs.append(rec.copy())
        self.saved_records_count = len(recs)
        self.logger.info('%s records will be saved.' %
                         self.saved_records_count)
        self.save_to_db(table_def, recs)

    @Timer()
    def execute_sql(self):
        self.target_conn.execute(self.sql)

    def main(self):
        try:
            try:
                table_name = self.get_table_name()
                raw_table_name = self.get_raw_table_name(table_name)
                table_def = self.get_table_def(raw_table_name)
                saved = False
                if self.cmd != 'INSERT':
                    where_condition = self.get_where_condition()
                    self.save_data_to_be_changed(table_def, where_condition)
                    saved = True
            except Exception as e:
                self.logger.warning(traceback.format_exc())
                self.logger.warning(
                    'Failed to save the to-be-changed data, start to executing sql.'
                )
            self.execute_sql()
            return '(%(db_type)s)%(table_schema)s.%(table_name)s - impacted %(row_count)s rows, saved %(status)s' % {
                'db_type':
                table_def.DB_TYPE,
                'table_schema':
                table_def.TABLE_SCHEMA,
                'table_name':
                table_def.TABLE_NAME,
                'row_count':
                self.saved_records_count,
                'status':
                'N/A' if self.cmd == 'INSERT' else
                'successfully' if saved else 'unsuccessfully'
            }
        finally:
            if self.mssql:
                self.mssql.close_connection()
            if self.vtc:
                self.vtc.close_connection()