Exemple #1
0
 def __init__(self,
              sql,
              actioner,
              log_detail=True,
              batch_size=100,
              meta={},
              db_type=None,
              table_schema=None,
              logger=None):
     self.meta = meta
     self.actioner = actioner
     self.log_detail = log_detail
     self.batch_size = batch_size
     self.logger = logger if logger else Logger(log_level="info",
                                                vendor_key=-1,
                                                retailer_key=-1,
                                                module_name="scd")
     self.sql = sql
     self.raw_sql_splitted = self.sql.split(' ')
     self.raw_sql_splitted_trimed = [
         str for str in self.raw_sql_splitted if str != ''
     ]
     self.db_type = db_type
     self.table_schema = table_schema
     self.logger.info('[SQL] %s' % self.sql)
     self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted)
     self.mssql = MSOperation(meta=self.meta)
     self.vtc = None
     self.saved_records_count = 0
Exemple #2
0
    def __init__(self, meta, params, logger=None):
        self.meta = meta
        self.params = params
        self.file_ext = self.params.get("fileExt") if self.params.get(
            "fileExt") else "txt"
        self.zip_flag = True if str(self.params.get(
            "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False

        self._debug = "Y" if str(self.params.get(
            'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N'
        __log_level = 'DEBUG' if self._debug == "Y" else 'INFO'
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)
        self.logger.set_level(log_level=__log_level)
        self.logger.set_keys(log_id="{}".format(self.params["jobId"]))

        self.provision_url = self.meta[
            "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key="
        self.capacity = Capacity(meta=self.meta)
        self.rsi_username = self.meta.get("fileshare_username")
        self.rsi_folder = self.meta.get("fileshare_folder")
        self.cmn_schema = self.meta.get("db_conn_vertica_common_schema",
                                        "common")

        self.app_conn = MSOperation(meta=self.meta)
        self.vertica_conn = DWOperation(meta=self.meta)

        self.dumper_context = self.get_context()
        self._layout_payload = {}
 def __init__(self, context):
     self.context = context
     self.meta = self.context["meta"]
     self._logger = self.context["logger"]
     self._db = MSOperation(meta=self.meta)
     self._dw = DWOperation(meta=self.meta)
     self.bundle_base_url = self.meta["api_osa_bundle_str"]
     self.alert_base_url = self.meta["api_alerts_str"]
     self._common_schema = self.meta["db_conn_vertica_common_schema"]
     self._dim_calendar = "DIM_CALENDAR"
     self._dim_product = "DIM_PRODUCT"
     self._dim_store = "DIM_STORE"
     self._meta_intvn = "ALT_META_INTERVENTION"
     self._dim_base_dir = self.meta["azure_storage_dim_root"]
     self._azure_storage_account_name = self.meta[
         "azure_storage_account_name"]
     self._azure_storage_blob_container = self.meta[
         "azure_storage_blob_container"]
     self._prefix_name = "IRIS_"
     self.azure_uploader = UploadToBlobStorage(meta=self.meta,
                                               logger=self._logger)
     self.body = {
         "job_id": self.context["jobId"],
         "step_id": self.context["stepId"],
         "status": StepStatus.RUNNING.name
     }
     self.capacity = Capacity(meta=self.meta)
     self.cycle_key = self._get_cycle_key()
Exemple #4
0
 def __init__(self, context):
     self.context = {**context, **context["meta"]}.copy()
     self._hub_ids = context["hub_ids"]
     self._logger = context["logger"]
     self._db = MSOperation(meta=context["meta"])
     self._dw = DWOperation(meta=context["meta"])
     self._common_schema = context["meta"]["db_conn_vertica_common_schema"]
     # self._dw_conn_vertica = self.context["dw_conn_vertica"]
     self._all_dim = self.context["all_dim"]
     self._dim_calendar = "DIM_CALENDAR"
     self._dim_product = "DIM_PRODUCT"
     self._dim_store = "DIM_STORE"
     self._product_cis_src_tbl = "PS_ITEM_DIM_EXT"
     self._store_cis_src_tbl = "PS_STORE_DIM_EXT"
     self._dct_vendor_retailer_hub = {}
     self._dct_vendor_hub = {}
     self._dct_retailer_hub = {}
     self._all_vendor_retailer_hub = self._get_avaliable_vr()
Exemple #5
0
 def __init__(self, meta, params, logger=None, filename="afm_config.json"):
     """
     :param meta:      common config
     :param params:    used by calling REST API
     :param filename:  only used for local testing
     """
     self.meta = meta
     self.params = params
     self.filename = filename
     self._cap = Capacity(meta=self.meta)
     self.app_conn = MSOperation(meta=self.meta)
     self._logger = logger if logger else Logger(log_level="debug",
                                                 target="console",
                                                 vendor_key=-1,
                                                 retailer_key=-1,
                                                 sql_conn=None)
     self.scd_url = "{0}/scd/process".format(
         self.meta["api_osa_bundle_str"])
Exemple #6
0
    def test_gen_vtc_test_data():
        mssql = MSOperation(meta=meta)
        vtc = DWOperation(meta=meta)
        try:
            vtc.execute('drop table common.test_columns')
        except:
            print('table doesn\'t exists')
        vtc.execute(
            'create table common.test_columns as select * from columns')

        rs = mssql.query(
            'select id from META_SCD_TABLE_DEF where table_name=\'test_columns\''
        )
        if not rs:
            sql = '''
            INSERT INTO META_SCD_TABLE_DEF(TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA, CREATE_TIME)
            SELECT 'test_columns', '["table_schema", "table_name", "column_name"]', 'VERTICA', '(COMMON)', GETDATE()
            '''
            mssql.execute(sql)
        '''
def insert_default_params():
    sql_handler = MSOperation(meta=meta)
    sql = "UPDATE META_DIM_DTL SET ATTR_TEXT = '%s' WHERE DEF_ID=(SELECT ID FROM META_DIM_DEF WHERE NAME='defaultParamCategory')" % json.dumps(
        param_category)
    try:
        sql_handler.execute(sql)
    finally:
        sql_handler.close_connection()
Exemple #8
0
    def test_gen_mssql_test_data():
        mssql = MSOperation(meta=meta)
        try:
            mssql.execute('drop table test_backupfille')
        except:
            print('table doesn\'t exists')
        mssql.execute('select * into test_backupfille from msdb.dbo.backupfile'
                      )  #--backup_set_id, file_number

        rs = mssql.query(
            'select id from META_SCD_TABLE_DEF where table_name=\'test_backupfille\''
        )
        if not rs:
            sql = '''
            INSERT INTO META_SCD_TABLE_DEF(TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA, CREATE_TIME)
            SELECT 'test_backupfille', '["backup_set_id", "file_number"]', 'MSSQL', '(COMMON)', GETDATE()
            '''
            mssql.execute(sql)
        '''
 def __init__(self, context):
     self.context = context
     self.meta = self.context["meta"]
     self._logger = self.context["logger"]
     self.body = {
         "job_id": self.context["jobId"],
         "step_id": self.context["stepId"],
         "status": StepStatus.RUNNING.name
     }
     self._db = MSOperation(meta=self.meta)
     self._dw = DWOperation(meta=self.meta)
     self._common_schema = self.meta["db_conn_vertica_common_schema"]
     self._check_spd_url = "{}/availablecycle/rc/action".format(
         self.meta["api_alerts_str"])
     self.pacific_job_def_name = "OSAPacificNotification"
Exemple #10
0
class DumpMain(object):
    def __init__(self, meta, params, logger=None):
        self.meta = meta
        self.params = params
        self.file_ext = self.params.get("fileExt") if self.params.get(
            "fileExt") else "txt"
        self.zip_flag = True if str(self.params.get(
            "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False

        self._debug = "Y" if str(self.params.get(
            'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N'
        __log_level = 'DEBUG' if self._debug == "Y" else 'INFO'
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)
        self.logger.set_level(log_level=__log_level)
        self.logger.set_keys(log_id="{}".format(self.params["jobId"]))

        self.provision_url = self.meta[
            "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key="
        self.capacity = Capacity(meta=self.meta)
        self.rsi_username = self.meta.get("fileshare_username")
        self.rsi_folder = self.meta.get("fileshare_folder")
        self.cmn_schema = self.meta.get("db_conn_vertica_common_schema",
                                        "common")

        self.app_conn = MSOperation(meta=self.meta)
        self.vertica_conn = DWOperation(meta=self.meta)

        self.dumper_context = self.get_context()
        self._layout_payload = {}

    def main_process(self):
        """
        Getting AFM related parameters. 2 cases here.
        1, called by AFM directly:
           Alerts Delivery is coupled with AFM. So it is better to call by AFM. Then we can get parameters directly from AFM via self.params
        2, called via REST API.
           in case failure when called by AFM, we also support the REST API while we can manually trigger this. And getting params from REST API Body.
        :return:
        """
        try:
            # Checking the cancel status from previous step.
            _cancel_flag = self.params.get("cancel_flag", False)
            if _cancel_flag:
                return self.params.get("message")

            self.logger.debug("The context is: %s" % str(self.dumper_context))
            init = Initialize(self.dumper_context)
            afm_params = init.init_process()
            _cycle_key = afm_params.get("cycleKey")

            _retailer_name = self.capacity.get_retailer_schema_name(
                retailer_key=afm_params["retailerKey"])
            self.dumper_context["retailer_name"] = _retailer_name
            self.dumper_context["schema"] = _retailer_name
            self.dumper_context["cycle_key"] = _cycle_key

            self._layout_payload = self._get_layout()
            self._process(afm_params)

        except Warning as e:
            return str(e)

        except Exception as e:
            self.logger.warning(str(e))
            raise

    def get_context(self):
        context = {
            "meta": self.meta,
            "params": self.params,
            "dw_conn": self.vertica_conn,
            "app_conn": self.app_conn,
            "logger": self.logger
        }

        return context

    def _get_layout(self):
        # Getting layout fields for alert delivery.
        _cycle_key = self.dumper_context["cycle_key"]
        _delivery_layout_api = "{0}{1}".format(self.provision_url, _cycle_key)

        self.logger.info(
            "Alert delivery related columns(payload param) is not given, "
            "So just calling provision API: %s to retrieve them." %
            _delivery_layout_api)
        # _header = {"tokenid": "eyJhbGciOiJIUzI1NiJ9.eyJjb29raWVOYW1lIjoicnNpU3NvTmV4dEdlbiIsImNvb2tpZVZhbHVlIjoiQVFJQzV3TTJMWTRTZmN6ZmRuLVVQazA2b2NnRzVWaTlZRFc1cHZZQzF6b3djbXMuKkFBSlRTUUFDTURFQUFsTkxBQkkyTVRBNU1EUTBPRFUwTXpZeU1UWTBORFlBQWxNeEFBQS4qIiwic3RhdHVzIjoic3VjY2VzcyIsInVzZXJJZCI6ImJlbi53dUByc2ljb3JwLmxvY2FsIiwiaWF0IjoxNTQ0Njg5OTY3fQ.AzCgFFHXHo3J1M4fk-17T8fBLwReDQDb4p-DXUcBm_M"}
        _header = {}
        resp = requests.get(_delivery_layout_api, _header, verify=False)
        if resp.text == "invalid token":
            self.logger.error(
                "ERROR: invalid tokenid, Please update the tokenid manually. "
                "Then rerun this script again!!!")

        if resp.status_code != requests.codes.ok or str(
                resp.json().get("status")).lower() != "success":
            self.logger.warning("The response result is: %s" % resp.json())
            self.logger.error("Calling API failed. Refer to API: %s" %
                              _delivery_layout_api)

        # _payload = resp.json()["data"]["payload"]   # e.g. list({"key1":"value1"}, {}, {})
        _payload = resp.json()["data"]

        self.logger.debug("The layout payload is: %s" % _payload)
        return _payload

    def _gen_query(self, delivery_key):
        # getting the payload for given delivery key.
        _payload = self._layout_payload.get(str(delivery_key))["payload"]
        self.logger.debug("The payload is: %s" % str(_payload))

        # only getting data where field preselect is True.
        _payload = list(filter(lambda x: x["preselect"] is True, _payload))
        self.logger.debug("The filtered payload is: %s" % str(_payload))
        if not _payload:
            self.logger.warning(
                "There is no layout configed for this delivery.")
            return None, None

        # It is required to order by seq, so need to convert list(dict) to list(tuple) to sort the data.
        # Otherwise, the following code will not be working correctly.
        # _payload_list should be like: e.g. [(1, 'Alert', 'ALERT_ID', 'ALERT ID', None), (), ...]
        # Noted: Please don't change below fields order for generating _payload_list.
        _payload_list = [(ele["sequence"], ele["dimension_type"], ele["Field"],
                          ele["display_name"], ele["sort_by"])
                         for ele in _payload]
        self.logger.debug("Converted payload is: %s" % str(_payload_list))

        # Fields should be listed based on sequence. In case seq is None, then putting those columns to the last.
        # if many elements with None value, then they will be sorted randomly.
        _payload_list.sort(
            key=lambda x: x[0]
            if x[0] is not None else 999)  # Assuming less than 999 columns.
        self.logger.debug("Sorted payload is: %s" % str(_payload_list))

        display_fields_lst = [
        ]  # getting display name like: ["alert type", ...] for displaying on delivery file.
        columns_with_alias = [
        ]  # getting fields with format like: [alert."alert_type" as "alert type", ...]
        table_list = set(
        )  # getting required table list. ("Alert", "Store", ...)
        for columns in _payload_list:
            self.logger.debug(
                "Current display name is: \"%s\" for column: %s" %
                (columns[3], columns[2]))
            table_list.add(str(columns[1]).lower())
            if columns[3] is None:  # display name could be None
                display_fields_lst.append(columns[2])
                columns_with_alias.append(columns[1] + '."' + columns[2] +
                                          '" AS "' + columns[2] + '"')
            else:
                display_fields_lst.append(columns[3])
                columns_with_alias.append(columns[1] + '."' + columns[2] +
                                          '" AS "' + columns[3] + '"')

        # if there is no "Alert Date" enabled, then manually added this field to the beginning.
        if "Alert Date".lower() not in list(
                map(lambda x: str(x).lower(), display_fields_lst)):
            display_fields_lst.insert(0, "Alert Date")
            columns_with_alias.insert(0, 'ALERT.period_key AS "Alert Date"')

        _display_fields_str = ",".join([
            '"' + column + '"' for column in display_fields_lst
        ])  # combine them with comma(,)
        _required_columns_prefix_tmp = ",".join(
            [column for column in columns_with_alias])  # combine them
        # replace one mandatory fields alert_type to intervention_name. Since it is not the column name in fact table.
        _required_columns_prefix = re.sub(
            '(?i)' + re.escape('ALERT."ALERT_TYPE"'), 'type.intervention_name',
            _required_columns_prefix_tmp)

        # Getting columns which sort_by is True(which is enabled sorting). And sort it.
        sort_by_list = sorted(list(
            filter(lambda x: x[4] is True, _payload_list)),
                              key=lambda x: x[4])
        if sort_by_list:
            _sort_columns = ",".join([
                ele[1] + '."' + ele[2] + '" DESC NULLS LAST'
                for ele in sort_by_list
            ])
        else:
            _sort_columns = '1'
        self.logger.debug(_sort_columns)
        _sort_columns = re.sub('(?i)' + re.escape('ALERT."ALERT_TYPE"'),
                               'type.intervention_name', _sort_columns)

        _fdbk_required = True if "feedback" in table_list else False
        _item_required = True if "product" in table_list else False
        _store_required = True  # mandatory

        # Generating the initial query.
        _init_sql = """
        SELECT {columns}, ROW_NUMBER() OVER(ORDER BY store.STORE_ID, {sortedFields} ) AS rn
        FROM {schema}.FACT_PROCESSED_ALERT alert 
        INNER JOIN {cmnSchema}.alt_meta_intervention type 
        ON alert.InterventionKey = type.Intervention_Key """\
            .format(columns=_required_columns_prefix,
                    sortedFields=_sort_columns,
                    schema=self.dumper_context["schema"],
                    cmnSchema=self.cmn_schema)

        if _fdbk_required:
            _init_sql += """
            LEFT JOIN {schema}.FACT_FEEDBACK Feedback 
            ON alert.alert_id = Feedback.alert_id"""\
                .format(schema=self.dumper_context["schema"])

        if _item_required:
            _init_sql += """
            INNER JOIN {cmnSchema}.dim_product Product 
            ON alert.item_key = Product.item_key
            AND alert.vendor_key = Product.vendor_key 
            AND alert.retailer_key = Product.retailer_key"""\
                .format(cmnSchema=self.cmn_schema)

        if _store_required:
            _init_sql += """
            INNER JOIN {cmnSchema}.dim_store Store 
            ON alert.store_key = store.store_key
            AND alert.vendor_key = store.vendor_key 
            AND alert.retailer_key = store.retailer_key"""\
                .format(cmnSchema=self.cmn_schema)

        # always apply the period_key filter for given vendor/retailer
        _init_sql += """
        WHERE 1=1 AND alert.IssuanceId = 0
        AND (alert.vendor_key, alert.retailer_key, alert.period_key) IN 
            ( SELECT vendor_key, retailer_key, alert_day FROM TMP_RAW_ALERTS_INFO )
            """

        return _display_fields_str, _init_sql

    def _process(self, afm_params):
        try:
            _cycle_key = afm_params.get("cycleKey")

            # Reading configuration from meta table under IRIS MSSQL.
            # Getting all owners(includes both SVR & RETAILER rule) according to given cycle_key.
            # the delivery file will be dumped by owner.
            sql = """
            SELECT d.ID AS DELIVERY_KEY, d.CYCLE_KEY, d.RETAILER_KEY, d.DELIVERY_NAME, d.FILTERS, 
                   d.DELIMITER, d.OWNER, ep.SERVER, ep.EXTRACTION_FOLDER, ep.USERNAME, ep.PASSWORD,
                   ep.MAIL_SUBJECT, ep.MAIL_BODY, ep.MAIL_RECPSCC, ep.MAIL_RECPSTO, ep.DELIVERY_TYPE 
            FROM AP_META_DELIVERIES d 
            INNER JOIN AP_META_ENDPOINTS ep
            ON d.ENDPOINT_ID = ep.ID 
            WHERE d.cycle_key = {0}
            AND d.ENABLED = 'T' AND ep.ENABLED = 'T' 
            """.format(_cycle_key)
            self.logger.info(sql)
            meta_rows = self.app_conn.query(sql)
            self.logger.debug("The meta data is: %s" % str(meta_rows))
            if not meta_rows:
                raise Warning(
                    "There is no endpoint or delivery configed. Please check meta table!"
                )

            # There could be multi owners for the given cycle but with different filters.
            # This is required by PM. And we need to generate separate files for every single row.
            for meta_data in meta_rows:
                # 1, Getting the initial source query
                _delivery_key = meta_data.DELIVERY_KEY
                required_columns, _init_src_query = self._gen_query(
                    delivery_key=_delivery_key)
                if required_columns is None and _init_src_query is None:
                    self.logger.warning(
                        "Seems no layout configed for delivery key: %s" %
                        _delivery_key)
                    continue
                self.logger.info("The initial source query is: %s" %
                                 _init_src_query)

                delivery_type = meta_data.DELIVERY_TYPE
                if str.lower(delivery_type) == 'customer':
                    meta_data = meta_data._replace(
                        EXTRACTION_FOLDER=self.rsi_folder,
                        USERNAME=self.rsi_username)

                if meta_data.USERNAME is None:
                    self.logger.warning(
                        "There is no username configed for delivery key: %s" %
                        _delivery_key)
                    continue

                _pmp_pwd = get_password(username=meta_data.USERNAME,
                                        meta=self.meta)
                if _pmp_pwd:
                    meta_data = meta_data._replace(PASSWORD=_pmp_pwd)

                self.logger.info("Start to dump & delivery for meta: %s" %
                                 str(meta_data))
                _src_query = _init_src_query

                # 2, checking if any filters applied. (e.g alert_type, category etc.)
                # User might wants to dump only given alert types of data. This should be configurable.
                # So far, we support 2 types of filters: alert_type & category
                # TODO: confirm the filter format with UI team. Currently filters are configed with json format.
                # e.g. {"alert_type": "d-void,phantom", "category":"cat1,cat2"}
                _filters_raw = meta_data.FILTERS
                if not _filters_raw or _filters_raw == "":
                    self.logger.info("No filters applied.")
                else:
                    self.logger.info("The filters are: %s" % _filters_raw)
                    _filters = json.loads(str(_filters_raw).lower().strip())
                    alert_type_str = _filters.get(
                        "alert_type", None)  # e.g. phantom,d-void,shelf oos
                    if alert_type_str is not None and str(
                            alert_type_str).strip() != '':
                        alert_type = ','.join(
                            "'" + str(ele).strip() + "'"
                            for ele in str(alert_type_str).split(','))
                        _src_query += " AND type.intervention_name IN ({type})".format(
                            type=alert_type)

                    category_str = _filters.get("category", None)
                    if category_str is not None and str(
                            category_str).strip() != '':
                        category_type = ','.join(
                            "'" + str(ele).strip() + "'"
                            for ele in str(category_str).split(','))
                        _src_query += " AND Product.OSM_CATEGORY IN ({cat_type})".format(
                            cat_type=category_type)

                # The owner format should be like: owner1 or owner1,owner2,...
                _owners = str(meta_data.OWNER)
                if not _owners:
                    # owner is the mandatory filter for every delivery.
                    raise ValueError(
                        "There is no owner configed in delivery meta table")

                _owner_in_str = ",".join("'" + ele.strip() + "'"
                                         for ele in _owners.split(","))
                _src_query += " AND alert.owner IN ({owner}) ".format(
                    owner=_owner_in_str)

                _final_src_query = """
                SELECT {columns} FROM ({query}) x ORDER BY rn
                """.format(columns=required_columns, query=_src_query)

                self.logger.info("The final source sql is: %s" %
                                 _final_src_query)

                # delivery file name should be: <delivery_name>_<YYYYMMDD>.<fileExt>. e.g. <delivery_name>_20180101.txt
                curr_folder = os.path.dirname(os.path.realpath(__file__))
                target_filename = meta_data.DELIVERY_NAME + "_" + datetime.datetime.now(
                ).strftime('%Y%m%d')

                # delivery file will be dumped to "<curr_dir>/data" folder temporarily.
                abs_target_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.' + self.file_ext
                zip_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.zip'

                # Getting data delimiter. e.g. ','
                delimiter = str(meta_data.DELIMITER).strip()
                if len(delimiter) != 1:
                    raise ValueError("delimiter should be 1 char")

                # start to dump data
                self.dumper = dd.DumpData(context=self.dumper_context)

                # dump data from source db
                self.logger.info("Dumping data into file: %s" %
                                 abs_target_filename)
                _dump_flag = self.dumper.dump_data(
                    src_sql=_final_src_query,
                    output_file=abs_target_filename,
                    delimiter=delimiter)
                self.logger.debug("The dump flag is: %s" % _dump_flag)

                # dump alerts succeeded.
                if _dump_flag is True:
                    self.logger.info("Dumping data is done!")

                    # check the zip flag
                    if self.zip_flag:
                        _flat_file_size = round(
                            os.path.getsize(abs_target_filename) / 1024 / 1024)
                        self.logger.debug("The flat file size is: %s" %
                                          _flat_file_size)
                        self.logger.info("zipping file: %s" %
                                         abs_target_filename)
                        with zipfile.ZipFile(zip_filename, 'w') as z:
                            z.write(abs_target_filename,
                                    os.path.basename(abs_target_filename))

                        abs_target_filename = zip_filename
                        self.logger.info("The zip file name is: %s" %
                                         abs_target_filename)

                    # start to send data file
                    self.logger.info(
                        "Starting uploading delivery file to dest folder!")
                    self.sender = sd.SendData(context=self.dumper_context)
                    self.sender.delivery_file(meta_data=meta_data,
                                              src_file=abs_target_filename)

                else:
                    self.logger.warning(
                        "There is no data returned or dump data failed. "
                        "Please refer to previous log to get the related source query."
                    )

            self.logger.info("Alert delivery process is done")

        except Warning as e:
            raise

        except Exception:
            raise

        finally:
            if self.vertica_conn:
                self.vertica_conn.close_connection()
            if self.app_conn:
                self.app_conn.close_connection()
Exemple #11
0
class SyncDimData:
    """
    Get product and store dimension data from hub. DIM_PRODUCT and DIM_STORE save pivoted data,
    tall table PRODUCT and STORE save attribute that config in AP_CONFIG_META_ATTRIBUTE
    """
    def __init__(self, context):
        self.context = {**context, **context["meta"]}.copy()
        self._hub_ids = context["hub_ids"]
        self._logger = context["logger"]
        self._db = MSOperation(meta=context["meta"])
        self._dw = DWOperation(meta=context["meta"])
        self._common_schema = context["meta"]["db_conn_vertica_common_schema"]
        # self._dw_conn_vertica = self.context["dw_conn_vertica"]
        self._all_dim = self.context["all_dim"]
        self._dim_calendar = "DIM_CALENDAR"
        self._dim_product = "DIM_PRODUCT"
        self._dim_store = "DIM_STORE"
        self._product_cis_src_tbl = "PS_ITEM_DIM_EXT"
        self._store_cis_src_tbl = "PS_STORE_DIM_EXT"
        self._dct_vendor_retailer_hub = {}
        self._dct_vendor_hub = {}
        self._dct_retailer_hub = {}
        self._all_vendor_retailer_hub = self._get_avaliable_vr()

    def _get_column(self):
        """Get attribute from config table
        :return:
        """
        self._dct_table = {"DIM_PRODUCT": "", "DIM_STORE": ""}
        self._dct_key = {"DIM_PRODUCT": "ITEM_KEY", "DIM_STORE": "STORE_KEY"}
        self._dct_table_column = {"PRODUCT": [], "STORE": []}
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 1"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["PRODUCT"].append(row[0])
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 2"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["STORE"].append(row[0])
        for k, v in self._dct_table_column.items():
            self._dct_table_column[k] = """'{}'""".format("', '".join(v))

    def _get_source_config(self, hub_id):
        """Get source config
        :return:
        """
        self._source_dw = SiloOperation(hub_id=hub_id,
                                        meta=self.context["meta"])
        if self.context["meta"]["db_conn_vertica_silo_username"]:
            self._source_dw.config.json_data["configs"][
                "dw.etluser.id"] = self._source_dw.username
            self._source_dw.config.json_data["configs"][
                "dw.etluser.password"] = Crypto().encrypt(
                    self._source_dw.password)
        silo_type = self._source_dw.config.get_property(
            "etl.silo.type", "None").upper()
        # TODO For hub, silo_type is MD, not find para for this, default host
        if silo_type in ("WMSSC", "SASSC", "WMCAT", "WMINTL"):
            self._active_status = ""
        else:
            self._active_status = " AND D.ACTIVE = 'T'"

    def _get_avaliable_vr(self):
        """
        getting all avaliable vendor and retailer from cycle mapping table.
        and get related hub_id via config service.
        :return: dict like {'vendor,retailer': 'hub_id', ...}
        """

        sql = "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_MAPPING " \
              "UNION " \
              "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_RC_MAPPING"
        self._logger.debug(sql)
        dct_vendor_retailer_hub = {}
        # dct_vendor_retailer_hub = dict(self._db.query(sql))
        for v_r in self._db.query(sql):
            try:
                config = Config(meta=self.context["meta"],
                                vendor_key=v_r.VENDOR_KEY,
                                retailer_key=v_r.RETAILER_KEY)
                hub_id = config.get_hub_id()
                _key = str(v_r.VENDOR_KEY) + ',' + str(v_r.RETAILER_KEY)
                dct_vendor_retailer_hub[_key] = hub_id
            # in case there is no config returned for given vendor & retailer, then skip this vendor & retailer.
            except Exception as e:
                # self._logger.warning(str(e))
                self._logger.warning(
                    "Seems there is no silo configed for vendor: %s and retailer: %s"
                    % (str(v_r.VENDOR_KEY), str(v_r.RETAILER_KEY)))
                continue

        return dct_vendor_retailer_hub

    def _get_vendor_retailer_from_hub(self, hub_id):
        """Get vendor retailer from table AP_ALERT_CYCLE_MAPPING, only keep the first one that appear in multiple hubs
        :return:
        """
        self._str_vendor_retailer_filter = ""
        self._str_vendor_filter = ""
        self._str_retailer_filter = ""
        lst_vendor_retailer_filter = []
        lst_vendor_filter = []
        lst_retailer_filter = []
        dct_vendor_retailer_hub = self._all_vendor_retailer_hub

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ "
            "    DISTINCT VENDOR_KEY, RETAILER_KEY FROM {self._meta_schema}.CONFIG_CUSTOMERS"
            " WHERE VENDOR_NAME != 'Alert' AND HUB_ID = '{hub_id}'"
            " ORDER BY VENDOR_KEY, RETAILER_KEY".format(hub_id=hub_id,
                                                        self=self))
        self._logger.debug(sql)
        vendor_retailers = self._source_dw.query(sql)
        for row in vendor_retailers:
            _vr = "{row.VENDOR_KEY},{row.RETAILER_KEY}".format(row=row)
            # same vendor & retailer could have multi hubs.
            # So only getting the one which matches with the hub from Config service.
            if dct_vendor_retailer_hub.get(_vr) == hub_id or self._all_dim:
                if _vr not in self._dct_vendor_retailer_hub:
                    self._dct_vendor_retailer_hub[_vr] = hub_id
                    lst_vendor_retailer_filter.append(
                        "({row.VENDOR_KEY},{row.RETAILER_KEY})".format(
                            row=row))
                if str(row.VENDOR_KEY) not in self._dct_vendor_hub:
                    self._dct_vendor_hub["{}".format(row.VENDOR_KEY)] = hub_id
                    lst_vendor_filter.append("{}".format(row.VENDOR_KEY))
                if str(row.RETAILER_KEY) not in self._dct_retailer_hub:
                    self._dct_retailer_hub["{}".format(
                        row.RETAILER_KEY)] = hub_id
                    lst_retailer_filter.append("{}".format(row.RETAILER_KEY))
        if lst_vendor_retailer_filter:
            self._str_vendor_retailer_filter = (
                " AND (VENDOR_KEY, RETAILER_KEY) IN ({})"
                "".format(",".join(lst_vendor_retailer_filter)))
            self._logger.debug(self._str_vendor_retailer_filter)
        if lst_vendor_filter:
            self._str_vendor_filter = (" AND VENDOR_KEY IN ({})".format(
                ",".join(list(set(lst_vendor_filter)))))
            self._logger.debug(self._str_vendor_filter)
        if lst_retailer_filter:
            self._str_retailer_filter = (" AND RETAILER_KEY IN ({})".format(
                ",".join(list(set(lst_retailer_filter)))))
            self._logger.debug(self._str_retailer_filter)

    def _recreate_stage_table(self):
        """
        Creating related stage tables on IRIS DW.
        :return:
        """
        lst_sql = []
        for table in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append("""
                DROP TABLE IF EXISTS {self._common_schema}.STAGE_{table};
                CREATE TABLE {self._common_schema}.STAGE_{table} LIKE {self._common_schema}.{table} INCLUDING PROJECTIONS;
            """.format(self=self, table=table))
        sql = ''.join(lst_sql)
        self._logger.info(sql)
        self._dw.execute(sql)

        # creating staging table for loading CIS source data.
        _sql = """
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{itemTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{itemTable}_CIS LIKE {cmnSchema}.{itemTable} INCLUDING PROJECTIONS;
         
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{storeTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{storeTable}_CIS LIKE {cmnSchema}.{storeTable} INCLUDING PROJECTIONS;
         """.format(cmnSchema=self._common_schema,
                    itemTable=self._dim_product,
                    storeTable=self._dim_store)
        self._logger.info(_sql)
        self._dw.execute(_sql)

    def _truncate_stage_table(self):
        """Truncate stage table
        :return:
        """
        lst_sql = []
        for k in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append(
                "TRUNCATE TABLE {self._common_schema}.STAGE_{k}".format(
                    self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _analyze_table(self):
        """Analyze table
        :return:
        """
        lst_sql = []
        for k in self._dct_table.keys():
            lst_sql.append(
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.{k}')"
                "".format(self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _export_calendar_data(self, hub_id):
        """ Export calendar data to common schema from source schema if no data in DIM_CALENDAR table
        :param hub_id:
        :return:
        """
        cnt = self._dw.query_scalar(
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ COUNT(*) "
            "FROM {self._common_schema}.{self._dim_calendar}".format(
                self=self))
        if cnt != 0:
            self._logger.info(
                "Calendar data exists, skip hub_id {}.".format(hub_id))
        else:
            col = (
                'PERIOD_KEY, CALENDAR_KEY, CALENDARNAME, YEAR, YEARNAME, QUARTER, QUARTERNAME, MONTH, MONTHNAME, '
                'PERIOD, PERIODNAME, WEEKENDED, WEEKENDEDNAME, WEEKBEGIN, WEEKBEGINNAME, YEARWEEKNAME, YEARWEEK, '
                'YEARMONTHWEEKNAME, LY_PERIOD_KEY, NY_PERIOD_KEY, DATE_NAME, DATE_VALUE, CAL_PERIOD_KEY, '
                '"2YA_PERIOD_KEY", "3YA_PERIOD_KEY", "4YA_PERIOD_KEY"')
            sql = """
            SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CAL.PERIOD_KEY AS PERIOD_KEY,
                   META.CALENDAR_KEY AS CALENDAR_KEY,
                   MAX(META.CALENDAR_NAME) AS CALENDARNAME,
                   MAX(CAL."YEAR") AS "YEAR",
                   MAX(CAL.YEARNAME) AS YEARNAME,
                   MAX(CAL.YEARQUARTER) AS QUARTER,
                   MAX(CAL.YEARQUARTERNAME) AS QUARTERNAME,
                   MAX(CAL.YEARPERIOD) AS "MONTH",
                   MAX(CAL.YEARPERIODNAME) AS MONTHNAME,
                   MAX(CAL.YEARMONTH) AS PERIOD,
                   MAX(CAL.YEARMONTHNAME) AS PERIODNAME,
                   MAX(CAL.WEEKENDED) AS WEEKENDED,
                   MAX(CAL.WEEKENDEDNAME) AS WEEKENDEDNAME,
                   MAX(CAL.WEEKBEGIN) AS WEEKBEGIN,
                   MAX(CAL.WEEKBEGINNAME) AS WEEKBEGINNAME,
                   (((MAX(CAL.YEAR))::VARCHAR|| ' - Wk '::VARCHAR(6)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(MAX(CAL.WEEK))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR)) - 2) + 1),2)
                   ) AS YEARWEEKNAME,
                   MAX(CAL.YEARWEEK) AS YEARWEEK,
                   (((((MAX(CAL.YEAR))::VARCHAR|| ' - '::VARCHAR(3)) || SUBSTR (('00'::VARCHAR(2) 
                   ||(MAX(CAL.MONTH))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR)) - 2) + 1),2)) 
                   || ' Wk '::VARCHAR(4)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(META.MONTH) 
                   ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR),((LENGTH(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(CAL.MONTH) ORDER BY 
                   CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR)) - 2) + 1),2)) AS YEARMONTHWEEKNAME,
                   MAX(CAL.LY_PERIOD_KEY) LY_PERIOD_KEY,
                   MAX(CAL.NY_PERIOD_KEY) NY_PERIOD_KEY,
                   TO_CHAR(TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD'), 'YYYY-MM-DD') DATE_NAME,
                   TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD') DATE_VALUE,
                   CAL.PERIOD_KEY CAL_PERIOD_KEY,
                   MAX("2YA_PERIOD_KEY") "2YA_PERIOD_KEY",  MAX("3YA_PERIOD_KEY") "3YA_PERIOD_KEY", 
                   MAX("4YA_PERIOD_KEY") "4YA_PERIOD_KEY"
            FROM (
                (SELECT 2 AS CALENDAR_KEY, 'STD' AS PREFIX, 1 AS "MONTH", '445 (Standard)' AS CALENDAR_NAME) META
                  JOIN (SELECT C.PERIOD_KEY AS PERIOD_KEY,
                        C.CALENDAR_KEY AS CALENDAR_KEY,
                        C.YEAR,
                        C.WEEK,
                        C.MONTH,
                        (C.YEAR)::VARCHAR(4) AS YEARNAME,
                        QE.YEARQUARTER,
                        QE.YEARQUARTERNAME,
                        ME.YEARPERIOD AS YEARMONTH,
                        ME.YEARPERIODNAME AS YEARMONTHNAME,
                        ME.YEARPERIOD,
                        ME.YEARPERIODNAME,
                        WE.YEARWEEK AS YEARWEEK,
                        WE.WEEKBEGIN,
                        TO_CHAR(TO_DATE ((WE.WEEKBEGIN)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKBEGINNAME,
                        WE.WEEKENDED,
                        TO_CHAR(TO_DATE ((WE.WEEKENDED)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKENDEDNAME,
                        C.LY_PERIOD_KEY,
                        C.NY_PERIOD_KEY,
                        YA2.LY_PERIOD_KEY "2YA_PERIOD_KEY",  YA3.LY_PERIOD_KEY "3YA_PERIOD_KEY", 
                        YA4.LY_PERIOD_KEY "4YA_PERIOD_KEY"
                    FROM ((({self._dim_schema}.CALENDAR_PERIOD C
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            MIN(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKBEGIN,
                            MAX(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKENDED,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK AS WEEK,
                            CASE
                            WHEN (CALENDAR_PERIOD.WEEK > 2000) THEN CALENDAR_PERIOD.WEEK
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.WEEK)
                            END AS YEARWEEK
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.WEEK_CODE = 'B'::CHAR(1)) OR (CALENDAR_PERIOD.WEEK_CODE = 'E'::CHAR(1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK) WE
                      ON ( ( (C.CALENDAR_KEY = WE.CALENDAR_KEY)
                      AND (C.YEAR = WE.YEAR)
                      AND (C.WEEK = WE.WEEK))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH AS PERIOD,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN (CALENDAR_PERIOD.MONTH)::VARCHAR(6)
                            ELSE CONCAT (CONCAT (CONCAT ('Period '::VARCHAR(7),(CALENDAR_PERIOD.MONTH)::VARCHAR(2)), 
                                 ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4))
                            END AS YEARPERIODNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN CALENDAR_PERIOD.MONTH
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.MONTH)
                            END AS YEARPERIOD
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.MONTH_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.MONTH_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH) ME
                      ON ( ( (C.CALENDAR_KEY = ME.CALENDAR_KEY)
                      AND (C.YEAR = ME.YEAR)
                      AND (C.MONTH = ME.PERIOD))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER AS QUARTER,
                            CASE WHEN CALENDAR_PERIOD.QUARTER > 2000 THEN CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1), 
                                (SUBSTR((CALENDAR_PERIOD.QUARTER)::VARCHAR(6), 6))::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.QUARTER)::VARCHAR(4)) 
                            ELSE CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1),(CALENDAR_PERIOD.QUARTER)::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) 
                            END AS  YEARQUARTERNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.QUARTER > 2000) THEN CALENDAR_PERIOD.QUARTER
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.QUARTER)
                            END AS YEARQUARTER
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.QUARTER_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.QUARTER_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER) QE
                      ON (((C.CALENDAR_KEY = QE.CALENDAR_KEY) AND (C.YEAR = QE.YEAR) AND (C.QUARTER = QE.QUARTER))))
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA2 
                      ON C.CALENDAR_KEY = YA2.CALENDAR_KEY AND C.LY_PERIOD_KEY = YA2.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA3 
                      ON C.CALENDAR_KEY = YA3.CALENDAR_KEY AND YA2.LY_PERIOD_KEY = YA3.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA4 
                      ON C.CALENDAR_KEY = YA4.CALENDAR_KEY AND YA3.LY_PERIOD_KEY = YA4.PERIOD_KEY        
                    WHERE (C.YEAR >= ((DATE_PART ('year'::VARCHAR (4),(NOW ())::TIMESTAMP))::INT- 5))) CAL 
                  ON ( (CAL.CALENDAR_KEY = META.CALENDAR_KEY))
            )
            GROUP BY CAL.PERIOD_KEY,META.CALENDAR_KEY;
            """.format(self=self)
            dct_sync_data = self.context.copy()
            dct_sync_data["source_dw"] = self._source_dw
            dct_sync_data["source_config"] = self._source_dw.config.json_data[
                "configs"]
            dct_sync_data["target_dw_schema"] = self._common_schema
            dct_sync_data["target_dw_table"] = self._dim_calendar
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)
            self._logger.info(
                "Sync hub_id {} dim calendar done".format(hub_id))

    def _export_product_data(self, hub_id, cis_source=False):
        """
        Export product data to common schema from source schema
        :param hub_id:
        :return:
        """

        _target_stage_table = "STAGE_{}".format(self._dim_product)
        _src_item_table = "PRODUCT"
        if cis_source:
            _target_stage_table = "STAGE_{}_CIS".format(self._dim_product)
            _src_item_table = "STAGE_{}".format(self._product_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgItemTable};
            CREATE TABLE {dimSchema}.{stgItemTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC_ID AS UPC, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' AS ACTIVE 
            FROM {dimSchema}.{srcItemTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgItemTable=_src_item_table,
                       srcItemTable=self._product_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC, ITEM_DESCRIPTION, ITEM_GROUP, OSM_ITEM_NBR, OSM_ITEM_STATUS,"
            "OSM_ITEM_TYPE, OSM_BRAND, OSM_UNIT_PRICE, OSM_CATEGORY, OSM_MAJOR_CATEGORY_NO, OSM_MAJOR_CATEGORY, "
            "OSM_SUB_CATEGORY_NO, OSM_SUB_CATEGORY, OSM_VENDOR_STK_NBR, OSM_VENDOR_PACK_COST, VENDOR_PACK_QTY, "
            "OSM_WHSE_PACK_COST, OSM_WHSE_PACK_QTY, DSD_IND, VENDOR_NAME ")
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
            CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY, PV.UPC, AGPG.ITEM_DESCRIPTION, 
            CASE WHEN AGPG.ITEM_GROUP IS NULL OR AGPG.ITEM_GROUP = '000000' THEN PV.UPC ELSE AGPG.ITEM_GROUP END AS 
            ITEM_GROUP, AGPG.OSM_ITEM_NBR, AGPG.OSM_ITEM_STATUS, AGPG.OSM_ITEM_TYPE, AGPG.OSM_BRAND,AGPG.OSM_UNIT_PRICE, 
            AGPG.OSM_CATEGORY, AGPG.OSM_MAJOR_CATEGORY_NO, AGPG.OSM_MAJOR_CATEGORY, AGPG.OSM_SUB_CATEGORY_NO, 
            AGPG.OSM_SUB_CATEGORY, AGPG.OSM_VENDOR_STK_NBR, AGPG.OSM_VENDOR_PACK_COST, 
            CASE WHEN length(AGPG.VENDOR_PACK_QTY) < 15
                THEN CASE WHEN REGEXP_LIKE(SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15), '^[0-9]*(\.[0-9]+)?$') 
                    THEN CASE WHEN SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC <= 1 THEN 1
                        ELSE SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC END 
                    ELSE 1 END 
                ELSE 1 END VENDOR_PACK_QTY, AGPG.OSM_WHSE_PACK_COST, AGPG.OSM_WHSE_PACK_QTY, 
            CASE WHEN AGPG.OSM_ITEM_TYPE IN ('07','37') THEN '1' ELSE '0' END AS DSD_IND, CC.VENDOR_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, VENDOR_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.VENDOR_KEY, D.ITEM_KEY, MAX(D.UPC) UPC FROM {self._dim_schema}.{itemTable} D
            JOIN {self._dim_schema}.VENDOR V on V.VENDOR_KEY = D.VENDOR_KEY
            WHERE D.ITEM_KEY NOT IN (SELECT ITEM_KEY FROM {self._dim_schema}.PRODUCT_GLOBAL_FILTER) 
            {self._active_status}
            GROUP BY D.VENDOR_KEY, D.ITEM_KEY
        ) PV ON PV.VENDOR_KEY = CC.VENDOR_KEY
        LEFT JOIN (
            SELECT AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_GROUP_X' THEN ATTRIBVALUE END) ITEM_GROUP
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'VENDOR_PACK_QTY_X' THEN ATTRIBVALUE END) VENDOR_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_DESCRIPTION_X' THEN ATTRIBVALUE END) ITEM_DESCRIPTION
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_NBR_X' THEN ATTRIBVALUE END)     OSM_ITEM_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_STATUS_X' THEN ATTRIBVALUE END)     OSM_ITEM_STATUS
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_TYPE_X' THEN ATTRIBVALUE END)     OSM_ITEM_TYPE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_QTY_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_BRAND_X' THEN ATTRIBVALUE END)     OSM_BRAND
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_UNIT_PRICE_X' THEN ATTRIBVALUE END)     OSM_UNIT_PRICE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_STK_NBR_X' THEN ATTRIBVALUE END)     OSM_VENDOR_STK_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_VENDOR_PACK_COST
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_COST
            FROM (
                SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(ATTR_GROUP, '_X') ATTRIBNAME, D.ITEM_KEY, D.ATTRIBVALUE
                      FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                      JOIN {self._dim_schema}.{itemTable} D
                      ON D.VENDOR_KEY = AG.VENDOR_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                      WHERE AG.ATTR_GROUP IN ('ITEM_GROUP','VENDOR_PACK_QTY','ITEM_DESCRIPTION','OSM_ITEM_NBR'
                          ,'OSM_ITEM_STATUS','OSM_ITEM_TYPE','OSM_WHSE_PACK_QTY','OSM_MAJOR_CATEGORY_NO'
                          ,'OSM_MAJOR_CATEGORY','OSM_CATEGORY','OSM_SUB_CATEGORY_NO','OSM_SUB_CATEGORY','OSM_BRAND',
                          'OSM_UNIT_PRICE','OSM_VENDOR_STK_NBR','OSM_VENDOR_PACK_COST','OSM_WHSE_PACK_COST')
            ) AGP
            GROUP BY AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
        ) AGPG ON AGPG.RETAILER_KEY = CC.RETAILER_KEY AND AGPG.VENDOR_KEY = PV.VENDOR_KEY AND AGPG.ITEM_KEY =PV.ITEM_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY FROM {self._dim_schema}.PRODUCT_DEFAULT_FILTER)
        """.format(self=self, itemTable=_src_item_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_stage_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS).
        if self._str_vendor_filter and not cis_source:
            self._product_columns = self._dct_table_column["PRODUCT"]
            col = " VENDOR_KEY, ITEM_KEY, UPC, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.PRODUCT WHERE ATTRIBNAME "
                "IN ({self._product_columns}) {self._str_vendor_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_PRODUCT"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _export_store_data(self, hub_id, cis_source=False):
        """
        Export store data to common schema table stage_<table_name> from source schema <table_name>
        :param hub_id:
        :return:
        """

        _target_store_table = "STAGE_{}".format(self._dim_store)
        _src_store_table = "STORE"
        if cis_source:
            _target_store_table = "STAGE_{}_CIS".format(self._dim_store)
            _src_store_table = "STAGE_{}".format(self._store_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgStoreTable};
            CREATE TABLE {dimSchema}.{stgStoreTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID AS STOREID, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' as ACTIVE 
            FROM {dimSchema}.{srcStoreTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgStoreTable=_src_store_table,
                       srcStoreTable=self._store_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID, MARKET_CLUSTER, STATE, CITY, ZIP, OSM_REGION, "
            "OSM_DISTRICT, RSI_ALERT_TYPE, RETAILER_NAME, RSI_BANNER, PRIME_DC, STORE_NAME"
        )
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY, SR.STOREID, 
            AGSG.MARKET_CLUSTER, SR.STATE,
            CASE WHEN SR.CITY IS NULL AND STATE IS NULL THEN 'N/A'
                WHEN SR.CITY IS NOT NULL AND STATE IS NULL THEN CONCAT(SR.CITY, ', N/A')
                WHEN SR.CITY IS NULL AND STATE IS NOT NULL THEN CONCAT('N/A, ' , SR.STATE)
                WHEN SR.CITY IS NOT NULL AND STATE IS NOT NULL THEN CONCAT(CONCAT(SR.CITY, ', '), SR.STATE) END CITY,
            SR.ZIP, AGSG.OSM_REGION, AGSG.OSM_DISTRICT, COALESCE(SR.RSI_ALERT_TYPE, '0') RSI_ALERT_TYPE,
            CC.RETAILER_NAME, coalesce(AGSG.RSI_BANNER, 'unmapped') AS RSI_BANNER, AGSG.PRIME_DC, AGSG.STORE_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, RETAILER_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.RETAILER_KEY, D.STORE_KEY, D.STOREID,
                MAX(CASE WHEN D.ATTRIBNAME = 'STATE' THEN D.ATTRIBVALUE END) STATE,
                MAX(CASE WHEN D.ATTRIBNAME = 'CITY' THEN D.ATTRIBVALUE END) CITY,
                MAX(CASE WHEN D.ATTRIBNAME = 'ZIP' THEN D.ATTRIBVALUE END) ZIP,
                MAX(CASE WHEN D.ATTRIBNAME = 'RSI_ALERT_TYPE' THEN D.ATTRIBVALUE END) RSI_ALERT_TYPE
            FROM {self._dim_schema}.{storeTable} D
            JOIN {self._dim_schema}.RETAILER R ON R.RETAILER_KEY = D.RETAILER_KEY
            WHERE STORE_KEY NOT IN (SELECT STORE_KEY FROM {self._dim_schema}.STORES_GLOBAL_FILTER) {self._active_status}
            GROUP BY D.RETAILER_KEY, D.STORE_KEY, D.STOREID
        ) SR ON SR.RETAILER_KEY = CC.RETAILER_KEY
        LEFT JOIN (
            SELECT AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'MARKET_CLUSTER_X' THEN ATTRIBVALUE END) MARKET_CLUSTER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_REGION_X' THEN ATTRIBVALUE END) OSM_REGION,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_DISTRICT_X' THEN ATTRIBVALUE END) OSM_DISTRICT,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'RSI_BANNER_X' THEN ATTRIBVALUE END) RSI_BANNER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'PRIME_DC_X' THEN ATTRIBVALUE END) PRIME_DC,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'SAR_STORE_NAME_X' THEN ATTRIBVALUE END) STORE_NAME
                FROM (
                    SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(AG.ATTR_GROUP, '_X') ATTRIBNAME, D.STORE_KEY, 
                        D.ATTRIBVALUE
                    FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                    JOIN {self._dim_schema}.{storeTable} D
                    ON D.RETAILER_KEY = AG.RETAILER_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                    WHERE AG.ATTR_GROUP IN ('RSI_BANNER','PRIME_DC', 'MARKET_CLUSTER', 'OSM_REGION', 'OSM_DISTRICT', 'SAR_STORE_NAME')
                ) AGS
                GROUP BY AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY
        ) AGSG 
        ON AGSG.RETAILER_KEY = SR.RETAILER_KEY AND AGSG.VENDOR_KEY = CC.VENDOR_KEY
        AND AGSG.STORE_KEY = SR.STORE_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY FROM {self._dim_schema}.STORES_DEFAULT_FILTER);
        """.format(self=self, storeTable=_src_store_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_store_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS)
        if self._str_retailer_filter and not cis_source:
            self._store_columns = self._dct_table_column["STORE"]
            col = " RETAILER_KEY, STORE_KEY, STOREID, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.STORE WHERE ATTRIBNAME "
                "IN ({self._store_columns}) {self._str_retailer_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_STORE"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _load_product_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        if the same item_key exists in both CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_product_cis
        self._export_product_data(hub_id=hub_id, cis_source=True)

        # Below part is moved from method self._export_product_data. Put it here when all product data are ready.
        # Override nxg items with CIS if there are duplicated items on both NXG and CIS
        sql = """
         DELETE FROM {cmnSchema}.STAGE_{itemTable} 
         WHERE (vendor_key, retailer_key, item_key) IN
         (SELECT vendor_key, retailer_key, item_key
          FROM {cmnSchema}.STAGE_{itemTable}_CIS);
         
         INSERT INTO {cmnSchema}.STAGE_{itemTable} 
         SELECT * FROM {cmnSchema}.STAGE_{itemTable}_CIS;
        """.format(itemTable=self._dim_product, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

        # updating column OSM_MAJOR_CATEGORY when NXG and CIS sources are all done.
        sql = """
        DROP TABLE IF EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_SUB_CATEGORY_NO,MAX(OSM_SUB_CATEGORY) as OSM_SUB_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_SUB_CATEGORY_NO;

        DROP TABLE IF EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_MAJOR_CATEGORY_NO,MAX(OSM_MAJOR_CATEGORY) as OSM_MAJOR_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_MAJOR_CATEGORY_NO;
        """.format(self=self)
        self._logger.info(sql)
        self._dw.execute(sql)

        sql = ("""
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.STAGE_{self._dim_product}');

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_SUB_CATEGORY_CONSISTENCY
        SELECT a.OSM_SUB_CATEGORY_NO, MAX(a.OSM_SUB_CATEGORY) AS OSM_SUB_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_SUB_CATEGORY_CONSISTENCY b on a.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO
        WHERE a.OSM_SUB_CATEGORY_NO IS NOT NULL AND a.OSM_SUB_CATEGORY_NO <> '' AND b.OSM_SUB_CATEGORY_NO IS NULL
        GROUP BY a.OSM_SUB_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_SUB_CATEGORY = b.OSM_SUB_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_SUB_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO;
        /*UPDATE {self._common_schema}.STAGE_{self._dim_product}
        SET ITEM_GROUP = B.ITEM_GROUP
        FROM $schemaName.OLAP_ITEM B
        WHERE {self._common_schema}.STAGE_{self._dim_product}.ITEM_KEY = B.ITEM_KEY;*/

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY
        SELECT a.OSM_MAJOR_CATEGORY_NO, MAX(a.OSM_MAJOR_CATEGORY) AS OSM_MAJOR_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY b on a.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO
        WHERE a.OSM_MAJOR_CATEGORY_NO IS NOT NULL AND a.OSM_MAJOR_CATEGORY_NO <> '' AND b.OSM_MAJOR_CATEGORY_NO IS NULL
        GROUP BY a.OSM_MAJOR_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_MAJOR_CATEGORY = b.OSM_MAJOR_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO;
        """.format(self=self))
        self._logger.info(sql)
        self._dw.execute(sql)

    def _load_store_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        # if the same store_key exists in CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_store_cis
        self._export_store_data(hub_id=hub_id, cis_source=True)

        # # Override nxg stores with CIS if there are duplicated items on both NXG and CIS
        sql = """
        DELETE FROM {cmnSchema}.STAGE_{storeTable} 
        WHERE (vendor_key, retailer_key, store_key) 
        IN (SELECT vendor_key, retailer_key, store_key 
            FROM {cmnSchema}.STAGE_{storeTable}_CIS );
        
        INSERT INTO {cmnSchema}.STAGE_{storeTable}  
        SELECT * FROM {cmnSchema}.STAGE_{storeTable}_CIS;
        """.format(storeTable=self._dim_store, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

    def _swap_data(self):
        """
        Switch partition from stage table to final table.
        :return:
        """

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_{self._dim_product}".format(
                   self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_product,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_product}'.format(self=self))

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_PRODUCT".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='PRODUCT',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_PRODUCT')

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_{self._dim_store}".format(
                self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_store,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_store}'.format(self=self))

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_STORE".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='STORE',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_STORE')

    def _process_sync_dim_data(self):
        """
        Process to sync dim data
        :return:
        """
        try:
            self._get_column()
            self._recreate_stage_table()
            for hub_id in self._hub_ids:
                try:
                    self._dim_schema = "DIM_{hub_id}".format(hub_id=hub_id)
                    self._meta_schema = "METADATA_{hub_id}".format(
                        hub_id=hub_id)
                    self._logger.info(
                        "Sync dim data start hub_id {hub_id}".format(
                            hub_id=hub_id))
                    self._get_source_config(hub_id)
                    self._export_calendar_data(hub_id)
                    self._get_vendor_retailer_from_hub(hub_id)

                    if not self._str_vendor_retailer_filter:
                        self._logger.warning(
                            "hub_id {} does not have any matched product and store data"
                            .format(hub_id))
                        continue

                    self._export_product_data(hub_id)
                    self._export_store_data(hub_id)

                    # handle CIS source.
                    self._load_product_data_cis(hub_id)
                    self._load_store_data_cis(hub_id)

                    self._logger.info(
                        "Sync hub_id {hub_id} dim data to stage table.".format(
                            hub_id=hub_id))
                except Exception as msg:
                    self._logger.warning(msg)
                    continue

            self._swap_data()
            # self._truncate_stage_table()
            self._logger.info("Sync dim data done from hubs {hub_ids}".format(
                hub_ids=self._hub_ids))
            self._analyze_table()
        except Exception as msg:
            self._logger.error(msg)
        finally:
            pass

    def process(self):
        try:
            self._process_sync_dim_data()

        except Exception as e:
            self._logger.error(e)

        finally:
            if hasattr(self, "_dw"):
                self._dw.close_connection()
            if hasattr(self, "_source_dw"):
                self._source_dw.close_connection()
            if hasattr(self, "_db"):
                self._db.close_connection()
Exemple #12
0
class AFMConfigToTable(object):
    """
    :input: afm_config.json under same folder. Note: this config file name is hardcoded.
    :output: below 4 tables.
    AFM_RULE_SET
    AFM_RULES
    AFM_RETAILER_RULE
    AFM_SVR_RULE
    :usage: python AFMConfigToTable.py

    :description:
    This is a temp solution for loading AFM configuration data from json format into AFM rule tables. Since the frontend UI is not yet working.
    Refer to above json string for the format template.
    When the frontend finished, this module will be also retired.
    There are only 4 tables involved here. See output tables list.
    """
    def __init__(self, meta, params, logger=None, filename="afm_config.json"):
        """
        :param meta:      common config
        :param params:    used by calling REST API
        :param filename:  only used for local testing
        """
        self.meta = meta
        self.params = params
        self.filename = filename
        self._cap = Capacity(meta=self.meta)
        self.app_conn = MSOperation(meta=self.meta)
        self._logger = logger if logger else Logger(log_level="debug",
                                                    target="console",
                                                    vendor_key=-1,
                                                    retailer_key=-1,
                                                    sql_conn=None)
        self.scd_url = "{0}/scd/process".format(
            self.meta["api_osa_bundle_str"])

    def get_data(self):
        # json_data = json.loads(afm_config_data)
        with open(self.filename, 'r') as fp:
            json_data = json.load(fp=fp)
        return json_data

    def insert_data(self):
        # _json_config_data = self.get_data()  # for local testing
        _json_config_data = self.params  # getting data from params directly instead of reading from config file.
        _rule_type = _json_config_data["ruleType"].upper()
        _cycle_key = _json_config_data["cycleKey"]
        _vendor_key = _json_config_data["vendorKey"]
        _retailer_key = _json_config_data["retailerKey"]
        _owner = _json_config_data["owner"]
        _rule_set_name = None

        try:
            if _rule_type not in ("RETAILER", "SVR"):
                self._logger.warning(
                    "ruleType should be only either RETAILER or SVR! "
                    "Please check config file: afm_config.json.")
                exit(1)

            if _rule_type == "RETAILER":
                _rule_set_name = _owner + '_' + self._cap.get_retailer_name_by_key(
                    retailer_key=_retailer_key)
            elif _rule_type == "SVR":
                _rule_set_name = _owner + '_' + self._cap.get_retailer_name_by_key(
                    retailer_key=_retailer_key
                ) + '_' + self._cap.get_vendor_name_by_key(
                    vendor_key=_vendor_key)

            rule_set_data = {
                "RULE_SET_NAME": _rule_set_name,
                "ENGINE_PROVIDER_NAME": "AFM",
                "DATA_PROVIDER_NAME": "AHOLD",
                "OWNER": _owner,
                "ITEM_SCOPE": _json_config_data["itemScope"],
                "STORE_SCOPE": _json_config_data["storeScope"],
                "TYPES_LIST": _json_config_data["typeList"],
                "ENABLED": "T",
                "CREATED_BY": _json_config_data["createdBy"],
                "UPDATED_BY": _json_config_data["createdBy"]
            }
            self._logger.info(rule_set_data)

            _rule_set_sql = """
            INSERT INTO AFM_RULE_SET
            (
             [RULE_SET_NAME], [ENGINE_PROVIDER_NAME], [DATA_PROVIDER_NAME],
             [OWNER], [ITEM_SCOPE], [STORE_SCOPE], [TYPES_LIST], [ENABLED],
             [CREATED_BY], [CREATED_DATE], [UPDATED_BY], [UPDATED_DATE]
            ) OUTPUT inserted.RULE_SET_ID
            VALUES(
             '{RULE_SET_NAME}', '{ENGINE_PROVIDER_NAME}', '{DATA_PROVIDER_NAME}',
             '{OWNER}', '{ITEM_SCOPE}', '{STORE_SCOPE}', '{TYPES_LIST}', '{ENABLED}',
             '{CREATED_BY}', GETDATE(), '{UPDATED_BY}', GETDATE()
            )
            """.format(**rule_set_data)
            self._logger.info("SQL for AFM_RULE_SET table is: %s" %
                              _rule_set_sql)
            _rule_set_id = self.app_conn.query_scalar(_rule_set_sql)

            self._logger.info("Generated rule set id is: %s" % _rule_set_id)

            rules_data = _json_config_data["rules"]
            for rule in rules_data:
                rule_data = {
                    "RULE_ID": rule["ruleId"],
                    "RULE_SET_ID": _rule_set_id,
                    "SUB_LEVEL_METRICS": rule["subLevelMetrics"],
                    "PARAMETER1": rule["parameter1"],
                    "PARAMETER2": rule["parameter2"],
                    "PARAMETER3": rule["parameter3"],
                    "ENABLED": "T",
                    "CREATED_BY": _json_config_data["createdBy"],
                    "CREATED_DATE": "",
                    "UPDATED_BY": _json_config_data["createdBy"],
                    "UPDATED_DATE": ""
                }
                self._logger.debug(rule_data)

                _rules_sql = """
                INSERT INTO AFM_RULES
                ( [RULE_ID], [RULE_SET_ID], [SUB_LEVEL_METRICS], [PARAMETER1], [PARAMETER2], [PARAMETER3],
                  [ENABLED], [CREATED_BY], [CREATED_DATE], [UPDATED_BY], [UPDATED_DATE]
                )
                VALUES (
                  {RULE_ID}, {RULE_SET_ID}, '{SUB_LEVEL_METRICS}', '{PARAMETER1}', '{PARAMETER2}', '{PARAMETER3}',
                  '{ENABLED}', '{CREATED_BY}', getdate(), '{UPDATED_BY}', getdate()
                )""".format(**rule_data)
                self._logger.info("SQL for AFM_RULES table is: %s" %
                                  _rules_sql)
                self.app_conn.execute(_rules_sql)

            if _rule_type == "RETAILER":
                _sql = """SELECT COUNT(*) FROM AFM_RETAILER_RULE 
                WHERE cycle_key = {cycleKey}""".format(cycleKey=_cycle_key)
                _exist = self.app_conn.query_scalar(_sql)
                if _exist != 0:
                    _update_sql = "UPDATE AFM_RETAILER_RULE SET owner='{owner}', rule_set_id = {rule_set_id} " \
                                  "WHERE cycle_key = {cycleKey}; "\
                        .format(cycleKey=_cycle_key,
                                owner=_owner,
                                rule_set_id=_rule_set_id)
                    self._logger.info(
                        "Sql for updating table AFM_RETAILER_RULE is: %s" %
                        _update_sql)

                    # calling scd api to execute update/delete statements
                    _body = {
                        "sql": _update_sql,
                        "actioner": "ben.wu",
                        "log_detail": True,
                        "batch_size": 100,
                        "db_type": "MSSQL",
                        "table_schema": "(COMMON)"
                    }
                    # resp = requests.post(self.scd_url, data=json.dumps(_body))
                    resp = requests.post(self.scd_url, json=_body)
                    if resp.status_code != requests.codes.ok:
                        self._logger.warning("The response result is: %s" %
                                             resp.json())
                        self._logger.error(
                            "Calling API failed with body: %s. Refer to API: %s"
                            % (_body, self.scd_url))

                else:
                    _insert_sql = """
                    INSERT INTO AFM_RETAILER_RULE(cycle_key, owner, rule_set_id)
                    VALUES({0}, '{1}', {2});""".format(_cycle_key, _owner,
                                                       _rule_set_id)
                    self._logger.info(
                        "SQL for inserting table AFM_RETAILER_RULE is: %s" %
                        _insert_sql)
                    self.app_conn.execute(_insert_sql)

            if _rule_type == "SVR":
                _sql = """
                SELECT COUNT(*) FROM AFM_SVR_RULE
                WHERE vendor_key = {vendorKey} AND retailer_key = {retailerKey}
                """.format(vendorKey=_vendor_key, retailerKey=_retailer_key)
                _exist = self.app_conn.query_scalar(_sql)
                if _exist != 0:
                    _update_sql = """
                    UPDATE AFM_SVR_RULE SET owner = '{owner}', rule_set_id = {rule_set_id} 
                    WHERE vendor_key = {vendorKey} AND retailer_key = {retailerKey}
                    AND cycle_key = {cycleKey}
                    """.format(owner=_owner,
                               rule_set_id=_rule_set_id,
                               vendorKey=_vendor_key,
                               retailerKey=_retailer_key,
                               cycleKey=_cycle_key)
                    self._logger.info(
                        "SQL for updating table AFM_SVR_RULE is: %s" %
                        _update_sql)

                    # calling scd api to execute update/delete statements
                    _body = {
                        "sql": _update_sql,
                        "actioner": "ben.wu",
                        "log_detail": True,
                        "batch_size": 100,
                        "db_type": "MSSQL",
                        "table_schema": "(COMMON)"
                    }
                    # resp = requests.post(self.scd_url, data=json.dumps(_body))
                    resp = requests.post(self.scd_url, json=_body)
                    if resp.status_code != requests.codes.ok:
                        self._logger.warning("The response result is: %s" %
                                             resp.text)
                        self._logger.error(
                            "Calling API failed with body: %s. Refer to API: %s"
                            % (_body, self.scd_url))

                else:
                    _insert_sql = """
                    INSERT INTO AFM_SVR_RULE(vendor_key, retailer_key, owner, rule_set_id, cycle_key)
                    VALUES({0}, {1}, '{2}', {3}, {4});
                    """.format(_vendor_key, _retailer_key, _owner,
                               _rule_set_id, _cycle_key)
                    self._logger.info(
                        "SQL for inserting table AFM_SVR_RULE is: %s" %
                        _insert_sql)
                    self.app_conn.execute(_insert_sql)

            # insert related schedule.
            self.gen_schedule()

            self._logger.info(
                "Insert AFM config data completed. Please check rule_set_id: %s in related tables."
                % _rule_set_id)

        except Exception as e:
            self._logger.warning(
                "WARNING: Error found. Please fix it and re-run this script.")
            self._logger.warning(e)
            raise

        finally:
            if self.app_conn:
                self.app_conn.close_connection()

    def get_job_def_id(self, job_name):
        _job_name = job_name
        _job_url = self.meta[
            "api_schedule_str"]  # http://engv3dstr2.eng.rsicorp.local/common
        _headers = {  #"tokenid": "eyJhbGciOiJIUzI1NiJ9.eyJjb29raWVWYWx1ZSI6IkFRSUM1d00yTFk0U2Zjd1ZidkJILXZOWFhEYS1HQm1ETlVpd240dWtMSzBsNEJjLipBQUpUU1FBQ01ERUFBbE5MQUJNek16azRPVEkyTXpFNU5EZzJNemMwTnpZeUFBSlRNUUFBKiIsInVzZXJJZCI6ImJlbi53dUByc2ljb3JwLmxvY2FsIiwiY29va2llTmFtZSI6InJzaVNzb05leHRHZW4iLCJzdGF0dXMiOiJzdWNjZXNzIiwiaWF0IjoxNTM0ODE1NTAxfQ.Hbv_wcsEqmUBFTy64BTf15nWC94fsFTfmt3LZMq24Ag",
            "content-type": "application/json"
        }
        _job_def_url = _job_url + '/schedule/jobdefinitions'

        self._logger.info("URL to find the job definition is: %s " %
                          _job_def_url)
        res = requests.get(url=_job_def_url, headers=_headers, verify=False)
        if res.text == "invalid token":
            self._logger.warning(
                "WARNING: Please update the tokenid manually. "
                "Then rerun this script again!!!")
            exit(1)

        x = [dct["id"] for dct in res.json() if dct["jobDefName"] == _job_name]
        if not x:
            self._logger.info(
                "There is no job id found for job: %s. You can refer to API: %s"
                % (_job_name, _job_def_url))
            exit(1)
        # returning job definition id
        return x[0]

    def gen_schedule(self):
        self._logger.info(
            "Inserting related schedule schedule for OSARetailerAFM.")
        _rule_type = self.params["ruleType"].upper()

        # only creating schedule for RETAILER rule.
        if _rule_type == "RETAILER":
            _job_name = "OSARetailerAFM"
            _job_schedule_id = self.get_job_def_id(_job_name)
            _cycle_key = self.params["cycleKey"]
            _group_name = "{0}:{1}".format(_job_schedule_id, _cycle_key)
            _schedule_name = "{0}:{1}".format(_group_name, _job_name)
            sch_params = dict(
                creater="*****@*****.**",
                groupName=_group_name,
                jobDefinitionId=_job_schedule_id,
                # parametersContext="",
                # priority=1,
                # scheduleExpression="",
                scheduleName=_schedule_name,
                scheduleType="EVENT")
            loader = scheduler.ScheduleParamsToTable(self.meta, sch_params)
            loader.load_data()

        # OSMAlerting schedule will take care of SVR rule.
        if _rule_type == "SVR":
            pass
    def __init__(self, meta, params=None, init_flag=False, logger=None):
        """
        # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key.
        # 1, only sync those vendor&retailer which applied OSA Service.
        # 2, for Those new vendor&retailer, copy all historical data when initialization.

        :param meta: [mandatory] config data from config.properties file
        :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below.
            1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs.
               Noted: rdp_id will be passed when calling this service via REST API.
            2, vendor_key:   mandatory only when init_flag is True.
               retailer_key: mandatory only when init_flag is True
               Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts.
        :param init_flag:
            if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer.
            if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily).
        :param logger:
        """
        self.meta = meta
        self._params = {} if params is None else params
        self._rdp_id = self._params.get("rdpId", None)
        self._fact_type = 'fdbk'
        self._init_flag = init_flag
        self._vendor_key = self._params.get("vendor_key", None)
        self._retailer_key = self._params.get("retailer_key", None)
        self._debug = self._params.get('debug', 'N')
        self._default_rdp = "RDP_AUX"

        self._log_file = './log/sync_fdbk_%s_%s.log' % (
            self._rdp_id, datetime.datetime.now().strftime('%Y%m%d'))
        self.logger = logger if logger else Logger(log_level="debug",
                                                   target="console|file",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   log_file=self._log_file,
                                                   sql_conn=None)

        self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger)
        self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger)
        self.max_event_key = None

        # we already know feedback table name of RDP
        self.source_table_rdp = "DS_FACT_FEEDBACK"  # source table in RDP side.
        self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP"  # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK)
        self.target_table_osa = "FACT_FEEDBACK"  # final table in OSA side
        self.capacity = Capacity(meta=meta)

        self.dct_sync_data = copy.deepcopy(
            self.meta)  # required for calling sync_data module
        self.dct_sync_data[
            "meta"] = self.meta  # required for calling sync_data module
        self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn
        self.dct_sync_data["target_dw_schema"] = self.meta[
            'db_conn_vertica_common_schema']
        self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa
        self.dct_sync_data["logger"] = self.logger

        # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql.
        self.dct_sync_data["dw_conn_vertica"] = False
        # self.dct_sync_data["dw_conn_vertica"] = True

        self.transfer = TransferData(dct_sync_data=self.dct_sync_data)
class Feedback(object):
    def __init__(self, meta, params=None, init_flag=False, logger=None):
        """
        # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key.
        # 1, only sync those vendor&retailer which applied OSA Service.
        # 2, for Those new vendor&retailer, copy all historical data when initialization.

        :param meta: [mandatory] config data from config.properties file
        :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below.
            1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs.
               Noted: rdp_id will be passed when calling this service via REST API.
            2, vendor_key:   mandatory only when init_flag is True.
               retailer_key: mandatory only when init_flag is True
               Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts.
        :param init_flag:
            if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer.
            if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily).
        :param logger:
        """
        self.meta = meta
        self._params = {} if params is None else params
        self._rdp_id = self._params.get("rdpId", None)
        self._fact_type = 'fdbk'
        self._init_flag = init_flag
        self._vendor_key = self._params.get("vendor_key", None)
        self._retailer_key = self._params.get("retailer_key", None)
        self._debug = self._params.get('debug', 'N')
        self._default_rdp = "RDP_AUX"

        self._log_file = './log/sync_fdbk_%s_%s.log' % (
            self._rdp_id, datetime.datetime.now().strftime('%Y%m%d'))
        self.logger = logger if logger else Logger(log_level="debug",
                                                   target="console|file",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   log_file=self._log_file,
                                                   sql_conn=None)

        self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger)
        self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger)
        self.max_event_key = None

        # we already know feedback table name of RDP
        self.source_table_rdp = "DS_FACT_FEEDBACK"  # source table in RDP side.
        self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP"  # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK)
        self.target_table_osa = "FACT_FEEDBACK"  # final table in OSA side
        self.capacity = Capacity(meta=meta)

        self.dct_sync_data = copy.deepcopy(
            self.meta)  # required for calling sync_data module
        self.dct_sync_data[
            "meta"] = self.meta  # required for calling sync_data module
        self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn
        self.dct_sync_data["target_dw_schema"] = self.meta[
            'db_conn_vertica_common_schema']
        self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa
        self.dct_sync_data["logger"] = self.logger

        # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql.
        self.dct_sync_data["dw_conn_vertica"] = False
        # self.dct_sync_data["dw_conn_vertica"] = True

        self.transfer = TransferData(dct_sync_data=self.dct_sync_data)

    def _populate_source_config(self, source_config):
        self.logger.debug("The source config is: %s" % source_config)

        _src_config = {}
        if os.name == 'nt':
            _src_config["temp_file_path"] = "d:"
        elif os.name == 'posix':
            _src_config["temp_file_path"] = "/tmp"

        # Getting user account from config.properties file first.
        if self.meta.get("db_conn_vertica_rdp_username"):
            _src_config["dw.etluser.id"] = self.meta.get(
                "db_conn_vertica_rdp_username")
            if self.meta.get("db_conn_vertica_rdp_password"):
                _src_config["dw.etluser.password"] = self.meta.get(
                    "db_conn_vertica_rdp_password")
            else:
                _pmp_pwd = get_password(
                    username=self.meta.get("db_conn_vertica_rdp_username"),
                    meta=self.meta)
                # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module
                _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd)
        # if not configed then get them directly from RDP config.
        else:
            _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id")
            # the pwd is encrypted
            _src_config["dw.etluser.password"] = source_config.get(
                "dw.etluser.password")

        # required info for calling sync_data module.
        _src_config["dw.server.name"] = source_config.get("dw.server.name")
        _src_config["dw.db.name"] = source_config.get("dw.db.name")
        _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433)
        _src_config["dw.schema.name"] = source_config.get("dw.schema.name")

        self.logger.debug("srouce config is: %s" % _src_config)
        self.dct_sync_data["source_config"] = _src_config

        # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster
        rdp_meta = copy.deepcopy(self.meta)
        tmp_rdp_meta = {
            'db_conn_vertica_servername': _src_config["dw.server.name"],
            'db_conn_vertica_port': _src_config["dw.db.portno"],
            'db_conn_vertica_dbname': _src_config["dw.db.name"],
            'db_conn_vertica_username': _src_config["dw.etluser.id"],
            'db_conn_vertica_password': _src_config["dw.etluser.password"],
            'db_conn_vertica_password_encrypted': "true"
        }
        rdp_meta.update(tmp_rdp_meta)
        self.logger.debug("rdp config is: %s" % rdp_meta)
        rdp_connection = DWOperation(meta=rdp_meta)
        self.dct_sync_data["source_dw"] = rdp_connection

    def main_process(self):
        try:
            # if not introducing new customer and _rdp_id was given,
            # then we will sync all feedback data from given RDP for registered users.
            if self._init_flag is False and self._rdp_id:
                try:
                    rdp_config = Config(meta=self.meta,
                                        hub_id=self._rdp_id).json_data
                    if not rdp_config['configs']:
                        raise Warning(
                            "There is no configs returned for RDP: %s."
                            "Please check if this RDP registered in CP with below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self._rdp_id, self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    _rdp_schema = rdp_config['configs'].get('dw.schema.name')
                    self.logger.info("Started to sync data from rdp: %s" %
                                     _rdp_schema)

                    # self.dct_sync_data["source_config"] = rdp_config['configs']
                    self._populate_source_config(rdp_config['configs'])

                    self.initialize()

                    _flag = self.load_data()
                    if _flag:
                        # if no data, then no need to process & update variables table.
                        self.process_data()

                        sql = """
                        IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                            INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                            VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                        ELSE
                            UPDATE VARIABLES
                            SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                            WHERE VARIABLE_NAME = '{eventType}'
                        """.format(eventType=_rdp_schema,
                                   value=self.max_event_key)
                        self.logger.info(sql)
                        self.osa_app_conn.execute(sql)

                    self.logger.info("Data sync done for RDP: %s" %
                                     _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise
                    # exit(StepStatus.SUCCESS.value)  # exit(0) otherwise Docker container will fail.

            # Else we will get all RDPs from REST API: http://10.172.36.75/config/properties/rdps?factType=fdbk
            # There could be multi RDPs(e.g. for SVR & WM). if so, loop all RDPs
            elif self._init_flag is False and self._rdp_id is None:
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        raise Warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    for rdp_config in rdp_configs:
                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info("Started to sync data from rdp: %s" %
                                         _rdp_schema)

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process & update variables table.
                            self.process_data()

                            sql = """
                            IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                                INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                                VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                            ELSE
                                UPDATE VARIABLES
                                SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                                WHERE VARIABLE_NAME = '{eventType}'
                            """.format(eventType=_rdp_schema,
                                       value=self.max_event_key)
                            self.logger.info(sql)
                            self.osa_app_conn.execute(sql)

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise

            elif self._init_flag is True:
                if self._vendor_key is None or self._retailer_key is None:
                    self.logger.warning(
                        "vendor_key and retailer_key are required when initilize feedback for new customer"
                    )
                    raise ValueError

                # getting fdbk related rdps.
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        self.logger.warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        exit(StepStatus.SUCCESS.value)

                    fdbk_rdps = [
                        str(rdp_config["rdpId"]).upper()
                        for rdp_config in rdp_configs
                    ]

                    # change table name in case conflict with normal sync process
                    self.dct_sync_data[
                        "target_dw_table"] = "{0}_{1}_{2}".format(
                            self.staging_import_table_osa, self._vendor_key,
                            self._retailer_key)

                    _silo_config = Config(
                        meta=self.meta,
                        vendor_key=self._vendor_key,
                        retailer_key=self._retailer_key).json_data
                    _silo_type = _silo_config['configs'].get(
                        'etl.silo.type', 'SVR')
                    _rdp_id = _silo_config['configs'].get('rdp.db.name')

                    # RDP_AUX is default rdp id for feedback etl on PRODUCTION.
                    # 1, if there is no RDP for given silo. then exit.
                    if not _rdp_id or str(_rdp_id).strip() == '':
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # 2, Getting configed RDP list, and check if there are feedback related RDPs.
                    _tmp_rdp_lst = str(_rdp_id).upper().split(sep=",")
                    _rdp_lst = [_tmp.strip() for _tmp in _tmp_rdp_lst]
                    # common_rdps is RDP silo configed for syncing feedback data for given silo(vendor&retailer)
                    common_rdps = list(set(_rdp_lst).intersection(fdbk_rdps))
                    if common_rdps is None:
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # If there is 1 or more than 1 feedback related rdps configed, then loop them to sync feedback data,
                    # Normally, there should be only 1. or no feedback rdp configed.
                    for common_rdp in common_rdps:
                        _rdp_id = common_rdp
                        self.logger.info(
                            "Started to sync data from rdp: %s for given vendor:%s and retailer:%s. "
                            % (_rdp_id, self._vendor_key, self._retailer_key))

                        # if RDP is not RDP_AUX, Won't exit but log a warning.
                        if _rdp_id != self._default_rdp:
                            self.logger.warning(
                                "Please be noted: The RDP is:%s. It is not RDP_AUX."
                                % _rdp_id)

                        # WM silos are also following above logic.
                        # all hosted silo are ultilizing RDP_AUX to transfer feedback data. not sure about Walmart.
                        # if str(_silo_type).upper() in ["WMSSC", "WMCAT", "SASSC", "WMINTL"]:
                        #     _rdp_id = self._default_rdp  # WM rdp is RDP_AUX as well?

                        rdp_config = Config(meta=self.meta,
                                            hub_id=_rdp_id).json_data
                        if not rdp_config['configs']:
                            self.logger.warning(
                                "There is no configs for RDP: %s. Please check following URL:"
                                "%s/properties/%s/%s" %
                                (_rdp_id, self.meta["api_config_str"], _rdp_id,
                                 _rdp_id))
                            exit(StepStatus.SUCCESS.value)

                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info(
                            "Started to init feedback data from rdp: %s for "
                            "given vendor:%s and retailer:%s " %
                            (_rdp_schema, self._vendor_key,
                             self._retailer_key))

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process.
                            self.process_data()

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_id)

                except Exception as e:
                    self.logger.warning(e)
                    self.logger.warning(
                        "Please check if any warning or error messages when doing the initialization!"
                    )

        finally:
            if self.osa_app_conn:
                self.osa_app_conn.close_connection()
            if self.osa_dw_conn:
                self.osa_dw_conn.close_connection()

    def initialize(self):
        """
        Create local temp tables , and DDLs required to process this fact type
        :return:
        """
        self.logger.info("Initialize...")

        # recreate this table for every RDP. no need to truncate any longer.
        # sql = "TRUNCATE TABLE {cmnSchema}.{targetTable}"\
        #     .format(cmnSchema=self.dct_sync_data["target_dw_schema"], targetTable=self.staging_import_table_osa)
        # self.logger.info(sql)
        # self.osa_dw.execute(sql)

        sql = """
        --Store data from RDP table.
        DROP TABLE IF EXISTS {cmnSchema}.{importTable};
        CREATE TABLE {cmnSchema}.{importTable}
        (
            EVENT_KEY int NOT NULL,
            RETAILER_KEY int,
            VENDOR_KEY int,
            STORE_VISIT_DATE date,
            PERIOD_KEY int NOT NULL,
            TYPE varchar(1),
            TYPE_DATE varchar(10),
            ALERT_ID int,
            ALERT_TYPE varchar(64),
            MERCHANDISER_STORE_NUMBER varchar(512),
            STORE_ID varchar(512),
            MERCHANDISER_UPC varchar(512),
            INNER_UPC varchar(512),
            MERCHANDISER varchar(100),
            STORE_REP varchar(1000),
            SOURCE varchar(1000),
            BEGIN_STATUS varchar(255),
            ACTION varchar(255),
            FEEDBACK_DESCRIPTION varchar(255),
            FEEDBACK_HOTLINEREPORTDATE date,
            FEEDBACK_ISININVENTORY varchar(5),
            ZIP_CODE varchar(64),
            ARTS_CHAIN_NAME varchar(255),
            UPC_STATUS varchar(255),
            MSI varchar(255)
        )
        UNSEGMENTED ALL NODES;
        """.format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                   importTable=self.dct_sync_data["target_dw_table"])
        self.logger.info(sql)
        self.osa_dw_conn.execute(sql)

    def load_data(self):
        """
        # Load data from RDP table ds_fact_feedback to local temp tables.
        There is an column event_key which is incremental for all customers in ds_fact_feedback table.
        we can save the snapshot of this column to variable table, and do the incremental every time based on this column.
        There are few cases here:
        1, Routinely, There will be a scheduled job to sync the whole feedback data for valid customers from related RDP silo.
           And save the snapshot of the event_key from previous loading for next incremental loading.
        2, if on-boarding a new vendor & retailer customer. Getting rdp_event_key from variable for related RDP silo. (rdp_event_key is from previous loading)
           and then sync feedback data from related RDP silo only for this given customer when event_key < rdp_event_key.
           Then case1 will take care the rest of feedback data.
        :return:
        """
        rdp_schema = self.dct_sync_data["source_config"].get('dw.schema.name')

        # rdp_aux.ds_fact_feedback
        source_table = "{rdpSchema}.{rdptableName}"\
            .format(rdpSchema=rdp_schema,
                    rdptableName=self.source_table_rdp)
        # common.stage_fact_feedback_rdp
        target_table = "{dwSchema}.{importTable}"\
            .format(dwSchema=self.dct_sync_data["target_dw_schema"],
                    importTable=self.dct_sync_data["target_dw_table"])

        self.logger.info(
            "Ready to load Data from {srouceTable} to {targetTable}".format(
                targetTable=target_table, srouceTable=source_table))

        insert_columns = " EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISIT_DATE, PERIOD_KEY, TYPE, TYPE_DATE," \
                         " ALERT_ID, ALERT_TYPE, MERCHANDISER_STORE_NUMBER, STORE_ID, MERCHANDISER_UPC, INNER_UPC," \
                         " MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION," \
                         " FEEDBACK_HOTLINEREPORTDATE, FEEDBACK_ISININVENTORY, ZIP_CODE, ARTS_CHAIN_NAME, UPC_STATUS, MSI "

        try:
            self.logger.info(
                "Getting the previous Event_key from last run for incremental load."
            )
            _event_sql = "SELECT VARIABLE_VALUE FROM variables " \
                         "WHERE VARIABLE_NAME = '{rdpName}'".format(rdpName=rdp_schema)
            self.logger.info(_event_sql)
            event_key = self.osa_app_conn.query_scalar(_event_sql)

            self.logger.info(
                "Getting customer info which only applied OSA services as filter"
            )
            sql = "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_MAPPING " \
                  "UNION " \
                  "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_RC_MAPPING"
            self.logger.info(sql)
            results = self.osa_app_conn.query(sql)
            if not results:
                raise Warning(
                    "There is no data in table AP_ALERT_CYCLE_MAPPING. Please check sql: %s"
                    % sql)
                # exit(StepStatus.SUCCESS.value)

            user_filters = [
                'SELECT ' + str(result.retailer_key) + ',' +
                str(result.vendor_key) for result in results
            ]
            user_filter_str = ' UNION ALL '.join(user_filters)
            self.logger.info("Customer filters are: %s" % user_filter_str)

            # incremental filter from RDP table
            where_sql = "EVENT_KEY > {eventKey} AND SOURCE != 'ARIA' " \
                        "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                .format(eventKey=event_key,
                        userFilter=user_filter_str)

            # TODO2DONE: how to set default value? use -1
            # copy all if there is no value in variables table.
            if not event_key:
                self.logger.warning(
                    "There is no value set in variables table for RDP:{name}, "
                    "So copy the whole table".format(name=rdp_schema))
                where_sql = " SOURCE != 'ARIA' AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str)
                event_key = -1  # check if this is the first run.

            if self._init_flag is True:
                if event_key == -1:  # event_key is None
                    self.logger.warning(
                        "There is no event_key logged in variables table for the given RDP: %s."
                        "So Let's wait for the routine job to sync the whole rdp feedback data"
                        % rdp_schema)
                    return False

                self.logger.info("Generating init feedback filters")
                where_sql = "EVENT_KEY <= {eventKey} AND SOURCE != 'ARIA' " \
                            "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter}) " \
                            "AND RETAILER_KEY={retailerKey} AND VENDOR_KEY={vendorKey} "\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str,
                            retailerKey=self._retailer_key,
                            vendorKey=self._vendor_key)

            self.logger.debug("The filters are: %s" % where_sql)

            # form the fetch query from RDP and then Insert into the target table
            fetch_query = """
            SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ {insertQuery} FROM {sourceTable}
            WHERE {whereSql}
            """.format(insertQuery=insert_columns,
                       sourceTable=source_table,
                       whereSql=where_sql)
            self.logger.info("fetch_query is : %s" % fetch_query)

            self.logger.info(
                ">>Loading {factType} Data from event_key:{eventKey} start at: {timestamp}<<"
                .format(factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            self.dct_sync_data["target_column"] = insert_columns
            self.dct_sync_data["source_sql"] = fetch_query

            row_count = self.transfer.transfer_data(
                dct_sync_data=self.dct_sync_data)

            self.logger.info(
                ">>Done loaded {cnt} rows from event_key:{eventKey} completed at: {timestamp}<<"
                .format(cnt=row_count,
                        factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            # if no data transfered, then update variables with previous value.
            sql = "SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ nvl(max(event_key), {oldEventKey}) " \
                  "FROM {schemaName}.{importTable} "\
                .format(schemaName=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"],
                        oldEventKey=event_key)
            self.logger.info(sql)
            self.max_event_key = self.osa_dw_conn.query_scalar(sql)

            # max_event_key = -1   # testing purpose
            if self.max_event_key == -1:
                self.logger.warning(
                    "There is no feedback data in RDP table: {0}".format(
                        source_table))
                return False

            return True

        except Exception as e:
            self.logger.warning(e)
            raise

        finally:
            pass

    def process_data(self):
        """
        after load_data part completes. sync data from temp table to related schemas.
        :return:
        """
        try:
            self.logger.info("Processing feedback start...")

            # loop retailer to insert feedback data
            sql = "SELECT DISTINCT retailer_key " \
                  "FROM {cmnSchema}.{importTable}"\
                .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"])
            self.logger.info(sql)
            retailers = self.osa_dw_conn.query(sql)

            if retailers.rowcount == 0:
                self.logger.warning(
                    "There is no data in table {cmnSchema}.{importTable}."
                    "It could be no incremental data. Please check fetch_query against RDP database"
                    .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"]))

            for retailer in retailers:
                retailer_key = retailer.retailer_key
                osa_schema = self.capacity.get_retailer_schema_name(
                    retailer_key)

                # Finally, run the sql
                msg = "Processing fdbk data within retailer {retailerKey}:{retailerName}"\
                    .format(retailerKey=retailer_key, retailerName=osa_schema)
                self.logger.info(msg)

                # Normally, There should NOT be duplicated alert_id transfered by incremental.
                # But should consider this case here. Delete existing alertid from target table
                # TODO: delete could have performance issue. consider using switch partition
                delete_sql = "DELETE FROM {osaSchema}.{targetTable} " \
                             "WHERE alert_id IN (SELECT alert_id FROM {cmnSchema}.{importTable} )"\
                    .format(targetTable=self.target_table_osa,
                            osaSchema=osa_schema,
                            cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(delete_sql)
                self.osa_dw_conn.execute(delete_sql)

                # inserting feedback data into final table fact_feedback from processed table.
                sql = """
                INSERT INTO {osaSchema}.{targetTable}
                (EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISITED_PERIOD_KEY, PERIOD_KEY,
                 ALERT_ID, STORE_KEY, MERCHANDISER_STORE_NUMBER,
                 STORE_ID, ITEM_KEY, MERCHANDISER_UPC, UPC, MERCHANDISER, STORE_REP, SOURCE,
                 BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION,
                 ON_HAND_PHYSICAL_COUNT, ON_HAND_CAO_COUNT
                )
                SELECT stage.EVENT_KEY, stage.RETAILER_KEY, stage.VENDOR_KEY,
                       TO_CHAR(stage.STORE_VISIT_DATE, 'YYYYMMDD')::int AS STORE_VISITED_PERIOD_KEY,
                       stage.PERIOD_KEY,
                       stage.ALERT_ID,
                       store.STORE_KEY AS STORE_KEY,
                       stage.MERCHANDISER_STORE_NUMBER,
                       COALESCE(store.STORE_ID , alert.STOREID, stage.STORE_ID) AS STORE_ID,
                       item.ITEM_KEY AS ITEM_KEY,
                       stage.MERCHANDISER_UPC,
                       COALESCE(item.UPC, alert.UPC, stage.INNER_UPC, stage.MERCHANDISER_UPC) AS UPC,
                       stage.MERCHANDISER, stage.STORE_REP, stage.SOURCE,
                       stage.BEGIN_STATUS, stage.ACTION, stage.FEEDBACK_DESCRIPTION,
                       0 AS ON_HAND_PHYSICAL_COUNT,
                       0 AS ON_HAND_CAO_COUNT
                FROM {cmnSchema}.{importTable} stage
                LEFT JOIN {osaSchema}.FACT_PROCESSED_ALERT alert
                ON stage.alert_id = alert.alert_id AND alert.issuanceid = 0
                AND alert.retailer_key = {retailerKey} AND stage.vendor_key = alert.vendor_key
                INNER JOIN {cmnSchema}.DIM_PRODUCT item
                ON item.retailer_key = {retailerKey} AND alert.vendor_key = item.vendor_key
                AND item.item_key = alert.item_key
                INNER JOIN {cmnSchema}.DIM_STORE store
                ON store.retailer_key = {retailerKey} AND alert.vendor_key = store.vendor_key
                AND store.store_key = alert.store_key
                WHERE stage.retailer_key = {retailerKey}
                """.format(osaSchema=osa_schema,
                           targetTable=self.target_table_osa,
                           cmnSchema=self.dct_sync_data["target_dw_schema"],
                           importTable=self.dct_sync_data["target_dw_table"],
                           retailerKey=retailer_key)
                self.logger.info(
                    "SQL used to load data to related schema. %s" % sql)
                self.osa_dw_conn.execute(sql)

            self.logger.info("Processing feedback ended...")

        except Exception as e:
            self.logger.warning("Process data for RDP {0} failed: {1}".format(
                self._rdp_id, e))
            raise

        finally:
            if self._debug.upper() == 'N':
                _drop_sql = "DROP TABLE IF EXISTS {schemaName}.{importTable};" \
                    .format(schemaName=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(_drop_sql)
                self.osa_dw_conn.execute(_drop_sql)
Exemple #15
0
class Scd(object):
    def __init__(self,
                 sql,
                 actioner,
                 log_detail=True,
                 batch_size=100,
                 meta={},
                 db_type=None,
                 table_schema=None,
                 logger=None):
        self.meta = meta
        self.actioner = actioner
        self.log_detail = log_detail
        self.batch_size = batch_size
        self.logger = logger if logger else Logger(log_level="info",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   module_name="scd")
        self.sql = sql
        self.raw_sql_splitted = self.sql.split(' ')
        self.raw_sql_splitted_trimed = [
            str for str in self.raw_sql_splitted if str != ''
        ]
        self.db_type = db_type
        self.table_schema = table_schema
        self.logger.info('[SQL] %s' % self.sql)
        self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted)
        self.mssql = MSOperation(meta=self.meta)
        self.vtc = None
        self.saved_records_count = 0

    def set_insert_delete_update(self):
        cmd = self.raw_sql_splitted_trimed[0].upper()
        if cmd in ('DELETE', 'UPDATE', 'INSERT'):
            self.cmd = cmd
        else:
            raise RuntimeError('%s is not supported.' % cmd)

    def locate_keyword(self, keyword, sql_splitted=[]):
        # find the location of the keyword firstly appears in the sql
        # e.g. locate WHERE
        # UPDATE TEST SET COL = ' WHERE ' WHERE ID=1, the first ' WHERE ' is not a keyword, so return the index of the 2nd WHERE
        keyword = keyword.upper()
        base_seq_splitted = sql_splitted if sql_splitted else self.raw_sql_splitted  # if sql_splitted is specified, locate keyword in sql_splitted, otherwise locate keyword in raw_sql_splitted
        index_keyword = [
            k for k, v in enumerate(base_seq_splitted) if keyword in v.upper()
        ]
        for ind in index_keyword:
            count_of_single_quote_ahead_of_where = sum([
                base_seq_splitted[i].count("'") for i in range(0, ind)
                if "'" in base_seq_splitted[i]
            ])
            i = base_seq_splitted[ind].upper().index(keyword)
            count_of_single_quote_ahead_of_where += base_seq_splitted[ind][
                0:i].count("'")
            if count_of_single_quote_ahead_of_where % 2 == 0:  # the KEYWORD we want to capture is the one has n*2 "'" ahead of it
                return ind
        return -1  # KEYWORD not found

    def locate_keyword_combination(
            self,
            keywords):  #--mainly for parsing merge, may need to be updated
        # find the location of the first keyword, and to check if 2nd, 3rd keywords are just following the 1st keyword in the SQL
        # if match all the keywords, return the index of the first keyword
        # else return -1
        keywords_list = [
            w.upper().strip() for w in keywords.split(' ') if w.strip() != ''
        ]
        sql_splitted = [*self.raw_sql_splitted]
        base_ind = 0
        self.logger.info('Locating %s' % keywords_list)
        while True:
            self.logger.info('Locating 1st keyword "%s"' % keywords_list[0])
            ind = self.locate_keyword(keywords_list[0], sql_splitted)
            if ind == -1:
                self.logger.warning('Keyword not found.')
                return -1
            following = [
                v.upper().strip() for v in sql_splitted[ind + 1:]
                if v.strip() != ''
            ]
            self.logger.info('Getting following %s', following)
            if following[0:len(keywords_list) - 1] == keywords_list[1:]:
                self.logger.info('Keywords matched!')
                return base_ind + ind
            else:
                self.logger.warning('Keywords not matched.')
                sql_splitted = sql_splitted[ind + 1:]
                base_ind += ind

    def get_merge_where(
        self
    ):  #--TODO, not done yet, the sql gramma "update from" for vertica and sqlserver are different, need more time to work on this
        index_using = self.locate_keyword('USING')
        index_on = self.locate_keyword('ON')
        index_when_match = self.locate_keyword_combination('WHEN MATCHED THEN')
        index_when_not_match = self.locate_keyword_combination(
            'WHEN NOT MATCHED THEN')

    def get_where_condition(self):
        index_where = self.locate_keyword('WHERE')
        return '' if index_where == -1 else ' '.join(
            self.raw_sql_splitted[index_where:])

    def get_table_name(self):
        self.set_insert_delete_update()
        return self.raw_sql_splitted_trimed[1].upper(
        ) if self.cmd == 'UPDATE' else self.raw_sql_splitted_trimed[2].upper()

    def get_raw_table_name(self, table_name):
        return table_name.split('.')[-1]

    def get_table_def(self, raw_table_name):
        sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name
        if self.db_type:
            sql += " AND DB_TYPE='%s'" % self.db_type
        if self.table_schema:
            sql += " AND TABLE_SCHEMA='%s'" % self.table_schema
        rs = self.mssql.query(sql)
        if len(rs) == 0:
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.'
                % (raw_table_name, db_type, table_schema))
        elif len(rs) > 1:
            self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|')
            for r in rs:
                self.logger.info(
                    '|%s|%s|%s|%s|' %
                    (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA))
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.'
                % (raw_table_name, db_type, table_schema))
        else:
            if rs[0].DB_TYPE == 'MSSQL':
                self.target_conn = self.mssql
            else:
                self.vtc = DWOperation(meta=self.meta)
                self.target_conn = self.vtc
            return rs[0]

    def get_mssql_table_ddl_def(self, db_name, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM %s.INFORMATION_SCHEMA.COLUMNS
            WHERE TABLE_NAME = N'%s'
            AND TABLE_SCHEMA = N'%s'
            ORDER BY ORDINAL_POSITION
        ''' % (db_name, table_name, table_schema)
        rs = self.mssql.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def get_vertica_table_ddl_def(self, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM COLUMNS
            WHERE TABLE_NAME = '%s'
            AND TABLE_SCHEMA = '%s'
            ORDER BY ORDINAL_POSITION
        ''' % (table_name, table_schema)
        rs = self.vtc.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def save_to_db(self, table_def, recs):
        pk = json.loads(table_def.PK)
        batch_sqls = []
        columns = 'TABLE_DEF_ID, PK_VALUE, ACTION, RECORD, UPDATE_TIME, ACTIONER' if self.log_detail else 'TABLE_DEF_ID, PK_VALUE, ACTION, UPDATE_TIME, ACTIONER'
        counter = 0
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for rec in recs:
            pk_dict = collections.OrderedDict()
            for k in pk:
                pk_dict[k] = rec[k]
            pk_value = json.dumps(pk_dict).replace("'", "''")
            if self.log_detail:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, json.dumps(rec).replace(
                        "'", "''"), now, self.actioner))
            else:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, now, self.actioner))
            counter += 1
            if len(batch_sqls) % self.batch_size == 0:
                self.mssql.execute(
                    '''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                    (columns, ' UNION ALL '.join(batch_sqls)))
                batch_sqls = []
                self.logger.info('%s has been logged.' % counter)
        if len(batch_sqls) > 0:
            self.mssql.execute('''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                               (columns, ' UNION ALL '.join(batch_sqls)))
            self.logger.info('%s has been logged.' % counter)

    @Timer()
    def save_data_to_be_changed(self, table_def, where_condition):
        if table_def.DB_TYPE == 'MSSQL':
            table_schema = 'dbo' if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_mssql_table_ddl_def(
                self.meta['db_conn_mssql_dbname'], table_def.TABLE_NAME,
                table_schema)
        else:  # vertica
            table_schema = self.meta[
                'db_conn_vertica_common_schema'] if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_vertica_table_ddl_def(
                table_def.TABLE_NAME, table_schema)
        if not self.log_detail:  # only dump primary key columns
            pk = json.loads(table_def.PK)
            table_ddl_def = dict([(c, table_ddl_def[c]) for c in table_ddl_def
                                  if c in pk])
        self.logger.info(table_ddl_def)
        query_columns = ','.join([
            '"%s"' % c if ' ' in c else c for c in list(table_ddl_def.keys())
        ])
        pull_data_sql = 'SELECT %s FROM %s.%s %s' % (
            query_columns, table_schema, table_def.TABLE_NAME, where_condition)
        self.logger.info(pull_data_sql)
        rs = self.target_conn.query(pull_data_sql)
        recs = []
        for r in rs:
            rec = {}
            for c in list(table_ddl_def.keys()):
                #rec[c] = str(eval("r."+c)) if 'TIME' not in table_ddl_def[c].upper() and 'DATE' not in table_ddl_def[c].upper() else eval("r."+c).strftime("%Y-%m-%d %H:%M:%S")
                rec[c] = str(eval('r.__getattribute__("%s")' % c))
            recs.append(rec.copy())
        self.saved_records_count = len(recs)
        self.logger.info('%s records will be saved.' %
                         self.saved_records_count)
        self.save_to_db(table_def, recs)

    @Timer()
    def execute_sql(self):
        self.target_conn.execute(self.sql)

    def main(self):
        try:
            try:
                table_name = self.get_table_name()
                raw_table_name = self.get_raw_table_name(table_name)
                table_def = self.get_table_def(raw_table_name)
                saved = False
                if self.cmd != 'INSERT':
                    where_condition = self.get_where_condition()
                    self.save_data_to_be_changed(table_def, where_condition)
                    saved = True
            except Exception as e:
                self.logger.warning(traceback.format_exc())
                self.logger.warning(
                    'Failed to save the to-be-changed data, start to executing sql.'
                )
            self.execute_sql()
            return '(%(db_type)s)%(table_schema)s.%(table_name)s - impacted %(row_count)s rows, saved %(status)s' % {
                'db_type':
                table_def.DB_TYPE,
                'table_schema':
                table_def.TABLE_SCHEMA,
                'table_name':
                table_def.TABLE_NAME,
                'row_count':
                self.saved_records_count,
                'status':
                'N/A' if self.cmd == 'INSERT' else
                'successfully' if saved else 'unsuccessfully'
            }
        finally:
            if self.mssql:
                self.mssql.close_connection()
            if self.vtc:
                self.vtc.close_connection()