class SyncDimData: """ Get product and store dimension data from hub. DIM_PRODUCT and DIM_STORE save pivoted data, tall table PRODUCT and STORE save attribute that config in AP_CONFIG_META_ATTRIBUTE """ def __init__(self, context): self.context = {**context, **context["meta"]}.copy() self._hub_ids = context["hub_ids"] self._logger = context["logger"] self._db = MSOperation(meta=context["meta"]) self._dw = DWOperation(meta=context["meta"]) self._common_schema = context["meta"]["db_conn_vertica_common_schema"] # self._dw_conn_vertica = self.context["dw_conn_vertica"] self._all_dim = self.context["all_dim"] self._dim_calendar = "DIM_CALENDAR" self._dim_product = "DIM_PRODUCT" self._dim_store = "DIM_STORE" self._product_cis_src_tbl = "PS_ITEM_DIM_EXT" self._store_cis_src_tbl = "PS_STORE_DIM_EXT" self._dct_vendor_retailer_hub = {} self._dct_vendor_hub = {} self._dct_retailer_hub = {} self._all_vendor_retailer_hub = self._get_avaliable_vr() def _get_column(self): """Get attribute from config table :return: """ self._dct_table = {"DIM_PRODUCT": "", "DIM_STORE": ""} self._dct_key = {"DIM_PRODUCT": "ITEM_KEY", "DIM_STORE": "STORE_KEY"} self._dct_table_column = {"PRODUCT": [], "STORE": []} sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 1" self._logger.debug(sql) for row in self._db.query(sql): self._dct_table_column["PRODUCT"].append(row[0]) sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 2" self._logger.debug(sql) for row in self._db.query(sql): self._dct_table_column["STORE"].append(row[0]) for k, v in self._dct_table_column.items(): self._dct_table_column[k] = """'{}'""".format("', '".join(v)) def _get_source_config(self, hub_id): """Get source config :return: """ self._source_dw = SiloOperation(hub_id=hub_id, meta=self.context["meta"]) if self.context["meta"]["db_conn_vertica_silo_username"]: self._source_dw.config.json_data["configs"][ "dw.etluser.id"] = self._source_dw.username self._source_dw.config.json_data["configs"][ "dw.etluser.password"] = Crypto().encrypt( self._source_dw.password) silo_type = self._source_dw.config.get_property( "etl.silo.type", "None").upper() # TODO For hub, silo_type is MD, not find para for this, default host if silo_type in ("WMSSC", "SASSC", "WMCAT", "WMINTL"): self._active_status = "" else: self._active_status = " AND D.ACTIVE = 'T'" def _get_avaliable_vr(self): """ getting all avaliable vendor and retailer from cycle mapping table. and get related hub_id via config service. :return: dict like {'vendor,retailer': 'hub_id', ...} """ sql = "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_MAPPING " \ "UNION " \ "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_RC_MAPPING" self._logger.debug(sql) dct_vendor_retailer_hub = {} # dct_vendor_retailer_hub = dict(self._db.query(sql)) for v_r in self._db.query(sql): try: config = Config(meta=self.context["meta"], vendor_key=v_r.VENDOR_KEY, retailer_key=v_r.RETAILER_KEY) hub_id = config.get_hub_id() _key = str(v_r.VENDOR_KEY) + ',' + str(v_r.RETAILER_KEY) dct_vendor_retailer_hub[_key] = hub_id # in case there is no config returned for given vendor & retailer, then skip this vendor & retailer. except Exception as e: # self._logger.warning(str(e)) self._logger.warning( "Seems there is no silo configed for vendor: %s and retailer: %s" % (str(v_r.VENDOR_KEY), str(v_r.RETAILER_KEY))) continue return dct_vendor_retailer_hub def _get_vendor_retailer_from_hub(self, hub_id): """Get vendor retailer from table AP_ALERT_CYCLE_MAPPING, only keep the first one that appear in multiple hubs :return: """ self._str_vendor_retailer_filter = "" self._str_vendor_filter = "" self._str_retailer_filter = "" lst_vendor_retailer_filter = [] lst_vendor_filter = [] lst_retailer_filter = [] dct_vendor_retailer_hub = self._all_vendor_retailer_hub sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ " " DISTINCT VENDOR_KEY, RETAILER_KEY FROM {self._meta_schema}.CONFIG_CUSTOMERS" " WHERE VENDOR_NAME != 'Alert' AND HUB_ID = '{hub_id}'" " ORDER BY VENDOR_KEY, RETAILER_KEY".format(hub_id=hub_id, self=self)) self._logger.debug(sql) vendor_retailers = self._source_dw.query(sql) for row in vendor_retailers: _vr = "{row.VENDOR_KEY},{row.RETAILER_KEY}".format(row=row) # same vendor & retailer could have multi hubs. # So only getting the one which matches with the hub from Config service. if dct_vendor_retailer_hub.get(_vr) == hub_id or self._all_dim: if _vr not in self._dct_vendor_retailer_hub: self._dct_vendor_retailer_hub[_vr] = hub_id lst_vendor_retailer_filter.append( "({row.VENDOR_KEY},{row.RETAILER_KEY})".format( row=row)) if str(row.VENDOR_KEY) not in self._dct_vendor_hub: self._dct_vendor_hub["{}".format(row.VENDOR_KEY)] = hub_id lst_vendor_filter.append("{}".format(row.VENDOR_KEY)) if str(row.RETAILER_KEY) not in self._dct_retailer_hub: self._dct_retailer_hub["{}".format( row.RETAILER_KEY)] = hub_id lst_retailer_filter.append("{}".format(row.RETAILER_KEY)) if lst_vendor_retailer_filter: self._str_vendor_retailer_filter = ( " AND (VENDOR_KEY, RETAILER_KEY) IN ({})" "".format(",".join(lst_vendor_retailer_filter))) self._logger.debug(self._str_vendor_retailer_filter) if lst_vendor_filter: self._str_vendor_filter = (" AND VENDOR_KEY IN ({})".format( ",".join(list(set(lst_vendor_filter))))) self._logger.debug(self._str_vendor_filter) if lst_retailer_filter: self._str_retailer_filter = (" AND RETAILER_KEY IN ({})".format( ",".join(list(set(lst_retailer_filter))))) self._logger.debug(self._str_retailer_filter) def _recreate_stage_table(self): """ Creating related stage tables on IRIS DW. :return: """ lst_sql = [] for table in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]): lst_sql.append(""" DROP TABLE IF EXISTS {self._common_schema}.STAGE_{table}; CREATE TABLE {self._common_schema}.STAGE_{table} LIKE {self._common_schema}.{table} INCLUDING PROJECTIONS; """.format(self=self, table=table)) sql = ''.join(lst_sql) self._logger.info(sql) self._dw.execute(sql) # creating staging table for loading CIS source data. _sql = """ DROP TABLE IF EXISTS {cmnSchema}.STAGE_{itemTable}_CIS; CREATE TABLE {cmnSchema}.STAGE_{itemTable}_CIS LIKE {cmnSchema}.{itemTable} INCLUDING PROJECTIONS; DROP TABLE IF EXISTS {cmnSchema}.STAGE_{storeTable}_CIS; CREATE TABLE {cmnSchema}.STAGE_{storeTable}_CIS LIKE {cmnSchema}.{storeTable} INCLUDING PROJECTIONS; """.format(cmnSchema=self._common_schema, itemTable=self._dim_product, storeTable=self._dim_store) self._logger.info(_sql) self._dw.execute(_sql) def _truncate_stage_table(self): """Truncate stage table :return: """ lst_sql = [] for k in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]): lst_sql.append( "TRUNCATE TABLE {self._common_schema}.STAGE_{k}".format( self=self, k=k)) sql = ';\n'.join(lst_sql) self._logger.debug(sql) self._dw.execute(sql) def _analyze_table(self): """Analyze table :return: """ lst_sql = [] for k in self._dct_table.keys(): lst_sql.append( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.{k}')" "".format(self=self, k=k)) sql = ';\n'.join(lst_sql) self._logger.debug(sql) self._dw.execute(sql) def _export_calendar_data(self, hub_id): """ Export calendar data to common schema from source schema if no data in DIM_CALENDAR table :param hub_id: :return: """ cnt = self._dw.query_scalar( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ COUNT(*) " "FROM {self._common_schema}.{self._dim_calendar}".format( self=self)) if cnt != 0: self._logger.info( "Calendar data exists, skip hub_id {}.".format(hub_id)) else: col = ( 'PERIOD_KEY, CALENDAR_KEY, CALENDARNAME, YEAR, YEARNAME, QUARTER, QUARTERNAME, MONTH, MONTHNAME, ' 'PERIOD, PERIODNAME, WEEKENDED, WEEKENDEDNAME, WEEKBEGIN, WEEKBEGINNAME, YEARWEEKNAME, YEARWEEK, ' 'YEARMONTHWEEKNAME, LY_PERIOD_KEY, NY_PERIOD_KEY, DATE_NAME, DATE_VALUE, CAL_PERIOD_KEY, ' '"2YA_PERIOD_KEY", "3YA_PERIOD_KEY", "4YA_PERIOD_KEY"') sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CAL.PERIOD_KEY AS PERIOD_KEY, META.CALENDAR_KEY AS CALENDAR_KEY, MAX(META.CALENDAR_NAME) AS CALENDARNAME, MAX(CAL."YEAR") AS "YEAR", MAX(CAL.YEARNAME) AS YEARNAME, MAX(CAL.YEARQUARTER) AS QUARTER, MAX(CAL.YEARQUARTERNAME) AS QUARTERNAME, MAX(CAL.YEARPERIOD) AS "MONTH", MAX(CAL.YEARPERIODNAME) AS MONTHNAME, MAX(CAL.YEARMONTH) AS PERIOD, MAX(CAL.YEARMONTHNAME) AS PERIODNAME, MAX(CAL.WEEKENDED) AS WEEKENDED, MAX(CAL.WEEKENDEDNAME) AS WEEKENDEDNAME, MAX(CAL.WEEKBEGIN) AS WEEKBEGIN, MAX(CAL.WEEKBEGINNAME) AS WEEKBEGINNAME, (((MAX(CAL.YEAR))::VARCHAR|| ' - Wk '::VARCHAR(6)) || SUBSTR(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR)) - 2) + 1),2) ) AS YEARWEEKNAME, MAX(CAL.YEARWEEK) AS YEARWEEK, (((((MAX(CAL.YEAR))::VARCHAR|| ' - '::VARCHAR(3)) || SUBSTR (('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR)) - 2) + 1),2)) || ' Wk '::VARCHAR(4)) || SUBSTR(('00'::VARCHAR(2) ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(META.MONTH) ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(CAL.MONTH) ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR)) - 2) + 1),2)) AS YEARMONTHWEEKNAME, MAX(CAL.LY_PERIOD_KEY) LY_PERIOD_KEY, MAX(CAL.NY_PERIOD_KEY) NY_PERIOD_KEY, TO_CHAR(TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD'), 'YYYY-MM-DD') DATE_NAME, TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD') DATE_VALUE, CAL.PERIOD_KEY CAL_PERIOD_KEY, MAX("2YA_PERIOD_KEY") "2YA_PERIOD_KEY", MAX("3YA_PERIOD_KEY") "3YA_PERIOD_KEY", MAX("4YA_PERIOD_KEY") "4YA_PERIOD_KEY" FROM ( (SELECT 2 AS CALENDAR_KEY, 'STD' AS PREFIX, 1 AS "MONTH", '445 (Standard)' AS CALENDAR_NAME) META JOIN (SELECT C.PERIOD_KEY AS PERIOD_KEY, C.CALENDAR_KEY AS CALENDAR_KEY, C.YEAR, C.WEEK, C.MONTH, (C.YEAR)::VARCHAR(4) AS YEARNAME, QE.YEARQUARTER, QE.YEARQUARTERNAME, ME.YEARPERIOD AS YEARMONTH, ME.YEARPERIODNAME AS YEARMONTHNAME, ME.YEARPERIOD, ME.YEARPERIODNAME, WE.YEARWEEK AS YEARWEEK, WE.WEEKBEGIN, TO_CHAR(TO_DATE ((WE.WEEKBEGIN)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10) ) AS WEEKBEGINNAME, WE.WEEKENDED, TO_CHAR(TO_DATE ((WE.WEEKENDED)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10) ) AS WEEKENDEDNAME, C.LY_PERIOD_KEY, C.NY_PERIOD_KEY, YA2.LY_PERIOD_KEY "2YA_PERIOD_KEY", YA3.LY_PERIOD_KEY "3YA_PERIOD_KEY", YA4.LY_PERIOD_KEY "4YA_PERIOD_KEY" FROM ((({self._dim_schema}.CALENDAR_PERIOD C JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, MIN(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKBEGIN, MAX(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKENDED, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.WEEK AS WEEK, CASE WHEN (CALENDAR_PERIOD.WEEK > 2000) THEN CALENDAR_PERIOD.WEEK ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.WEEK) END AS YEARWEEK FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.WEEK_CODE = 'B'::CHAR(1)) OR (CALENDAR_PERIOD.WEEK_CODE = 'E'::CHAR(1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.WEEK) WE ON ( ( (C.CALENDAR_KEY = WE.CALENDAR_KEY) AND (C.YEAR = WE.YEAR) AND (C.WEEK = WE.WEEK)))) JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.MONTH AS PERIOD, CASE WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN (CALENDAR_PERIOD.MONTH)::VARCHAR(6) ELSE CONCAT (CONCAT (CONCAT ('Period '::VARCHAR(7),(CALENDAR_PERIOD.MONTH)::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) END AS YEARPERIODNAME, CASE WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN CALENDAR_PERIOD.MONTH ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.MONTH) END AS YEARPERIOD FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.MONTH_CODE = 'B'::CHAR (1)) OR (CALENDAR_PERIOD.MONTH_CODE = 'E'::CHAR (1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.MONTH) ME ON ( ( (C.CALENDAR_KEY = ME.CALENDAR_KEY) AND (C.YEAR = ME.YEAR) AND (C.MONTH = ME.PERIOD)))) JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.QUARTER AS QUARTER, CASE WHEN CALENDAR_PERIOD.QUARTER > 2000 THEN CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1), (SUBSTR((CALENDAR_PERIOD.QUARTER)::VARCHAR(6), 6))::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.QUARTER)::VARCHAR(4)) ELSE CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1),(CALENDAR_PERIOD.QUARTER)::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) END AS YEARQUARTERNAME, CASE WHEN (CALENDAR_PERIOD.QUARTER > 2000) THEN CALENDAR_PERIOD.QUARTER ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.QUARTER) END AS YEARQUARTER FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.QUARTER_CODE = 'B'::CHAR (1)) OR (CALENDAR_PERIOD.QUARTER_CODE = 'E'::CHAR (1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.QUARTER) QE ON (((C.CALENDAR_KEY = QE.CALENDAR_KEY) AND (C.YEAR = QE.YEAR) AND (C.QUARTER = QE.QUARTER)))) LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA2 ON C.CALENDAR_KEY = YA2.CALENDAR_KEY AND C.LY_PERIOD_KEY = YA2.PERIOD_KEY LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA3 ON C.CALENDAR_KEY = YA3.CALENDAR_KEY AND YA2.LY_PERIOD_KEY = YA3.PERIOD_KEY LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA4 ON C.CALENDAR_KEY = YA4.CALENDAR_KEY AND YA3.LY_PERIOD_KEY = YA4.PERIOD_KEY WHERE (C.YEAR >= ((DATE_PART ('year'::VARCHAR (4),(NOW ())::TIMESTAMP))::INT- 5))) CAL ON ( (CAL.CALENDAR_KEY = META.CALENDAR_KEY)) ) GROUP BY CAL.PERIOD_KEY,META.CALENDAR_KEY; """.format(self=self) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = self._dim_calendar dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) self._logger.info( "Sync hub_id {} dim calendar done".format(hub_id)) def _export_product_data(self, hub_id, cis_source=False): """ Export product data to common schema from source schema :param hub_id: :return: """ _target_stage_table = "STAGE_{}".format(self._dim_product) _src_item_table = "PRODUCT" if cis_source: _target_stage_table = "STAGE_{}_CIS".format(self._dim_product) _src_item_table = "STAGE_{}".format(self._product_cis_src_tbl) # Creating related staging table for CIS source in HUB dw. # Since CIS source tables are external tables, can NOT be used to join other tables directly. sql = """ DROP TABLE IF EXISTS {dimSchema}.{stgItemTable}; CREATE TABLE {dimSchema}.{stgItemTable} AS SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC_ID AS UPC, ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' AS ACTIVE FROM {dimSchema}.{srcItemTable} WHERE 0=0 {filters}; """.format(dimSchema=self._dim_schema, stgItemTable=_src_item_table, srcItemTable=self._product_cis_src_tbl, filters=self._str_vendor_retailer_filter) self._logger.info(sql) self._source_dw.execute(sql) col = ( " RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC, ITEM_DESCRIPTION, ITEM_GROUP, OSM_ITEM_NBR, OSM_ITEM_STATUS," "OSM_ITEM_TYPE, OSM_BRAND, OSM_UNIT_PRICE, OSM_CATEGORY, OSM_MAJOR_CATEGORY_NO, OSM_MAJOR_CATEGORY, " "OSM_SUB_CATEGORY_NO, OSM_SUB_CATEGORY, OSM_VENDOR_STK_NBR, OSM_VENDOR_PACK_COST, VENDOR_PACK_QTY, " "OSM_WHSE_PACK_COST, OSM_WHSE_PACK_QTY, DSD_IND, VENDOR_NAME ") sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY, PV.UPC, AGPG.ITEM_DESCRIPTION, CASE WHEN AGPG.ITEM_GROUP IS NULL OR AGPG.ITEM_GROUP = '000000' THEN PV.UPC ELSE AGPG.ITEM_GROUP END AS ITEM_GROUP, AGPG.OSM_ITEM_NBR, AGPG.OSM_ITEM_STATUS, AGPG.OSM_ITEM_TYPE, AGPG.OSM_BRAND,AGPG.OSM_UNIT_PRICE, AGPG.OSM_CATEGORY, AGPG.OSM_MAJOR_CATEGORY_NO, AGPG.OSM_MAJOR_CATEGORY, AGPG.OSM_SUB_CATEGORY_NO, AGPG.OSM_SUB_CATEGORY, AGPG.OSM_VENDOR_STK_NBR, AGPG.OSM_VENDOR_PACK_COST, CASE WHEN length(AGPG.VENDOR_PACK_QTY) < 15 THEN CASE WHEN REGEXP_LIKE(SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15), '^[0-9]*(\.[0-9]+)?$') THEN CASE WHEN SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC <= 1 THEN 1 ELSE SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC END ELSE 1 END ELSE 1 END VENDOR_PACK_QTY, AGPG.OSM_WHSE_PACK_COST, AGPG.OSM_WHSE_PACK_QTY, CASE WHEN AGPG.OSM_ITEM_TYPE IN ('07','37') THEN '1' ELSE '0' END AS DSD_IND, CC.VENDOR_NAME FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, VENDOR_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC JOIN ( SELECT D.VENDOR_KEY, D.ITEM_KEY, MAX(D.UPC) UPC FROM {self._dim_schema}.{itemTable} D JOIN {self._dim_schema}.VENDOR V on V.VENDOR_KEY = D.VENDOR_KEY WHERE D.ITEM_KEY NOT IN (SELECT ITEM_KEY FROM {self._dim_schema}.PRODUCT_GLOBAL_FILTER) {self._active_status} GROUP BY D.VENDOR_KEY, D.ITEM_KEY ) PV ON PV.VENDOR_KEY = CC.VENDOR_KEY LEFT JOIN ( SELECT AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_GROUP_X' THEN ATTRIBVALUE END) ITEM_GROUP , MAX(CASE WHEN AGP.ATTRIBNAME = 'VENDOR_PACK_QTY_X' THEN ATTRIBVALUE END) VENDOR_PACK_QTY , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_DESCRIPTION_X' THEN ATTRIBVALUE END) ITEM_DESCRIPTION , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_NBR_X' THEN ATTRIBVALUE END) OSM_ITEM_NBR , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_STATUS_X' THEN ATTRIBVALUE END) OSM_ITEM_STATUS , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_TYPE_X' THEN ATTRIBVALUE END) OSM_ITEM_TYPE , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_QTY_X' THEN ATTRIBVALUE END) OSM_WHSE_PACK_QTY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_NO_X' THEN ATTRIBVALUE END) OSM_MAJOR_CATEGORY_NO , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_X' THEN ATTRIBVALUE END) OSM_MAJOR_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_CATEGORY_X' THEN ATTRIBVALUE END) OSM_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_NO_X' THEN ATTRIBVALUE END) OSM_SUB_CATEGORY_NO , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_X' THEN ATTRIBVALUE END) OSM_SUB_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_BRAND_X' THEN ATTRIBVALUE END) OSM_BRAND , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_UNIT_PRICE_X' THEN ATTRIBVALUE END) OSM_UNIT_PRICE , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_STK_NBR_X' THEN ATTRIBVALUE END) OSM_VENDOR_STK_NBR , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_PACK_COST_X' THEN ATTRIBVALUE END) OSM_VENDOR_PACK_COST , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_COST_X' THEN ATTRIBVALUE END) OSM_WHSE_PACK_COST FROM ( SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(ATTR_GROUP, '_X') ATTRIBNAME, D.ITEM_KEY, D.ATTRIBVALUE FROM {self._meta_schema}.ATTRIBUTEGROUP AG JOIN {self._dim_schema}.{itemTable} D ON D.VENDOR_KEY = AG.VENDOR_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status} WHERE AG.ATTR_GROUP IN ('ITEM_GROUP','VENDOR_PACK_QTY','ITEM_DESCRIPTION','OSM_ITEM_NBR' ,'OSM_ITEM_STATUS','OSM_ITEM_TYPE','OSM_WHSE_PACK_QTY','OSM_MAJOR_CATEGORY_NO' ,'OSM_MAJOR_CATEGORY','OSM_CATEGORY','OSM_SUB_CATEGORY_NO','OSM_SUB_CATEGORY','OSM_BRAND', 'OSM_UNIT_PRICE','OSM_VENDOR_STK_NBR','OSM_VENDOR_PACK_COST','OSM_WHSE_PACK_COST') ) AGP GROUP BY AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY ) AGPG ON AGPG.RETAILER_KEY = CC.RETAILER_KEY AND AGPG.VENDOR_KEY = PV.VENDOR_KEY AND AGPG.ITEM_KEY =PV.ITEM_KEY WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY) NOT IN (SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY FROM {self._dim_schema}.PRODUCT_DEFAULT_FILTER) """.format(self=self, itemTable=_src_item_table) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = _target_stage_table dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) # sync tall table(not for CIS). if self._str_vendor_filter and not cis_source: self._product_columns = self._dct_table_column["PRODUCT"] col = " VENDOR_KEY, ITEM_KEY, UPC, ATTRIBNAME, ATTRIBVALUE, ACTIVE " sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.PRODUCT WHERE ATTRIBNAME " "IN ({self._product_columns}) {self._str_vendor_filter}". format(self=self, col=col)) dct_sync_data["target_dw_table"] = "STAGE_PRODUCT" dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) def _export_store_data(self, hub_id, cis_source=False): """ Export store data to common schema table stage_<table_name> from source schema <table_name> :param hub_id: :return: """ _target_store_table = "STAGE_{}".format(self._dim_store) _src_store_table = "STORE" if cis_source: _target_store_table = "STAGE_{}_CIS".format(self._dim_store) _src_store_table = "STAGE_{}".format(self._store_cis_src_tbl) # Creating related staging table for CIS source in HUB dw. # Since CIS source tables are external tables, can NOT be used to join other tables directly. sql = """ DROP TABLE IF EXISTS {dimSchema}.{stgStoreTable}; CREATE TABLE {dimSchema}.{stgStoreTable} AS SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID AS STOREID, ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' as ACTIVE FROM {dimSchema}.{srcStoreTable} WHERE 0=0 {filters}; """.format(dimSchema=self._dim_schema, stgStoreTable=_src_store_table, srcStoreTable=self._store_cis_src_tbl, filters=self._str_vendor_retailer_filter) self._logger.info(sql) self._source_dw.execute(sql) col = ( " RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID, MARKET_CLUSTER, STATE, CITY, ZIP, OSM_REGION, " "OSM_DISTRICT, RSI_ALERT_TYPE, RETAILER_NAME, RSI_BANNER, PRIME_DC, STORE_NAME" ) sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY, SR.STOREID, AGSG.MARKET_CLUSTER, SR.STATE, CASE WHEN SR.CITY IS NULL AND STATE IS NULL THEN 'N/A' WHEN SR.CITY IS NOT NULL AND STATE IS NULL THEN CONCAT(SR.CITY, ', N/A') WHEN SR.CITY IS NULL AND STATE IS NOT NULL THEN CONCAT('N/A, ' , SR.STATE) WHEN SR.CITY IS NOT NULL AND STATE IS NOT NULL THEN CONCAT(CONCAT(SR.CITY, ', '), SR.STATE) END CITY, SR.ZIP, AGSG.OSM_REGION, AGSG.OSM_DISTRICT, COALESCE(SR.RSI_ALERT_TYPE, '0') RSI_ALERT_TYPE, CC.RETAILER_NAME, coalesce(AGSG.RSI_BANNER, 'unmapped') AS RSI_BANNER, AGSG.PRIME_DC, AGSG.STORE_NAME FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, RETAILER_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC JOIN ( SELECT D.RETAILER_KEY, D.STORE_KEY, D.STOREID, MAX(CASE WHEN D.ATTRIBNAME = 'STATE' THEN D.ATTRIBVALUE END) STATE, MAX(CASE WHEN D.ATTRIBNAME = 'CITY' THEN D.ATTRIBVALUE END) CITY, MAX(CASE WHEN D.ATTRIBNAME = 'ZIP' THEN D.ATTRIBVALUE END) ZIP, MAX(CASE WHEN D.ATTRIBNAME = 'RSI_ALERT_TYPE' THEN D.ATTRIBVALUE END) RSI_ALERT_TYPE FROM {self._dim_schema}.{storeTable} D JOIN {self._dim_schema}.RETAILER R ON R.RETAILER_KEY = D.RETAILER_KEY WHERE STORE_KEY NOT IN (SELECT STORE_KEY FROM {self._dim_schema}.STORES_GLOBAL_FILTER) {self._active_status} GROUP BY D.RETAILER_KEY, D.STORE_KEY, D.STOREID ) SR ON SR.RETAILER_KEY = CC.RETAILER_KEY LEFT JOIN ( SELECT AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY, MAX(CASE WHEN AGS.ATTRIBNAME = 'MARKET_CLUSTER_X' THEN ATTRIBVALUE END) MARKET_CLUSTER, MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_REGION_X' THEN ATTRIBVALUE END) OSM_REGION, MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_DISTRICT_X' THEN ATTRIBVALUE END) OSM_DISTRICT, MAX(CASE WHEN AGS.ATTRIBNAME = 'RSI_BANNER_X' THEN ATTRIBVALUE END) RSI_BANNER, MAX(CASE WHEN AGS.ATTRIBNAME = 'PRIME_DC_X' THEN ATTRIBVALUE END) PRIME_DC, MAX(CASE WHEN AGS.ATTRIBNAME = 'SAR_STORE_NAME_X' THEN ATTRIBVALUE END) STORE_NAME FROM ( SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(AG.ATTR_GROUP, '_X') ATTRIBNAME, D.STORE_KEY, D.ATTRIBVALUE FROM {self._meta_schema}.ATTRIBUTEGROUP AG JOIN {self._dim_schema}.{storeTable} D ON D.RETAILER_KEY = AG.RETAILER_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status} WHERE AG.ATTR_GROUP IN ('RSI_BANNER','PRIME_DC', 'MARKET_CLUSTER', 'OSM_REGION', 'OSM_DISTRICT', 'SAR_STORE_NAME') ) AGS GROUP BY AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY ) AGSG ON AGSG.RETAILER_KEY = SR.RETAILER_KEY AND AGSG.VENDOR_KEY = CC.VENDOR_KEY AND AGSG.STORE_KEY = SR.STORE_KEY WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY) NOT IN (SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY FROM {self._dim_schema}.STORES_DEFAULT_FILTER); """.format(self=self, storeTable=_src_store_table) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = _target_store_table dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) # sync tall table(not for CIS) if self._str_retailer_filter and not cis_source: self._store_columns = self._dct_table_column["STORE"] col = " RETAILER_KEY, STORE_KEY, STOREID, ATTRIBNAME, ATTRIBVALUE, ACTIVE " sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.STORE WHERE ATTRIBNAME " "IN ({self._store_columns}) {self._str_retailer_filter}". format(self=self, col=col)) dct_sync_data["target_dw_table"] = "STAGE_STORE" dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) def _load_product_data_cis(self, hub_id): """ Sync data from CIS source. if the same item_key exists in both CIS and NXG HUBs, the CIS one should override the one in NXG side. :param hub_id: :return: """ # dump CIS data into cis staging table: stage_dim_product_cis self._export_product_data(hub_id=hub_id, cis_source=True) # Below part is moved from method self._export_product_data. Put it here when all product data are ready. # Override nxg items with CIS if there are duplicated items on both NXG and CIS sql = """ DELETE FROM {cmnSchema}.STAGE_{itemTable} WHERE (vendor_key, retailer_key, item_key) IN (SELECT vendor_key, retailer_key, item_key FROM {cmnSchema}.STAGE_{itemTable}_CIS); INSERT INTO {cmnSchema}.STAGE_{itemTable} SELECT * FROM {cmnSchema}.STAGE_{itemTable}_CIS; """.format(itemTable=self._dim_product, cmnSchema=self._common_schema) self._logger.info(sql) self._dw.execute(sql) # updating column OSM_MAJOR_CATEGORY when NXG and CIS sources are all done. sql = """ DROP TABLE IF EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY; CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ SELECT distinct OSM_SUB_CATEGORY_NO,MAX(OSM_SUB_CATEGORY) as OSM_SUB_CATEGORY FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_SUB_CATEGORY_NO; DROP TABLE IF EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY; CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ SELECT distinct OSM_MAJOR_CATEGORY_NO,MAX(OSM_MAJOR_CATEGORY) as OSM_MAJOR_CATEGORY FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_MAJOR_CATEGORY_NO; """.format(self=self) self._logger.info(sql) self._dw.execute(sql) sql = (""" SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.STAGE_{self._dim_product}'); INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_SUB_CATEGORY_CONSISTENCY SELECT a.OSM_SUB_CATEGORY_NO, MAX(a.OSM_SUB_CATEGORY) AS OSM_SUB_CATEGORY FROM {self._common_schema}.STAGE_{self._dim_product} a LEFT JOIN TEMP_OSM_SUB_CATEGORY_CONSISTENCY b on a.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO WHERE a.OSM_SUB_CATEGORY_NO IS NOT NULL AND a.OSM_SUB_CATEGORY_NO <> '' AND b.OSM_SUB_CATEGORY_NO IS NULL GROUP BY a.OSM_SUB_CATEGORY_NO; UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product} SET OSM_SUB_CATEGORY = b.OSM_SUB_CATEGORY FROM (SELECT * FROM TEMP_OSM_SUB_CATEGORY_CONSISTENCY) b WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO; /*UPDATE {self._common_schema}.STAGE_{self._dim_product} SET ITEM_GROUP = B.ITEM_GROUP FROM $schemaName.OLAP_ITEM B WHERE {self._common_schema}.STAGE_{self._dim_product}.ITEM_KEY = B.ITEM_KEY;*/ INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY SELECT a.OSM_MAJOR_CATEGORY_NO, MAX(a.OSM_MAJOR_CATEGORY) AS OSM_MAJOR_CATEGORY FROM {self._common_schema}.STAGE_{self._dim_product} a LEFT JOIN TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY b on a.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO WHERE a.OSM_MAJOR_CATEGORY_NO IS NOT NULL AND a.OSM_MAJOR_CATEGORY_NO <> '' AND b.OSM_MAJOR_CATEGORY_NO IS NULL GROUP BY a.OSM_MAJOR_CATEGORY_NO; UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product} SET OSM_MAJOR_CATEGORY = b.OSM_MAJOR_CATEGORY FROM (SELECT * FROM TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY) b WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO; """.format(self=self)) self._logger.info(sql) self._dw.execute(sql) def _load_store_data_cis(self, hub_id): """ Sync data from CIS source. # if the same store_key exists in CIS and NXG HUBs, the CIS one should override the one in NXG side. :param hub_id: :return: """ # dump CIS data into cis staging table: stage_dim_store_cis self._export_store_data(hub_id=hub_id, cis_source=True) # # Override nxg stores with CIS if there are duplicated items on both NXG and CIS sql = """ DELETE FROM {cmnSchema}.STAGE_{storeTable} WHERE (vendor_key, retailer_key, store_key) IN (SELECT vendor_key, retailer_key, store_key FROM {cmnSchema}.STAGE_{storeTable}_CIS ); INSERT INTO {cmnSchema}.STAGE_{storeTable} SELECT * FROM {cmnSchema}.STAGE_{storeTable}_CIS; """.format(storeTable=self._dim_store, cmnSchema=self._common_schema) self._logger.info(sql) self._dw.execute(sql) def _swap_data(self): """ Switch partition from stage table to final table. :return: """ sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY " "FROM {self._common_schema}.STAGE_{self._dim_product}".format( self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition( schema_name=self._common_schema, table_name=self._dim_product, partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_{self._dim_product}'.format(self=self)) sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY " "FROM {self._common_schema}.STAGE_PRODUCT".format(self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition(schema_name=self._common_schema, table_name='PRODUCT', partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_PRODUCT') sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY " "FROM {self._common_schema}.STAGE_{self._dim_store}".format( self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition( schema_name=self._common_schema, table_name=self._dim_store, partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_{self._dim_store}'.format(self=self)) sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY " "FROM {self._common_schema}.STAGE_STORE".format(self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition(schema_name=self._common_schema, table_name='STORE', partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_STORE') def _process_sync_dim_data(self): """ Process to sync dim data :return: """ try: self._get_column() self._recreate_stage_table() for hub_id in self._hub_ids: try: self._dim_schema = "DIM_{hub_id}".format(hub_id=hub_id) self._meta_schema = "METADATA_{hub_id}".format( hub_id=hub_id) self._logger.info( "Sync dim data start hub_id {hub_id}".format( hub_id=hub_id)) self._get_source_config(hub_id) self._export_calendar_data(hub_id) self._get_vendor_retailer_from_hub(hub_id) if not self._str_vendor_retailer_filter: self._logger.warning( "hub_id {} does not have any matched product and store data" .format(hub_id)) continue self._export_product_data(hub_id) self._export_store_data(hub_id) # handle CIS source. self._load_product_data_cis(hub_id) self._load_store_data_cis(hub_id) self._logger.info( "Sync hub_id {hub_id} dim data to stage table.".format( hub_id=hub_id)) except Exception as msg: self._logger.warning(msg) continue self._swap_data() # self._truncate_stage_table() self._logger.info("Sync dim data done from hubs {hub_ids}".format( hub_ids=self._hub_ids)) self._analyze_table() except Exception as msg: self._logger.error(msg) finally: pass def process(self): try: self._process_sync_dim_data() except Exception as e: self._logger.error(e) finally: if hasattr(self, "_dw"): self._dw.close_connection() if hasattr(self, "_source_dw"): self._source_dw.close_connection() if hasattr(self, "_db"): self._db.close_connection()
class Feedback(object): def __init__(self, meta, params=None, init_flag=False, logger=None): """ # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key. # 1, only sync those vendor&retailer which applied OSA Service. # 2, for Those new vendor&retailer, copy all historical data when initialization. :param meta: [mandatory] config data from config.properties file :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below. 1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs. Noted: rdp_id will be passed when calling this service via REST API. 2, vendor_key: mandatory only when init_flag is True. retailer_key: mandatory only when init_flag is True Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts. :param init_flag: if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer. if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily). :param logger: """ self.meta = meta self._params = {} if params is None else params self._rdp_id = self._params.get("rdpId", None) self._fact_type = 'fdbk' self._init_flag = init_flag self._vendor_key = self._params.get("vendor_key", None) self._retailer_key = self._params.get("retailer_key", None) self._debug = self._params.get('debug', 'N') self._default_rdp = "RDP_AUX" self._log_file = './log/sync_fdbk_%s_%s.log' % ( self._rdp_id, datetime.datetime.now().strftime('%Y%m%d')) self.logger = logger if logger else Logger(log_level="debug", target="console|file", vendor_key=-1, retailer_key=-1, log_file=self._log_file, sql_conn=None) self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger) self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger) self.max_event_key = None # we already know feedback table name of RDP self.source_table_rdp = "DS_FACT_FEEDBACK" # source table in RDP side. self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP" # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK) self.target_table_osa = "FACT_FEEDBACK" # final table in OSA side self.capacity = Capacity(meta=meta) self.dct_sync_data = copy.deepcopy( self.meta) # required for calling sync_data module self.dct_sync_data[ "meta"] = self.meta # required for calling sync_data module self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn self.dct_sync_data["target_dw_schema"] = self.meta[ 'db_conn_vertica_common_schema'] self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa self.dct_sync_data["logger"] = self.logger # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql. self.dct_sync_data["dw_conn_vertica"] = False # self.dct_sync_data["dw_conn_vertica"] = True self.transfer = TransferData(dct_sync_data=self.dct_sync_data) def _populate_source_config(self, source_config): self.logger.debug("The source config is: %s" % source_config) _src_config = {} if os.name == 'nt': _src_config["temp_file_path"] = "d:" elif os.name == 'posix': _src_config["temp_file_path"] = "/tmp" # Getting user account from config.properties file first. if self.meta.get("db_conn_vertica_rdp_username"): _src_config["dw.etluser.id"] = self.meta.get( "db_conn_vertica_rdp_username") if self.meta.get("db_conn_vertica_rdp_password"): _src_config["dw.etluser.password"] = self.meta.get( "db_conn_vertica_rdp_password") else: _pmp_pwd = get_password( username=self.meta.get("db_conn_vertica_rdp_username"), meta=self.meta) # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd) # if not configed then get them directly from RDP config. else: _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id") # the pwd is encrypted _src_config["dw.etluser.password"] = source_config.get( "dw.etluser.password") # required info for calling sync_data module. _src_config["dw.server.name"] = source_config.get("dw.server.name") _src_config["dw.db.name"] = source_config.get("dw.db.name") _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433) _src_config["dw.schema.name"] = source_config.get("dw.schema.name") self.logger.debug("srouce config is: %s" % _src_config) self.dct_sync_data["source_config"] = _src_config # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster rdp_meta = copy.deepcopy(self.meta) tmp_rdp_meta = { 'db_conn_vertica_servername': _src_config["dw.server.name"], 'db_conn_vertica_port': _src_config["dw.db.portno"], 'db_conn_vertica_dbname': _src_config["dw.db.name"], 'db_conn_vertica_username': _src_config["dw.etluser.id"], 'db_conn_vertica_password': _src_config["dw.etluser.password"], 'db_conn_vertica_password_encrypted': "true" } rdp_meta.update(tmp_rdp_meta) self.logger.debug("rdp config is: %s" % rdp_meta) rdp_connection = DWOperation(meta=rdp_meta) self.dct_sync_data["source_dw"] = rdp_connection def main_process(self): try: # if not introducing new customer and _rdp_id was given, # then we will sync all feedback data from given RDP for registered users. if self._init_flag is False and self._rdp_id: try: rdp_config = Config(meta=self.meta, hub_id=self._rdp_id).json_data if not rdp_config['configs']: raise Warning( "There is no configs returned for RDP: %s." "Please check if this RDP registered in CP with below URL." "%s/properties/rdps?factType=fdbk" % (self._rdp_id, self.meta["api_config_str"])) # exit(StepStatus.SUCCESS.value) _rdp_schema = rdp_config['configs'].get('dw.schema.name') self.logger.info("Started to sync data from rdp: %s" % _rdp_schema) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process & update variables table. self.process_data() sql = """ IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}') INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME) VALUES ('{eventType}', '{value}', '', getdate(), getdate()) ELSE UPDATE VARIABLES SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate() WHERE VARIABLE_NAME = '{eventType}' """.format(eventType=_rdp_schema, value=self.max_event_key) self.logger.info(sql) self.osa_app_conn.execute(sql) self.logger.info("Data sync done for RDP: %s" % _rdp_schema) except Exception as e: self.logger.warning(e) raise # exit(StepStatus.SUCCESS.value) # exit(0) otherwise Docker container will fail. # Else we will get all RDPs from REST API: http://10.172.36.75/config/properties/rdps?factType=fdbk # There could be multi RDPs(e.g. for SVR & WM). if so, loop all RDPs elif self._init_flag is False and self._rdp_id is None: try: rdp_configs = Config( meta=self.meta, rdp_info=True, rdp_fact_type=self._fact_type).json_data if not rdp_configs: raise Warning( "No feedback related RDP found." "Please check if any data returned from below URL." "%s/properties/rdps?factType=fdbk" % (self.meta["api_config_str"])) # exit(StepStatus.SUCCESS.value) for rdp_config in rdp_configs: _rdp_schema = rdp_config['configs'].get( 'dw.schema.name') self.logger.info("Started to sync data from rdp: %s" % _rdp_schema) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process & update variables table. self.process_data() sql = """ IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}') INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME) VALUES ('{eventType}', '{value}', '', getdate(), getdate()) ELSE UPDATE VARIABLES SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate() WHERE VARIABLE_NAME = '{eventType}' """.format(eventType=_rdp_schema, value=self.max_event_key) self.logger.info(sql) self.osa_app_conn.execute(sql) self.logger.info("Data sync done for RDP: %s" % _rdp_schema) except Exception as e: self.logger.warning(e) raise elif self._init_flag is True: if self._vendor_key is None or self._retailer_key is None: self.logger.warning( "vendor_key and retailer_key are required when initilize feedback for new customer" ) raise ValueError # getting fdbk related rdps. try: rdp_configs = Config( meta=self.meta, rdp_info=True, rdp_fact_type=self._fact_type).json_data if not rdp_configs: self.logger.warning( "No feedback related RDP found." "Please check if any data returned from below URL." "%s/properties/rdps?factType=fdbk" % (self.meta["api_config_str"])) exit(StepStatus.SUCCESS.value) fdbk_rdps = [ str(rdp_config["rdpId"]).upper() for rdp_config in rdp_configs ] # change table name in case conflict with normal sync process self.dct_sync_data[ "target_dw_table"] = "{0}_{1}_{2}".format( self.staging_import_table_osa, self._vendor_key, self._retailer_key) _silo_config = Config( meta=self.meta, vendor_key=self._vendor_key, retailer_key=self._retailer_key).json_data _silo_type = _silo_config['configs'].get( 'etl.silo.type', 'SVR') _rdp_id = _silo_config['configs'].get('rdp.db.name') # RDP_AUX is default rdp id for feedback etl on PRODUCTION. # 1, if there is no RDP for given silo. then exit. if not _rdp_id or str(_rdp_id).strip() == '': self.logger.warning( "There is no RDP silo configed for the given vendor:%s " "and retailer:%s. So no need to sync feedback." % (self._vendor_key, self._retailer_key)) exit(StepStatus.SUCCESS.value) # 2, Getting configed RDP list, and check if there are feedback related RDPs. _tmp_rdp_lst = str(_rdp_id).upper().split(sep=",") _rdp_lst = [_tmp.strip() for _tmp in _tmp_rdp_lst] # common_rdps is RDP silo configed for syncing feedback data for given silo(vendor&retailer) common_rdps = list(set(_rdp_lst).intersection(fdbk_rdps)) if common_rdps is None: self.logger.warning( "There is no RDP silo configed for the given vendor:%s " "and retailer:%s. So no need to sync feedback." % (self._vendor_key, self._retailer_key)) exit(StepStatus.SUCCESS.value) # If there is 1 or more than 1 feedback related rdps configed, then loop them to sync feedback data, # Normally, there should be only 1. or no feedback rdp configed. for common_rdp in common_rdps: _rdp_id = common_rdp self.logger.info( "Started to sync data from rdp: %s for given vendor:%s and retailer:%s. " % (_rdp_id, self._vendor_key, self._retailer_key)) # if RDP is not RDP_AUX, Won't exit but log a warning. if _rdp_id != self._default_rdp: self.logger.warning( "Please be noted: The RDP is:%s. It is not RDP_AUX." % _rdp_id) # WM silos are also following above logic. # all hosted silo are ultilizing RDP_AUX to transfer feedback data. not sure about Walmart. # if str(_silo_type).upper() in ["WMSSC", "WMCAT", "SASSC", "WMINTL"]: # _rdp_id = self._default_rdp # WM rdp is RDP_AUX as well? rdp_config = Config(meta=self.meta, hub_id=_rdp_id).json_data if not rdp_config['configs']: self.logger.warning( "There is no configs for RDP: %s. Please check following URL:" "%s/properties/%s/%s" % (_rdp_id, self.meta["api_config_str"], _rdp_id, _rdp_id)) exit(StepStatus.SUCCESS.value) _rdp_schema = rdp_config['configs'].get( 'dw.schema.name') self.logger.info( "Started to init feedback data from rdp: %s for " "given vendor:%s and retailer:%s " % (_rdp_schema, self._vendor_key, self._retailer_key)) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process. self.process_data() self.logger.info("Data sync done for RDP: %s" % _rdp_id) except Exception as e: self.logger.warning(e) self.logger.warning( "Please check if any warning or error messages when doing the initialization!" ) finally: if self.osa_app_conn: self.osa_app_conn.close_connection() if self.osa_dw_conn: self.osa_dw_conn.close_connection() def initialize(self): """ Create local temp tables , and DDLs required to process this fact type :return: """ self.logger.info("Initialize...") # recreate this table for every RDP. no need to truncate any longer. # sql = "TRUNCATE TABLE {cmnSchema}.{targetTable}"\ # .format(cmnSchema=self.dct_sync_data["target_dw_schema"], targetTable=self.staging_import_table_osa) # self.logger.info(sql) # self.osa_dw.execute(sql) sql = """ --Store data from RDP table. DROP TABLE IF EXISTS {cmnSchema}.{importTable}; CREATE TABLE {cmnSchema}.{importTable} ( EVENT_KEY int NOT NULL, RETAILER_KEY int, VENDOR_KEY int, STORE_VISIT_DATE date, PERIOD_KEY int NOT NULL, TYPE varchar(1), TYPE_DATE varchar(10), ALERT_ID int, ALERT_TYPE varchar(64), MERCHANDISER_STORE_NUMBER varchar(512), STORE_ID varchar(512), MERCHANDISER_UPC varchar(512), INNER_UPC varchar(512), MERCHANDISER varchar(100), STORE_REP varchar(1000), SOURCE varchar(1000), BEGIN_STATUS varchar(255), ACTION varchar(255), FEEDBACK_DESCRIPTION varchar(255), FEEDBACK_HOTLINEREPORTDATE date, FEEDBACK_ISININVENTORY varchar(5), ZIP_CODE varchar(64), ARTS_CHAIN_NAME varchar(255), UPC_STATUS varchar(255), MSI varchar(255) ) UNSEGMENTED ALL NODES; """.format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(sql) self.osa_dw_conn.execute(sql) def load_data(self): """ # Load data from RDP table ds_fact_feedback to local temp tables. There is an column event_key which is incremental for all customers in ds_fact_feedback table. we can save the snapshot of this column to variable table, and do the incremental every time based on this column. There are few cases here: 1, Routinely, There will be a scheduled job to sync the whole feedback data for valid customers from related RDP silo. And save the snapshot of the event_key from previous loading for next incremental loading. 2, if on-boarding a new vendor & retailer customer. Getting rdp_event_key from variable for related RDP silo. (rdp_event_key is from previous loading) and then sync feedback data from related RDP silo only for this given customer when event_key < rdp_event_key. Then case1 will take care the rest of feedback data. :return: """ rdp_schema = self.dct_sync_data["source_config"].get('dw.schema.name') # rdp_aux.ds_fact_feedback source_table = "{rdpSchema}.{rdptableName}"\ .format(rdpSchema=rdp_schema, rdptableName=self.source_table_rdp) # common.stage_fact_feedback_rdp target_table = "{dwSchema}.{importTable}"\ .format(dwSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info( "Ready to load Data from {srouceTable} to {targetTable}".format( targetTable=target_table, srouceTable=source_table)) insert_columns = " EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISIT_DATE, PERIOD_KEY, TYPE, TYPE_DATE," \ " ALERT_ID, ALERT_TYPE, MERCHANDISER_STORE_NUMBER, STORE_ID, MERCHANDISER_UPC, INNER_UPC," \ " MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION," \ " FEEDBACK_HOTLINEREPORTDATE, FEEDBACK_ISININVENTORY, ZIP_CODE, ARTS_CHAIN_NAME, UPC_STATUS, MSI " try: self.logger.info( "Getting the previous Event_key from last run for incremental load." ) _event_sql = "SELECT VARIABLE_VALUE FROM variables " \ "WHERE VARIABLE_NAME = '{rdpName}'".format(rdpName=rdp_schema) self.logger.info(_event_sql) event_key = self.osa_app_conn.query_scalar(_event_sql) self.logger.info( "Getting customer info which only applied OSA services as filter" ) sql = "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_MAPPING " \ "UNION " \ "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_RC_MAPPING" self.logger.info(sql) results = self.osa_app_conn.query(sql) if not results: raise Warning( "There is no data in table AP_ALERT_CYCLE_MAPPING. Please check sql: %s" % sql) # exit(StepStatus.SUCCESS.value) user_filters = [ 'SELECT ' + str(result.retailer_key) + ',' + str(result.vendor_key) for result in results ] user_filter_str = ' UNION ALL '.join(user_filters) self.logger.info("Customer filters are: %s" % user_filter_str) # incremental filter from RDP table where_sql = "EVENT_KEY > {eventKey} AND SOURCE != 'ARIA' " \ "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\ .format(eventKey=event_key, userFilter=user_filter_str) # TODO2DONE: how to set default value? use -1 # copy all if there is no value in variables table. if not event_key: self.logger.warning( "There is no value set in variables table for RDP:{name}, " "So copy the whole table".format(name=rdp_schema)) where_sql = " SOURCE != 'ARIA' AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\ .format(eventKey=event_key, userFilter=user_filter_str) event_key = -1 # check if this is the first run. if self._init_flag is True: if event_key == -1: # event_key is None self.logger.warning( "There is no event_key logged in variables table for the given RDP: %s." "So Let's wait for the routine job to sync the whole rdp feedback data" % rdp_schema) return False self.logger.info("Generating init feedback filters") where_sql = "EVENT_KEY <= {eventKey} AND SOURCE != 'ARIA' " \ "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter}) " \ "AND RETAILER_KEY={retailerKey} AND VENDOR_KEY={vendorKey} "\ .format(eventKey=event_key, userFilter=user_filter_str, retailerKey=self._retailer_key, vendorKey=self._vendor_key) self.logger.debug("The filters are: %s" % where_sql) # form the fetch query from RDP and then Insert into the target table fetch_query = """ SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ {insertQuery} FROM {sourceTable} WHERE {whereSql} """.format(insertQuery=insert_columns, sourceTable=source_table, whereSql=where_sql) self.logger.info("fetch_query is : %s" % fetch_query) self.logger.info( ">>Loading {factType} Data from event_key:{eventKey} start at: {timestamp}<<" .format(factType=self._fact_type, eventKey=event_key, timestamp=datetime.datetime.now())) self.dct_sync_data["target_column"] = insert_columns self.dct_sync_data["source_sql"] = fetch_query row_count = self.transfer.transfer_data( dct_sync_data=self.dct_sync_data) self.logger.info( ">>Done loaded {cnt} rows from event_key:{eventKey} completed at: {timestamp}<<" .format(cnt=row_count, factType=self._fact_type, eventKey=event_key, timestamp=datetime.datetime.now())) # if no data transfered, then update variables with previous value. sql = "SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ nvl(max(event_key), {oldEventKey}) " \ "FROM {schemaName}.{importTable} "\ .format(schemaName=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"], oldEventKey=event_key) self.logger.info(sql) self.max_event_key = self.osa_dw_conn.query_scalar(sql) # max_event_key = -1 # testing purpose if self.max_event_key == -1: self.logger.warning( "There is no feedback data in RDP table: {0}".format( source_table)) return False return True except Exception as e: self.logger.warning(e) raise finally: pass def process_data(self): """ after load_data part completes. sync data from temp table to related schemas. :return: """ try: self.logger.info("Processing feedback start...") # loop retailer to insert feedback data sql = "SELECT DISTINCT retailer_key " \ "FROM {cmnSchema}.{importTable}"\ .format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(sql) retailers = self.osa_dw_conn.query(sql) if retailers.rowcount == 0: self.logger.warning( "There is no data in table {cmnSchema}.{importTable}." "It could be no incremental data. Please check fetch_query against RDP database" .format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"])) for retailer in retailers: retailer_key = retailer.retailer_key osa_schema = self.capacity.get_retailer_schema_name( retailer_key) # Finally, run the sql msg = "Processing fdbk data within retailer {retailerKey}:{retailerName}"\ .format(retailerKey=retailer_key, retailerName=osa_schema) self.logger.info(msg) # Normally, There should NOT be duplicated alert_id transfered by incremental. # But should consider this case here. Delete existing alertid from target table # TODO: delete could have performance issue. consider using switch partition delete_sql = "DELETE FROM {osaSchema}.{targetTable} " \ "WHERE alert_id IN (SELECT alert_id FROM {cmnSchema}.{importTable} )"\ .format(targetTable=self.target_table_osa, osaSchema=osa_schema, cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(delete_sql) self.osa_dw_conn.execute(delete_sql) # inserting feedback data into final table fact_feedback from processed table. sql = """ INSERT INTO {osaSchema}.{targetTable} (EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISITED_PERIOD_KEY, PERIOD_KEY, ALERT_ID, STORE_KEY, MERCHANDISER_STORE_NUMBER, STORE_ID, ITEM_KEY, MERCHANDISER_UPC, UPC, MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION, ON_HAND_PHYSICAL_COUNT, ON_HAND_CAO_COUNT ) SELECT stage.EVENT_KEY, stage.RETAILER_KEY, stage.VENDOR_KEY, TO_CHAR(stage.STORE_VISIT_DATE, 'YYYYMMDD')::int AS STORE_VISITED_PERIOD_KEY, stage.PERIOD_KEY, stage.ALERT_ID, store.STORE_KEY AS STORE_KEY, stage.MERCHANDISER_STORE_NUMBER, COALESCE(store.STORE_ID , alert.STOREID, stage.STORE_ID) AS STORE_ID, item.ITEM_KEY AS ITEM_KEY, stage.MERCHANDISER_UPC, COALESCE(item.UPC, alert.UPC, stage.INNER_UPC, stage.MERCHANDISER_UPC) AS UPC, stage.MERCHANDISER, stage.STORE_REP, stage.SOURCE, stage.BEGIN_STATUS, stage.ACTION, stage.FEEDBACK_DESCRIPTION, 0 AS ON_HAND_PHYSICAL_COUNT, 0 AS ON_HAND_CAO_COUNT FROM {cmnSchema}.{importTable} stage LEFT JOIN {osaSchema}.FACT_PROCESSED_ALERT alert ON stage.alert_id = alert.alert_id AND alert.issuanceid = 0 AND alert.retailer_key = {retailerKey} AND stage.vendor_key = alert.vendor_key INNER JOIN {cmnSchema}.DIM_PRODUCT item ON item.retailer_key = {retailerKey} AND alert.vendor_key = item.vendor_key AND item.item_key = alert.item_key INNER JOIN {cmnSchema}.DIM_STORE store ON store.retailer_key = {retailerKey} AND alert.vendor_key = store.vendor_key AND store.store_key = alert.store_key WHERE stage.retailer_key = {retailerKey} """.format(osaSchema=osa_schema, targetTable=self.target_table_osa, cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"], retailerKey=retailer_key) self.logger.info( "SQL used to load data to related schema. %s" % sql) self.osa_dw_conn.execute(sql) self.logger.info("Processing feedback ended...") except Exception as e: self.logger.warning("Process data for RDP {0} failed: {1}".format( self._rdp_id, e)) raise finally: if self._debug.upper() == 'N': _drop_sql = "DROP TABLE IF EXISTS {schemaName}.{importTable};" \ .format(schemaName=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(_drop_sql) self.osa_dw_conn.execute(_drop_sql)
_common_schema = "common" _dim_product = "DIM_PRODUCT" _dim_store = "DIM_STORE" _dim_calendar = "DIM_CALENDAR" _meta_intvn = "ALT_META_INTERVENTION" _dim_base_dir = meta.get("azure_storage_dim_root") _prefix_name = "IRIS_" _dw_conn = DWOperation(meta=meta) # Getting retailer list from dim table retailer_list_in_table = [] _retailer_sql = "SELECT DISTINCT retailer_key FROM {schema}.{table} where retailer_key in (6, 5240)"\ .format(schema=_common_schema, table=_dim_store) res = _dw_conn.query(_retailer_sql) for ele in res: retailer_list_in_table.append(ele.retailer_key) print(retailer_list_in_table) capacity = Capacity(meta=meta) azure_uploader = UploadToBlobStorage(meta=meta) # sql_for_item = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_product) # sql_for_store = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_store) # sql_for_cal = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_calendar) # sql_for_alert = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_meta_intvn) # retailer_list = [6, 5240] retailer_list = retailer_list_in_table for retailer_key in retailer_list:
class Scd(object): def __init__(self, sql, actioner, log_detail=True, batch_size=100, meta={}, db_type=None, table_schema=None, logger=None): self.meta = meta self.actioner = actioner self.log_detail = log_detail self.batch_size = batch_size self.logger = logger if logger else Logger(log_level="info", vendor_key=-1, retailer_key=-1, module_name="scd") self.sql = sql self.raw_sql_splitted = self.sql.split(' ') self.raw_sql_splitted_trimed = [ str for str in self.raw_sql_splitted if str != '' ] self.db_type = db_type self.table_schema = table_schema self.logger.info('[SQL] %s' % self.sql) self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted) self.mssql = MSOperation(meta=self.meta) self.vtc = None self.saved_records_count = 0 def set_insert_delete_update(self): cmd = self.raw_sql_splitted_trimed[0].upper() if cmd in ('DELETE', 'UPDATE', 'INSERT'): self.cmd = cmd else: raise RuntimeError('%s is not supported.' % cmd) def locate_keyword(self, keyword, sql_splitted=[]): # find the location of the keyword firstly appears in the sql # e.g. locate WHERE # UPDATE TEST SET COL = ' WHERE ' WHERE ID=1, the first ' WHERE ' is not a keyword, so return the index of the 2nd WHERE keyword = keyword.upper() base_seq_splitted = sql_splitted if sql_splitted else self.raw_sql_splitted # if sql_splitted is specified, locate keyword in sql_splitted, otherwise locate keyword in raw_sql_splitted index_keyword = [ k for k, v in enumerate(base_seq_splitted) if keyword in v.upper() ] for ind in index_keyword: count_of_single_quote_ahead_of_where = sum([ base_seq_splitted[i].count("'") for i in range(0, ind) if "'" in base_seq_splitted[i] ]) i = base_seq_splitted[ind].upper().index(keyword) count_of_single_quote_ahead_of_where += base_seq_splitted[ind][ 0:i].count("'") if count_of_single_quote_ahead_of_where % 2 == 0: # the KEYWORD we want to capture is the one has n*2 "'" ahead of it return ind return -1 # KEYWORD not found def locate_keyword_combination( self, keywords): #--mainly for parsing merge, may need to be updated # find the location of the first keyword, and to check if 2nd, 3rd keywords are just following the 1st keyword in the SQL # if match all the keywords, return the index of the first keyword # else return -1 keywords_list = [ w.upper().strip() for w in keywords.split(' ') if w.strip() != '' ] sql_splitted = [*self.raw_sql_splitted] base_ind = 0 self.logger.info('Locating %s' % keywords_list) while True: self.logger.info('Locating 1st keyword "%s"' % keywords_list[0]) ind = self.locate_keyword(keywords_list[0], sql_splitted) if ind == -1: self.logger.warning('Keyword not found.') return -1 following = [ v.upper().strip() for v in sql_splitted[ind + 1:] if v.strip() != '' ] self.logger.info('Getting following %s', following) if following[0:len(keywords_list) - 1] == keywords_list[1:]: self.logger.info('Keywords matched!') return base_ind + ind else: self.logger.warning('Keywords not matched.') sql_splitted = sql_splitted[ind + 1:] base_ind += ind def get_merge_where( self ): #--TODO, not done yet, the sql gramma "update from" for vertica and sqlserver are different, need more time to work on this index_using = self.locate_keyword('USING') index_on = self.locate_keyword('ON') index_when_match = self.locate_keyword_combination('WHEN MATCHED THEN') index_when_not_match = self.locate_keyword_combination( 'WHEN NOT MATCHED THEN') def get_where_condition(self): index_where = self.locate_keyword('WHERE') return '' if index_where == -1 else ' '.join( self.raw_sql_splitted[index_where:]) def get_table_name(self): self.set_insert_delete_update() return self.raw_sql_splitted_trimed[1].upper( ) if self.cmd == 'UPDATE' else self.raw_sql_splitted_trimed[2].upper() def get_raw_table_name(self, table_name): return table_name.split('.')[-1] def get_table_def(self, raw_table_name): sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name if self.db_type: sql += " AND DB_TYPE='%s'" % self.db_type if self.table_schema: sql += " AND TABLE_SCHEMA='%s'" % self.table_schema rs = self.mssql.query(sql) if len(rs) == 0: raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.' % (raw_table_name, db_type, table_schema)) elif len(rs) > 1: self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|') for r in rs: self.logger.info( '|%s|%s|%s|%s|' % (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA)) raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.' % (raw_table_name, db_type, table_schema)) else: if rs[0].DB_TYPE == 'MSSQL': self.target_conn = self.mssql else: self.vtc = DWOperation(meta=self.meta) self.target_conn = self.vtc return rs[0] def get_mssql_table_ddl_def(self, db_name, table_name, table_schema): ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE FROM %s.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'%s' AND TABLE_SCHEMA = N'%s' ORDER BY ORDINAL_POSITION ''' % (db_name, table_name, table_schema) rs = self.mssql.query(ddl_sql) #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) def get_vertica_table_ddl_def(self, table_name, table_schema): ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE FROM COLUMNS WHERE TABLE_NAME = '%s' AND TABLE_SCHEMA = '%s' ORDER BY ORDINAL_POSITION ''' % (table_name, table_schema) rs = self.vtc.query(ddl_sql) #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) def save_to_db(self, table_def, recs): pk = json.loads(table_def.PK) batch_sqls = [] columns = 'TABLE_DEF_ID, PK_VALUE, ACTION, RECORD, UPDATE_TIME, ACTIONER' if self.log_detail else 'TABLE_DEF_ID, PK_VALUE, ACTION, UPDATE_TIME, ACTIONER' counter = 0 now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") for rec in recs: pk_dict = collections.OrderedDict() for k in pk: pk_dict[k] = rec[k] pk_value = json.dumps(pk_dict).replace("'", "''") if self.log_detail: batch_sqls.append( '''SELECT %s, '%s', '%s', '%s', '%s', '%s' ''' % (table_def.ID, pk_value, self.cmd, json.dumps(rec).replace( "'", "''"), now, self.actioner)) else: batch_sqls.append( '''SELECT %s, '%s', '%s', '%s', '%s' ''' % (table_def.ID, pk_value, self.cmd, now, self.actioner)) counter += 1 if len(batch_sqls) % self.batch_size == 0: self.mssql.execute( '''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' % (columns, ' UNION ALL '.join(batch_sqls))) batch_sqls = [] self.logger.info('%s has been logged.' % counter) if len(batch_sqls) > 0: self.mssql.execute('''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' % (columns, ' UNION ALL '.join(batch_sqls))) self.logger.info('%s has been logged.' % counter) @Timer() def save_data_to_be_changed(self, table_def, where_condition): if table_def.DB_TYPE == 'MSSQL': table_schema = 'dbo' if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA table_ddl_def = self.get_mssql_table_ddl_def( self.meta['db_conn_mssql_dbname'], table_def.TABLE_NAME, table_schema) else: # vertica table_schema = self.meta[ 'db_conn_vertica_common_schema'] if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA table_ddl_def = self.get_vertica_table_ddl_def( table_def.TABLE_NAME, table_schema) if not self.log_detail: # only dump primary key columns pk = json.loads(table_def.PK) table_ddl_def = dict([(c, table_ddl_def[c]) for c in table_ddl_def if c in pk]) self.logger.info(table_ddl_def) query_columns = ','.join([ '"%s"' % c if ' ' in c else c for c in list(table_ddl_def.keys()) ]) pull_data_sql = 'SELECT %s FROM %s.%s %s' % ( query_columns, table_schema, table_def.TABLE_NAME, where_condition) self.logger.info(pull_data_sql) rs = self.target_conn.query(pull_data_sql) recs = [] for r in rs: rec = {} for c in list(table_ddl_def.keys()): #rec[c] = str(eval("r."+c)) if 'TIME' not in table_ddl_def[c].upper() and 'DATE' not in table_ddl_def[c].upper() else eval("r."+c).strftime("%Y-%m-%d %H:%M:%S") rec[c] = str(eval('r.__getattribute__("%s")' % c)) recs.append(rec.copy()) self.saved_records_count = len(recs) self.logger.info('%s records will be saved.' % self.saved_records_count) self.save_to_db(table_def, recs) @Timer() def execute_sql(self): self.target_conn.execute(self.sql) def main(self): try: try: table_name = self.get_table_name() raw_table_name = self.get_raw_table_name(table_name) table_def = self.get_table_def(raw_table_name) saved = False if self.cmd != 'INSERT': where_condition = self.get_where_condition() self.save_data_to_be_changed(table_def, where_condition) saved = True except Exception as e: self.logger.warning(traceback.format_exc()) self.logger.warning( 'Failed to save the to-be-changed data, start to executing sql.' ) self.execute_sql() return '(%(db_type)s)%(table_schema)s.%(table_name)s - impacted %(row_count)s rows, saved %(status)s' % { 'db_type': table_def.DB_TYPE, 'table_schema': table_def.TABLE_SCHEMA, 'table_name': table_def.TABLE_NAME, 'row_count': self.saved_records_count, 'status': 'N/A' if self.cmd == 'INSERT' else 'successfully' if saved else 'unsuccessfully' } finally: if self.mssql: self.mssql.close_connection() if self.vtc: self.vtc.close_connection()