def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()): self.dtm = DateTimeManager(_snapshot_date=_snapshot_date, _dt_col="d_date", _dt_format="%Y-%m-%d %H:%M:%S", _date_format="%Y-%m-%d", _config=_config, _partition_col="p_yyyymm", _partition_dt_format="%Y%m") self.dtm.append_periods(["1m", "3m", "6m", "12m"]) self.config = self.dtm.get_config() Channel.__init__(self, "Store", self.dtm, self.config) self.sales = Sales(self.config) self._create_default_cores() self._create_default_sources()
def _create_from_daterange( cls, dtm: DateTimeManager, time_helpers: ConfigObj, period_ranges="date_filters.ranges", ): """ :param time_helpers: { snapshot_date: "20190225", snapshot_type: "DAILY", partition_col: "p_yyyymm", date_ranges: { "1m": {"start": <val>, "end": <val>, "3m": {"start": <val>, "end": <val>} } } :return: """ filters = [] filter_vals = [] date_ranges = time_helpers.get_config(period_ranges).as_dict() partition_col = time_helpers.get_or_else("partition_col", "") filter_names = list(date_ranges.keys()) filter_cols = [partition_col] if time_helpers.get_or_else("snapshot_type", "MONTH") == "DAILY": date_col = time_helpers.get_or_else( "date_col", "No date col specified in the time_helpers config." "Daily snapshots require a date_col to be defined") filter_cols = [partition_col, date_col] for date_range in date_ranges.keys(): d_filter = dtm.scoped_time_filter( range_dict=date_ranges[date_range]) filters.append([d_filter]) filter_vals.append(date_ranges[date_range]) else: for date_range in date_ranges.keys(): p_filter = dtm.scoped_partition_filter( range_dict=date_ranges[date_range]) filters.append([p_filter]) filter_vals.append(date_ranges[date_range]) return Multiplier(filter_cols, [filter_vals], [filters], [filter_names])
class Catalog(Channel): def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()): self.dtm = DateTimeManager(_snapshot_date=_snapshot_date, _dt_col="d_date", _dt_format="%Y-%m-%d %H:%M:%S", _date_format="%Y-%m-%d", _config=_config, _partition_col="p_yyyymm", _partition_dt_format="%Y%m") self.dtm.append_periods(["1m", "3m", "6m", "12m"]) self.config = self.dtm.get_config() Channel.__init__(self, "Store", self.dtm, self.config) self.sales = Sales(self.config) self._create_default_cores() self._create_default_sources() # self.groupby = Store._GroupBy(self) def Sales(self): self.sales = Sales(self.config) return self.sales def Trends(self, featureSet_to_trend, trend_ranges, _dtm=None, _config: ConfigObj = None): """ Trends is a method that constructs the TrendsCommon generic FeatureFamily. It returns features, slope and y-intercept such that the user can understand the trend line and ideally future values for various features. :param featureSet_to_trend: A feature set to be trended :param trend_ranges: list of ranges to be trended [['1m','12m'], ['1w', '4w']]. Accepted range periods are d = day w = week m = month y = year :param _dtm: A specific dtm can be passed in but generally the dtm from self should be used :param _config: A specific configObj can be passed in but generally the config is taken from self :return: """ if _config is None: time_helpers = self.config.get_config("time_helpers") else: time_helpers = _config.get_config("time_helpers") dtm = self.dtm if _dtm is None else _dtm trends = TrendsCommon(featureSet_to_trend, trend_ranges, dtm, time_helpers) return trends def _create_default_cores(self): try: df = spark.read.table("tomes_tpcds_1tb.catalog_sales_enhanced") self.add_core("catalog_sales", df, ["p_yyyymm"]) df = spark.read.table("tomes_tpcds_1tb.catalog_returns_enhanced") self.add_core("catalog_returns", df, ["p_yyyymm"]) except Exception as e: logger.warning("Error loading default cores. {}".format(str(e))) traceback.print_exc(file=sys.stdout) return self.cores def _create_default_sources(self): try: df = spark.read.table("tomes_tpcds_1tb.item") self.add_source("item", df, []) df = spark.read.table("tomes_tpcds_1tb.inventory") self.add_source("inventory", df, []) df = spark.read.table('tomes_tpcds_1tb.date_dim') self.add_source('date', df, []) except Exception as e: logger.warning("Error loading default sources. {}".format(str(e))) traceback.print_exc(file=sys.stdout) return self.sources class _GroupBy: def __init__(self, _catalog): self.helpers = Helpers() self._groupby_cols = [] self._catalog = _catalog def trans_date(self): """ GroupBy cs_sold_date and name column 'SOLD_DATE' :return: """ self._groupby_cols.append(col("cs_sold_date_sk").alias("SOLD_DATE")) return self def warehouse_id(self): """ GroupBy cs_warehouse_sk and name column 'WAREHOUSE_ID' :return: """ self._groupby_cols.append(col("cs_warehouse_sk").alias("WAREHOUSE_ID")) return self