Beispiel #1
0
 def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()):
     self.dtm = DateTimeManager(_snapshot_date=_snapshot_date,
                                _dt_col="d_date",
                                _dt_format="%Y-%m-%d %H:%M:%S",
                                _date_format="%Y-%m-%d",
                                _config=_config,
                                _partition_col="p_yyyymm",
                                _partition_dt_format="%Y%m")
     self.dtm.append_periods(["1m", "3m", "6m", "12m"])
     self.config = self.dtm.get_config()
     Channel.__init__(self, "Store", self.dtm, self.config)
     self.sales = Sales(self.config)
     self._create_default_cores()
     self._create_default_sources()
Beispiel #2
0
    def _create_from_daterange(
        cls,
        dtm: DateTimeManager,
        time_helpers: ConfigObj,
        period_ranges="date_filters.ranges",
    ):
        """
               :param time_helpers: {
                   snapshot_date: "20190225",
                   snapshot_type: "DAILY",
                   partition_col: "p_yyyymm",
                   date_ranges: {
                       "1m": {"start": <val>, "end": <val>, "3m": {"start": <val>, "end": <val>}
                       }
               }
               :return:
               """
        filters = []
        filter_vals = []
        date_ranges = time_helpers.get_config(period_ranges).as_dict()
        partition_col = time_helpers.get_or_else("partition_col", "")
        filter_names = list(date_ranges.keys())
        filter_cols = [partition_col]

        if time_helpers.get_or_else("snapshot_type", "MONTH") == "DAILY":
            date_col = time_helpers.get_or_else(
                "date_col", "No date col specified in the time_helpers config."
                "Daily snapshots require a date_col to be defined")
            filter_cols = [partition_col, date_col]
            for date_range in date_ranges.keys():
                d_filter = dtm.scoped_time_filter(
                    range_dict=date_ranges[date_range])
                filters.append([d_filter])
                filter_vals.append(date_ranges[date_range])

        else:
            for date_range in date_ranges.keys():
                p_filter = dtm.scoped_partition_filter(
                    range_dict=date_ranges[date_range])
                filters.append([p_filter])
                filter_vals.append(date_ranges[date_range])

        return Multiplier(filter_cols, [filter_vals], [filters],
                          [filter_names])
Beispiel #3
0
class Catalog(Channel):
    def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()):
        self.dtm = DateTimeManager(_snapshot_date=_snapshot_date,
                                   _dt_col="d_date",
                                   _dt_format="%Y-%m-%d %H:%M:%S",
                                   _date_format="%Y-%m-%d",
                                   _config=_config,
                                   _partition_col="p_yyyymm",
                                   _partition_dt_format="%Y%m")
        self.dtm.append_periods(["1m", "3m", "6m", "12m"])
        self.config = self.dtm.get_config()
        Channel.__init__(self, "Store", self.dtm, self.config)
        self.sales = Sales(self.config)
        self._create_default_cores()
        self._create_default_sources()
        # self.groupby = Store._GroupBy(self)


    def Sales(self):
        self.sales = Sales(self.config)
        return self.sales

    def Trends(self, featureSet_to_trend, trend_ranges, _dtm=None, _config: ConfigObj = None):
        """
        Trends is a method that constructs the TrendsCommon generic FeatureFamily. It returns features, slope and y-intercept
        such that the user can understand the trend line and ideally future values for various features.
        :param featureSet_to_trend: A feature set to be trended
        :param trend_ranges: list of ranges to be trended [['1m','12m'], ['1w', '4w']]. Accepted range periods are
        d = day
        w = week
        m = month
        y = year
        :param _dtm: A specific dtm can be passed in but generally the dtm from self should be used
        :param _config: A specific configObj can be passed in but generally the config is taken from self
        :return:
        """
        if _config is None:
            time_helpers = self.config.get_config("time_helpers")
        else:
            time_helpers = _config.get_config("time_helpers")

        dtm = self.dtm if _dtm is None else _dtm
        trends = TrendsCommon(featureSet_to_trend, trend_ranges, dtm, time_helpers)
        return trends

    def _create_default_cores(self):
        try:
            df = spark.read.table("tomes_tpcds_1tb.catalog_sales_enhanced")
            self.add_core("catalog_sales", df, ["p_yyyymm"])
            df = spark.read.table("tomes_tpcds_1tb.catalog_returns_enhanced")
            self.add_core("catalog_returns", df, ["p_yyyymm"])
        except Exception as e:
            logger.warning("Error loading default cores. {}".format(str(e)))
            traceback.print_exc(file=sys.stdout)

        return self.cores

    def _create_default_sources(self):
        try:
            df = spark.read.table("tomes_tpcds_1tb.item")
            self.add_source("item", df, [])
            df = spark.read.table("tomes_tpcds_1tb.inventory")
            self.add_source("inventory", df, [])
            df = spark.read.table('tomes_tpcds_1tb.date_dim')
            self.add_source('date', df, [])

        except Exception as e:
            logger.warning("Error loading default sources. {}".format(str(e)))
            traceback.print_exc(file=sys.stdout)

        return self.sources

    class _GroupBy:
        def __init__(self, _catalog):
            self.helpers = Helpers()
            self._groupby_cols = []
            self._catalog = _catalog

        def trans_date(self):
            """
                GroupBy cs_sold_date and name column 'SOLD_DATE'
            :return:
            """
            self._groupby_cols.append(col("cs_sold_date_sk").alias("SOLD_DATE"))
            return self

        def warehouse_id(self):
            """
                GroupBy cs_warehouse_sk and name column 'WAREHOUSE_ID'
            :return:
            """
            self._groupby_cols.append(col("cs_warehouse_sk").alias("WAREHOUSE_ID"))
            return self