Beispiel #1
0
    def _create_feature(self, frame):
        """
        Creates a feature based on the arguments passed in from frame.
        :param frame:
        :return:
        """
        args, _, _, values = inspect.getargvalues(frame)
        dict = {i: values[i] for i in args}
        _name = dict["_name"]

        exist_feat = self._get_feature(_name)
        if exist_feat:
            self._features[_name] = exist_feat
        else:
            del dict["self"]
            if "_col_alias" in dict:
                del dict["_col_alias"]
            base_col = dict["_base_col"]
            dict["_base_col"] = col(
                base_col) if type(base_col) is str else lit(base_col)
            input_config = ConfigObj(dict)
            self._features[_name] = Feature(
                _name=input_config.get_or_else("_name", ""),
                _base_col=input_config.get_or_else("_base_col", None),
                _filter=input_config.get_or_else("_filter", []),
                _negative_value=input_config.get_or_else("_negative_value", 0),
                _agg_func=input_config.get_or_else("_agg_func", None),
                _agg_alias=input_config.get_or_else("_agg_alias", None),
                _joiners=input_config.get_or_else("_joiners", {}),
                _kind=input_config.get_or_else("_kind", "multipliable"))

        return self._features[_name]
Beispiel #2
0
 def test_add_config(self):
     config_obj = ConfigObj(dict())
     config_obj.add("namespace1.namespace2", {
         "setting1": "value1",
         "setting2": "value2"
     })
     assert config_obj.get_or_else("namespace1.namespace2.setting1",
                                   "") == "value1"
Beispiel #3
0
 def test_get_config(self):
     config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
     conf = config_obj.get_config("b")
     assert conf.c == [1, 2, 3, 4,
                       5], "get_config should return a config obj."
     conf = config_obj.get_config(("b.c"))
     assert conf == [1, 2, 3, 4, 5], "get_config should return a value."
     conf = config_obj.get_config("c")
     print("conf: {}".format(conf))
Beispiel #4
0
    def __init__(self,
                 _snapshot_date,
                 _dt_col: str,
                 _dt_format: str,
                 _date_format: str,
                 _config: ConfigObj,
                 _partition_col=None,
                 _partition_dt_format=None):
        """
        :param _dt_col: datetime column if relevant, otherwise just use date column
        :param _dt_format: common datetime format in python str format
        this is generally the column in the date_dim that is ued throughout the codebase
        any code using a different format will need to use the clean_time function
        :param _date_format: common date format in python str format
        :param _partition_dt_format: python format string for the datetime format used for partition col
        This only works if the partition col is date type
        :param _config: config object passed in
        :param _partition_col: partition column if dataset[s] are partitioned on date/datetime
        :param _partition_period: Supported is Daily, Monthly, Yearly,
        this will select the first day of the period based on the _partition_dt_format
        to ensure the entire period is included in the partition filters
        """
        self.date_time_col = _dt_col
        self.date_time_format = _dt_format
        self.date_format = _date_format
        self.partition_col = _partition_col
        self.partition_dt_format = _partition_dt_format
        self.partition_range = {}
        self.config = _config
        self.dt_tracker = ConfigObj()

        if self.config.contains("time_helpers.snapshot_date"):
            self.snapshot_date = self.config.get_or_else(
                "time_helpers.snapshot_date", "")
        elif _snapshot_date is None and not self.config.contains(
                "time_helpers.snapshot_date"):
            self.snapshot_date = date.today().strftime(_date_format)
        else:
            self.snapshot_date = _snapshot_date

        self.snapshot_date_dt = datetime.strptime(self.snapshot_date,
                                                  self.date_format).date()
        self.end = self.snapshot_date_dt.strftime(self.date_format)
        self.all_dates = set([])

        if not self.config.contains("time_helpers.snapshot_type"):
            mo_max_day = calendar.monthrange(self.snapshot_date_dt.year,
                                             self.snapshot_date_dt.month)[1]
            snapshot_type = "MONTH" if mo_max_day == self.snapshot_date_dt.day else "DAILY"
            self.config.add("time_helpers.snapshot_type", snapshot_type)

        self.config.add("time_helpers.snapshot_date", self.snapshot_date)
        self.config.add("time_helpers.partition_col", self.partition_col)
        self.config.add("time_helpers.date_col", self.date_time_col)
        self.config.add("time_helpers.date_col_format", self.date_format)
        self.config.add("time_helpers.partition_col_format",
                        self.partition_dt_format)
Beispiel #5
0
 def test_key_as_int(self):
     channel_config = ConfigObj()
     channel_config.add(
         "promo_pre_periods",
         {
             # list of promo length / pre-period length pairs in days
             7: 28,
             14: 28,
             21: 56
         })
     assert channel_config.as_dict()["promo_pre_periods"][7] == 28
Beispiel #6
0
    def Trends(self,
               featureSet_to_trend,
               trend_ranges,
               _dtm=None,
               _config: ConfigObj = None):
        """
        Trends is a method that constructs the TrendsCommon generic FeatureFamily. It returns features, slope and y-intercept
        such that the user can understand the trend line and ideally future values for various features.
        :param featureSet_to_trend: A feature set to be trended
        :param trend_ranges: list of ranges to be trended [['1m','12m'], ['1w', '4w']]. Accepted range periods are
        d = day
        w = week
        m = month
        y = year
        :param _dtm: A specific dtm can be passed in but generally the dtm from self should be used
        :param _config: A specific configObj can be passed in but generally the config is taken from self
        :return:
        """
        if _config is None:
            time_helpers = self.config.get_config("time_helpers")
        else:
            time_helpers = _config.get_config("time_helpers")

        dtm = self.dtm if _dtm is None else _dtm
        trends = TrendsCommon(featureSet_to_trend, trend_ranges, dtm,
                              time_helpers)
        return trends
Beispiel #7
0
    def _create_from_daterange(
        cls,
        dtm: DateTimeManager,
        time_helpers: ConfigObj,
        period_ranges="date_filters.ranges",
    ):
        """
               :param time_helpers: {
                   snapshot_date: "20190225",
                   snapshot_type: "DAILY",
                   partition_col: "p_yyyymm",
                   date_ranges: {
                       "1m": {"start": <val>, "end": <val>, "3m": {"start": <val>, "end": <val>}
                       }
               }
               :return:
               """
        filters = []
        filter_vals = []
        date_ranges = time_helpers.get_config(period_ranges).as_dict()
        partition_col = time_helpers.get_or_else("partition_col", "")
        filter_names = list(date_ranges.keys())
        filter_cols = [partition_col]

        if time_helpers.get_or_else("snapshot_type", "MONTH") == "DAILY":
            date_col = time_helpers.get_or_else(
                "date_col", "No date col specified in the time_helpers config."
                "Daily snapshots require a date_col to be defined")
            filter_cols = [partition_col, date_col]
            for date_range in date_ranges.keys():
                d_filter = dtm.scoped_time_filter(
                    range_dict=date_ranges[date_range])
                filters.append([d_filter])
                filter_vals.append(date_ranges[date_range])

        else:
            for date_range in date_ranges.keys():
                p_filter = dtm.scoped_partition_filter(
                    range_dict=date_ranges[date_range])
                filters.append([p_filter])
                filter_vals.append(date_ranges[date_range])

        return Multiplier(filter_cols, [filter_vals], [filters],
                          [filter_names])
Beispiel #8
0
    def _create_joiner_df(self, joiner: dict, partner_conf: ConfigObj):
        """
        Tries to create a dataframe from target_join_df of a joiner.
        If the sources/cores does not contain the dataframe, keep the target_join_df as path string.
        The path may be resovled later when more df is added to the partner.
        :param joiner:
        :param partner_conf:
        :return:
        """
        if "target_join_df" in joiner:
            if not isinstance(joiner["target_join_df"], DataFrame):
                df_path = joiner["target_join_df"]
                df_parts = [p.strip() for p in df_path.split(".")]
                cores = partner_conf.get_or_else("cores", {})
                sources = partner_conf.get_or_else("sources", {})
                data_source = cores if df_parts[0] == "cores" else sources
                if df_parts[1] in data_source:
                    joiner["target_join_df"] = data_source[df_parts[1]].df

            return joiner["target_join_df"]
        return None
Beispiel #9
0
 def test_contains_config(self):
     config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
     assert config_obj.contains("b.c"), "config should contains b.c"
     assert not config_obj.contains(
         "b.c.e"), "config should not contains b.c.e"
     assert config_obj.contains("a"), "config should contains a"
     assert not config_obj.contains("d"), "config should not contains d"
Beispiel #10
0
 def test_add_config(self):
     config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
     config_obj.add("b.c2", 2)
     config_obj.add("e1.e2.e3", 123)
     print(config_obj.configs)
     assert config_obj.b.c2 == 2 and config_obj.b.c == [
         1, 2, 3, 4, 5
     ], "should add config to config obj."
     assert config_obj.e1.e2.e3 == 123, "should be able to chain add."
     assert config_obj.get_or_else(
         "e1.e2.e3", None) == 123, "should be able to get config value."
     assert config_obj.get_or_else(
         "e1.e22",
         "default") == "default", "default value if key does not exist."
Beispiel #11
0
 def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()):
     self.dtm = DateTimeManager(_snapshot_date=_snapshot_date,
                                _dt_col="d_date",
                                _dt_format="%Y-%m-%d %H:%M:%S",
                                _date_format="%Y-%m-%d",
                                _config=_config,
                                _partition_col="p_yyyymm",
                                _partition_dt_format="%Y%m")
     self.dtm.append_periods(["1m", "3m", "6m", "12m"])
     self.config = self.dtm.get_config()
     Channel.__init__(self, "Store", self.dtm, self.config)
     self.sales = Sales(self.config)
     self._create_default_cores()
     self._create_default_sources()
Beispiel #12
0
    def __init__(self, _featureSet_to_trend, _trend_period_ranges,
                 _dtm: DateTimeManager, _time_helprs_config=ConfigObj()):
        """

        :param _featureSet_to_trend:
        :param _dtm: This dtm needs to contain a single type of period such as only w or only m
        passing in mixed ranges is not supported for trending yet
        :param _time_helprs_config:
        """
        # FeatureFamily.__init__(self, config)
        self.time_helpers_config = _time_helprs_config
        self.dtm = _dtm
        # self.months = _months
        self.featureSet_to_trend = _featureSet_to_trend
        self.trend_period_ranges = _trend_period_ranges

        FeatureFamily.__init__(self, _time_helprs_config)
Beispiel #13
0
    def __init__(self,
                 _name,
                 _dtm: DateTimeManager,
                 _config: ConfigObj = ConfigObj()):
        """

        :param _name:
        :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _config:
        """
        self.dtm = _dtm
        self.config = _config
        self.channel_name = _name

        # Derive final snapshot and add it config
        self.snapshot_date = _dtm.snapshot_date
        self.snapshot_date_dt = _dtm.snapshot_date_dt

        # TODO - Improve partitions to handle nested partitions
        # Get the partition if it exists
        self.partition_start = self.config.get_or_else(
            "time_helpers.partition_lower",
            (datetime.today() - relativedelta(years=1)).strftime(
                _dtm.partition_dt_format))
        self.partition_end = self.config.get_or_else(
            "time_helpers.partition_upper",
            datetime.today().strftime(_dtm.partition_dt_format))
        self.groupby_cols = self.config.get_or_else("_groupby_cols", [])

        self._features = OrderedDict()
        if not self.config.contains("cores"):
            self.config.add("cores", {})
        if not self.config.contains("sources"):
            self.config.add("sources", {})
        self.cores = self.config.get_or_else("cores", None)
        self.sources = self.config.get_or_else("sources", None)

        self.ff = Feature_Factory()
        # self._populate_partition_range()
        self.helpers = Helpers()
Beispiel #14
0
    def test_merge_config(self):
        config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
        config_obj.merge({"e1": {"e2": 12}})
        assert config_obj.e1.e2 == 12, "error merging two configs"
        config_obj.print()

        config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
        config_obj.merge(ConfigObj({"e1": {"e2": 12}}))
        assert config_obj.e1.e2 == 12, "error merging two configs"

        config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
        config_obj.merge(ConfigObj({"a": {"e2": 12}}))
        assert len(config_obj.configs
                   ) == 2, "merge should fail because of conflicts in keys."
Beispiel #15
0
 def __init__(self, config=ConfigObj()):
     self._multipliable_feat_func = multipliable
     self._base_feat_func = base_feat
     self._joiner_func = joiner_func
     # self._build_all()
     FeatureFamily.__init__(self, config)
Beispiel #16
0
 def __init__(self, config=ConfigObj()):
     # FeatureFamily.__init__(self, config)
     self.config = config
     self._joiner_func = joiner_func
     self._feat_func = feat_func
Beispiel #17
0
 def _add_joiner(self, join_key: str, partner_conf: ConfigObj):
     joiner = partner_conf.get_or_else(join_key, {})
     if join_key in self.joiners:
         print("{} has been added already.".format(join_key))
     else:
         self.joiners[join_key] = joiner
Beispiel #18
0
class DateTimeManager:
    def __init__(self,
                 _snapshot_date,
                 _dt_col: str,
                 _dt_format: str,
                 _date_format: str,
                 _config: ConfigObj,
                 _partition_col=None,
                 _partition_dt_format=None):
        """
        :param _dt_col: datetime column if relevant, otherwise just use date column
        :param _dt_format: common datetime format in python str format
        this is generally the column in the date_dim that is ued throughout the codebase
        any code using a different format will need to use the clean_time function
        :param _date_format: common date format in python str format
        :param _partition_dt_format: python format string for the datetime format used for partition col
        This only works if the partition col is date type
        :param _config: config object passed in
        :param _partition_col: partition column if dataset[s] are partitioned on date/datetime
        :param _partition_period: Supported is Daily, Monthly, Yearly,
        this will select the first day of the period based on the _partition_dt_format
        to ensure the entire period is included in the partition filters
        """
        self.date_time_col = _dt_col
        self.date_time_format = _dt_format
        self.date_format = _date_format
        self.partition_col = _partition_col
        self.partition_dt_format = _partition_dt_format
        self.partition_range = {}
        self.config = _config
        self.dt_tracker = ConfigObj()

        if self.config.contains("time_helpers.snapshot_date"):
            self.snapshot_date = self.config.get_or_else(
                "time_helpers.snapshot_date", "")
        elif _snapshot_date is None and not self.config.contains(
                "time_helpers.snapshot_date"):
            self.snapshot_date = date.today().strftime(_date_format)
        else:
            self.snapshot_date = _snapshot_date

        self.snapshot_date_dt = datetime.strptime(self.snapshot_date,
                                                  self.date_format).date()
        self.end = self.snapshot_date_dt.strftime(self.date_format)
        self.all_dates = set([])

        if not self.config.contains("time_helpers.snapshot_type"):
            mo_max_day = calendar.monthrange(self.snapshot_date_dt.year,
                                             self.snapshot_date_dt.month)[1]
            snapshot_type = "MONTH" if mo_max_day == self.snapshot_date_dt.day else "DAILY"
            self.config.add("time_helpers.snapshot_type", snapshot_type)

        self.config.add("time_helpers.snapshot_date", self.snapshot_date)
        self.config.add("time_helpers.partition_col", self.partition_col)
        self.config.add("time_helpers.date_col", self.date_time_col)
        self.config.add("time_helpers.date_col_format", self.date_format)
        self.config.add("time_helpers.partition_col_format",
                        self.partition_dt_format)

    def get_config(self):
        return self.config

    def _track_date(self, period, dt):
        if not self.dt_tracker.contains("{}.all_dates".format(period)):
            self.dt_tracker.add("{}.all_dates".format(period), [dt])
        else:
            dl = self.dt_tracker.get_or_else("{}.all_dates".format(period), "")
            self.dt_tracker.add("{}.all_dates".format(period), dl + [dt])

    def _append_ranges(self):
        for period, dates in self.dt_tracker.configs.items():
            mm = self._get_min_max_dt_from_list(dates['all_dates'],
                                                self.date_format)
            self.dt_tracker.add("{}.min".format(period),
                                mm[0].strftime(self.date_format))
            self.dt_tracker.add("{}.max".format(period),
                                mm[1].strftime(self.date_format))

    def _append_trend_ranges(self, trend_period_ranges):
        for rng in trend_period_ranges:  # [["1m","12m"]]
            num_range = list(range(int(rng[0][:-1]), int(rng[1][:-1]) + 1))
            period_type = rng[0][-1:]
            trend_period = [
                "{}{}".format(num, period_type) for num in num_range
            ]
            self.append_periods(
                trend_period,
                "time_helpers.trend_filters.{}".format(period_type))

    def append_periods(self, periods, cfg_prefix="time_helpers.date_filters"):

        date_format = self.date_format
        config = self.config
        self.all_dates.add(self.end)

        def _month(_length):
            start = (self.snapshot_date_dt -
                     relativedelta(months=int(_length))).strftime(date_format)
            self.all_dates.add(start)
            self._track_date("m", start)
            period_config = {"start": start, "end": self.end}
            config.add("{}.ranges.{}m".format(cfg_prefix, _length),
                       period_config)
            return True

        def _week(_length):
            start = (self.snapshot_date_dt -
                     relativedelta(weeks=int(_length))).strftime(date_format)
            self.all_dates.add(start)
            self._track_date("w", start)
            period_config = {"start": start, "end": self.end}
            config.add("{}.ranges.{}w".format(cfg_prefix, _length),
                       period_config)
            return True

        def _day(_length):
            start = (self.snapshot_date_dt -
                     relativedelta(days=int(_length))).strftime(date_format)
            self.all_dates.add(start)
            self._track_date("d", start)
            period_config = {"start": start, "end": self.end}
            config.add("{}.ranges.{}d".format(cfg_prefix, _length),
                       period_config)
            return True

        def _year(_length):
            start = (self.snapshot_date_dt -
                     relativedelta(years=int(_length))).strftime(date_format)
            self.all_dates.add(start)
            self._track_date("y", start)
            period_config = {"start": start, "end": self.end}
            config.add("{}.ranges.{}y".format(cfg_prefix, _length),
                       period_config)
            return True

        for p in periods:
            assert ("m" in p[-1:] or "w" in p[-1:] or "d" in p[-1:] or "y" in p[-1:]), \
                "Only m,w,d,y accepted as period types for insertion"

        for period in periods:
            length = period[:-1]
            period_type = period[-1:]

            append = {"m": _month, "w": _week, "d": _day, "y": _year}

            append[period_type](length)

            # Track the periods that were appended for later use
            if not self.dt_tracker.contains("{}.periods".format(period_type)):
                self.dt_tracker.add("{}.periods".format(period_type),
                                    [int(length)])
            else:
                pl = self.dt_tracker.get_or_else(
                    "{}.periods".format(period_type), "")
                self.dt_tracker.add("{}.periods".format(period_type),
                                    pl + [int(length)])
        self._append_ranges()

        # self._build_numerical_ranges()

        # If config is missing min/max partition values, insert them
        # Otherwise set the min/max of this to what's in the config
        all_dates_mm = self._get_min_max_dt_from_list(self.all_dates,
                                                      date_format)
        if not config.contains("time_helpers.partition_lower"):
            self.partition_range['start'] = all_dates_mm[0].strftime(
                self.partition_dt_format)
            config.add("time_helpers.partition_lower",
                       self.partition_range['start'])
        else:
            self.partition_range['start'] = config.get_or_else(
                "time_helpers.partition_lower", "partition_lower is not set")
        if not config.contains("time_helpers.partition_upper"):
            self.partition_range['end'] = all_dates_mm[1].strftime(
                self.partition_dt_format)
            config.add("time_helpers.partition_upper",
                       self.partition_range['end'])
        else:
            self.partition_range['end'] = config.get_or_else(
                "time_helpers.partition_upper", "partition_upper is not set")

    def scoped_partition_filter(self,
                                range_dict=None,
                                start=None,
                                end=None,
                                partition_col=None,
                                input_fmt=None,
                                partition_col_fmt=None,
                                snapshot_type=None):

        if snapshot_type is None:
            snapshot_type = self.config.get_or_else(
                "time_helpers.snapshot_type", "MONTH")

        assert (snapshot_type == "MONTH" or snapshot_type == "DAILY"), \
            "snapshot_type must be DAILY or MONTH, got {}".format(snapshot_type)

        if input_fmt is None:
            fmt = self.date_format
        else:
            fmt = input_fmt

        if partition_col_fmt is None:
            partition_col_fmt = self.partition_dt_format

        if range_dict is not None and start is None and end is None:
            min_date_dt = datetime.strptime(range_dict['start'], fmt)
            max_date_dt = datetime.strptime(range_dict['end'], fmt)

        if start is None and range_dict is None:
            min_date_dt = min(
                [datetime.strptime(dt, fmt) for dt in list(self.all_dates)])
        elif range_dict is None and start is not None:
            min_date_dt = datetime.strptime(start, fmt)

        if end is None and range_dict is None:
            max_date_dt = max(
                [datetime.strptime(dt, fmt) for dt in list(self.all_dates)])
        elif range_dict is None and end is not None:
            max_date_dt = datetime.strptime(end, fmt)

        if partition_col is None:
            partition_col = self.config.get_or_else(
                "time_helpers.partition_col",
                "No time_helpers.partition_col specified in config and "
                "no partition column passed into function")

        if snapshot_type == "MONTH":
            adj_start = (min_date_dt -
                         relativedelta(months=1 - 1)).replace(day=1)
            part_filter = col(partition_col).between(
                adj_start.strftime(partition_col_fmt),
                max_date_dt.strftime(partition_col_fmt))
        else:
            part_filter = col(partition_col).between(
                min_date_dt.strftime(partition_col_fmt),
                max_date_dt.strftime(partition_col_fmt))

        return part_filter

    def scoped_time_filter(self,
                           range_dict=None,
                           start=None,
                           end=None,
                           _fmt=None):

        if _fmt is None:
            fmt = self.date_format
        else:
            fmt = _fmt

        if range_dict is not None and start is None and end is None:
            min_date_dt = datetime.strptime(range_dict['start'], fmt)
            max_date_dt = datetime.strptime(range_dict['end'], fmt)

        if start is None and range_dict is None:
            min_date_dt = min(
                [datetime.strptime(dt, fmt) for dt in list(self.all_dates)])
        elif range_dict is None and start is not None:
            min_date_dt = datetime.strptime(start, fmt)

        if end is None and range_dict is None:
            max_date_dt = max(
                [datetime.strptime(dt, fmt) for dt in list(self.all_dates)])
        elif range_dict is None and end is not None:
            max_date_dt = datetime.strptime(end, fmt)

        min_date_str = min_date_dt.strftime(fmt)
        max_date_str = max_date_dt.strftime(fmt)
        dt_filter = col(self.date_time_col).between(min_date_str, max_date_str)

        if self.partition_col is not None:
            snapshot_type = self.config.get_or_else(
                "time_helpers.snapshot_type", "MONTH")
            partition_filter = self.scoped_partition_filter(
                min_date_dt, max_date_dt, self.partition_col,
                self.partition_dt_format, snapshot_type)
            dt_filter = dt_filter & partition_filter

        return dt_filter

    def _get_min_max_dt_from_list(self, l, fmt):
        mi = min([datetime.strptime(dt, fmt) for dt in l])
        ma = max([datetime.strptime(dt, fmt) for dt in l])
        return [mi, ma]
Beispiel #19
0
 def test_drop_config(self):
     config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}})
     config_obj.drop("b.c").print()
     assert len(config_obj.get_or_else("b",
                                       None)) == 0, "b.c should be removed"