def _create_feature(self, frame): """ Creates a feature based on the arguments passed in from frame. :param frame: :return: """ args, _, _, values = inspect.getargvalues(frame) dict = {i: values[i] for i in args} _name = dict["_name"] exist_feat = self._get_feature(_name) if exist_feat: self._features[_name] = exist_feat else: del dict["self"] if "_col_alias" in dict: del dict["_col_alias"] base_col = dict["_base_col"] dict["_base_col"] = col( base_col) if type(base_col) is str else lit(base_col) input_config = ConfigObj(dict) self._features[_name] = Feature( _name=input_config.get_or_else("_name", ""), _base_col=input_config.get_or_else("_base_col", None), _filter=input_config.get_or_else("_filter", []), _negative_value=input_config.get_or_else("_negative_value", 0), _agg_func=input_config.get_or_else("_agg_func", None), _agg_alias=input_config.get_or_else("_agg_alias", None), _joiners=input_config.get_or_else("_joiners", {}), _kind=input_config.get_or_else("_kind", "multipliable")) return self._features[_name]
def test_add_config(self): config_obj = ConfigObj(dict()) config_obj.add("namespace1.namespace2", { "setting1": "value1", "setting2": "value2" }) assert config_obj.get_or_else("namespace1.namespace2.setting1", "") == "value1"
def test_get_config(self): config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) conf = config_obj.get_config("b") assert conf.c == [1, 2, 3, 4, 5], "get_config should return a config obj." conf = config_obj.get_config(("b.c")) assert conf == [1, 2, 3, 4, 5], "get_config should return a value." conf = config_obj.get_config("c") print("conf: {}".format(conf))
def __init__(self, _snapshot_date, _dt_col: str, _dt_format: str, _date_format: str, _config: ConfigObj, _partition_col=None, _partition_dt_format=None): """ :param _dt_col: datetime column if relevant, otherwise just use date column :param _dt_format: common datetime format in python str format this is generally the column in the date_dim that is ued throughout the codebase any code using a different format will need to use the clean_time function :param _date_format: common date format in python str format :param _partition_dt_format: python format string for the datetime format used for partition col This only works if the partition col is date type :param _config: config object passed in :param _partition_col: partition column if dataset[s] are partitioned on date/datetime :param _partition_period: Supported is Daily, Monthly, Yearly, this will select the first day of the period based on the _partition_dt_format to ensure the entire period is included in the partition filters """ self.date_time_col = _dt_col self.date_time_format = _dt_format self.date_format = _date_format self.partition_col = _partition_col self.partition_dt_format = _partition_dt_format self.partition_range = {} self.config = _config self.dt_tracker = ConfigObj() if self.config.contains("time_helpers.snapshot_date"): self.snapshot_date = self.config.get_or_else( "time_helpers.snapshot_date", "") elif _snapshot_date is None and not self.config.contains( "time_helpers.snapshot_date"): self.snapshot_date = date.today().strftime(_date_format) else: self.snapshot_date = _snapshot_date self.snapshot_date_dt = datetime.strptime(self.snapshot_date, self.date_format).date() self.end = self.snapshot_date_dt.strftime(self.date_format) self.all_dates = set([]) if not self.config.contains("time_helpers.snapshot_type"): mo_max_day = calendar.monthrange(self.snapshot_date_dt.year, self.snapshot_date_dt.month)[1] snapshot_type = "MONTH" if mo_max_day == self.snapshot_date_dt.day else "DAILY" self.config.add("time_helpers.snapshot_type", snapshot_type) self.config.add("time_helpers.snapshot_date", self.snapshot_date) self.config.add("time_helpers.partition_col", self.partition_col) self.config.add("time_helpers.date_col", self.date_time_col) self.config.add("time_helpers.date_col_format", self.date_format) self.config.add("time_helpers.partition_col_format", self.partition_dt_format)
def test_key_as_int(self): channel_config = ConfigObj() channel_config.add( "promo_pre_periods", { # list of promo length / pre-period length pairs in days 7: 28, 14: 28, 21: 56 }) assert channel_config.as_dict()["promo_pre_periods"][7] == 28
def Trends(self, featureSet_to_trend, trend_ranges, _dtm=None, _config: ConfigObj = None): """ Trends is a method that constructs the TrendsCommon generic FeatureFamily. It returns features, slope and y-intercept such that the user can understand the trend line and ideally future values for various features. :param featureSet_to_trend: A feature set to be trended :param trend_ranges: list of ranges to be trended [['1m','12m'], ['1w', '4w']]. Accepted range periods are d = day w = week m = month y = year :param _dtm: A specific dtm can be passed in but generally the dtm from self should be used :param _config: A specific configObj can be passed in but generally the config is taken from self :return: """ if _config is None: time_helpers = self.config.get_config("time_helpers") else: time_helpers = _config.get_config("time_helpers") dtm = self.dtm if _dtm is None else _dtm trends = TrendsCommon(featureSet_to_trend, trend_ranges, dtm, time_helpers) return trends
def _create_from_daterange( cls, dtm: DateTimeManager, time_helpers: ConfigObj, period_ranges="date_filters.ranges", ): """ :param time_helpers: { snapshot_date: "20190225", snapshot_type: "DAILY", partition_col: "p_yyyymm", date_ranges: { "1m": {"start": <val>, "end": <val>, "3m": {"start": <val>, "end": <val>} } } :return: """ filters = [] filter_vals = [] date_ranges = time_helpers.get_config(period_ranges).as_dict() partition_col = time_helpers.get_or_else("partition_col", "") filter_names = list(date_ranges.keys()) filter_cols = [partition_col] if time_helpers.get_or_else("snapshot_type", "MONTH") == "DAILY": date_col = time_helpers.get_or_else( "date_col", "No date col specified in the time_helpers config." "Daily snapshots require a date_col to be defined") filter_cols = [partition_col, date_col] for date_range in date_ranges.keys(): d_filter = dtm.scoped_time_filter( range_dict=date_ranges[date_range]) filters.append([d_filter]) filter_vals.append(date_ranges[date_range]) else: for date_range in date_ranges.keys(): p_filter = dtm.scoped_partition_filter( range_dict=date_ranges[date_range]) filters.append([p_filter]) filter_vals.append(date_ranges[date_range]) return Multiplier(filter_cols, [filter_vals], [filters], [filter_names])
def _create_joiner_df(self, joiner: dict, partner_conf: ConfigObj): """ Tries to create a dataframe from target_join_df of a joiner. If the sources/cores does not contain the dataframe, keep the target_join_df as path string. The path may be resovled later when more df is added to the partner. :param joiner: :param partner_conf: :return: """ if "target_join_df" in joiner: if not isinstance(joiner["target_join_df"], DataFrame): df_path = joiner["target_join_df"] df_parts = [p.strip() for p in df_path.split(".")] cores = partner_conf.get_or_else("cores", {}) sources = partner_conf.get_or_else("sources", {}) data_source = cores if df_parts[0] == "cores" else sources if df_parts[1] in data_source: joiner["target_join_df"] = data_source[df_parts[1]].df return joiner["target_join_df"] return None
def test_contains_config(self): config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) assert config_obj.contains("b.c"), "config should contains b.c" assert not config_obj.contains( "b.c.e"), "config should not contains b.c.e" assert config_obj.contains("a"), "config should contains a" assert not config_obj.contains("d"), "config should not contains d"
def test_add_config(self): config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) config_obj.add("b.c2", 2) config_obj.add("e1.e2.e3", 123) print(config_obj.configs) assert config_obj.b.c2 == 2 and config_obj.b.c == [ 1, 2, 3, 4, 5 ], "should add config to config obj." assert config_obj.e1.e2.e3 == 123, "should be able to chain add." assert config_obj.get_or_else( "e1.e2.e3", None) == 123, "should be able to get config value." assert config_obj.get_or_else( "e1.e22", "default") == "default", "default value if key does not exist."
def __init__(self, _snapshot_date=None, _config: ConfigObj = ConfigObj()): self.dtm = DateTimeManager(_snapshot_date=_snapshot_date, _dt_col="d_date", _dt_format="%Y-%m-%d %H:%M:%S", _date_format="%Y-%m-%d", _config=_config, _partition_col="p_yyyymm", _partition_dt_format="%Y%m") self.dtm.append_periods(["1m", "3m", "6m", "12m"]) self.config = self.dtm.get_config() Channel.__init__(self, "Store", self.dtm, self.config) self.sales = Sales(self.config) self._create_default_cores() self._create_default_sources()
def __init__(self, _featureSet_to_trend, _trend_period_ranges, _dtm: DateTimeManager, _time_helprs_config=ConfigObj()): """ :param _featureSet_to_trend: :param _dtm: This dtm needs to contain a single type of period such as only w or only m passing in mixed ranges is not supported for trending yet :param _time_helprs_config: """ # FeatureFamily.__init__(self, config) self.time_helpers_config = _time_helprs_config self.dtm = _dtm # self.months = _months self.featureSet_to_trend = _featureSet_to_trend self.trend_period_ranges = _trend_period_ranges FeatureFamily.__init__(self, _time_helprs_config)
def __init__(self, _name, _dtm: DateTimeManager, _config: ConfigObj = ConfigObj()): """ :param _name: :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _config: """ self.dtm = _dtm self.config = _config self.channel_name = _name # Derive final snapshot and add it config self.snapshot_date = _dtm.snapshot_date self.snapshot_date_dt = _dtm.snapshot_date_dt # TODO - Improve partitions to handle nested partitions # Get the partition if it exists self.partition_start = self.config.get_or_else( "time_helpers.partition_lower", (datetime.today() - relativedelta(years=1)).strftime( _dtm.partition_dt_format)) self.partition_end = self.config.get_or_else( "time_helpers.partition_upper", datetime.today().strftime(_dtm.partition_dt_format)) self.groupby_cols = self.config.get_or_else("_groupby_cols", []) self._features = OrderedDict() if not self.config.contains("cores"): self.config.add("cores", {}) if not self.config.contains("sources"): self.config.add("sources", {}) self.cores = self.config.get_or_else("cores", None) self.sources = self.config.get_or_else("sources", None) self.ff = Feature_Factory() # self._populate_partition_range() self.helpers = Helpers()
def test_merge_config(self): config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) config_obj.merge({"e1": {"e2": 12}}) assert config_obj.e1.e2 == 12, "error merging two configs" config_obj.print() config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) config_obj.merge(ConfigObj({"e1": {"e2": 12}})) assert config_obj.e1.e2 == 12, "error merging two configs" config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) config_obj.merge(ConfigObj({"a": {"e2": 12}})) assert len(config_obj.configs ) == 2, "merge should fail because of conflicts in keys."
def __init__(self, config=ConfigObj()): self._multipliable_feat_func = multipliable self._base_feat_func = base_feat self._joiner_func = joiner_func # self._build_all() FeatureFamily.__init__(self, config)
def __init__(self, config=ConfigObj()): # FeatureFamily.__init__(self, config) self.config = config self._joiner_func = joiner_func self._feat_func = feat_func
def _add_joiner(self, join_key: str, partner_conf: ConfigObj): joiner = partner_conf.get_or_else(join_key, {}) if join_key in self.joiners: print("{} has been added already.".format(join_key)) else: self.joiners[join_key] = joiner
class DateTimeManager: def __init__(self, _snapshot_date, _dt_col: str, _dt_format: str, _date_format: str, _config: ConfigObj, _partition_col=None, _partition_dt_format=None): """ :param _dt_col: datetime column if relevant, otherwise just use date column :param _dt_format: common datetime format in python str format this is generally the column in the date_dim that is ued throughout the codebase any code using a different format will need to use the clean_time function :param _date_format: common date format in python str format :param _partition_dt_format: python format string for the datetime format used for partition col This only works if the partition col is date type :param _config: config object passed in :param _partition_col: partition column if dataset[s] are partitioned on date/datetime :param _partition_period: Supported is Daily, Monthly, Yearly, this will select the first day of the period based on the _partition_dt_format to ensure the entire period is included in the partition filters """ self.date_time_col = _dt_col self.date_time_format = _dt_format self.date_format = _date_format self.partition_col = _partition_col self.partition_dt_format = _partition_dt_format self.partition_range = {} self.config = _config self.dt_tracker = ConfigObj() if self.config.contains("time_helpers.snapshot_date"): self.snapshot_date = self.config.get_or_else( "time_helpers.snapshot_date", "") elif _snapshot_date is None and not self.config.contains( "time_helpers.snapshot_date"): self.snapshot_date = date.today().strftime(_date_format) else: self.snapshot_date = _snapshot_date self.snapshot_date_dt = datetime.strptime(self.snapshot_date, self.date_format).date() self.end = self.snapshot_date_dt.strftime(self.date_format) self.all_dates = set([]) if not self.config.contains("time_helpers.snapshot_type"): mo_max_day = calendar.monthrange(self.snapshot_date_dt.year, self.snapshot_date_dt.month)[1] snapshot_type = "MONTH" if mo_max_day == self.snapshot_date_dt.day else "DAILY" self.config.add("time_helpers.snapshot_type", snapshot_type) self.config.add("time_helpers.snapshot_date", self.snapshot_date) self.config.add("time_helpers.partition_col", self.partition_col) self.config.add("time_helpers.date_col", self.date_time_col) self.config.add("time_helpers.date_col_format", self.date_format) self.config.add("time_helpers.partition_col_format", self.partition_dt_format) def get_config(self): return self.config def _track_date(self, period, dt): if not self.dt_tracker.contains("{}.all_dates".format(period)): self.dt_tracker.add("{}.all_dates".format(period), [dt]) else: dl = self.dt_tracker.get_or_else("{}.all_dates".format(period), "") self.dt_tracker.add("{}.all_dates".format(period), dl + [dt]) def _append_ranges(self): for period, dates in self.dt_tracker.configs.items(): mm = self._get_min_max_dt_from_list(dates['all_dates'], self.date_format) self.dt_tracker.add("{}.min".format(period), mm[0].strftime(self.date_format)) self.dt_tracker.add("{}.max".format(period), mm[1].strftime(self.date_format)) def _append_trend_ranges(self, trend_period_ranges): for rng in trend_period_ranges: # [["1m","12m"]] num_range = list(range(int(rng[0][:-1]), int(rng[1][:-1]) + 1)) period_type = rng[0][-1:] trend_period = [ "{}{}".format(num, period_type) for num in num_range ] self.append_periods( trend_period, "time_helpers.trend_filters.{}".format(period_type)) def append_periods(self, periods, cfg_prefix="time_helpers.date_filters"): date_format = self.date_format config = self.config self.all_dates.add(self.end) def _month(_length): start = (self.snapshot_date_dt - relativedelta(months=int(_length))).strftime(date_format) self.all_dates.add(start) self._track_date("m", start) period_config = {"start": start, "end": self.end} config.add("{}.ranges.{}m".format(cfg_prefix, _length), period_config) return True def _week(_length): start = (self.snapshot_date_dt - relativedelta(weeks=int(_length))).strftime(date_format) self.all_dates.add(start) self._track_date("w", start) period_config = {"start": start, "end": self.end} config.add("{}.ranges.{}w".format(cfg_prefix, _length), period_config) return True def _day(_length): start = (self.snapshot_date_dt - relativedelta(days=int(_length))).strftime(date_format) self.all_dates.add(start) self._track_date("d", start) period_config = {"start": start, "end": self.end} config.add("{}.ranges.{}d".format(cfg_prefix, _length), period_config) return True def _year(_length): start = (self.snapshot_date_dt - relativedelta(years=int(_length))).strftime(date_format) self.all_dates.add(start) self._track_date("y", start) period_config = {"start": start, "end": self.end} config.add("{}.ranges.{}y".format(cfg_prefix, _length), period_config) return True for p in periods: assert ("m" in p[-1:] or "w" in p[-1:] or "d" in p[-1:] or "y" in p[-1:]), \ "Only m,w,d,y accepted as period types for insertion" for period in periods: length = period[:-1] period_type = period[-1:] append = {"m": _month, "w": _week, "d": _day, "y": _year} append[period_type](length) # Track the periods that were appended for later use if not self.dt_tracker.contains("{}.periods".format(period_type)): self.dt_tracker.add("{}.periods".format(period_type), [int(length)]) else: pl = self.dt_tracker.get_or_else( "{}.periods".format(period_type), "") self.dt_tracker.add("{}.periods".format(period_type), pl + [int(length)]) self._append_ranges() # self._build_numerical_ranges() # If config is missing min/max partition values, insert them # Otherwise set the min/max of this to what's in the config all_dates_mm = self._get_min_max_dt_from_list(self.all_dates, date_format) if not config.contains("time_helpers.partition_lower"): self.partition_range['start'] = all_dates_mm[0].strftime( self.partition_dt_format) config.add("time_helpers.partition_lower", self.partition_range['start']) else: self.partition_range['start'] = config.get_or_else( "time_helpers.partition_lower", "partition_lower is not set") if not config.contains("time_helpers.partition_upper"): self.partition_range['end'] = all_dates_mm[1].strftime( self.partition_dt_format) config.add("time_helpers.partition_upper", self.partition_range['end']) else: self.partition_range['end'] = config.get_or_else( "time_helpers.partition_upper", "partition_upper is not set") def scoped_partition_filter(self, range_dict=None, start=None, end=None, partition_col=None, input_fmt=None, partition_col_fmt=None, snapshot_type=None): if snapshot_type is None: snapshot_type = self.config.get_or_else( "time_helpers.snapshot_type", "MONTH") assert (snapshot_type == "MONTH" or snapshot_type == "DAILY"), \ "snapshot_type must be DAILY or MONTH, got {}".format(snapshot_type) if input_fmt is None: fmt = self.date_format else: fmt = input_fmt if partition_col_fmt is None: partition_col_fmt = self.partition_dt_format if range_dict is not None and start is None and end is None: min_date_dt = datetime.strptime(range_dict['start'], fmt) max_date_dt = datetime.strptime(range_dict['end'], fmt) if start is None and range_dict is None: min_date_dt = min( [datetime.strptime(dt, fmt) for dt in list(self.all_dates)]) elif range_dict is None and start is not None: min_date_dt = datetime.strptime(start, fmt) if end is None and range_dict is None: max_date_dt = max( [datetime.strptime(dt, fmt) for dt in list(self.all_dates)]) elif range_dict is None and end is not None: max_date_dt = datetime.strptime(end, fmt) if partition_col is None: partition_col = self.config.get_or_else( "time_helpers.partition_col", "No time_helpers.partition_col specified in config and " "no partition column passed into function") if snapshot_type == "MONTH": adj_start = (min_date_dt - relativedelta(months=1 - 1)).replace(day=1) part_filter = col(partition_col).between( adj_start.strftime(partition_col_fmt), max_date_dt.strftime(partition_col_fmt)) else: part_filter = col(partition_col).between( min_date_dt.strftime(partition_col_fmt), max_date_dt.strftime(partition_col_fmt)) return part_filter def scoped_time_filter(self, range_dict=None, start=None, end=None, _fmt=None): if _fmt is None: fmt = self.date_format else: fmt = _fmt if range_dict is not None and start is None and end is None: min_date_dt = datetime.strptime(range_dict['start'], fmt) max_date_dt = datetime.strptime(range_dict['end'], fmt) if start is None and range_dict is None: min_date_dt = min( [datetime.strptime(dt, fmt) for dt in list(self.all_dates)]) elif range_dict is None and start is not None: min_date_dt = datetime.strptime(start, fmt) if end is None and range_dict is None: max_date_dt = max( [datetime.strptime(dt, fmt) for dt in list(self.all_dates)]) elif range_dict is None and end is not None: max_date_dt = datetime.strptime(end, fmt) min_date_str = min_date_dt.strftime(fmt) max_date_str = max_date_dt.strftime(fmt) dt_filter = col(self.date_time_col).between(min_date_str, max_date_str) if self.partition_col is not None: snapshot_type = self.config.get_or_else( "time_helpers.snapshot_type", "MONTH") partition_filter = self.scoped_partition_filter( min_date_dt, max_date_dt, self.partition_col, self.partition_dt_format, snapshot_type) dt_filter = dt_filter & partition_filter return dt_filter def _get_min_max_dt_from_list(self, l, fmt): mi = min([datetime.strptime(dt, fmt) for dt in l]) ma = max([datetime.strptime(dt, fmt) for dt in l]) return [mi, ma]
def test_drop_config(self): config_obj = ConfigObj({"a": 1, "b": {"c": [1, 2, 3, 4, 5]}}) config_obj.drop("b.c").print() assert len(config_obj.get_or_else("b", None)) == 0, "b.c should be removed"