def __init__(self, _name, _dtm: DateTimeManager, _config: ConfigObj = ConfigObj()): """ :param _name: :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _config: """ self.dtm = _dtm self.config = _config self.channel_name = _name # Derive final snapshot and add it config self.snapshot_date = _dtm.snapshot_date self.snapshot_date_dt = _dtm.snapshot_date_dt # TODO - Improve partitions to handle nested partitions # Get the partition if it exists self.partition_start = self.config.get_or_else( "time_helpers.partition_lower", (datetime.today() - relativedelta(years=1)).strftime( _dtm.partition_dt_format)) self.partition_end = self.config.get_or_else( "time_helpers.partition_upper", datetime.today().strftime(_dtm.partition_dt_format)) self.groupby_cols = self.config.get_or_else("_groupby_cols", []) self._features = OrderedDict() if not self.config.contains("cores"): self.config.add("cores", {}) if not self.config.contains("sources"): self.config.add("sources", {}) self.cores = self.config.get_or_else("cores", None) self.sources = self.config.get_or_else("sources", None) self.ff = Feature_Factory() # self._populate_partition_range() self.helpers = Helpers()
def _create_from_months(cls, snapshot_dt: datetime.date, trend_time_col, month_durs: int): from framework.feature_factory.helpers import Helpers helpers = Helpers() filters = [] filter_vals = [] filter_names = ["{}m".format(i + 1) for i in range(month_durs)] filter_cols = [trend_time_col] months_range = helpers.get_months_range(snapshot_dt, month_durs) for m in reversed(months_range): target_month_id = helpers.get_monthid(m) t_filter = col(trend_time_col) == target_month_id filters.append([t_filter]) filter_vals.append(target_month_id) return Multiplier(filter_cols, [filter_vals], [filters], [filter_names])
def __init__(self, _features: OrderedDict = OrderedDict(), _name_prefix: str = ""): from framework.feature_factory.helpers import Helpers self.helpers = Helpers() _features = _features.features if isinstance(_features, FeatureSet) else _features self.features = OrderedDict() for fn, f in _features.items(): if f.name.startswith(_name_prefix): self.features[fn] = f else: new_name = self.helpers._clean_alias("{}{}".format( _name_prefix, f.name)) new_f = f._clone(new_name) self.features[new_name] = new_f self.columns = self.features.keys()
class Channel: """ Base class for a channel """ def __init__(self, _name, _dtm: DateTimeManager, _config: ConfigObj = ConfigObj()): """ :param _name: :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner. The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706] when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID") :param _config: """ self.dtm = _dtm self.config = _config self.channel_name = _name # Derive final snapshot and add it config self.snapshot_date = _dtm.snapshot_date self.snapshot_date_dt = _dtm.snapshot_date_dt # TODO - Improve partitions to handle nested partitions # Get the partition if it exists self.partition_start = self.config.get_or_else( "time_helpers.partition_lower", (datetime.today() - relativedelta(years=1)).strftime( _dtm.partition_dt_format)) self.partition_end = self.config.get_or_else( "time_helpers.partition_upper", datetime.today().strftime(_dtm.partition_dt_format)) self.groupby_cols = self.config.get_or_else("_groupby_cols", []) self._features = OrderedDict() if not self.config.contains("cores"): self.config.add("cores", {}) if not self.config.contains("sources"): self.config.add("sources", {}) self.cores = self.config.get_or_else("cores", None) self.sources = self.config.get_or_else("sources", None) self.ff = Feature_Factory() # self._populate_partition_range() self.helpers = Helpers() def _create_joiner_df(self, joiner: dict): if not isinstance(joiner["target_join_df"], DataFrame): df_path = joiner["target_join_df"] df_parts = [p.strip() for p in df_path.split(".")] df = self.get_core( df_parts[1]) if df_parts[0] == "core" else self.get_source( df_parts[1]) joiner["target_join_df"] = df return df else: return joiner["target_join_df"] def _get_groupby_cols(self): """ Returns a list of cols for the gorupBy operation on the sources/cores. :return: """ return self.groupby_cols def list_cores(self): """ Returns a list of keys of cores. :return: """ return list(self.cores.keys()) def list_sources(self): """ Returns a list of keys of sources. :return: """ return list(self.sources.keys()) def get_core(self, name: str): """ Gets the dataframe of a core by name. :param name: :return: """ if name in self.cores: return self._apply_metric_filters(name, self.cores[name].df) else: return None def get_source(self, name: str): """ Gets the dataframe of a source by name. :param name: :return: """ if name in self.sources: return self._apply_metric_filters(name, self.sources[name].df) else: return None def get_data(self, df_path): """ Get dataframe from cores or sources using path such as "sources.collector_dim" :param path: :return: """ df_parts = [p.strip() for p in df_path.split(".")] cores = self.config.get_or_else("cores", {}) sources = self.config.get_or_else("sources", {}) data_source = cores if df_parts[0] == "cores" else sources if df_parts[1] in data_source: return data_source[df_parts[1]].df else: return None def _apply_metric_filters(self, name: str, df: DataFrame): metric_filters = self.config.get_config("metric_filters") metric_filter = metric_filters.get_or_else(name, None) if metric_filter is not None: df.filter(metric_filter) return df def add_core(self, name: str, table: DataFrame, partition_col=[]): """ Adds a core to the partner. :param name: :param table: the dataframe of the core :param partition_col: the columns to be filtered using partition_start and partition_end :return: """ self._add_data(self.cores, name, table, partition_col) def add_source(self, name: str, table: DataFrame, partition_col=[]): """ Adds a source to the partner. :param name: :param table: the dataframe of the source :param partition_col: the columns to be filtered using partition_start and partition_end :return: """ self._add_data(self.sources, name, table, partition_col) def _add_data(self, datalist: dict, name: str, table: DataFrame, partition_cols=[]): # if name not in datalist: if len(partition_cols) > 0: p_filter = self.dtm.scoped_partition_filter( start=self.partition_start, end=self.partition_end, partition_col=partition_cols[0], input_fmt=self.dtm.partition_dt_format) d = Data(table.filter(p_filter), partition_cols) else: d = Data(table, partition_cols) # TODO - Add this back in to support nested partition columns # TODO - But it will require a few tweaks # if len(partition_cols) > 0 and len(self.partition_start) > 0: # tf_filters = [col(tfc) >= self.partition_start[i] for i, tfc in enumerate(d.partition_cols)] # where_clause = tf_filters[0] # for f in tf_filters[1:]: # where_clause &= f # print("Applying filter {} to dataframe {}".format(where_clause, name)) # d.df = d.df.filter(where_clause) # # if len(partition_cols) > 0 and len(self.partition_end) > 0: # tf_filters = [col(tfc) <= self.partition_end[i] for i, tfc in enumerate(d.partition_cols)] # where_clause = tf_filters[0] # for f in tf_filters[1:]: # where_clause &= f # print("Applying filter {} to dataframe {}".format(where_clause, name)) # d.df = d.df.filter(where_clause) datalist[name] = d def remove_core(self, name: str): self.config.drop("cores.{}".format(name)) def remove_source(self, name: str): self.config.drop("sources.{}".format(name)) def get_daterange_multiplier(self, time_helpers: ConfigObj = None): time_helpers = self.config.get_config( "time_helpers") if time_helpers is None else time_helpers return Multiplier._create_from_daterange(self.dtm, time_helpers) def _create_groupby(self, joiner_key: str, groupby_col, assign_df: bool = True): joiner_config = self.config.get_or_else(joiner_key, dict()) if assign_df: table_name = self.helpers._get_joiner_key(joiner_key) joiner_config["target_join_df"] = self.get_source(table_name) return { "col": groupby_col, "joiner": joiner_config, "joiner_key": joiner_key }
from framework.feature_factory.feature_family import FeatureFamily from pyspark.sql import functions as F from pyspark.sql.functions import col, lit, when, struct from framework.feature_factory.feature import Feature from framework.feature_factory.helpers import Helpers from framework.configobj import ConfigObj import inspect from datetime import date feat_func = Helpers()._register_feature_func() joiner_func = Helpers()._register_joiner_func() # ADD DOCS HERE class SalesCommon: def __init__(self, config=ConfigObj()): # FeatureFamily.__init__(self, config) self.config = config self._joiner_func = joiner_func self._feat_func = feat_func # FeatureFamily.__init__(self, config) def transfer_features(self, cls): for fn, func in self._feat_func.all.items(): setattr(cls, fn, func) # @feat_func # def totalAmrmEarned(self, # _name="total_gross_sales", # _base_col="gross_sales",
def __init__(self): self.helpers = Helpers()
class Feature_Factory(): def __init__(self): self.helpers = Helpers() def append_features(self, df: DataFrame, groupBy_cols, feature_sets: [FeatureSet], withTrendsForFeatures: [FeatureSet] = None): """ Appends features to incoming df. The features columns and groupby cols will be deduped and validated. If there's a group by, the groupby cols will be applied before appending features. If there's not a group by and no agg features then the features will be appended to df. :param df: :param groupBy_cols: :param feature_sets: input of FeatureSet :return: """ # If groupBy Column is past in as something other than list, convert to list # Validation - If features, passed in is dict, convert to list of vals, etc. # groupBy_cols = self.helpers._to_list(groupBy_cols) groupBy_cols, groupBy_joiners = self.helpers._extract_groupby_joiner( groupBy_cols) features, dups = self.helpers._dedup_fast(df, [ feature for feature_set in feature_sets for feature in feature_set.features.values() ]) df = self.helpers._resolve_feature_joiners( df, features, groupBy_joiners).repartition(*groupBy_cols) # feature_cols = [] agg_cols = [] non_agg_cols = {} features_to_drop = [] # base_cols = [f.base_col for f in features] # column validation # valid_result, undef_cols = self.helpers.validate_col(df, *base_cols) # assert valid_result, "base cols {} are not defined in df columns {}".format(undef_cols, df.columns) # valid_result, undef_cols = self.helpers._validate_col(df, *groupBy_cols) # assert valid_result, "groupby cols {} are not defined in df columns {}".format(undef_cols, df.columns) for feature in features: assert True if ((len(feature.aggs) > 0) and (len( groupBy_cols) > 0) or feature.agg_func is None) else False, "{} has either aggs or groupBys " \ "but not both, ensure both are present".format(feature.name) # feature_cols.append(feature.assembled_column) # feature_cols.append(F.col(feature.output_alias)) agg_cols += [agg_col for agg_col in feature.aggs] if feature.agg_func is None: non_agg_cols[feature.output_alias] = feature.assembled_column else: df = df.withColumn(feature.output_alias, feature.assembled_column) if feature.is_temporary: features_to_drop.append(feature.name) if len(groupBy_cols) > 0: df = df.groupBy(*groupBy_cols)\ .agg(*agg_cols) for fn, col in non_agg_cols.items(): df = df.withColumn(fn, col) final_df = df.drop(*features_to_drop) # else: # new_df = df.select(*df.columns + feature_cols) return final_df
class FeatureSet: def __init__(self, _features: OrderedDict = OrderedDict(), _name_prefix: str = ""): from framework.feature_factory.helpers import Helpers self.helpers = Helpers() _features = _features.features if isinstance(_features, FeatureSet) else _features self.features = OrderedDict() for fn, f in _features.items(): if f.name.startswith(_name_prefix): self.features[fn] = f else: new_name = self.helpers._clean_alias("{}{}".format( _name_prefix, f.name)) new_f = f._clone(new_name) self.features[new_name] = new_f self.columns = self.features.keys() # Allow addition of individual feature in a certain position def add_feature(self, _feature: Feature, pos: int = 0): name = _feature.name self.features[name] = _feature def remove_feature(self, _name: str): del self.features[_name] def extract_multipliable_name(self, _name_prefix: str, _feature_name: str): if _feature_name.startswith(_name_prefix): return '_'.join(_feature_name.split("_")[1:]) else: return _feature_name def multiply(self, _multiplier, _name_prefix: str, is_temporary=False): _name_prefix = _name_prefix.upper() nrow = len(_multiplier.filters) results = OrderedDict() for base_feature in self.features.values(): if base_feature.kind == "multipliable": for r in range(nrow): for c in range(len(_multiplier.filters[r])): current_name = _multiplier.filter_names[r][ c] if _multiplier.filter_names is not None else _multiplier.filter_vals[ r][c] multipliable_name = self.extract_multipliable_name( _name_prefix, base_feature.name) feature_name = self.helpers._clean_alias( "{}_{}_{}".format(_name_prefix, current_name.upper(), multipliable_name.upper())) feature_filter = [ *base_feature.filter, *_multiplier.filters[r][c] ] results[feature_name] = Feature( _name=feature_name, _base_col=base_feature.base_col, _filter=feature_filter, _negative_value=base_feature.negative_value, _agg_func=base_feature.agg_func, _joiners=base_feature.joiners, _is_temporary=is_temporary) return FeatureSet(results)