Ejemplo n.º 1
0
    def __init__(self,
                 _name,
                 _dtm: DateTimeManager,
                 _config: ConfigObj = ConfigObj()):
        """

        :param _name:
        :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _config:
        """
        self.dtm = _dtm
        self.config = _config
        self.channel_name = _name

        # Derive final snapshot and add it config
        self.snapshot_date = _dtm.snapshot_date
        self.snapshot_date_dt = _dtm.snapshot_date_dt

        # TODO - Improve partitions to handle nested partitions
        # Get the partition if it exists
        self.partition_start = self.config.get_or_else(
            "time_helpers.partition_lower",
            (datetime.today() - relativedelta(years=1)).strftime(
                _dtm.partition_dt_format))
        self.partition_end = self.config.get_or_else(
            "time_helpers.partition_upper",
            datetime.today().strftime(_dtm.partition_dt_format))
        self.groupby_cols = self.config.get_or_else("_groupby_cols", [])

        self._features = OrderedDict()
        if not self.config.contains("cores"):
            self.config.add("cores", {})
        if not self.config.contains("sources"):
            self.config.add("sources", {})
        self.cores = self.config.get_or_else("cores", None)
        self.sources = self.config.get_or_else("sources", None)

        self.ff = Feature_Factory()
        # self._populate_partition_range()
        self.helpers = Helpers()
Ejemplo n.º 2
0
    def _create_from_months(cls, snapshot_dt: datetime.date, trend_time_col,
                            month_durs: int):
        from framework.feature_factory.helpers import Helpers
        helpers = Helpers()

        filters = []
        filter_vals = []
        filter_names = ["{}m".format(i + 1) for i in range(month_durs)]
        filter_cols = [trend_time_col]
        months_range = helpers.get_months_range(snapshot_dt, month_durs)
        for m in reversed(months_range):
            target_month_id = helpers.get_monthid(m)
            t_filter = col(trend_time_col) == target_month_id
            filters.append([t_filter])
            filter_vals.append(target_month_id)

        return Multiplier(filter_cols, [filter_vals], [filters],
                          [filter_names])
Ejemplo n.º 3
0
    def __init__(self,
                 _features: OrderedDict = OrderedDict(),
                 _name_prefix: str = ""):
        from framework.feature_factory.helpers import Helpers
        self.helpers = Helpers()
        _features = _features.features if isinstance(_features,
                                                     FeatureSet) else _features
        self.features = OrderedDict()
        for fn, f in _features.items():
            if f.name.startswith(_name_prefix):
                self.features[fn] = f
            else:
                new_name = self.helpers._clean_alias("{}{}".format(
                    _name_prefix, f.name))
                new_f = f._clone(new_name)
                self.features[new_name] = new_f

        self.columns = self.features.keys()
Ejemplo n.º 4
0
class Channel:
    """
    Base class for a channel
    """
    def __init__(self,
                 _name,
                 _dtm: DateTimeManager,
                 _config: ConfigObj = ConfigObj()):
        """

        :param _name:
        :param _partition_start: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _partition_end: to be applied as a filter to the partition_cols of all sources/cores of this partner.
            The partition_cols is an argument for adding a data source/core to a partner. e.g. partition_start can be [201706]
            when add the core: channel.add_core("sales_item_fact", core_df, "P_TRANS_MONTH_ID")
        :param _config:
        """
        self.dtm = _dtm
        self.config = _config
        self.channel_name = _name

        # Derive final snapshot and add it config
        self.snapshot_date = _dtm.snapshot_date
        self.snapshot_date_dt = _dtm.snapshot_date_dt

        # TODO - Improve partitions to handle nested partitions
        # Get the partition if it exists
        self.partition_start = self.config.get_or_else(
            "time_helpers.partition_lower",
            (datetime.today() - relativedelta(years=1)).strftime(
                _dtm.partition_dt_format))
        self.partition_end = self.config.get_or_else(
            "time_helpers.partition_upper",
            datetime.today().strftime(_dtm.partition_dt_format))
        self.groupby_cols = self.config.get_or_else("_groupby_cols", [])

        self._features = OrderedDict()
        if not self.config.contains("cores"):
            self.config.add("cores", {})
        if not self.config.contains("sources"):
            self.config.add("sources", {})
        self.cores = self.config.get_or_else("cores", None)
        self.sources = self.config.get_or_else("sources", None)

        self.ff = Feature_Factory()
        # self._populate_partition_range()
        self.helpers = Helpers()

    def _create_joiner_df(self, joiner: dict):
        if not isinstance(joiner["target_join_df"], DataFrame):
            df_path = joiner["target_join_df"]
            df_parts = [p.strip() for p in df_path.split(".")]
            df = self.get_core(
                df_parts[1]) if df_parts[0] == "core" else self.get_source(
                    df_parts[1])
            joiner["target_join_df"] = df
            return df
        else:
            return joiner["target_join_df"]

    def _get_groupby_cols(self):
        """
        Returns a list of cols for the gorupBy operation on the sources/cores.
        :return:
        """
        return self.groupby_cols

    def list_cores(self):
        """
        Returns a list of keys of cores.
        :return:
        """
        return list(self.cores.keys())

    def list_sources(self):
        """
        Returns a list of keys of sources.
        :return:
        """
        return list(self.sources.keys())

    def get_core(self, name: str):
        """
        Gets the dataframe of a core by name.
        :param name:
        :return:
        """
        if name in self.cores:
            return self._apply_metric_filters(name, self.cores[name].df)
        else:
            return None

    def get_source(self, name: str):
        """
        Gets the dataframe of a source by name.
        :param name:
        :return:
        """
        if name in self.sources:
            return self._apply_metric_filters(name, self.sources[name].df)
        else:
            return None

    def get_data(self, df_path):
        """
        Get dataframe from cores or sources using path such as "sources.collector_dim"
        :param path:
        :return:
        """
        df_parts = [p.strip() for p in df_path.split(".")]
        cores = self.config.get_or_else("cores", {})
        sources = self.config.get_or_else("sources", {})
        data_source = cores if df_parts[0] == "cores" else sources
        if df_parts[1] in data_source:
            return data_source[df_parts[1]].df
        else:
            return None

    def _apply_metric_filters(self, name: str, df: DataFrame):
        metric_filters = self.config.get_config("metric_filters")
        metric_filter = metric_filters.get_or_else(name, None)
        if metric_filter is not None:
            df.filter(metric_filter)
        return df

    def add_core(self, name: str, table: DataFrame, partition_col=[]):
        """
        Adds a core to the partner.
        :param name:
        :param table: the dataframe of the core
        :param partition_col: the columns to be filtered using partition_start and partition_end
        :return:
        """

        self._add_data(self.cores, name, table, partition_col)

    def add_source(self, name: str, table: DataFrame, partition_col=[]):
        """
        Adds a source to the partner.
        :param name:
        :param table: the dataframe of the source
        :param partition_col: the columns to be filtered using partition_start and partition_end
        :return:
        """
        self._add_data(self.sources, name, table, partition_col)

    def _add_data(self,
                  datalist: dict,
                  name: str,
                  table: DataFrame,
                  partition_cols=[]):
        # if name not in datalist:

        if len(partition_cols) > 0:
            p_filter = self.dtm.scoped_partition_filter(
                start=self.partition_start,
                end=self.partition_end,
                partition_col=partition_cols[0],
                input_fmt=self.dtm.partition_dt_format)
            d = Data(table.filter(p_filter), partition_cols)
        else:
            d = Data(table, partition_cols)

        # TODO - Add this back in to support nested partition columns
        # TODO - But it will require a few tweaks
        # if len(partition_cols) > 0 and len(self.partition_start) > 0:
        #     tf_filters = [col(tfc) >= self.partition_start[i] for i, tfc in enumerate(d.partition_cols)]
        #     where_clause = tf_filters[0]
        #     for f in tf_filters[1:]:
        #         where_clause &= f
        #     print("Applying filter {} to dataframe {}".format(where_clause, name))
        #     d.df = d.df.filter(where_clause)
        #
        # if len(partition_cols) > 0 and len(self.partition_end) > 0:
        #     tf_filters = [col(tfc) <= self.partition_end[i] for i, tfc in enumerate(d.partition_cols)]
        #     where_clause = tf_filters[0]
        #     for f in tf_filters[1:]:
        #         where_clause &= f
        #     print("Applying filter {} to dataframe {}".format(where_clause, name))
        #     d.df = d.df.filter(where_clause)

        datalist[name] = d

    def remove_core(self, name: str):
        self.config.drop("cores.{}".format(name))

    def remove_source(self, name: str):
        self.config.drop("sources.{}".format(name))

    def get_daterange_multiplier(self, time_helpers: ConfigObj = None):
        time_helpers = self.config.get_config(
            "time_helpers") if time_helpers is None else time_helpers
        return Multiplier._create_from_daterange(self.dtm, time_helpers)

    def _create_groupby(self,
                        joiner_key: str,
                        groupby_col,
                        assign_df: bool = True):
        joiner_config = self.config.get_or_else(joiner_key, dict())
        if assign_df:
            table_name = self.helpers._get_joiner_key(joiner_key)
            joiner_config["target_join_df"] = self.get_source(table_name)
        return {
            "col": groupby_col,
            "joiner": joiner_config,
            "joiner_key": joiner_key
        }
Ejemplo n.º 5
0
from framework.feature_factory.feature_family import FeatureFamily
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit, when, struct
from framework.feature_factory.feature import Feature
from framework.feature_factory.helpers import Helpers
from framework.configobj import ConfigObj
import inspect
from datetime import date

feat_func = Helpers()._register_feature_func()
joiner_func = Helpers()._register_joiner_func()

# ADD DOCS HERE


class SalesCommon:
    def __init__(self, config=ConfigObj()):
        # FeatureFamily.__init__(self, config)
        self.config = config
        self._joiner_func = joiner_func
        self._feat_func = feat_func
        # FeatureFamily.__init__(self, config)

    def transfer_features(self, cls):
        for fn, func in self._feat_func.all.items():
            setattr(cls, fn, func)

    # @feat_func
    # def totalAmrmEarned(self,
    #                     _name="total_gross_sales",
    #                     _base_col="gross_sales",
Ejemplo n.º 6
0
 def __init__(self):
     self.helpers = Helpers()
Ejemplo n.º 7
0
class Feature_Factory():
    def __init__(self):
        self.helpers = Helpers()

    def append_features(self,
                        df: DataFrame,
                        groupBy_cols,
                        feature_sets: [FeatureSet],
                        withTrendsForFeatures: [FeatureSet] = None):
        """
        Appends features to incoming df. The features columns and groupby cols will be deduped and validated.
        If there's a group by, the groupby cols will be applied before appending features.
        If there's not a group by and no agg features then the features will be appended to df.
        :param df:
        :param groupBy_cols:
        :param feature_sets: input of FeatureSet
        :return:
        """
        # If groupBy Column is past in as something other than list, convert to list
        # Validation - If features, passed in is dict, convert to list of vals, etc.
        # groupBy_cols = self.helpers._to_list(groupBy_cols)
        groupBy_cols, groupBy_joiners = self.helpers._extract_groupby_joiner(
            groupBy_cols)
        features, dups = self.helpers._dedup_fast(df, [
            feature for feature_set in feature_sets
            for feature in feature_set.features.values()
        ])
        df = self.helpers._resolve_feature_joiners(
            df, features, groupBy_joiners).repartition(*groupBy_cols)

        # feature_cols = []
        agg_cols = []
        non_agg_cols = {}
        features_to_drop = []
        # base_cols = [f.base_col for f in features]

        # column validation
        # valid_result, undef_cols = self.helpers.validate_col(df, *base_cols)
        # assert valid_result, "base cols {} are not defined in df columns {}".format(undef_cols, df.columns)

        # valid_result, undef_cols = self.helpers._validate_col(df, *groupBy_cols)
        # assert valid_result, "groupby cols {} are not defined in df columns {}".format(undef_cols, df.columns)
        for feature in features:
            assert True if ((len(feature.aggs) > 0) and (len(
                groupBy_cols) > 0) or feature.agg_func is None) else False, "{} has either aggs or groupBys " \
                                               "but not both, ensure both are present".format(feature.name)
            # feature_cols.append(feature.assembled_column)
            # feature_cols.append(F.col(feature.output_alias))
            agg_cols += [agg_col for agg_col in feature.aggs]
            if feature.agg_func is None:
                non_agg_cols[feature.output_alias] = feature.assembled_column
            else:
                df = df.withColumn(feature.output_alias,
                                   feature.assembled_column)

            if feature.is_temporary:
                features_to_drop.append(feature.name)

        if len(groupBy_cols) > 0:
            df = df.groupBy(*groupBy_cols)\
                .agg(*agg_cols)
        for fn, col in non_agg_cols.items():
            df = df.withColumn(fn, col)

        final_df = df.drop(*features_to_drop)
        # else:
        #     new_df = df.select(*df.columns + feature_cols)
        return final_df
Ejemplo n.º 8
0
class FeatureSet:
    def __init__(self,
                 _features: OrderedDict = OrderedDict(),
                 _name_prefix: str = ""):
        from framework.feature_factory.helpers import Helpers
        self.helpers = Helpers()
        _features = _features.features if isinstance(_features,
                                                     FeatureSet) else _features
        self.features = OrderedDict()
        for fn, f in _features.items():
            if f.name.startswith(_name_prefix):
                self.features[fn] = f
            else:
                new_name = self.helpers._clean_alias("{}{}".format(
                    _name_prefix, f.name))
                new_f = f._clone(new_name)
                self.features[new_name] = new_f

        self.columns = self.features.keys()

    # Allow addition of individual feature in a certain position
    def add_feature(self, _feature: Feature, pos: int = 0):
        name = _feature.name
        self.features[name] = _feature

    def remove_feature(self, _name: str):
        del self.features[_name]

    def extract_multipliable_name(self, _name_prefix: str, _feature_name: str):
        if _feature_name.startswith(_name_prefix):
            return '_'.join(_feature_name.split("_")[1:])
        else:
            return _feature_name

    def multiply(self, _multiplier, _name_prefix: str, is_temporary=False):
        _name_prefix = _name_prefix.upper()
        nrow = len(_multiplier.filters)

        results = OrderedDict()
        for base_feature in self.features.values():
            if base_feature.kind == "multipliable":
                for r in range(nrow):
                    for c in range(len(_multiplier.filters[r])):
                        current_name = _multiplier.filter_names[r][
                            c] if _multiplier.filter_names is not None else _multiplier.filter_vals[
                                r][c]
                        multipliable_name = self.extract_multipliable_name(
                            _name_prefix, base_feature.name)
                        feature_name = self.helpers._clean_alias(
                            "{}_{}_{}".format(_name_prefix,
                                              current_name.upper(),
                                              multipliable_name.upper()))
                        feature_filter = [
                            *base_feature.filter, *_multiplier.filters[r][c]
                        ]
                        results[feature_name] = Feature(
                            _name=feature_name,
                            _base_col=base_feature.base_col,
                            _filter=feature_filter,
                            _negative_value=base_feature.negative_value,
                            _agg_func=base_feature.agg_func,
                            _joiners=base_feature.joiners,
                            _is_temporary=is_temporary)
        return FeatureSet(results)