def get_function(self): # note: returning class instance method errors for python2, # so using this branching code path while we support python2 if is_python_2(): return pd.Series.skew.__func__ else: return pd.Series.skew
def load_primitive_from_file(filepath): """load primitive objects in a file""" module = os.path.basename(filepath)[:-3] if is_python_2(): # for python 2.7 module = imp.load_source(module, filepath) else: # TODO: what is the first argument"? # for python >3.5 spec = importlib.util.spec_from_file_location(module, filepath) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) primitives = [] for primitive_name in vars(module): primitive_class = getattr(module, primitive_name) if (isclass(primitive_class) and issubclass(primitive_class, PrimitiveBase) and primitive_class not in (AggregationPrimitive, TransformPrimitive)): primitives.append((primitive_name, primitive_class)) if len(primitives) == 0: raise RuntimeError("No primitive defined in file %s" % filepath) elif len(primitives) > 1: raise RuntimeError("More than one primitive defined in file %s" % filepath) return primitives[0]
def _calculate_agg_features(self, features, frame, df_trie): test_feature = features[0] child_entity = test_feature.base_features[0].entity base_frame = df_trie.get_node(test_feature.relationship_path).value # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where where = test_feature.where if where is not None and not base_frame.empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan if base_frame.empty: for f in features: frame[f.get_name()] = np.nan else: relationship_path = test_feature.relationship_path groupby_var = get_relationship_variable_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if is_python_2() and func == pd.Series.count.__func__: func = "count" elif func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # variable twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if u"{}-{}".format(variable_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (f.number_output_features == 1 and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
import os from inspect import isclass import pandas as pd from .base import AggregationPrimitive, PrimitiveBase, TransformPrimitive import featuretools from featuretools.utils import is_python_2 if is_python_2(): import imp else: import importlib.util def get_aggregation_primitives(): aggregation_primitives = set([]) for attribute_string in dir(featuretools.primitives): attribute = getattr(featuretools.primitives, attribute_string) if isclass(attribute): if issubclass(attribute, featuretools.primitives.AggregationPrimitive): if attribute.name: aggregation_primitives.add(attribute) return {prim.name.lower(): prim for prim in aggregation_primitives} def get_transform_primitives(): transform_primitives = set([]) for attribute_string in dir(featuretools.primitives):
def get_function(self): if is_python_2(): return pd.Series.median.__func__ else: return pd.Series.median