def _handle_time(self, entity_id, df, time_last=None, training_window=None, include_cutoff_time=True): """ Filter a dataframe for all instances before time_last. If the DataTable does not have a time index, return the original dataframe. """ dt = self[entity_id] if is_instance(df, ks, 'DataFrame') and isinstance(time_last, np.datetime64): time_last = pd.to_datetime(time_last) if dt.time_index: df_empty = df.empty if isinstance(df, pd.DataFrame) else False if time_last is not None and not df_empty: if include_cutoff_time: df = df[df[dt.time_index] <= time_last] else: df = df[df[dt.time_index] < time_last] if training_window is not None: training_window = _check_timedelta(training_window) if include_cutoff_time: mask = df[dt.time_index] > time_last - training_window else: mask = df[dt.time_index] >= time_last - training_window if dt.last_time_index is not None: lti_slice = dt.last_time_index.reindex(df.index) if include_cutoff_time: lti_mask = lti_slice > time_last - training_window else: lti_mask = lti_slice >= time_last - training_window mask = mask | lti_mask else: warnings.warn( "Using training_window but last_time_index is " "not set on entity %s" % (dt.id) ) df = df[mask] for secondary_time_index, columns in dt.secondary_time_index.items(): # should we use ignore time last here? df_empty = df.empty if isinstance(df, pd.DataFrame) else False if time_last is not None and not df_empty: mask = df[secondary_time_index] >= time_last if isinstance(df, dd.DataFrame): for col in columns: df[col] = df[col].mask(mask, np.nan) elif is_instance(df, ks, 'DataFrame'): df.loc[mask, columns] = None else: df.loc[mask, columns] = np.nan return df
def convert_variable_data(df, column_id, new_type, **kwargs): """Convert dataframe's variable to different type. """ empty = df[column_id].empty if isinstance(df, pd.DataFrame) else False if empty: return df if new_type == vtypes.Numeric: if isinstance(df, dd.DataFrame): df[column_id] = dd.to_numeric(df[column_id], errors='coerce') elif is_instance(df, ks, 'DataFrame'): df[column_id] = ks.to_numeric(df[column_id]) else: orig_nonnull = df[column_id].dropna().shape[0] df[column_id] = pd.to_numeric(df[column_id], errors='coerce') # This will convert strings to nans # If column contained all strings, then we should # just raise an error, because that shouldn't have # been converted to numeric nonnull = df[column_id].dropna().shape[0] if nonnull == 0 and orig_nonnull != 0: raise TypeError( "Attempted to convert all string column {} to numeric". format(column_id)) elif issubclass(new_type, vtypes.Datetime): format = kwargs.get("format", None) # TODO: if float convert to int? if isinstance(df, dd.DataFrame): df[column_id] = dd.to_datetime(df[column_id], format=format, infer_datetime_format=True) elif is_instance(df, ks, 'DataFrame'): df[column_id] = ks.to_datetime(df[column_id], format=format, infer_datetime_format=True) else: df[column_id] = pd.to_datetime(df[column_id], format=format, infer_datetime_format=True) elif new_type == vtypes.Boolean: map_dict = { kwargs.get("true_val", True): True, kwargs.get("false_val", False): False, True: True, False: False } # TODO: what happens to nans? df[column_id] = df[column_id].map(map_dict).astype(np.bool) elif not issubclass(new_type, vtypes.Discrete): raise Exception("Cannot convert column %s to %s" % (column_id, new_type)) return df
def to_pandas(df, index=None, sort_index=False, int_index=False): ''' Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe. Args: index (str, optional): column name to set as index, defaults to None sort_index (bool, optional): whether to sort the dataframe on the index after setting it, defaults to False int_index (bool, optional): Converts computed dask index to Int64Index to avoid errors, defaults to False Returns: Pandas DataFrame ''' if isinstance(df, (pd.DataFrame, pd.Series)): return df if isinstance(df, (dd.DataFrame, dd.Series)): pd_df = df.compute() if is_instance(df, (ks, ks), ('DataFrame', 'Series')): pd_df = df.to_pandas() if index: pd_df = pd_df.set_index(index) if sort_index: pd_df = pd_df.sort_index() if int_index and isinstance(df, dd.DataFrame): pd_df.index = pd.Int64Index(pd_df.index) return pd_df
def set_time_index(self, variable_id, already_sorted=False): # check time type if not isinstance(self.df, pd.DataFrame) or self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype] else: time_to_check = self.df[variable_id].iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) if is_instance(self.df, (dd, ks), 'DataFrame'): t = time_type # skip checking values already_sorted = True # skip sorting else: t = vtypes.NumericTimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex # use stable sort if not already_sorted: # sort by time variable, then by index self.df = self.df.sort_values([variable_id, self.index]) self.convert_variable_type(variable_id, t, convert_data=False) self.time_index = variable_id
def _create_index(index, make_index, df): '''Handles index creation logic base on user input''' created_index = None if index is None: # Case 1: user wanted to make index but did not specify column name assert not make_index, "Must specify an index name if make_index is True" # Case 2: make_index not specified but no index supplied, use first column warnings.warn(("Using first column as index. " "To change this, specify the index parameter")) index = df.columns[0] elif make_index and index in df.columns: # Case 3: user wanted to make index but column already exists raise RuntimeError("Cannot make index: index variable already present") elif index not in df.columns: if not make_index: # Case 4: user names index, it is not in df. does not specify # make_index. Make new index column and warn warnings.warn("index {} not found in dataframe, creating new " "integer column".format(index)) # Case 5: make_index with no errors or warnings # (Case 4 also uses this code path) if isinstance(df, dd.DataFrame): df[index] = 1 df[index] = df[index].cumsum() - 1 elif is_instance(df, ks, 'DataFrame'): df = df.koalas.attach_id_column('distributed-sequence', index) else: df.insert(0, index, range(len(df))) created_index = index # Case 6: user specified index, which is already in df. No action needed. return created_index, index, df
def _vals_to_series(instance_vals, variable_id): """ instance_vals may be a pd.Dataframe, a pd.Series, a list, a single value, or None. This function always returns a Series or None. """ if instance_vals is None: return None # If this is a single value, make it a list if not hasattr(instance_vals, '__iter__'): instance_vals = [instance_vals] # convert iterable to pd.Series if isinstance(instance_vals, pd.DataFrame): out_vals = instance_vals[variable_id] elif is_instance(instance_vals, (pd, dd, ks), 'Series'): out_vals = instance_vals.rename(variable_id) else: out_vals = pd.Series(instance_vals) # no duplicates or NaN values out_vals = out_vals.drop_duplicates().dropna() # want index to have no name for the merge in query_by_values out_vals.index.name = None return out_vals
def _calculate_direct_features(self, features, child_df, df_trie, progress_callback): path = features[0].relationship_path assert len( path) == 1, "Error calculating DirectFeatures, len(path) != 1" parent_df = df_trie.get_node([path[0]]).value _is_forward, relationship = path[0] merge_col = relationship._child_column_name # generate a mapping of old column names (in the parent dataframe) to # new column names (in the child dataframe) for the merge col_map = {relationship._parent_column_name: merge_col} index_as_feature = None fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() if not pd.isna(f.default_value) } fillna_dict.update(feature_defaults) if f.base_features[0].get_name( ) == relationship._parent_column_name: index_as_feature = f base_names = f.base_features[0].get_feature_names() for name, base_name in zip(f.get_feature_names(), base_names): if name in child_df.columns: continue col_map[base_name] = name # merge the identity feature from the parent dataframe into the child merge_df = parent_df[list(col_map.keys())].rename(columns=col_map) if is_instance(merge_df, (dd, ps), "DataFrame"): new_df = child_df.merge(merge_df, left_on=merge_col, right_on=merge_col, how="left") else: if index_as_feature is not None: merge_df.set_index(index_as_feature.get_name(), inplace=True, drop=False) else: merge_df.set_index(merge_col, inplace=True) new_df = child_df.merge(merge_df, left_on=merge_col, right_index=True, how="left") progress_callback(len(features) / float(self.num_features)) return new_df.fillna(fillna_dict)
def write_entity_data(entity, path, format='csv', **kwargs): '''Write entity data to disk or S3 path. Args: entity (Entity) : Instance of :class:`.Entity`. path (str) : Location on disk to write entity data. format (str) : Format to use for writing entity data. Defaults to csv. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method. Returns: loading_info (dict) : Information on storage location and format of entity data. ''' format = format.lower() if isinstance(entity.df, dd.DataFrame) and format == 'csv': basename = "{}-*.{}".format(entity.id, format) else: basename = '.'.join([entity.id, format]) location = os.path.join('data', basename) file = os.path.join(path, location) df = entity.df if format == 'csv': if is_instance(df, ks, 'DataFrame'): df = df.copy() columns = list(df.select_dtypes('object').columns) df[columns] = df[columns].astype(str) df.to_csv( file, index=kwargs['index'], sep=kwargs['sep'], encoding=kwargs['encoding'], compression=kwargs['compression'], ) elif format == 'parquet': # Serializing to parquet format raises an error when columns contain tuples. # Columns containing tuples are mapped as dtype object. # Issue is resolved by casting columns of dtype object to string. df = df.copy() columns = list(df.select_dtypes('object').columns) df[columns] = df[columns].astype(str) df.to_parquet(file, **kwargs) elif format == 'pickle': # Dask currently does not support to_pickle if isinstance(df, dd.DataFrame): msg = 'Cannot serialize Dask EntitySet to pickle' raise ValueError(msg) else: df.to_pickle(file, **kwargs) else: error = 'must be one of the following formats: {}' raise ValueError(error.format(', '.join(FORMATS))) return {'location': location, 'type': format, 'params': kwargs}
def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs): for fm in feature_matrix: fm.ww.init(**ww_init_kwargs) if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix): feature_matrix = dd.concat(feature_matrix) elif any(is_instance(fm, ks, 'DataFrame') for fm in feature_matrix): feature_matrix = ks.concat(feature_matrix) else: feature_matrix = pd.concat(feature_matrix) feature_matrix.ww.init(**ww_init_kwargs) return feature_matrix
def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs): cols_to_check = { col for col, ltype in ww_init_kwargs["logical_types"].items() if isinstance(ltype, (Age, Boolean, Integer)) } replacement_type = { "age": AgeNullable(), "boolean": BooleanNullable(), "integer": IntegerNullable(), } for fm in feature_matrix: updated_cols = set() for col in cols_to_check: # Only convert types for pandas if null values are present # Always convert for Dask/Spark to avoid pulling data into memory for null check is_pandas_df_with_null = (isinstance(fm, pd.DataFrame) and fm[col].isnull().any()) is_dask_df = isinstance(fm, dd.DataFrame) is_spark_df = is_instance(fm, ps, "DataFrame") if is_pandas_df_with_null or is_dask_df or is_spark_df: current_type = ww_init_kwargs["logical_types"][col].type_string ww_init_kwargs["logical_types"][col] = replacement_type[ current_type] updated_cols.add(col) cols_to_check = cols_to_check - updated_cols fm.ww.init(**ww_init_kwargs) if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix): feature_matrix = dd.concat(feature_matrix) elif any(is_instance(fm, ps, "DataFrame") for fm in feature_matrix): feature_matrix = ps.concat(feature_matrix) else: feature_matrix = pd.concat(feature_matrix) feature_matrix.ww.init(**ww_init_kwargs) return feature_matrix
def entity_to_description(entity): '''Serialize entity to data description. Args: entity (Entity) : Instance of :class:`.Entity`. Returns: dictionary (dict) : Description of :class:`.Entity`. ''' index = entity.df.columns.isin([variable.id for variable in entity.variables]) indexer = entity.df.columns[index].to_list() if is_instance(entity.df, ks, 'DataFrame') else entity.df.columns[index] dtypes = entity.df[indexer].dtypes.astype(str).to_dict() if isinstance(entity.df, dd.DataFrame): entity_type = 'dask' elif is_instance(entity.df, ks, 'DataFrame'): entity_type = 'koalas' else: entity_type = 'pandas' description = { "id": entity.id, "index": entity.index, "time_index": entity.time_index, "properties": { 'secondary_time_index': entity.secondary_time_index, 'last_time_index': entity.last_time_index is not None, }, "variables": [variable.to_data_description() for variable in entity.variables], "loading_info": { 'entity_type': entity_type, 'params': {}, 'properties': { 'dtypes': dtypes } } } return description
def _calculate_direct_features(self, features, child_df, df_trie, progress_callback): path = features[0].relationship_path assert len(path) == 1, \ "Error calculating DirectFeatures, len(path) != 1" parent_df = df_trie.get_node([path[0]]).value _is_forward, relationship = path[0] merge_var = relationship.child_variable.id # generate a mapping of old column names (in the parent entity) to # new column names (in the child entity) for the merge col_map = {relationship.parent_variable.id: merge_var} index_as_feature = None for f in features: if f.base_features[0].get_name( ) == relationship.parent_variable.id: index_as_feature = f base_names = f.base_features[0].get_feature_names() for name, base_name in zip(f.get_feature_names(), base_names): if name in child_df.columns: continue col_map[base_name] = name # merge the identity feature from the parent entity into the child merge_df = parent_df[list(col_map.keys())].rename(columns=col_map) if is_instance(merge_df, (dd, ks), 'DataFrame'): new_df = child_df.merge(merge_df, left_on=merge_var, right_on=merge_var, how='left') else: if index_as_feature is not None: merge_df.set_index(index_as_feature.get_name(), inplace=True, drop=False) else: merge_df.set_index(merge_var, inplace=True) new_df = child_df.merge(merge_df, left_on=merge_var, right_index=True, how='left') progress_callback(len(features) / float(self.num_features)) return new_df
def to_csv(self, path, sep=',', encoding='utf-8', engine='python', compression=None, profile_name=None): '''Write entityset to disk in the csv format, location specified by `path`. Path could be a local path or a S3 path. If writing to S3 a tar archive of files will be written. Args: path (str) : Location on disk to write to (will be created as a directory) sep (str) : String of length 1. Field delimiter for the output file. encoding (str) : A string representing the encoding to use in the output file, defaults to 'utf-8'. engine (str) : Name of the engine to use. Possible values are: {'c', 'python'}. compression (str) : Name of the compression to use. Possible values are: {'gzip', 'bz2', 'zip', 'xz', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. ''' if is_instance(self.entities[0].df, ks, 'DataFrame'): compression = str(compression) serialize.write_data_description(self, path, format='csv', index=False, sep=sep, encoding=encoding, engine=engine, compression=compression, profile_name=profile_name) return self
def set_secondary_time_index(self, secondary_time_index): for time_index, columns in secondary_time_index.items(): if is_instance(self.df, (dd, ks), 'DataFrame') or self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype] else: time_to_check = self.df[time_index].head(1).iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) if time_index not in columns: columns.append(time_index) self.secondary_time_index = secondary_time_index
def test_is_instance_multiple_modules(df): df2 = dd.from_pandas(df, npartitions=2) assert is_instance(df, (dd, pd), 'DataFrame') assert is_instance(df2, (dd, pd), 'DataFrame') assert is_instance(df2['id'], (dd, pd), ('Series', 'DataFrame')) assert not is_instance(df2['id'], (dd, pd), ('DataFrame', 'Series'))
def test_is_instance_none_module(df): assert not is_instance(df, None, "DataFrame") assert is_instance(df, (None, pd), "DataFrame") assert is_instance(df, (None, pd), ("Series", "DataFrame"))
def test_is_instance_errors_mismatch(): msg = "Number of modules does not match number of classnames" with pytest.raises(ValueError, match=msg): is_instance("abc", pd, ("DataFrame", "Series"))
def test_is_instance_multiple_modules(df): df2 = dd.from_pandas(df, npartitions=2) assert is_instance(df, (dd, pd), "DataFrame") assert is_instance(df2, (dd, pd), "DataFrame") assert is_instance(df2["id"], (dd, pd), ("Series", "DataFrame")) assert not is_instance(df2["id"], (dd, pd), ("DataFrame", "Series"))
def test_is_instance_single_module(df): assert is_instance(df, pd, "DataFrame")
def __init__(self, target_entity_id, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_entities=None, ignore_variables=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): if target_entity_id not in entityset.entity_dict: es_name = entityset.id or 'entity set' msg = 'Provided target entity %s does not exist in %s' % (target_entity_id, es_name) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None # if just one entity, set max depth to 1 (transform stacking rule) if len(entityset.entity_dict) == 1 and (max_depth is None or max_depth > 1): warnings.warn("Only one entity in entityset, changing max_depth to " "1 since deeper features cannot be created") max_depth = 1 self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_entities is None: self.ignore_entities = set() else: if not isinstance(ignore_entities, list): raise TypeError('ignore_entities must be a list') assert target_entity_id not in ignore_entities,\ "Can't ignore target_entity!" self.ignore_entities = set(ignore_entities) self.ignore_variables = defaultdict(set) if ignore_variables is not None: # check if ignore_variables is not {str: list} if not all(isinstance(i, str) for i in ignore_variables.keys()) or not all(isinstance(i, list) for i in ignore_variables.values()): raise TypeError('ignore_variables should be dict[str -> list]') # check if list values are all of type str elif not all(all(isinstance(v, str) for v in value) for value in ignore_variables.values()): raise TypeError('list values should be of type str') for eid, vars in ignore_variables.items(): self.ignore_variables[eid] = set(vars) self.target_entity_id = target_entity_id self.es = entityset if any(isinstance(entity.df, dd.DataFrame) for entity in self.es.entities): entityset_type = Library.DASK elif any(is_instance(entity.df, ks, 'DataFrame') for entity in self.es.entities): entityset_type = Library.KOALAS else: entityset_type = Library.PANDAS if agg_primitives is None: agg_primitives = [p for p in primitives.get_default_aggregation_primitives() if entityset_type in p.compatibility] self.agg_primitives = [] agg_prim_dict = primitives.get_aggregation_primitives() for a in agg_primitives: if isinstance(a, str): if a.lower() not in agg_prim_dict: raise ValueError("Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] a = handle_primitive(a) if not isinstance(a, AggregationPrimitive): raise ValueError("Primitive {} in agg_primitives is not an " "aggregation primitive".format(type(a))) self.agg_primitives.append(a) self.agg_primitives.sort() if trans_primitives is None: trans_primitives = [p for p in primitives.get_default_transform_primitives() if entityset_type in p.compatibility] self.trans_primitives = [] for t in trans_primitives: t = check_trans_primitive(t) self.trans_primitives.append(t) self.trans_primitives.sort() if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = [] for p in where_primitives: if isinstance(p, str): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError("Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj p = handle_primitive(p) self.where_primitives.append(p) self.where_primitives.sort() if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = [] for p in groupby_trans_primitives: p = check_trans_primitive(p) self.groupby_trans_primitives.append(p) self.groupby_trans_primitives.sort() if primitive_options is None: primitive_options = {} all_primitives = self.trans_primitives + self.agg_primitives + \ self.where_primitives + self.groupby_trans_primitives bad_primitives = [prim.name for prim in all_primitives if entityset_type not in prim.compatibility] if bad_primitives: msg = 'Selected primitives are incompatible with {} EntitySets: {}' raise ValueError(msg.format(entityset_type.value, ', '.join(bad_primitives))) self.primitive_options, self.ignore_entities, self.ignore_variables =\ generate_all_primitive_options(all_primitives, primitive_options, self.ignore_entities, self.ignore_variables, self.es) self.seed_features = sorted(seed_features or [], key=lambda f: f.unique_name()) self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def calculate_chunk(cutoff_time, chunk_size, feature_set, entityset, approximate, training_window, save_progress, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, progress_bar=None, progress_callback=None, include_cutoff_time=True): if not isinstance(feature_set, FeatureSet): feature_set = cloudpickle.loads(feature_set) feature_matrix = [] if no_unapproximated_aggs and approximate is not None: if entityset.time_type == NumericTimeIndex: group_time = np.inf else: group_time = datetime.now() if isinstance(cutoff_time, tuple): update_progress_callback = None if progress_bar is not None: def update_progress_callback(done): previous_progress = progress_bar.n progress_bar.update(done * len(cutoff_time[1])) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) time_last = cutoff_time[0] ids = cutoff_time[1] calculator = FeatureSetCalculator(entityset, feature_set, time_last, training_window=training_window) _feature_matrix = calculator.run( ids, progress_callback=update_progress_callback, include_cutoff_time=include_cutoff_time) if isinstance(_feature_matrix, pd.DataFrame): time_index = pd.Index([time_last] * len(ids), name='time') _feature_matrix = _feature_matrix.set_index(time_index, append=True) feature_matrix.append(_feature_matrix) else: for _, group in cutoff_time.groupby(cutoff_df_time_var): # if approximating, calculate the approximate features if approximate is not None: precalculated_features_trie = approximate_features( feature_set, group, window=approximate, entityset=entityset, training_window=training_window, include_cutoff_time=include_cutoff_time, ) else: precalculated_features_trie = None @save_csv_decorator(save_progress) def calc_results(time_last, ids, precalculated_features=None, training_window=None, include_cutoff_time=True): update_progress_callback = None if progress_bar is not None: def update_progress_callback(done): previous_progress = progress_bar.n progress_bar.update(done * group.shape[0]) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) calculator = FeatureSetCalculator( entityset, feature_set, time_last, training_window=training_window, precalculated_features=precalculated_features) matrix = calculator.run( ids, progress_callback=update_progress_callback, include_cutoff_time=include_cutoff_time) return matrix # if all aggregations have been approximated, can calculate all together if no_unapproximated_aggs and approximate is not None: inner_grouped = [[group_time, group]] else: # if approximated features, set cutoff_time to unbinned time if precalculated_features_trie is not None: group[cutoff_df_time_var] = group[target_time] inner_grouped = group.groupby(cutoff_df_time_var, sort=True) if chunk_size is not None: inner_grouped = _chunk_dataframe_groups( inner_grouped, chunk_size) for time_last, group in inner_grouped: # sort group by instance id ids = group['instance_id'].sort_values().values if no_unapproximated_aggs and approximate is not None: window = None else: window = training_window # calculate values for those instances at time time_last _feature_matrix = calc_results( time_last, ids, precalculated_features=precalculated_features_trie, training_window=window, include_cutoff_time=include_cutoff_time) if is_instance(_feature_matrix, (dd, ks), 'DataFrame'): id_name = _feature_matrix.columns[-1] else: id_name = _feature_matrix.index.name # if approximate, merge feature matrix with group frame to get original # cutoff times and passed columns if approximate: cols = [ c for c in _feature_matrix.columns if c not in pass_columns ] indexer = group[['instance_id', target_time] + pass_columns] _feature_matrix = _feature_matrix[cols].merge( indexer, right_on=['instance_id'], left_index=True, how='right') _feature_matrix.set_index(['instance_id', target_time], inplace=True) _feature_matrix.index.set_names([id_name, 'time'], inplace=True) _feature_matrix.sort_index(level=1, kind='mergesort', inplace=True) else: # all rows have same cutoff time. set time and add passed columns num_rows = len(ids) if len(pass_columns) > 0: pass_through = group[ ['instance_id', cutoff_df_time_var] + pass_columns] pass_through.rename(columns={ 'instance_id': id_name, cutoff_df_time_var: 'time' }, inplace=True) if isinstance(_feature_matrix, pd.DataFrame): time_index = pd.Index([time_last] * num_rows, name='time') _feature_matrix = _feature_matrix.set_index( time_index, append=True) if len(pass_columns) > 0: pass_through.set_index([id_name, 'time'], inplace=True) for col in pass_columns: _feature_matrix[col] = pass_through[col] elif isinstance(_feature_matrix, dd.DataFrame) and (len(pass_columns) > 0): _feature_matrix['time'] = time_last for col in pass_columns: pass_df = dd.from_pandas( pass_through[[id_name, 'time', col]], npartitions=_feature_matrix.npartitions) _feature_matrix = _feature_matrix.merge( pass_df, how="outer") _feature_matrix = _feature_matrix.drop( columns=['time']) elif is_instance(_feature_matrix, ks, 'DataFrame') and (len(pass_columns) > 0): _feature_matrix['time'] = time_last for col in pass_columns: pass_df = ks.from_pandas( pass_through[[id_name, 'time', col]]) _feature_matrix = _feature_matrix.merge( pass_df, how="outer") _feature_matrix = _feature_matrix.drop( columns=['time']) feature_matrix.append(_feature_matrix) if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix): feature_matrix = dd.concat(feature_matrix) elif any(is_instance(fm, ks, 'DataFrame') for fm in feature_matrix): feature_matrix = ks.concat(feature_matrix) else: feature_matrix = pd.concat(feature_matrix) return feature_matrix
def test_is_instance_none_module(df): assert not is_instance(df, None, 'DataFrame') assert is_instance(df, (None, pd), 'DataFrame') assert is_instance(df, (None, pd), ('Series', 'DataFrame'))
def add_last_time_indexes(self, updated_entities=None): """ Calculates the last time index values for each entity (the last time an instance or children of that instance were observed). Used when calculating features using training windows Args: updated_entities (list[str]): List of entity ids to update last_time_index for (will update all parents of those entities as well) """ # Generate graph of entities to find leaf entities children = defaultdict(list) # parent --> child mapping child_vars = defaultdict(dict) for r in self.relationships: children[r.parent_entity.id].append(r.child_entity) child_vars[r.parent_entity.id][r.child_entity.id] = r.child_variable updated_entities = updated_entities or [] if updated_entities: # find parents of updated_entities parent_queue = updated_entities[:] parents = set() while len(parent_queue): e = parent_queue.pop(0) if e in parents: continue parents.add(e) for parent_id, _ in self.get_forward_entities(e): parent_queue.append(parent_id) queue = [self[p] for p in parents] to_explore = parents else: to_explore = set([e.id for e in self.entities[:]]) queue = self.entities[:] explored = set() for e in queue: e.last_time_index = None # We will explore children of entities on the queue, # which may not be in the to_explore set. Therefore, # we check whether all elements of to_explore are in # explored, rather than just comparing length while not to_explore.issubset(explored): entity = queue.pop(0) if entity.last_time_index is None: if entity.time_index is not None: lti = entity.df[entity.time_index].copy() if isinstance(entity.df, dd.DataFrame): # The current Dask implementation doesn't set the index of the dataframe # to the entity's index, so we have to do it manually here lti.index = entity.df[entity.index].copy() else: lti = entity.df[entity.index].copy() if isinstance(entity.df, dd.DataFrame): lti.index = entity.df[entity.index].copy() lti = lti.apply(lambda x: None) elif is_instance(entity.df, ks, 'DataFrame'): lti = ks.Series(pd.Series(index=lti.to_list(), name=lti.name)) else: lti[:] = None entity.last_time_index = lti if entity.id in children: child_entities = children[entity.id] # if all children not explored, skip for now if not set([e.id for e in child_entities]).issubset(explored): # Now there is a possibility that a child entity # was not explicitly provided in updated_entities, # and never made it onto the queue. If updated_entities # is None then we just load all entities onto the queue # so we didn't need this logic for e in child_entities: if e.id not in explored and e.id not in [q.id for q in queue]: queue.append(e) queue.append(entity) continue # updated last time from all children for child_e in child_entities: # TODO: Figure out if Dask code related to indexes is important for Koalas if child_e.last_time_index is None: continue link_var = child_vars[entity.id][child_e.id].id lti_is_dask = isinstance(child_e.last_time_index, dd.Series) lti_is_koalas = is_instance(child_e.last_time_index, ks, 'Series') if lti_is_dask or lti_is_koalas: to_join = child_e.df[link_var] if lti_is_dask: to_join.index = child_e.df[child_e.index] lti_df = child_e.last_time_index.to_frame(name='last_time').join( to_join.to_frame(name=entity.index) ) if lti_is_dask: new_index = lti_df.index.copy() new_index.name = None lti_df.index = new_index lti_df = lti_df.groupby(lti_df[entity.index]).agg('max') lti_df = entity.last_time_index.to_frame(name='last_time_old').join(lti_df) else: lti_df = pd.DataFrame({'last_time': child_e.last_time_index, entity.index: child_e.df[link_var]}) # sort by time and keep only the most recent lti_df.sort_values(['last_time', entity.index], kind="mergesort", inplace=True) lti_df.drop_duplicates(entity.index, keep='last', inplace=True) lti_df.set_index(entity.index, inplace=True) lti_df = lti_df.reindex(entity.last_time_index.index) lti_df['last_time_old'] = entity.last_time_index if not (lti_is_dask or lti_is_koalas) and lti_df.empty: # Pandas errors out if it tries to do fillna and then max on an empty dataframe lti_df = pd.Series() else: if lti_is_koalas: lti_df['last_time'] = ks.to_datetime(lti_df['last_time']) lti_df['last_time_old'] = ks.to_datetime(lti_df['last_time_old']) # TODO: Figure out a workaround for fillna and replace lti_df = lti_df.max(axis=1) else: lti_df['last_time'] = lti_df['last_time'].astype('datetime64[ns]') lti_df['last_time_old'] = lti_df['last_time_old'].astype('datetime64[ns]') lti_df = lti_df.fillna(pd.to_datetime('1800-01-01 00:00')).max(axis=1) lti_df = lti_df.replace(pd.to_datetime('1800-01-01 00:00'), pd.NaT) # lti_df = lti_df.apply(lambda x: x.dropna().max(), axis=1) entity.last_time_index = lti_df entity.last_time_index.name = 'last_time' explored.add(entity.id) self.reset_data_description()
def _calculate_agg_features(self, features, frame, df_trie, progress_callback): test_feature = features[0] child_dataframe = test_feature.base_features[0].dataframe base_frame = df_trie.get_node(test_feature.relationship_path).value parent_merge_col = test_feature.relationship_path[0][ 1]._parent_column_name # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here fl = [] for f in features: for ind in f.get_feature_names(): if ind not in frame.columns: fl.append(f) break features = fl if not len(features): progress_callback(len(features) / float(self.num_features)) return frame # handle where base_frame_empty = (base_frame.empty if isinstance( base_frame, pd.DataFrame) else False) where = test_feature.where if where is not None and not base_frame_empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan base_frame_empty = (base_frame.empty if isinstance( base_frame, pd.DataFrame) else False) if base_frame_empty: feature_values = [] for f in features: feature_values.append( (f, np.full(f.number_output_features, np.nan))) progress_callback(1 / float(self.num_features)) frame = update_feature_columns(feature_values, frame) else: relationship_path = test_feature.relationship_path groupby_col = get_relationship_column_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child dataframe included in that Timedelta use_previous = test_feature.use_previous if use_previous: # Filter by use_previous values time_last = self.time_last if use_previous.has_no_observations(): time_first = time_last - use_previous ti = child_dataframe.ww.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.get_value("o") def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_col, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multi-column and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): column_id = f.base_features[0].get_name() if column_id not in to_agg: to_agg[column_id] = [] if isinstance(base_frame, dd.DataFrame): func = f.get_function(agg_type=Library.DASK) elif is_instance(base_frame, ps, "DataFrame"): func = f.get_function(agg_type=Library.SPARK) else: func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # column twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if "{}-{}".format(column_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname if isinstance(func, dd.Aggregation): # TODO: handle aggregation being applied to same column twice # (see above partial wrapping of functions) funcname = func.__name__ to_agg[column_id].append(func) # this is used below to rename columns that pandas names for us agg_rename["{}-{}".format(column_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_col can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_col], observed=True, sort=False).apply(wrap) frame = pd.merge( left=frame, right=to_merge, left_index=True, right_index=True, how="left", ) progress_callback(len(to_apply) / float(self.num_features)) # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_col can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) if is_instance(base_frame, (dd, ps), "DataFrame"): to_merge = base_frame.groupby(groupby_col).agg(to_agg) else: to_merge = base_frame.groupby(base_frame[groupby_col], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns ] to_merge = to_merge[list(agg_rename.values())] # Workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 # # Pandas claims that bug is fixed but it still shows up in some # cases. More investigation needed. if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) if is_instance(frame, (dd, ps), "DataFrame"): frame = frame.merge(to_merge, left_on=parent_merge_col, right_index=True, how="left") else: frame = pd.merge( left=frame, right=to_merge, left_index=True, right_index=True, how="left", ) # determine number of features that were just merged progress_callback( len(to_merge.columns) / float(self.num_features)) # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame = frame.fillna(fillna_dict) return frame
def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None, include_cutoff_time=True): """Query instances that have variable with given value Args: entity_id (str): The id of the entity to query instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Window defining how much time before the cutoff time data can be used when calculating features. If None, all data before cutoff time is used. include_cutoff_time (bool): If True, data at cutoff time are included in calculating features Returns: pd.DataFrame : instances that match constraints with ids in order of underlying dataframe """ entity = self[entity_id] if not variable_id: variable_id = entity.index instance_vals = _vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert training_window.has_no_observations(), "Training window cannot be in observations" if instance_vals is None: df = entity.df.copy() elif isinstance(instance_vals, pd.Series) and instance_vals.empty: df = entity.df.head(0) else: if is_instance(instance_vals, (dd, ks), 'Series'): df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id) elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'): df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id) else: df = entity.df[entity.df[variable_id].isin(instance_vals)] if isinstance(entity.df, pd.DataFrame): df = df.set_index(entity.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(entity.df[variable_id]): categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(entity_id=entity_id, df=df, time_last=time_last, training_window=training_window, include_cutoff_time=include_cutoff_time) if columns is not None: df = df[columns] return df
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None, entities=None, relationships=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, n_jobs=1, dask_kwargs=None, progress_callback=None, include_cutoff_time=True): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[:class:`.FeatureBase`]): Feature definitions to be calculated. entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships` not provided cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate the features for each instance. The resulting feature matrix will use data up to and including the cutoff_time. Can either be a DataFrame or a single value. If a DataFrame is passed the instance ids for which to calculate features must be in a column with the same name as the target entity index or a column named `instance_id`. The cutoff time values in the DataFrame must be in a column with the same name as the target entity time index or a column named `time`. If the DataFrame has more than two columns, any additional columns will be added to the resulting feature matrix. If a single value is passed, this value will be used for all instances. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of entities. Entries take the format {entity id -> (dataframe, id column, (time_column), (variable_types))}. Note that time_column and variable_types are optional. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (Timedelta or str, optional): Window defining how much time before the cutoff time data can be used when calculating features. If ``None``, all data before cutoff time is used. Defaults to ``None``. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. chunk_size (int or float or None): maximum number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs n_jobs (int, optional): number of parallel processes to use when calculating feature matrix. dask_kwargs (dict, optional): Dictionary of keyword arguments to be passed when creating the dask client and scheduler. Even if n_jobs is not set, using `dask_kwargs` will enable multiprocessing. Main parameters: cluster (str or dask.distributed.LocalCluster): cluster or address of cluster to send tasks to. If unspecified, a cluster will be created. diagnostics port (int): port number to use for web dashboard. If left unspecified, web interface will not be enabled. Valid keyword arguments for LocalCluster will also be accepted. save_progress (str, optional): path to save intermediate computational results. progress_callback (callable): function to be called with incremental progress updates. Has the following parameters: update: percentage change (float between 0 and 100) in progress since last call progress_percent: percentage (float between 0 and 100) of total computation completed time_elapsed: total time in seconds that has elapsed since start of call include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``. Returns: pd.DataFrame: The feature matrix. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, FeatureBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) if any(isinstance(es.df, dd.DataFrame) for es in entityset.entities): if approximate: msg = "Using approximate is not supported with Dask Entities" raise ValueError(msg) if training_window: msg = "Using training_window is not supported with Dask Entities" raise ValueError(msg) target_entity = entityset[features[0].entity.id] cutoff_time = _validate_cutoff_time(cutoff_time, target_entity) if isinstance(cutoff_time, pd.DataFrame): if instance_ids: msg = "Passing 'instance_ids' is valid only if 'cutoff_time' is a single value or None - ignoring" warnings.warn(msg) pass_columns = [ col for col in cutoff_time.columns if col not in ['instance_id', 'time'] ] # make sure dtype of instance_id in cutoff time # is same as column it references target_entity = features[0].entity dtype = entityset[target_entity.id].df[target_entity.index].dtype cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype) else: pass_columns = [] if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index df = target_entity._handle_time( target_entity.df, time_last=cutoff_time, training_window=training_window, include_cutoff_time=include_cutoff_time) instance_ids = df[index_var] if isinstance(instance_ids, dd.Series): instance_ids = instance_ids.compute() elif is_instance(instance_ids, ks, 'Series'): instance_ids = instance_ids.to_pandas() # convert list or range object into series if not isinstance(instance_ids, pd.Series): instance_ids = pd.Series(instance_ids) cutoff_time = (cutoff_time, instance_ids) _check_cutoff_time_type(cutoff_time, entityset.time_type) # Approximate provides no benefit with a single cutoff time, so ignore it if isinstance(cutoff_time, tuple) and approximate is not None: msg = "Using approximate with a single cutoff_time value or no cutoff_time " \ "provides no computational efficiency benefit" warnings.warn(msg) cutoff_time = pd.DataFrame({ "instance_id": cutoff_time[1], "time": [cutoff_time[0]] * len(cutoff_time[1]) }) feature_set = FeatureSet(features) # Get features to approximate if approximate is not None: approximate_feature_trie = gather_approximate_features(feature_set) # Make a new FeatureSet that ignores approximated features feature_set = FeatureSet( features, approximate_feature_trie=approximate_feature_trie) # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationFeature): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break if approximate is not None: all_approx_features = { f for _, feats in feature_set.approximate_feature_trie for f in feats } else: all_approx_features = set() deps = feature.get_dependencies(deep=True, ignored=all_approx_features) for dependency in deps: if isinstance(dependency, AggregationFeature): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time, approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time if isinstance(cutoff_time, pd.DataFrame): cutoff_time_len = cutoff_time.shape[0] else: cutoff_time_len = len(cutoff_time[1]) chunk_size = _handle_chunk_size(chunk_size, cutoff_time_len) tqdm_options = { 'total': (cutoff_time_len / FEATURE_CALCULATION_PERCENTAGE), 'bar_format': PBAR_FORMAT, 'disable': True } if verbose: tqdm_options.update({'disable': False}) elif progress_callback is not None: # allows us to utilize progress_bar updates without printing to anywhere tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False}) with make_tqdm_iterator(**tqdm_options) as progress_bar: if n_jobs != 1 or dask_kwargs is not None: feature_matrix = parallel_calculate_chunks( cutoff_time=cutoff_time_to_pass, chunk_size=chunk_size, feature_set=feature_set, approximate=approximate, training_window=training_window, save_progress=save_progress, entityset=entityset, n_jobs=n_jobs, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=progress_bar, dask_kwargs=dask_kwargs or {}, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time) else: feature_matrix = calculate_chunk( cutoff_time=cutoff_time_to_pass, chunk_size=chunk_size, feature_set=feature_set, approximate=approximate, training_window=training_window, save_progress=save_progress, entityset=entityset, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=progress_bar, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time) # ensure rows are sorted by input order if isinstance(feature_matrix, pd.DataFrame): if isinstance(cutoff_time, pd.DataFrame): feature_matrix = feature_matrix.reindex( pd.MultiIndex.from_frame( cutoff_time[["instance_id", "time"]], names=feature_matrix.index.names)) else: # Maintain index dtype index_dtype = feature_matrix.index.get_level_values(0).dtype feature_matrix = feature_matrix.reindex( cutoff_time[1].astype(index_dtype), level=0) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join( save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) # force to 100% since we saved last 5 percent previous_progress = progress_bar.n progress_bar.update(progress_bar.total - progress_bar.n) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) progress_bar.refresh() return feature_matrix
def infer_variable_types(df, link_vars, variable_types, time_index, secondary_time_index): '''Infer variable types from dataframe Args: df (DataFrame): Input DataFrame link_vars (list[]): Linked variables variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time ''' # TODO: set pk and pk types here inferred_types = {} vids_to_assume_datetime = [time_index] if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown for variable in df.columns: if variable in variable_types: continue elif isinstance(df, dd.DataFrame): msg = 'Variable types cannot be inferred from Dask DataFrames, ' \ 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif is_instance(df, ks, 'DataFrame'): msg = 'Variable types cannot be inferred from Koalas DataFrames, ' \ 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif variable in vids_to_assume_datetime: if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric elif variable in link_vars: inferred_type = vtypes.Categorical elif df[variable].dtype == "object": if not len(df[variable]): inferred_type = vtypes.Categorical elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Categorical # heuristics to predict this some other than categorical sample = df[variable].sample(min(10000, len(df[variable]))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > 50: inferred_type = vtypes.NaturalLanguage except AttributeError: pass elif df[variable].dtype == "bool": inferred_type = vtypes.Boolean elif pdtypes.is_categorical_dtype(df[variable].dtype): inferred_type = vtypes.Categorical elif pdtypes.is_numeric_dtype(df[variable].dtype): inferred_type = vtypes.Numeric elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime elif len(df[variable]): n = min(10000, len(df[variable])) sample = df[variable].sample(n=n) n_unique = len(sample.unique()) percent_unique = n_unique / sample.size if percent_unique < .05: inferred_type = vtypes.Categorical else: inferred_type = vtypes.Numeric inferred_types[variable] = inferred_type return inferred_types
def test_is_instance_errors_mismatch(): msg = 'Number of modules does not match number of classnames' with pytest.raises(ValueError, match=msg): is_instance('abc', pd, ('DataFrame', 'Series'))
def normalize_entity(self, base_entity_id, new_entity_id, index, additional_variables=None, copy_variables=None, make_time_index=None, make_secondary_time_index=None, new_entity_time_index=None, new_entity_secondary_time_index=None): """Create a new entity and relationship from unique values of an existing variable. Args: base_entity_id (str) : Entity id from which to split. new_entity_id (str): Id of the new entity. index (str): Variable in old entity that will become index of new entity. Relationship will be created across this variable. additional_variables (list[str]): List of variable ids to remove from base_entity and move to new entity. copy_variables (list[str]): List of variable ids to copy from old entity and move to new entity. make_time_index (bool or str, optional): Create time index for new entity based on time index in base_entity, optionally specifying which variable in base_entity to use for time_index. If specified as True without a specific variable, uses the primary time index. Defaults to True if base entity has a time index. make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index from key. Values of dictionary are the variables to associate with the secondary time index. Only one secondary time index is allowed. If None, only associate the time index. new_entity_time_index (str, optional): Rename new entity time index. new_entity_secondary_time_index (str, optional): Rename new entity secondary time index. """ base_entity = self.entity_dict[base_entity_id] additional_variables = additional_variables or [] copy_variables = copy_variables or [] # Check base entity to make sure time index is valid if base_entity.time_index is not None: t_index = base_entity[base_entity.time_index] if not isinstance(t_index, (vtypes.NumericTimeIndex, vtypes.DatetimeTimeIndex)): base_error = "Time index '{0}' is not a NumericTimeIndex or DatetimeTimeIndex, but type {1}. Use set_time_index on entity '{2}' to set the time_index." raise TypeError(base_error.format(base_entity.time_index, type(t_index), str(base_entity.id))) if not isinstance(additional_variables, list): raise TypeError("'additional_variables' must be a list, but received type {}" .format(type(additional_variables))) if len(additional_variables) != len(set(additional_variables)): raise ValueError("'additional_variables' contains duplicate variables. All variables must be unique.") if not isinstance(copy_variables, list): raise TypeError("'copy_variables' must be a list, but received type {}" .format(type(copy_variables))) if len(copy_variables) != len(set(copy_variables)): raise ValueError("'copy_variables' contains duplicate variables. All variables must be unique.") for v in additional_variables + copy_variables: if v == index: raise ValueError("Not copying {} as both index and variable".format(v)) for v in additional_variables: if v == base_entity.time_index: raise ValueError("Not moving {} as it is the base time index variable. Perhaps, move the variable to the copy_variables.".format(v)) if isinstance(make_time_index, str): if make_time_index not in base_entity.df.columns: raise ValueError("'make_time_index' must be a variable in the base entity") elif make_time_index not in additional_variables + copy_variables: raise ValueError("'make_time_index' must be specified in 'additional_variables' or 'copy_variables'") if index == base_entity.index: raise ValueError("'index' must be different from the index column of the base entity") transfer_types = {} transfer_types[index] = type(base_entity[index]) for v in additional_variables + copy_variables: if type(base_entity[v]) == vtypes.DatetimeTimeIndex: transfer_types[v] = vtypes.Datetime elif type(base_entity[v]) == vtypes.NumericTimeIndex: transfer_types[v] = vtypes.Numeric else: transfer_types[v] = type(base_entity[v]) # create and add new entity new_entity_df = self[base_entity_id].df.copy() if make_time_index is None and base_entity.time_index is not None: make_time_index = True if isinstance(make_time_index, str): # Set the new time index to make_time_index. base_time_index = make_time_index new_entity_time_index = make_time_index already_sorted = (new_entity_time_index == base_entity.time_index) elif make_time_index: # Create a new time index based on the base entity time index. base_time_index = base_entity.time_index if new_entity_time_index is None: new_entity_time_index = "first_%s_time" % (base_entity.id) already_sorted = True assert base_entity.time_index is not None, \ "Base entity doesn't have time_index defined" if base_time_index not in [v for v in additional_variables]: copy_variables.append(base_time_index) transfer_types[new_entity_time_index] = type(base_entity[base_entity.time_index]) else: new_entity_time_index = None already_sorted = False if new_entity_time_index is not None and new_entity_time_index == index: raise ValueError("time_index and index cannot be the same value, %s" % (new_entity_time_index)) selected_variables = [index] +\ [v for v in additional_variables] +\ [v for v in copy_variables] new_entity_df2 = new_entity_df. \ drop_duplicates(index, keep='first')[selected_variables] if make_time_index: new_entity_df2 = new_entity_df2.rename(columns={base_time_index: new_entity_time_index}) if make_secondary_time_index: assert len(make_secondary_time_index) == 1, "Can only provide 1 secondary time index" secondary_time_index = list(make_secondary_time_index.keys())[0] secondary_variables = [index, secondary_time_index] + list(make_secondary_time_index.values())[0] secondary_df = new_entity_df. \ drop_duplicates(index, keep='last')[secondary_variables] if new_entity_secondary_time_index: secondary_df = secondary_df.rename(columns={secondary_time_index: new_entity_secondary_time_index}) secondary_time_index = new_entity_secondary_time_index else: new_entity_secondary_time_index = secondary_time_index secondary_df = secondary_df.set_index(index) new_entity_df = new_entity_df2.join(secondary_df, on=index) else: new_entity_df = new_entity_df2 base_entity_index = index transfer_types[index] = vtypes.Categorical if make_secondary_time_index: old_ti_name = list(make_secondary_time_index.keys())[0] ti_cols = list(make_secondary_time_index.values())[0] ti_cols = [c if c != old_ti_name else secondary_time_index for c in ti_cols] make_secondary_time_index = {secondary_time_index: ti_cols} if is_instance(new_entity_df, ks, 'DataFrame'): already_sorted = False self.entity_from_dataframe( new_entity_id, new_entity_df, index, already_sorted=already_sorted, time_index=new_entity_time_index, secondary_time_index=make_secondary_time_index, variable_types=transfer_types) self.entity_dict[base_entity_id].delete_variables(additional_variables) new_entity = self.entity_dict[new_entity_id] base_entity.convert_variable_type(base_entity_index, vtypes.Id, convert_data=False) self.add_relationship(Relationship(new_entity[index], base_entity[base_entity_index])) self.reset_data_description() return self
def run(self, instance_ids, progress_callback=None, include_cutoff_time=True): """ Calculate values of features for the given instances of the target dataframe. Summary of algorithm: 1. Construct a trie where the edges are relationships and each node contains a set of features for a single dataframe. See FeatureSet._build_feature_trie. 2. Initialize a trie for storing dataframes. 3. Traverse the trie using depth first search. At each node calculate the features and store the resulting dataframe in the dataframe trie (so that its values can be used by features which depend on these features). See _calculate_features_for_dataframe. 4. Get the dataframe at the root of the trie (for the target dataframe) and return the columns corresponding to the requested features. Args: instance_ids (np.ndarray or pd.Categorical): Instance ids for which to build features. progress_callback (callable): function to be called with incremental progress updates include_cutoff_time (bool): If True, data at cutoff time are included in calculating features. Returns: pd.DataFrame : Pandas DataFrame of calculated feature values. Indexed by instance_ids. Columns in same order as features passed in. """ assert len(instance_ids) > 0, "0 instance ids provided" if progress_callback is None: # do nothing for the progress call back if not provided def progress_callback(*args): pass feature_trie = self.feature_set.feature_trie df_trie = Trie(path_constructor=RelationshipPath) full_dataframe_trie = Trie(path_constructor=RelationshipPath) target_dataframe = self.entityset[self.feature_set.target_df_name] self._calculate_features_for_dataframe( dataframe_name=self.feature_set.target_df_name, feature_trie=feature_trie, df_trie=df_trie, full_dataframe_trie=full_dataframe_trie, precalculated_trie=self.precalculated_features, filter_column=target_dataframe.ww.index, filter_values=instance_ids, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time, ) # The dataframe for the target dataframe should be stored at the root of # df_trie. df = df_trie.value # Fill in empty rows with default values. This only works for pandas dataframes # and is not currently supported for Dask dataframes. if isinstance(df, pd.DataFrame): index_dtype = df.index.dtype.name if df.empty: return self.generate_default_df(instance_ids=instance_ids) missing_ids = [ i for i in instance_ids if i not in df[target_dataframe.ww.index] ] if missing_ids: default_df = self.generate_default_df(instance_ids=missing_ids, extra_columns=df.columns) df = default_df.append(df, sort=True) df.index.name = self.entityset[ self.feature_set.target_df_name].ww.index # Order by instance_ids unique_instance_ids = pd.unique(instance_ids) unique_instance_ids = unique_instance_ids.astype( instance_ids.dtype) df = df.reindex(unique_instance_ids) # Keep categorical index if original index was categorical if index_dtype == "category": df.index = df.index.astype("category") column_list = [] for feat in self.feature_set.target_features: column_list.extend(feat.get_feature_names()) if is_instance(df, (dd, ps), "DataFrame"): column_list.extend([target_dataframe.ww.index]) return df[column_list]