def test_relationship_path_dataframes(es): assert list(RelationshipPath([]).dataframes()) == [] log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id") sessions_to_customers = Relationship(es, "customers", "id", "sessions", "customer_id") forward_path = [(True, log_to_sessions), (True, sessions_to_customers)] assert list(RelationshipPath(forward_path).dataframes()) == [ "log", "sessions", "customers", ] backward_path = [(False, sessions_to_customers), (False, log_to_sessions)] assert list(RelationshipPath(backward_path).dataframes()) == [ "customers", "sessions", "log", ] mixed_path = [(True, log_to_sessions), (False, log_to_sessions)] assert list(RelationshipPath(mixed_path).dataframes()) == [ "log", "sessions", "log" ]
def test_relationship_serialization(es): relationship = Relationship(es, "sessions", "id", "log", "session_id") dictionary = { "parent_dataframe_name": "sessions", "parent_column_name": "id", "child_dataframe_name": "log", "child_column_name": "session_id", } assert relationship.to_dictionary() == dictionary assert Relationship.from_dictionary(dictionary, es) == relationship
def test_relationship_serialization(es): relationship = Relationship(es['sessions']['id'], es['log']['session_id']) dictionary = { 'parent_entity_id': 'sessions', 'parent_variable_id': 'id', 'child_entity_id': 'log', 'child_variable_id': 'session_id', } assert relationship.to_dictionary() == dictionary assert Relationship.from_dictionary(dictionary, es) == relationship
def test_relationship_serialization(es): relationship = Relationship(es, 'sessions', 'id', 'log', 'session_id') dictionary = { 'parent_dataframe_name': 'sessions', 'parent_column_name': 'id', 'child_dataframe_name': 'log', 'child_column_name': 'session_id', } assert relationship.to_dictionary() == dictionary assert Relationship.from_dictionary(dictionary, es) == relationship
def test_relationship_path(es): log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id') sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions', 'customer_id') path_list = [(True, log_to_sessions), (True, sessions_to_customers), (False, sessions_to_customers)] path = RelationshipPath(path_list) for i, edge in enumerate(path_list): assert path[i] == edge assert [edge for edge in path] == path_list
def test_relationship_path_name(es): assert RelationshipPath([]).name == '' log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id') sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions', 'customer_id') forward_path = [(True, log_to_sessions), (True, sessions_to_customers)] assert RelationshipPath(forward_path).name == 'sessions.customers' backward_path = [(False, sessions_to_customers), (False, log_to_sessions)] assert RelationshipPath(backward_path).name == 'sessions.log' mixed_path = [(True, log_to_sessions), (False, log_to_sessions)] assert RelationshipPath(mixed_path).name == 'sessions.log'
def test_relationship_path_name(es): assert RelationshipPath([]).name == "" log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id") sessions_to_customers = Relationship(es, "customers", "id", "sessions", "customer_id") forward_path = [(True, log_to_sessions), (True, sessions_to_customers)] assert RelationshipPath(forward_path).name == "sessions.customers" backward_path = [(False, sessions_to_customers), (False, log_to_sessions)] assert RelationshipPath(backward_path).name == "sessions.log" mixed_path = [(True, log_to_sessions), (False, log_to_sessions)] assert RelationshipPath(mixed_path).name == "sessions.log"
def test_relationship_path(es): log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id") sessions_to_customers = Relationship(es, "customers", "id", "sessions", "customer_id") path_list = [ (True, log_to_sessions), (True, sessions_to_customers), (False, sessions_to_customers), ] path = RelationshipPath(path_list) for i, edge in enumerate(path_list): assert path[i] == edge assert [edge for edge in path] == path_list
def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) for df in description['dataframes'].values(): if path is not None: data_path = os.path.join(path, 'data', df['name']) dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description['relationships']: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def from_dictionary(cls, arguments, entityset, dependencies, primitive): base_features = [dependencies[name] for name in arguments["base_features"]] relationship_path = [ Relationship.from_dictionary(r, entityset) for r in arguments["relationship_path"] ] parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name relationship_path = RelationshipPath([(False, r) for r in relationship_path]) use_previous_data = arguments["use_previous"] use_previous = use_previous_data and Timedelta.from_dictionary( use_previous_data ) where_name = arguments["where"] where = where_name and dependencies[where_name] feat = cls( base_features=base_features, parent_dataframe_name=parent_dataframe_name, primitive=primitive, relationship_path=relationship_path, use_previous=use_previous, where=where, name=arguments["name"], ) feat._names = arguments.get("feature_names") return feat
def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) last_time_index = [] for entity in description['entities'].values(): entity['loading_info']['params'].update(kwargs) # If path is None, an empty dataframe will be created for entity. description_to_entity(entity, entityset, path=path) if entity['properties']['last_time_index']: last_time_index.append(entity['id']) for relationship in description['relationships']: relationship = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship) if len(last_time_index): entityset.add_last_time_indexes(updated_entities=last_time_index) return entityset
def __init__(self, id=None, entities=None, relationships=None): """Creates EntitySet Args: id (str) : Unique identifier to associate with this instance entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of entities. Entries take the format {entity id -> (dataframe, id column, (time_index), (variable_types), (make_index))}. Note that time_index, variable_types and make_index are optional. relationships (list[(str, str, str, str)]): List of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). Example: .. code-block:: python entities = { "cards" : (card_df, "id"), "transactions" : (transactions_df, "id", "transaction_time") } relationships = [("cards", "id", "transactions", "card_id")] ft.EntitySet("my-entity-set", entities, relationships) """ self.id = id self.entity_dict = {} self.relationships = [] self.time_type = None entities = entities or {} relationships = relationships or [] for entity in entities: df = entities[entity][0] index_column = entities[entity][1] time_index = None variable_types = None make_index = None if len(entities[entity]) > 2: time_index = entities[entity][2] if len(entities[entity]) > 3: variable_types = entities[entity][3] if len(entities[entity]) > 4: make_index = entities[entity][4] self.entity_from_dataframe(entity_id=entity, dataframe=df, index=index_column, time_index=time_index, variable_types=variable_types, make_index=make_index) for relationship in relationships: parent_variable = self[relationship[0]][relationship[1]] child_variable = self[relationship[2]][relationship[3]] self.add_relationship(Relationship(parent_variable, child_variable)) self.reset_data_description()
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer): base_features = [ dependencies[name] for name in arguments['base_features'] ] relationship_path = [ Relationship.from_dictionary(r, entityset) for r in arguments['relationship_path'] ] parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name relationship_path = RelationshipPath([(False, r) for r in relationship_path]) primitive = primitives_deserializer.deserialize_primitive( arguments['primitive']) use_previous_data = arguments['use_previous'] use_previous = use_previous_data and Timedelta.from_dictionary( use_previous_data) where_name = arguments['where'] where = where_name and dependencies[where_name] return cls(base_features=base_features, parent_dataframe_name=parent_dataframe_name, primitive=primitive, relationship_path=relationship_path, use_previous=use_previous, where=where, name=arguments['name'])
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer): base_feature = dependencies[arguments['base_feature']] relationship = Relationship.from_dictionary(arguments['relationship'], entityset) child_dataframe_name = relationship.child_dataframe.ww.name return cls(base_feature=base_feature, child_dataframe_name=child_dataframe_name, relationship=relationship, name=arguments['name'])
def test_relationship_path_dataframes(es): assert list(RelationshipPath([]).dataframes()) == [] log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id') sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions', 'customer_id') forward_path = [(True, log_to_sessions), (True, sessions_to_customers)] assert list(RelationshipPath(forward_path).dataframes()) == [ 'log', 'sessions', 'customers' ] backward_path = [(False, sessions_to_customers), (False, log_to_sessions)] assert list(RelationshipPath(backward_path).dataframes()) == [ 'customers', 'sessions', 'log' ] mixed_path = [(True, log_to_sessions), (False, log_to_sessions)] assert list(RelationshipPath(mixed_path).dataframes()) == [ 'log', 'sessions', 'log' ]
def from_dictionary(cls, arguments, entityset, dependencies, primitive): base_feature = dependencies[arguments["base_feature"]] relationship = Relationship.from_dictionary( arguments["relationship"], entityset ) child_dataframe_name = relationship.child_dataframe.ww.name return cls( base_feature=base_feature, child_dataframe_name=child_dataframe_name, relationship=relationship, name=arguments["name"], )
def test_relationship_path_entities(es): assert list(RelationshipPath([]).entities()) == [] log_to_sessions = Relationship(es['sessions']['id'], es['log']['session_id']) sessions_to_customers = Relationship(es['customers']['id'], es['sessions']['customer_id']) forward_path = [(True, log_to_sessions), (True, sessions_to_customers)] assert list(RelationshipPath(forward_path).entities()) == [ 'log', 'sessions', 'customers' ] backward_path = [(False, sessions_to_customers), (False, log_to_sessions)] assert list(RelationshipPath(backward_path).entities()) == [ 'customers', 'sessions', 'log' ] mixed_path = [(True, log_to_sessions), (False, log_to_sessions)] assert list( RelationshipPath(mixed_path).entities()) == ['log', 'sessions', 'log']
def description_to_relationship(description, entityset): '''Deserialize parent and child variables from relationship description. Args: description (dict) : Description of :class:`.Relationship`. entityset (EntitySet) : Instance of :class:`.EntitySet` containing parent and child variables. Returns: item (tuple(Variable, Variable)) : Tuple containing parent and child variables. ''' entity, variable = description['parent'] parent = entityset[entity][variable] entity, variable = description['child'] child = entityset[entity][variable] return Relationship(parent, child)
def description_to_entityset(description, **kwargs): """Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. """ check_schema_version(description, "entityset") from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get("path") entityset = EntitySet(description["id"]) for df in description["dataframes"].values(): if path is not None: data_path = os.path.join(path, "data", df["name"]) format = description.get("format") if format is not None: kwargs["format"] = format if format == "parquet" and df["loading_info"][ "table_type"] == "pandas": kwargs["filename"] = df["name"] + ".parquet" dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description["relationships"]: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] use_previous = test_feature.use_previous base_features = test_feature.base_features where = test_feature.where entity = test_feature.entity child_entity = base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames index_var = entity.index frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return # handle where clause for all functions below if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var).apply(last_n) if not base_frame.empty: if groupby_var not in base_frame: # This occured sometimes. I think it might have to do with category # but not sure. TODO: look into when this occurs no_instances = True # if the foreign key column in the child (base_frame) that links to # frame is an integer and the id column in the parent is an object or # category dtype, the .isin() call errors. elif (frame[index_var].dtype != base_frame[groupby_var].dtype or frame[index_var].dtype.name.find('category') > -1): try: frame_as_obj = frame[index_var].astype(object) base_frame_as_obj = base_frame[groupby_var].astype(object) except ValueError: msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})" raise ValueError( msg.format(entity.id, index_var, frame[index_var].dtype, child_entity.id, groupby_var, base_frame[groupby_var].dtype)) else: no_instances = check_no_related_instances( frame_as_obj.values, base_frame_as_obj.values) else: no_instances = check_no_related_instances( frame[index_var].values, base_frame[groupby_var].values) if base_frame.empty or no_instances: for f in features: set_default_column(entity_frames[entity.id], f) return def wrap_func_with_name(func, name): def inner(x): return func(x) inner.__name__ = name return inner to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # make sure function names are unique random_id = str(uuid.uuid1()) func = wrap_func_with_name(func, random_id) funcname = random_id to_agg[variable_id].append(func) agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one # Do the [variables] accessor on to_merge because the agg call returns # a dataframe with columns that contain the dataframes we want if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg) # we apply multiple functions to each column, creating # a multiindex as the column # rename the columns to a concatenation of the two indexes to_merge.columns = [ u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel() ] # to enable a rename to_merge = to_merge.rename(columns=agg_rename) variables = agg_rename.values() to_merge = to_merge[variables] frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) entity_frames[entity.id] = frame
def test_names_when_multiple_relationships_between_dataframes(games_es): relationship = Relationship(games_es, "teams", "id", "games", "home_team_id") assert relationship.child_name == "games[home_team_id]" assert relationship.parent_name == "teams[home_team_id]"
def normalize_entity(self, base_entity_id, new_entity_id, index, additional_variables=None, copy_variables=None, make_time_index=None, make_secondary_time_index=None, new_entity_time_index=None, new_entity_secondary_time_index=None): """Create a new entity and relationship from unique values of an existing variable. Args: base_entity_id (str) : Entity id from which to split. new_entity_id (str): Id of the new entity. index (str): Variable in old entity that will become index of new entity. Relationship will be created across this variable. additional_variables (list[str]): List of variable ids to remove from base_entity and move to new entity. copy_variables (list[str]): List of variable ids to copy from old entity and move to new entity. make_time_index (bool or str, optional): Create time index for new entity based on time index in base_entity, optionally specifying which variable in base_entity to use for time_index. If specified as True without a specific variable, uses the primary time index. Defaults to True if base entity has a time index. make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index from key. Values of dictionary are the variables to associate with the secondary time index. Only one secondary time index is allowed. If None, only associate the time index. new_entity_time_index (str, optional): Rename new entity time index. new_entity_secondary_time_index (str, optional): Rename new entity secondary time index. """ base_entity = self.entity_dict[base_entity_id] additional_variables = additional_variables or [] copy_variables = copy_variables or [] if not isinstance(additional_variables, list): raise TypeError( "'additional_variables' must be a list, but received type {}". format(type(additional_variables))) if len(additional_variables) != len(set(additional_variables)): raise ValueError( "'additional_variables' contains duplicate variables. All variables must be unique." ) if not isinstance(copy_variables, list): raise TypeError( "'copy_variables' must be a list, but received type {}".format( type(copy_variables))) if len(copy_variables) != len(set(copy_variables)): raise ValueError( "'copy_variables' contains duplicate variables. All variables must be unique." ) for v in additional_variables + copy_variables: if v == index: raise ValueError( "Not copying {} as both index and variable".format(v)) break if is_string(make_time_index): if make_time_index not in base_entity.df.columns: raise ValueError( "'make_time_index' must be a variable in the base entity") elif make_time_index not in additional_variables + copy_variables: raise ValueError( "'make_time_index' must specified in 'additional_variables' or 'copy_variables'" ) if index == base_entity.index: raise ValueError( "'index' must be different from the index column of the base entity" ) transfer_types = {} transfer_types[index] = type(base_entity[index]) for v in additional_variables + copy_variables: transfer_types[v] = type(base_entity[v]) # create and add new entity new_entity_df = self[base_entity_id].df.copy() if make_time_index is None and base_entity.time_index is not None: make_time_index = True if isinstance(make_time_index, str): # Set the new time index to make_time_index. base_time_index = make_time_index new_entity_time_index = make_time_index already_sorted = (new_entity_time_index == base_entity.time_index) elif make_time_index: # Create a new time index based on the base entity time index. base_time_index = base_entity.time_index if new_entity_time_index is None: new_entity_time_index = "first_%s_time" % (base_entity.id) already_sorted = True assert base_entity.time_index is not None, \ "Base entity doesn't have time_index defined" if base_time_index not in [v for v in additional_variables]: copy_variables.append(base_time_index) transfer_types[new_entity_time_index] = type( base_entity[base_entity.time_index]) else: new_entity_time_index = None already_sorted = False if new_entity_time_index is not None and new_entity_time_index == index: raise ValueError( "time_index and index cannot be the same value, %s" % (new_entity_time_index)) selected_variables = [index] +\ [v for v in additional_variables] +\ [v for v in copy_variables] new_entity_df2 = new_entity_df. \ drop_duplicates(index, keep='first')[selected_variables] if make_time_index: new_entity_df2.rename( columns={base_time_index: new_entity_time_index}, inplace=True) if make_secondary_time_index: assert len(make_secondary_time_index ) == 1, "Can only provide 1 secondary time index" secondary_time_index = list(make_secondary_time_index.keys())[0] secondary_variables = [index, secondary_time_index] + list( make_secondary_time_index.values())[0] secondary_df = new_entity_df. \ drop_duplicates(index, keep='last')[secondary_variables] if new_entity_secondary_time_index: secondary_df.rename(columns={ secondary_time_index: new_entity_secondary_time_index }, inplace=True) secondary_time_index = new_entity_secondary_time_index else: new_entity_secondary_time_index = secondary_time_index secondary_df.set_index(index, inplace=True) new_entity_df = new_entity_df2.join(secondary_df, on=index) else: new_entity_df = new_entity_df2 base_entity_index = index transfer_types[index] = vtypes.Categorical if make_secondary_time_index: old_ti_name = list(make_secondary_time_index.keys())[0] ti_cols = list(make_secondary_time_index.values())[0] ti_cols = [ c if c != old_ti_name else secondary_time_index for c in ti_cols ] make_secondary_time_index = {secondary_time_index: ti_cols} self.entity_from_dataframe( new_entity_id, new_entity_df, index, already_sorted=already_sorted, time_index=new_entity_time_index, secondary_time_index=make_secondary_time_index, variable_types=transfer_types) self.entity_dict[base_entity_id].delete_variables(additional_variables) new_entity = self.entity_dict[new_entity_id] base_entity.convert_variable_type(base_entity_index, vtypes.Id, convert_data=False) self.add_relationship( Relationship(new_entity[index], base_entity[base_entity_index])) self.reset_data_description() return self
def test_names_when_no_other_relationship_between_dataframes(home_games_es): relationship = Relationship(home_games_es, "teams", "id", "games", "home_team_id") assert relationship.child_name == "games" assert relationship.parent_name == "teams"
def test_names_when_multiple_relationships_between_entities(games_es): relationship = Relationship(games_es['teams']['id'], games_es['games']['home_team_id']) assert relationship.child_name == 'games[home_team_id]' assert relationship.parent_name == 'teams[home_team_id]'
def test_names_when_no_other_relationship_between_entities(home_games_es): relationship = Relationship(home_games_es['teams']['id'], home_games_es['games']['home_team_id']) assert relationship.child_name == 'games' assert relationship.parent_name == 'teams'
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] entity = test_feature.entity child_entity = test_feature.base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where clause for all functions below where = test_feature.where if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() funcname = func if callable(func): funcname = func.__name__ to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def test_names_when_no_other_relationship_between_dataframes(home_games_es): relationship = Relationship(home_games_es, 'teams', 'id', 'games', 'home_team_id') assert relationship.child_name == 'games' assert relationship.parent_name == 'teams'
def test_names_when_multiple_relationships_between_dataframes(games_es): relationship = Relationship(games_es, 'teams', 'id', 'games', 'home_team_id') assert relationship.child_name == 'games[home_team_id]' assert relationship.parent_name == 'teams[home_team_id]'
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] use_previous = test_feature.use_previous base_features = test_feature.base_features where = test_feature.where entity = test_feature.entity child_entity = base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames index_var = entity.index frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where clause for all functions below if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path(entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var).apply(last_n) if not base_frame.empty: if groupby_var not in base_frame: # This occured sometimes. I think it might have to do with category # but not sure. TODO: look into when this occurs no_instances = True # if the foreign key column in the child (base_frame) that links to # frame is an integer and the id column in the parent is an object or # category dtype, the .isin() call errors. elif (frame[index_var].dtype != base_frame[groupby_var].dtype or frame[index_var].dtype.name.find('category') > -1): try: frame_as_obj = frame[index_var].astype(object) base_frame_as_obj = base_frame[groupby_var].astype(object) except ValueError: msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})" raise ValueError(msg.format(entity.id, index_var, frame[index_var].dtype, child_entity.id, groupby_var, base_frame[groupby_var].dtype)) else: no_instances = check_no_related_instances( frame_as_obj.values, base_frame_as_obj.values) else: no_instances = check_no_related_instances( frame[index_var].values, base_frame[groupby_var].values) if base_frame.empty or no_instances: for f in features: set_default_column(entity_frames[entity.id], f) return frame def wrap_func_with_name(func, name): def inner(x): return func(x) inner.__name__ = name return inner to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # make sure function names are unique random_id = str(uuid.uuid1()) func = wrap_func_with_name(func, random_id) funcname = random_id to_agg[variable_id].append(func) agg_rename[u"{}-{}".format(variable_id, funcname)] = \ f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one # Do the [variables] accessor on to_merge because the agg call returns # a dataframe with columns that contain the dataframes we want if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg) # we apply multiple functions to each column, creating # a multiindex as the column # rename the columns to a concatenation of the two indexes to_merge.columns = [u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel()] # to enable a rename to_merge = to_merge.rename(columns=agg_rename) variables = list(agg_rename.values()) to_merge = to_merge[variables] frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [f for f in features if hasattr(f.default_value, '__iter__')] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = {f.get_name(): f.default_value for f in features if f not in iterfeats} frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame