Ejemplo n.º 1
0
def test_relationship_path_dataframes(es):
    assert list(RelationshipPath([]).dataframes()) == []

    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).dataframes()) == [
        "log",
        "sessions",
        "customers",
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).dataframes()) == [
        "customers",
        "sessions",
        "log",
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(RelationshipPath(mixed_path).dataframes()) == [
        "log", "sessions", "log"
    ]
Ejemplo n.º 2
0
def test_relationship_serialization(es):
    relationship = Relationship(es, "sessions", "id", "log", "session_id")

    dictionary = {
        "parent_dataframe_name": "sessions",
        "parent_column_name": "id",
        "child_dataframe_name": "log",
        "child_column_name": "session_id",
    }
    assert relationship.to_dictionary() == dictionary
    assert Relationship.from_dictionary(dictionary, es) == relationship
Ejemplo n.º 3
0
def test_relationship_serialization(es):
    relationship = Relationship(es['sessions']['id'], es['log']['session_id'])

    dictionary = {
        'parent_entity_id': 'sessions',
        'parent_variable_id': 'id',
        'child_entity_id': 'log',
        'child_variable_id': 'session_id',
    }
    assert relationship.to_dictionary() == dictionary
    assert Relationship.from_dictionary(dictionary, es) == relationship
Ejemplo n.º 4
0
def test_relationship_serialization(es):
    relationship = Relationship(es, 'sessions', 'id', 'log', 'session_id')

    dictionary = {
        'parent_dataframe_name': 'sessions',
        'parent_column_name': 'id',
        'child_dataframe_name': 'log',
        'child_column_name': 'session_id',
    }
    assert relationship.to_dictionary() == dictionary
    assert Relationship.from_dictionary(dictionary, es) == relationship
Ejemplo n.º 5
0
def test_relationship_path(es):
    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')
    path_list = [(True, log_to_sessions), (True, sessions_to_customers),
                 (False, sessions_to_customers)]
    path = RelationshipPath(path_list)

    for i, edge in enumerate(path_list):
        assert path[i] == edge

    assert [edge for edge in path] == path_list
Ejemplo n.º 6
0
def test_relationship_path_name(es):
    assert RelationshipPath([]).name == ''

    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert RelationshipPath(forward_path).name == 'sessions.customers'

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert RelationshipPath(backward_path).name == 'sessions.log'

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert RelationshipPath(mixed_path).name == 'sessions.log'
Ejemplo n.º 7
0
def test_relationship_path_name(es):
    assert RelationshipPath([]).name == ""

    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert RelationshipPath(forward_path).name == "sessions.customers"

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert RelationshipPath(backward_path).name == "sessions.log"

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert RelationshipPath(mixed_path).name == "sessions.log"
Ejemplo n.º 8
0
def test_relationship_path(es):
    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")
    path_list = [
        (True, log_to_sessions),
        (True, sessions_to_customers),
        (False, sessions_to_customers),
    ]
    path = RelationshipPath(path_list)

    for i, edge in enumerate(path_list):
        assert path[i] == edge

    assert [edge for edge in path] == path_list
Ejemplo n.º 9
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    for df in description['dataframes'].values():
        if path is not None:
            data_path = os.path.join(path, 'data', df['name'])
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description['relationships']:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
Ejemplo n.º 10
0
    def from_dictionary(cls, arguments, entityset, dependencies, primitive):
        base_features = [dependencies[name] for name in arguments["base_features"]]
        relationship_path = [
            Relationship.from_dictionary(r, entityset)
            for r in arguments["relationship_path"]
        ]
        parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name
        relationship_path = RelationshipPath([(False, r) for r in relationship_path])

        use_previous_data = arguments["use_previous"]
        use_previous = use_previous_data and Timedelta.from_dictionary(
            use_previous_data
        )

        where_name = arguments["where"]
        where = where_name and dependencies[where_name]

        feat = cls(
            base_features=base_features,
            parent_dataframe_name=parent_dataframe_name,
            primitive=primitive,
            relationship_path=relationship_path,
            use_previous=use_previous,
            where=where,
            name=arguments["name"],
        )
        feat._names = arguments.get("feature_names")
        return feat
Ejemplo n.º 11
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet
    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    last_time_index = []
    for entity in description['entities'].values():
        entity['loading_info']['params'].update(kwargs)
        # If path is None, an empty dataframe will be created for entity.
        description_to_entity(entity, entityset, path=path)
        if entity['properties']['last_time_index']:
            last_time_index.append(entity['id'])

    for relationship in description['relationships']:
        relationship = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship)

    if len(last_time_index):
        entityset.add_last_time_indexes(updated_entities=last_time_index)

    return entityset
Ejemplo n.º 12
0
    def __init__(self, id=None, entities=None, relationships=None):
        """Creates EntitySet

            Args:
                id (str) : Unique identifier to associate with this instance

                entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of
                    entities. Entries take the format
                    {entity id -> (dataframe, id column, (time_index), (variable_types), (make_index))}.
                    Note that time_index, variable_types and make_index are optional.

                relationships (list[(str, str, str, str)]): List of relationships
                    between entities. List items are a tuple with the format
                    (parent entity id, parent variable, child entity id, child variable).

            Example:

                .. code-block:: python

                    entities = {
                        "cards" : (card_df, "id"),
                        "transactions" : (transactions_df, "id", "transaction_time")
                    }

                    relationships = [("cards", "id", "transactions", "card_id")]

                    ft.EntitySet("my-entity-set", entities, relationships)
        """
        self.id = id
        self.entity_dict = {}
        self.relationships = []
        self.time_type = None

        entities = entities or {}
        relationships = relationships or []
        for entity in entities:
            df = entities[entity][0]
            index_column = entities[entity][1]
            time_index = None
            variable_types = None
            make_index = None
            if len(entities[entity]) > 2:
                time_index = entities[entity][2]
            if len(entities[entity]) > 3:
                variable_types = entities[entity][3]
            if len(entities[entity]) > 4:
                make_index = entities[entity][4]
            self.entity_from_dataframe(entity_id=entity,
                                       dataframe=df,
                                       index=index_column,
                                       time_index=time_index,
                                       variable_types=variable_types,
                                       make_index=make_index)

        for relationship in relationships:
            parent_variable = self[relationship[0]][relationship[1]]
            child_variable = self[relationship[2]][relationship[3]]
            self.add_relationship(Relationship(parent_variable,
                                               child_variable))
        self.reset_data_description()
Ejemplo n.º 13
0
    def from_dictionary(cls, arguments, entityset, dependencies,
                        primitives_deserializer):
        base_features = [
            dependencies[name] for name in arguments['base_features']
        ]
        relationship_path = [
            Relationship.from_dictionary(r, entityset)
            for r in arguments['relationship_path']
        ]
        parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name
        relationship_path = RelationshipPath([(False, r)
                                              for r in relationship_path])

        primitive = primitives_deserializer.deserialize_primitive(
            arguments['primitive'])

        use_previous_data = arguments['use_previous']
        use_previous = use_previous_data and Timedelta.from_dictionary(
            use_previous_data)

        where_name = arguments['where']
        where = where_name and dependencies[where_name]

        return cls(base_features=base_features,
                   parent_dataframe_name=parent_dataframe_name,
                   primitive=primitive,
                   relationship_path=relationship_path,
                   use_previous=use_previous,
                   where=where,
                   name=arguments['name'])
Ejemplo n.º 14
0
 def from_dictionary(cls, arguments, entityset, dependencies,
                     primitives_deserializer):
     base_feature = dependencies[arguments['base_feature']]
     relationship = Relationship.from_dictionary(arguments['relationship'],
                                                 entityset)
     child_dataframe_name = relationship.child_dataframe.ww.name
     return cls(base_feature=base_feature,
                child_dataframe_name=child_dataframe_name,
                relationship=relationship,
                name=arguments['name'])
Ejemplo n.º 15
0
def test_relationship_path_dataframes(es):
    assert list(RelationshipPath([]).dataframes()) == []

    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).dataframes()) == [
        'log', 'sessions', 'customers'
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).dataframes()) == [
        'customers', 'sessions', 'log'
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(RelationshipPath(mixed_path).dataframes()) == [
        'log', 'sessions', 'log'
    ]
Ejemplo n.º 16
0
 def from_dictionary(cls, arguments, entityset, dependencies, primitive):
     base_feature = dependencies[arguments["base_feature"]]
     relationship = Relationship.from_dictionary(
         arguments["relationship"], entityset
     )
     child_dataframe_name = relationship.child_dataframe.ww.name
     return cls(
         base_feature=base_feature,
         child_dataframe_name=child_dataframe_name,
         relationship=relationship,
         name=arguments["name"],
     )
Ejemplo n.º 17
0
def test_relationship_path_entities(es):
    assert list(RelationshipPath([]).entities()) == []

    log_to_sessions = Relationship(es['sessions']['id'],
                                   es['log']['session_id'])
    sessions_to_customers = Relationship(es['customers']['id'],
                                         es['sessions']['customer_id'])

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).entities()) == [
        'log', 'sessions', 'customers'
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).entities()) == [
        'customers', 'sessions', 'log'
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(
        RelationshipPath(mixed_path).entities()) == ['log', 'sessions', 'log']
Ejemplo n.º 18
0
def description_to_relationship(description, entityset):
    '''Deserialize parent and child variables from relationship description.

    Args:
        description (dict) : Description of :class:`.Relationship`.
        entityset (EntitySet) : Instance of :class:`.EntitySet` containing parent and child variables.

    Returns:
        item (tuple(Variable, Variable)) : Tuple containing parent and child variables.
    '''
    entity, variable = description['parent']
    parent = entityset[entity][variable]
    entity, variable = description['child']
    child = entityset[entity][variable]
    return Relationship(parent, child)
Ejemplo n.º 19
0
def description_to_entityset(description, **kwargs):
    """Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    """
    check_schema_version(description, "entityset")

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get("path")
    entityset = EntitySet(description["id"])

    for df in description["dataframes"].values():
        if path is not None:
            data_path = os.path.join(path, "data", df["name"])
            format = description.get("format")
            if format is not None:
                kwargs["format"] = format
                if format == "parquet" and df["loading_info"][
                        "table_type"] == "pandas":
                    kwargs["filename"] = df["name"] + ".parquet"
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description["relationships"]:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
Ejemplo n.º 20
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        use_previous = test_feature.use_previous
        base_features = test_feature.base_features
        where = test_feature.where
        entity = test_feature.entity
        child_entity = base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        index_var = entity.index
        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return

        # handle where clause for all functions below
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(
            entity.id, child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var).apply(last_n)

        if not base_frame.empty:
            if groupby_var not in base_frame:
                # This occured sometimes. I think it might have to do with category
                # but not sure. TODO: look into when this occurs
                no_instances = True
            # if the foreign key column in the child (base_frame) that links to
            # frame is an integer and the id column in the parent is an object or
            # category dtype, the .isin() call errors.
            elif (frame[index_var].dtype != base_frame[groupby_var].dtype
                  or frame[index_var].dtype.name.find('category') > -1):
                try:
                    frame_as_obj = frame[index_var].astype(object)
                    base_frame_as_obj = base_frame[groupby_var].astype(object)
                except ValueError:
                    msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})"
                    raise ValueError(
                        msg.format(entity.id, index_var,
                                   frame[index_var].dtype, child_entity.id,
                                   groupby_var, base_frame[groupby_var].dtype))
                else:
                    no_instances = check_no_related_instances(
                        frame_as_obj.values, base_frame_as_obj.values)
            else:
                no_instances = check_no_related_instances(
                    frame[index_var].values, base_frame[groupby_var].values)

        if base_frame.empty or no_instances:
            for f in features:
                set_default_column(entity_frames[entity.id], f)

            return

        def wrap_func_with_name(func, name):
            def inner(x):
                return func(x)

            inner.__name__ = name
            return inner

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()
                if variable_id not in to_agg:
                    to_agg[variable_id] = []
                func = f.get_function()
                # make sure function names are unique
                random_id = str(uuid.uuid1())
                func = wrap_func_with_name(func, random_id)
                funcname = random_id
                to_agg[variable_id].append(func)
                agg_rename[u"{}-{}".format(variable_id,
                                           funcname)] = f.get_name()

                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_on=index_var,
                             right_index=True,
                             how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        # Do the [variables] accessor on to_merge because the agg call returns
        # a dataframe with columns that contain the dataframes we want
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)

            to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg)
            # we apply multiple functions to each column, creating
            # a multiindex as the column
            # rename the columns to a concatenation of the two indexes
            to_merge.columns = [
                u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel()
            ]
            # to enable a rename
            to_merge = to_merge.rename(columns=agg_rename)
            variables = agg_rename.values()
            to_merge = to_merge[variables]
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_on=index_var,
                             right_index=True,
                             how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        entity_frames[entity.id] = frame
Ejemplo n.º 21
0
def test_names_when_multiple_relationships_between_dataframes(games_es):
    relationship = Relationship(games_es, "teams", "id", "games",
                                "home_team_id")
    assert relationship.child_name == "games[home_team_id]"
    assert relationship.parent_name == "teams[home_team_id]"
Ejemplo n.º 22
0
    def normalize_entity(self,
                         base_entity_id,
                         new_entity_id,
                         index,
                         additional_variables=None,
                         copy_variables=None,
                         make_time_index=None,
                         make_secondary_time_index=None,
                         new_entity_time_index=None,
                         new_entity_secondary_time_index=None):
        """Create a new entity and relationship from unique values of an existing variable.

        Args:
            base_entity_id (str) : Entity id from which to split.

            new_entity_id (str): Id of the new entity.

            index (str): Variable in old entity
                that will become index of new entity. Relationship
                will be created across this variable.

            additional_variables (list[str]):
                List of variable ids to remove from
                base_entity and move to new entity.

            copy_variables (list[str]): List of
                variable ids to copy from old entity
                and move to new entity.

            make_time_index (bool or str, optional): Create time index for new entity based
                on time index in base_entity, optionally specifying which variable in base_entity
                to use for time_index. If specified as True without a specific variable,
                uses the primary time index. Defaults to True if base entity has a time index.

            make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index
                from key. Values of dictionary
                are the variables to associate with the secondary time index. Only one
                secondary time index is allowed. If None, only associate the time index.

            new_entity_time_index (str, optional): Rename new entity time index.

            new_entity_secondary_time_index (str, optional): Rename new entity secondary time index.

        """
        base_entity = self.entity_dict[base_entity_id]
        additional_variables = additional_variables or []
        copy_variables = copy_variables or []

        if not isinstance(additional_variables, list):
            raise TypeError(
                "'additional_variables' must be a list, but received type {}".
                format(type(additional_variables)))

        if len(additional_variables) != len(set(additional_variables)):
            raise ValueError(
                "'additional_variables' contains duplicate variables. All variables must be unique."
            )

        if not isinstance(copy_variables, list):
            raise TypeError(
                "'copy_variables' must be a list, but received type {}".format(
                    type(copy_variables)))

        if len(copy_variables) != len(set(copy_variables)):
            raise ValueError(
                "'copy_variables' contains duplicate variables. All variables must be unique."
            )

        for v in additional_variables + copy_variables:
            if v == index:
                raise ValueError(
                    "Not copying {} as both index and variable".format(v))
                break
        if is_string(make_time_index):
            if make_time_index not in base_entity.df.columns:
                raise ValueError(
                    "'make_time_index' must be a variable in the base entity")
            elif make_time_index not in additional_variables + copy_variables:
                raise ValueError(
                    "'make_time_index' must specified in 'additional_variables' or 'copy_variables'"
                )
        if index == base_entity.index:
            raise ValueError(
                "'index' must be different from the index column of the base entity"
            )

        transfer_types = {}
        transfer_types[index] = type(base_entity[index])
        for v in additional_variables + copy_variables:
            transfer_types[v] = type(base_entity[v])

        # create and add new entity
        new_entity_df = self[base_entity_id].df.copy()

        if make_time_index is None and base_entity.time_index is not None:
            make_time_index = True

        if isinstance(make_time_index, str):
            # Set the new time index to make_time_index.
            base_time_index = make_time_index
            new_entity_time_index = make_time_index
            already_sorted = (new_entity_time_index == base_entity.time_index)
        elif make_time_index:
            # Create a new time index based on the base entity time index.
            base_time_index = base_entity.time_index
            if new_entity_time_index is None:
                new_entity_time_index = "first_%s_time" % (base_entity.id)

            already_sorted = True

            assert base_entity.time_index is not None, \
                "Base entity doesn't have time_index defined"

            if base_time_index not in [v for v in additional_variables]:
                copy_variables.append(base_time_index)

            transfer_types[new_entity_time_index] = type(
                base_entity[base_entity.time_index])
        else:
            new_entity_time_index = None
            already_sorted = False

        if new_entity_time_index is not None and new_entity_time_index == index:
            raise ValueError(
                "time_index and index cannot be the same value, %s" %
                (new_entity_time_index))

        selected_variables = [index] +\
            [v for v in additional_variables] +\
            [v for v in copy_variables]

        new_entity_df2 = new_entity_df. \
            drop_duplicates(index, keep='first')[selected_variables]

        if make_time_index:
            new_entity_df2.rename(
                columns={base_time_index: new_entity_time_index}, inplace=True)
        if make_secondary_time_index:
            assert len(make_secondary_time_index
                       ) == 1, "Can only provide 1 secondary time index"
            secondary_time_index = list(make_secondary_time_index.keys())[0]

            secondary_variables = [index, secondary_time_index] + list(
                make_secondary_time_index.values())[0]
            secondary_df = new_entity_df. \
                drop_duplicates(index, keep='last')[secondary_variables]
            if new_entity_secondary_time_index:
                secondary_df.rename(columns={
                    secondary_time_index:
                    new_entity_secondary_time_index
                },
                                    inplace=True)
                secondary_time_index = new_entity_secondary_time_index
            else:
                new_entity_secondary_time_index = secondary_time_index
            secondary_df.set_index(index, inplace=True)
            new_entity_df = new_entity_df2.join(secondary_df, on=index)
        else:
            new_entity_df = new_entity_df2

        base_entity_index = index

        transfer_types[index] = vtypes.Categorical
        if make_secondary_time_index:
            old_ti_name = list(make_secondary_time_index.keys())[0]
            ti_cols = list(make_secondary_time_index.values())[0]
            ti_cols = [
                c if c != old_ti_name else secondary_time_index
                for c in ti_cols
            ]
            make_secondary_time_index = {secondary_time_index: ti_cols}

        self.entity_from_dataframe(
            new_entity_id,
            new_entity_df,
            index,
            already_sorted=already_sorted,
            time_index=new_entity_time_index,
            secondary_time_index=make_secondary_time_index,
            variable_types=transfer_types)

        self.entity_dict[base_entity_id].delete_variables(additional_variables)

        new_entity = self.entity_dict[new_entity_id]
        base_entity.convert_variable_type(base_entity_index,
                                          vtypes.Id,
                                          convert_data=False)
        self.add_relationship(
            Relationship(new_entity[index], base_entity[base_entity_index]))
        self.reset_data_description()
        return self
Ejemplo n.º 23
0
def test_names_when_no_other_relationship_between_dataframes(home_games_es):
    relationship = Relationship(home_games_es, "teams", "id", "games",
                                "home_team_id")
    assert relationship.child_name == "games"
    assert relationship.parent_name == "teams"
Ejemplo n.º 24
0
def test_names_when_multiple_relationships_between_entities(games_es):
    relationship = Relationship(games_es['teams']['id'],
                                games_es['games']['home_team_id'])
    assert relationship.child_name == 'games[home_team_id]'
    assert relationship.parent_name == 'teams[home_team_id]'
Ejemplo n.º 25
0
def test_names_when_no_other_relationship_between_entities(home_games_es):
    relationship = Relationship(home_games_es['teams']['id'],
                                home_games_es['games']['home_team_id'])
    assert relationship.child_name == 'games'
    assert relationship.parent_name == 'teams'
Ejemplo n.º 26
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        entity = test_feature.entity
        child_entity = test_feature.base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where clause for all functions below
        where = test_feature.where
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(
            entity.id, child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        use_previous = test_feature.use_previous
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var,
                                                observed=True,
                                                sort=False).apply(last_n)

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()

                if variable_id not in to_agg:
                    to_agg[variable_id] = []

                func = f.get_function()
                funcname = func
                if callable(func):
                    funcname = func.__name__

                to_agg[variable_id].append(func)
                # this is used below to rename columns that pandas names for us
                agg_rename[u"{}-{}".format(variable_id,
                                           funcname)] = f.get_name()
                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).agg(to_agg)
            # rename columns to the correct feature names
            to_merge.columns = [
                agg_rename["-".join(x)] for x in to_merge.columns.ravel()
            ]
            to_merge = to_merge[list(agg_rename.values())]

            # workaround for pandas bug where categories are in the wrong order
            # see: https://github.com/pandas-dev/pandas/issues/22501
            if pdtypes.is_categorical_dtype(frame.index):
                categories = pdtypes.CategoricalDtype(
                    categories=frame.index.categories)
                to_merge.index = to_merge.index.astype(object).astype(
                    categories)

            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
Ejemplo n.º 27
0
def test_names_when_no_other_relationship_between_dataframes(home_games_es):
    relationship = Relationship(home_games_es, 'teams', 'id', 'games',
                                'home_team_id')
    assert relationship.child_name == 'games'
    assert relationship.parent_name == 'teams'
Ejemplo n.º 28
0
def test_names_when_multiple_relationships_between_dataframes(games_es):
    relationship = Relationship(games_es, 'teams', 'id', 'games',
                                'home_team_id')
    assert relationship.child_name == 'games[home_team_id]'
    assert relationship.parent_name == 'teams[home_team_id]'
Ejemplo n.º 29
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        use_previous = test_feature.use_previous
        base_features = test_feature.base_features
        where = test_feature.where
        entity = test_feature.entity
        child_entity = base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        index_var = entity.index
        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name()
                    not in frame.columns]
        if not len(features):
            return frame

        # handle where clause for all functions below
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(entity.id,
                                                              child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var).apply(last_n)

        if not base_frame.empty:
            if groupby_var not in base_frame:
                # This occured sometimes. I think it might have to do with category
                # but not sure. TODO: look into when this occurs
                no_instances = True
            # if the foreign key column in the child (base_frame) that links to
            # frame is an integer and the id column in the parent is an object or
            # category dtype, the .isin() call errors.
            elif (frame[index_var].dtype != base_frame[groupby_var].dtype or
                    frame[index_var].dtype.name.find('category') > -1):
                try:
                    frame_as_obj = frame[index_var].astype(object)
                    base_frame_as_obj = base_frame[groupby_var].astype(object)
                except ValueError:
                    msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})"
                    raise ValueError(msg.format(entity.id, index_var,
                                                frame[index_var].dtype,
                                                child_entity.id, groupby_var,
                                                base_frame[groupby_var].dtype))
                else:
                    no_instances = check_no_related_instances(
                        frame_as_obj.values, base_frame_as_obj.values)
            else:
                no_instances = check_no_related_instances(
                    frame[index_var].values, base_frame[groupby_var].values)

        if base_frame.empty or no_instances:
            for f in features:
                set_default_column(entity_frames[entity.id], f)

            return frame

        def wrap_func_with_name(func, name):
            def inner(x):
                return func(x)
            inner.__name__ = name
            return inner

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()
                if variable_id not in to_agg:
                    to_agg[variable_id] = []
                func = f.get_function()
                # make sure function names are unique
                random_id = str(uuid.uuid1())
                func = wrap_func_with_name(func, random_id)
                funcname = random_id
                to_agg[variable_id].append(func)
                agg_rename[u"{}-{}".format(variable_id, funcname)] = \
                    f.get_name()

                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame, right=to_merge,
                             left_on=index_var, right_index=True, how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        # Do the [variables] accessor on to_merge because the agg call returns
        # a dataframe with columns that contain the dataframes we want
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)

            to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg)
            # we apply multiple functions to each column, creating
            # a multiindex as the column
            # rename the columns to a concatenation of the two indexes
            to_merge.columns = [u"{}-{}".format(n1, n2)
                                for n1, n2 in to_merge.columns.ravel()]
            # to enable a rename
            to_merge = to_merge.rename(columns=agg_rename)
            variables = list(agg_rename.values())
            to_merge = to_merge[variables]
            frame = pd.merge(left=frame, right=to_merge,
                             left_on=index_var, right_index=True, how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [f for f in features
                     if hasattr(f.default_value, '__iter__')]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {f.get_name(): f.default_value for f in features
                       if f not in iterfeats}
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and
                    f.variable_type == variable_types.Numeric and
                    frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame