Beispiel #1
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        entity = test_feature.entity
        child_entity = test_feature.base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where clause for all functions below
        where = test_feature.where
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(
            entity.id, child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        use_previous = test_feature.use_previous
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var,
                                                observed=True,
                                                sort=False).apply(last_n)

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()

                if variable_id not in to_agg:
                    to_agg[variable_id] = []

                func = f.get_function()
                funcname = func
                if callable(func):
                    funcname = func.__name__

                to_agg[variable_id].append(func)
                # this is used below to rename columns that pandas names for us
                agg_rename[u"{}-{}".format(variable_id,
                                           funcname)] = f.get_name()
                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).agg(to_agg)
            # rename columns to the correct feature names
            to_merge.columns = [
                agg_rename["-".join(x)] for x in to_merge.columns.ravel()
            ]
            to_merge = to_merge[list(agg_rename.values())]

            # workaround for pandas bug where categories are in the wrong order
            # see: https://github.com/pandas-dev/pandas/issues/22501
            if pdtypes.is_categorical_dtype(frame.index):
                categories = pdtypes.CategoricalDtype(
                    categories=frame.index.categories)
                to_merge.index = to_merge.index.astype(object).astype(
                    categories)

            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
Beispiel #2
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        use_previous = test_feature.use_previous
        base_features = test_feature.base_features
        where = test_feature.where
        entity = test_feature.entity
        child_entity = base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        index_var = entity.index
        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return

        # handle where clause for all functions below
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(
            entity.id, child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var).apply(last_n)

        if not base_frame.empty:
            if groupby_var not in base_frame:
                # This occured sometimes. I think it might have to do with category
                # but not sure. TODO: look into when this occurs
                no_instances = True
            # if the foreign key column in the child (base_frame) that links to
            # frame is an integer and the id column in the parent is an object or
            # category dtype, the .isin() call errors.
            elif (frame[index_var].dtype != base_frame[groupby_var].dtype
                  or frame[index_var].dtype.name.find('category') > -1):
                try:
                    frame_as_obj = frame[index_var].astype(object)
                    base_frame_as_obj = base_frame[groupby_var].astype(object)
                except ValueError:
                    msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})"
                    raise ValueError(
                        msg.format(entity.id, index_var,
                                   frame[index_var].dtype, child_entity.id,
                                   groupby_var, base_frame[groupby_var].dtype))
                else:
                    no_instances = check_no_related_instances(
                        frame_as_obj.values, base_frame_as_obj.values)
            else:
                no_instances = check_no_related_instances(
                    frame[index_var].values, base_frame[groupby_var].values)

        if base_frame.empty or no_instances:
            for f in features:
                set_default_column(entity_frames[entity.id], f)

            return

        def wrap_func_with_name(func, name):
            def inner(x):
                return func(x)

            inner.__name__ = name
            return inner

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()
                if variable_id not in to_agg:
                    to_agg[variable_id] = []
                func = f.get_function()
                # make sure function names are unique
                random_id = str(uuid.uuid1())
                func = wrap_func_with_name(func, random_id)
                funcname = random_id
                to_agg[variable_id].append(func)
                agg_rename[u"{}-{}".format(variable_id,
                                           funcname)] = f.get_name()

                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_on=index_var,
                             right_index=True,
                             how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        # Do the [variables] accessor on to_merge because the agg call returns
        # a dataframe with columns that contain the dataframes we want
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)

            to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg)
            # we apply multiple functions to each column, creating
            # a multiindex as the column
            # rename the columns to a concatenation of the two indexes
            to_merge.columns = [
                u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel()
            ]
            # to enable a rename
            to_merge = to_merge.rename(columns=agg_rename)
            variables = agg_rename.values()
            to_merge = to_merge[variables]
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_on=index_var,
                             right_index=True,
                             how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        entity_frames[entity.id] = frame
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        use_previous = test_feature.use_previous
        base_features = test_feature.base_features
        where = test_feature.where
        entity = test_feature.entity
        child_entity = base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        index_var = entity.index
        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name()
                    not in frame.columns]
        if not len(features):
            return frame

        # handle where clause for all functions below
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(entity.id,
                                                              child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var).apply(last_n)

        if not base_frame.empty:
            if groupby_var not in base_frame:
                # This occured sometimes. I think it might have to do with category
                # but not sure. TODO: look into when this occurs
                no_instances = True
            # if the foreign key column in the child (base_frame) that links to
            # frame is an integer and the id column in the parent is an object or
            # category dtype, the .isin() call errors.
            elif (frame[index_var].dtype != base_frame[groupby_var].dtype or
                    frame[index_var].dtype.name.find('category') > -1):
                try:
                    frame_as_obj = frame[index_var].astype(object)
                    base_frame_as_obj = base_frame[groupby_var].astype(object)
                except ValueError:
                    msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})"
                    raise ValueError(msg.format(entity.id, index_var,
                                                frame[index_var].dtype,
                                                child_entity.id, groupby_var,
                                                base_frame[groupby_var].dtype))
                else:
                    no_instances = check_no_related_instances(
                        frame_as_obj.values, base_frame_as_obj.values)
            else:
                no_instances = check_no_related_instances(
                    frame[index_var].values, base_frame[groupby_var].values)

        if base_frame.empty or no_instances:
            for f in features:
                set_default_column(entity_frames[entity.id], f)

            return frame

        def wrap_func_with_name(func, name):
            def inner(x):
                return func(x)
            inner.__name__ = name
            return inner

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()
                if variable_id not in to_agg:
                    to_agg[variable_id] = []
                func = f.get_function()
                # make sure function names are unique
                random_id = str(uuid.uuid1())
                func = wrap_func_with_name(func, random_id)
                funcname = random_id
                to_agg[variable_id].append(func)
                agg_rename[u"{}-{}".format(variable_id, funcname)] = \
                    f.get_name()

                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame, right=to_merge,
                             left_on=index_var, right_index=True, how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        # Do the [variables] accessor on to_merge because the agg call returns
        # a dataframe with columns that contain the dataframes we want
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)

            to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg)
            # we apply multiple functions to each column, creating
            # a multiindex as the column
            # rename the columns to a concatenation of the two indexes
            to_merge.columns = [u"{}-{}".format(n1, n2)
                                for n1, n2 in to_merge.columns.ravel()]
            # to enable a rename
            to_merge = to_merge.rename(columns=agg_rename)
            variables = list(agg_rename.values())
            to_merge = to_merge[variables]
            frame = pd.merge(left=frame, right=to_merge,
                             left_on=index_var, right_index=True, how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [f for f in features
                     if hasattr(f.default_value, '__iter__')]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {f.get_name(): f.default_value for f in features
                       if f not in iterfeats}
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and
                    f.variable_type == variable_types.Numeric and
                    frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame