Esempio n. 1
0
def test_cutoff_time_binning(entityset):
    cutoff_time = pd.DataFrame({
        'time': [
            datetime(2011, 4, 9, 12, 31),
            datetime(2011, 4, 10, 11),
            datetime(2011, 4, 10, 13, 10, 1)
        ],
        'instance_id': [1, 2, 3]
    })
    binned_cutoff_times = bin_cutoff_times(cutoff_time, Timedelta(4, 'h'))
    labels = [
        datetime(2011, 4, 9, 12),
        datetime(2011, 4, 10, 8),
        datetime(2011, 4, 10, 12)
    ]
    for i in binned_cutoff_times.index:
        assert binned_cutoff_times['time'][i] == labels[i]

    binned_cutoff_times = bin_cutoff_times(cutoff_time, Timedelta(25, 'h'))
    labels = [
        datetime(2011, 4, 8, 22),
        datetime(2011, 4, 9, 23),
        datetime(2011, 4, 9, 23)
    ]
    for i in binned_cutoff_times.index:
        assert binned_cutoff_times['time'][i] == labels[i]
def test_cutoff_time_binning(entityset):
    cutoff_time = pd.DataFrame({'time': [
                                datetime(2011, 4, 9, 12, 31),
                                datetime(2011, 4, 10, 11),
                                datetime(2011, 4, 10, 13, 10, 1)
                                ],
                                'instance_id': [1, 2, 3]})
    binned_cutoff_times = bin_cutoff_times(cutoff_time, Timedelta(4, 'h'))
    labels = [datetime(2011, 4, 9, 12),
              datetime(2011, 4, 10, 8),
              datetime(2011, 4, 10, 12)]
    for i in binned_cutoff_times.index:
        assert binned_cutoff_times['time'][i] == labels[i]

    binned_cutoff_times = bin_cutoff_times(cutoff_time, Timedelta(25, 'h'))
    labels = [datetime(2011, 4, 8, 22),
              datetime(2011, 4, 9, 23),
              datetime(2011, 4, 9, 23)]
    for i in binned_cutoff_times.index:
        assert binned_cutoff_times['time'][i] == labels[i]
def approximate_features(feature_set,
                         cutoff_time,
                         window,
                         entityset,
                         training_window=None):
    '''Given a set of features and cutoff_times to be passed to
    calculate_feature_matrix, calculates approximate values of some features
    to speed up calculations.  Cutoff times are sorted into
    window-sized buckets and the approximate feature values are only calculated
    at one cutoff time for each bucket.


    ..note:: this only approximates DirectFeatures of AggregationFeatures, on
        the target entity. In future versions, it may also be possible to
        approximate these features on other top-level entities

    Args:
        cutoff_time (pd.DataFrame): specifies what time to calculate
            the features for each instance at. The resulting feature matrix will use data
            up to and including the cutoff_time. A DataFrame with
            'instance_id' and 'time' columns.

        window (Timedelta or str): frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        entityset (:class:`.EntitySet`): An already initialized entityset.

        feature_set (:class:`.FeatureSet`): The features to be calculated.

        training_window (`Timedelta`, optional):
            Window defining how much older than the cutoff time data
            can be to be included when calculating the feature. If None, all older data is used.

        save_progress (str, optional): path to save intermediate computational results
    '''
    approx_fms_trie = Trie(path_constructor=RelationshipPath)

    target_time_colname = 'target_time'
    cutoff_time[target_time_colname] = cutoff_time['time']
    approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
    cutoff_df_time_var = 'time'
    cutoff_df_instance_var = 'instance_id'
    # should this order be by dependencies so that calculate_feature_matrix
    # doesn't skip approximating something?
    for relationship_path, approx_feature_names in feature_set.approximate_feature_trie:
        if not approx_feature_names:
            continue

        cutoffs_with_approx_e_ids, new_approx_entity_index_var = \
            _add_approx_entity_index_var(entityset, feature_set.target_eid,
                                         approx_cutoffs.copy(), relationship_path)

        # Select only columns we care about
        columns_we_want = [
            new_approx_entity_index_var, cutoff_df_time_var,
            target_time_colname
        ]

        cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[columns_we_want]
        cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates()
        cutoffs_with_approx_e_ids.dropna(subset=[new_approx_entity_index_var],
                                         inplace=True)

        approx_features = [
            feature_set.features_by_name[name] for name in approx_feature_names
        ]
        if cutoffs_with_approx_e_ids.empty:
            approx_fm = gen_empty_approx_features_df(approx_features)
        else:
            cutoffs_with_approx_e_ids.sort_values(
                [cutoff_df_time_var, new_approx_entity_index_var],
                inplace=True)
            # CFM assumes specific column names for cutoff_time argument
            rename = {new_approx_entity_index_var: cutoff_df_instance_var}
            cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(
                columns=rename)
            cutoff_time_to_pass = cutoff_time_to_pass[[
                cutoff_df_instance_var, cutoff_df_time_var
            ]]

            cutoff_time_to_pass.drop_duplicates(inplace=True)
            approx_fm = calculate_feature_matrix(
                approx_features,
                entityset,
                cutoff_time=cutoff_time_to_pass,
                training_window=training_window,
                approximate=None,
                cutoff_time_in_index=False,
                chunk_size=cutoff_time_to_pass.shape[0])

        approx_fms_trie.get_node(relationship_path).value = approx_fm

    return approx_fms_trie
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, or a single
            value to calculate for all instances. If the dataframe has more than two columns, any additional
            columns will be added to the resulting feature matrix.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None or "cutoff time"): Number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    target_entity = entityset[features[0].entity.id]
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if isinstance(cutoff_time, list):
            raise TypeError("cutoff_time must be a single value or DataFrame")

        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            df = target_entity._handle_time(target_entity.df,
                                            time_last=cutoff_time,
                                            training_window=training_window)
            instance_ids = df[index_var].tolist()

        cutoff_time = [cutoff_time] * len(instance_ids)
        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time'])

    cutoff_time = cutoff_time.reset_index(drop=True)
    # handle how columns are names in cutoff_time
    # maybe add _check_time_dtype helper function
    if "instance_id" not in cutoff_time.columns:
        if target_entity.index not in cutoff_time.columns:
            raise AttributeError(
                'Name of the index variable in the target entity'
                ' or "instance_id" must be present in cutoff_time')
        # rename to instance_id
        cutoff_time.rename(columns={target_entity.index: "instance_id"},
                           inplace=True)

    if "time" not in cutoff_time.columns:
        # take the first column that isn't instance_id and assume it is time
        not_instance_id = [
            c for c in cutoff_time.columns if c != "instance_id"
        ]
        cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True)
    # Check that cutoff_time time type matches entityset time type
    if entityset.time_type == NumericTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_numerics:
            raise TypeError("cutoff_time times must be numeric: try casting "
                            "via pd.to_numeric(cutoff_time['time'])")
    elif entityset.time_type == DatetimeTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_datetimes:
            raise TypeError(
                "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])"
            )
    assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \
        "Duplicated rows in cutoff time dataframe."
    pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

    if _check_time_type(cutoff_time['time'].iloc[0]) is None:
        raise ValueError("cutoff_time time values must be datetime or numeric")

    # make sure dtype of instance_id in cutoff time
    # is same as column it references
    target_entity = features[0].entity
    dtype = entityset[target_entity.id].df[target_entity.index].dtype
    cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype)

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'
    num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape)

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if num_per_chunk == "cutoff time":
        iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var)
    else:
        iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass,
                                  time_variable=cutoff_df_time_var,
                                  num_per_chunk=num_per_chunk)

    chunks = []
    if num_per_chunk == "cutoff time":
        for _, group in iterator:
            chunks.append(group)
    else:
        for chunk in iterator:
            chunks.append(chunk)

    if n_jobs != 1 or dask_kwargs is not None:
        feature_matrix = parallel_calculate_chunks(
            chunks=chunks,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            verbose=verbose,
            save_progress=save_progress,
            entityset=entityset,
            n_jobs=n_jobs,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns,
            dask_kwargs=dask_kwargs or {})
    else:
        feature_matrix = linear_calculate_chunks(
            chunks=chunks,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            verbose=verbose,
            save_progress=save_progress,
            entityset=entityset,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns)

    feature_matrix = pd.concat(feature_matrix)

    feature_matrix.sort_index(level='time', kind='mergesort', inplace=True)
    if not cutoff_time_in_index:
        feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    return feature_matrix
Esempio n. 5
0
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None,
                             progress_callback=None,
                             include_cutoff_time=True):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame or a single
            value. If a DataFrame is passed the instance ids for which to calculate features
            must be in a column with the same name as the target entity index or a column
            named `instance_id`. The cutoff time values in the DataFrame must be in a column with
            the same name as the target entity time index or a column named `time`. If the
            DataFrame has more than two columns, any additional columns will be added to the
            resulting feature matrix. If a single value is passed, this value will be used for
            all instances.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column), (variable_types))}.
            Note that time_column and variable_types are optional.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None): maximum number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

        include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    if any(isinstance(es.df, dd.DataFrame) for es in entityset.entities):
        if approximate:
            msg = "Using approximate is not supported with Dask Entities"
            raise ValueError(msg)
        if training_window:
            msg = "Using training_window is not supported with Dask Entities"
            raise ValueError(msg)

    target_entity = entityset[features[0].entity.id]
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if isinstance(cutoff_time, list):
            raise TypeError("cutoff_time must be a single value or DataFrame")

        if isinstance(cutoff_time, dd.DataFrame):
            msg = "cannot use Dask DataFrame for cutoff_time: "\
                  "cutoff_time must a single value or a Pandas DataFrame"
            raise TypeError(msg)

        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            df = target_entity._handle_time(
                target_entity.df,
                time_last=cutoff_time,
                training_window=training_window,
                include_cutoff_time=include_cutoff_time)
            instance_ids = list(df[index_var])

        cutoff_time = [cutoff_time] * len(instance_ids)
        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time'])

    cutoff_time = cutoff_time.reset_index(drop=True)
    # handle how columns are names in cutoff_time
    # maybe add _check_time_dtype helper function
    if "instance_id" not in cutoff_time.columns:
        if target_entity.index not in cutoff_time.columns:
            raise AttributeError(
                'Cutoff time DataFrame must contain a column with either the same name'
                ' as the target entity index or a column named "instance_id"')
        # rename to instance_id
        cutoff_time.rename(columns={target_entity.index: "instance_id"},
                           inplace=True)

    if "time" not in cutoff_time.columns:
        if target_entity.time_index and target_entity.time_index not in cutoff_time.columns:
            raise AttributeError(
                'Cutoff time DataFrame must contain a column with either the same name'
                ' as the target entity time_index or a column named "time"')
        # rename to time
        cutoff_time.rename(columns={target_entity.time_index: "time"},
                           inplace=True)

    # Make sure user supplies only one valid name for instance id and time columns
    if "instance_id" in cutoff_time.columns and target_entity.index in cutoff_time.columns and \
            "instance_id" != target_entity.index:
        raise AttributeError(
            'Cutoff time DataFrame cannot contain both a column named "instance_id" and a column'
            ' with the same name as the target entity index')
    if "time" in cutoff_time.columns and target_entity.time_index in cutoff_time.columns and \
            "time" != target_entity.time_index:
        raise AttributeError(
            'Cutoff time DataFrame cannot contain both a column named "time" and a column'
            ' with the same name as the target entity time index')

    # Check that cutoff_time time type matches entityset time type
    if entityset.time_type == NumericTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_numerics:
            raise TypeError("cutoff_time times must be numeric: try casting "
                            "via pd.to_numeric(cutoff_time['time'])")
    elif entityset.time_type == DatetimeTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_datetimes:
            raise TypeError(
                "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])"
            )
    assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \
        "Duplicated rows in cutoff time dataframe."

    pass_columns = [
        col for col in cutoff_time.columns
        if col not in ['instance_id', 'time']
    ]

    if _check_time_type(cutoff_time['time'].iloc[0]) is None:
        raise ValueError("cutoff_time time values must be datetime or numeric")

    # make sure dtype of instance_id in cutoff time
    # is same as column it references
    target_entity = features[0].entity
    dtype = entityset[target_entity.id].df[target_entity.index].dtype
    cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype)

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    chunk_size = _handle_chunk_size(chunk_size, cutoff_time.shape[0])
    tqdm_options = {
        'total': (cutoff_time.shape[0] / FEATURE_CALCULATION_PERCENTAGE),
        'bar_format': PBAR_FORMAT,
        'disable': True
    }

    if verbose:
        tqdm_options.update({'disable': False})
    elif progress_callback is not None:
        # allows us to utilize progress_bar updates without printing to anywhere
        tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False})

    progress_bar = make_tqdm_iterator(**tqdm_options)
    progress_bar._instances.clear()

    if n_jobs != 1 or dask_kwargs is not None:
        feature_matrix = parallel_calculate_chunks(
            cutoff_time=cutoff_time_to_pass,
            chunk_size=chunk_size,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            save_progress=save_progress,
            entityset=entityset,
            n_jobs=n_jobs,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns,
            progress_bar=progress_bar,
            dask_kwargs=dask_kwargs or {},
            progress_callback=progress_callback,
            include_cutoff_time=include_cutoff_time)
    else:
        feature_matrix = calculate_chunk(
            cutoff_time=cutoff_time_to_pass,
            chunk_size=chunk_size,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            save_progress=save_progress,
            entityset=entityset,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns,
            progress_bar=progress_bar,
            progress_callback=progress_callback,
            include_cutoff_time=include_cutoff_time)

    # ensure rows are sorted by input order
    if isinstance(feature_matrix, pd.DataFrame):
        feature_matrix = feature_matrix.reindex(
            pd.MultiIndex.from_frame(cutoff_time[["instance_id", "time"]],
                                     names=feature_matrix.index.names))
        if not cutoff_time_in_index:
            feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    # force to 100% since we saved last 5 percent
    previous_progress = progress_bar.n
    progress_bar.update(progress_bar.total - progress_bar.n)

    if progress_callback is not None:
        update, progress_percent, time_elapsed = update_progress_callback_parameters(
            progress_bar, previous_progress)
        progress_callback(update, progress_percent, time_elapsed)

    progress_bar.refresh()
    progress_bar.close()
    return feature_matrix
Esempio n. 6
0
def approximate_features(features,
                         cutoff_time,
                         window,
                         entityset,
                         backend,
                         training_window=None,
                         profile=None):
    '''Given a list of features and cutoff_times to be passed to
    calculate_feature_matrix, calculates approximate values of some features
    to speed up calculations.  Cutoff times are sorted into
    window-sized buckets and the approximate feature values are only calculated
    at one cutoff time for each bucket.


    ..note:: this only approximates DirectFeatures of AggregationFeatures, on
        the target entity. In future versions, it may also be possible to
        approximate these features on other top-level entities

    Args:
        features (list[:class:`.FeatureBase`]): if these features are dependent
            on aggregation features on the prediction, the approximate values
            for the aggregation feature will be calculated

        cutoff_time (pd.DataFrame): specifies what time to calculate
            the features for each instance at. The resulting feature matrix will use data
            up to and including the cutoff_time. A DataFrame with
            'instance_id' and 'time' columns.

        window (Timedelta or str): frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        entityset (:class:`.EntitySet`): An already initialized entityset.

        training_window (`Timedelta`, optional):
            Window defining how much older than the cutoff time data
            can be to be included when calculating the feature. If None, all older data is used.

        profile (bool, optional): Enables profiling if True

        save_progress (str, optional): path to save intermediate computational results
    '''
    approx_fms_by_entity = {}
    all_approx_feature_set = None
    target_entity = features[0].entity
    target_index_var = target_entity.index

    to_approximate, all_approx_feature_set = gather_approximate_features(
        features, backend)

    target_time_colname = 'target_time'
    cutoff_time[target_time_colname] = cutoff_time['time']
    target_instance_colname = target_index_var
    cutoff_time[target_instance_colname] = cutoff_time['instance_id']
    approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
    cutoff_df_time_var = 'time'
    cutoff_df_instance_var = 'instance_id'
    # should this order be by dependencies so that calculate_feature_matrix
    # doesn't skip approximating something?
    for approx_entity_id, approx_features in to_approximate.items():
        # Gather associated instance_ids from the approximate entity
        cutoffs_with_approx_e_ids = approx_cutoffs.copy()
        frames = entityset.get_pandas_data_slice(
            [approx_entity_id, target_entity.id], target_entity.id,
            cutoffs_with_approx_e_ids[target_instance_colname])

        if frames is not None:
            path = entityset.find_path(approx_entity_id, target_entity.id)
            rvar = get_relationship_variable_id(path)
            parent_instance_frame = frames[approx_entity_id][target_entity.id]
            cutoffs_with_approx_e_ids[rvar] = \
                cutoffs_with_approx_e_ids.merge(parent_instance_frame[[rvar]],
                                                left_on=target_index_var,
                                                right_index=True,
                                                how='left')[rvar].values
            new_approx_entity_index_var = rvar

            # Select only columns we care about
            columns_we_want = [
                target_instance_colname, new_approx_entity_index_var,
                cutoff_df_time_var, target_time_colname
            ]

            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[
                columns_we_want]
            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates(
            )
            cutoffs_with_approx_e_ids.dropna(
                subset=[new_approx_entity_index_var], inplace=True)
        else:
            cutoffs_with_approx_e_ids = pd.DataFrame()

        if cutoffs_with_approx_e_ids.empty:
            approx_fms_by_entity = gen_empty_approx_features_df(
                approx_features)
            continue

        cutoffs_with_approx_e_ids.sort_values(
            [cutoff_df_time_var, new_approx_entity_index_var], inplace=True)
        # CFM assumes specific column names for cutoff_time argument
        rename = {new_approx_entity_index_var: cutoff_df_instance_var}
        cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(columns=rename)
        cutoff_time_to_pass = cutoff_time_to_pass[[
            cutoff_df_instance_var, cutoff_df_time_var
        ]]

        cutoff_time_to_pass.drop_duplicates(inplace=True)
        approx_fm = calculate_feature_matrix(
            approx_features,
            entityset,
            cutoff_time=cutoff_time_to_pass,
            training_window=training_window,
            approximate=None,
            cutoff_time_in_index=False,
            chunk_size=cutoff_time_to_pass.shape[0],
            profile=profile)

        approx_fms_by_entity[approx_entity_id] = approx_fm

    # Include entity because we only want to ignore features that
    # are base_features/dependencies of the top level entity we're
    # approximating.
    # For instance, if target entity is sessions, and we're
    # approximating customers.COUNT(sessions.COUNT(log.value)),
    # we could also just want the feature COUNT(log.value)
    # defined on sessions
    # as a first class feature in the feature matrix.
    # Unless we signify to only ignore it as a dependency of
    # a feature defined on customers, we would ignore computing it
    # and pandas_backend would error
    return approx_fms_by_entity, all_approx_feature_set
Esempio n. 7
0
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             dataframes=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None,
                             progress_callback=None,
                             include_cutoff_time=True):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `dataframes` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame or a single
            value. If a DataFrame is passed the instance ids for which to calculate features
            must be in a column with the same name as the target dataframe index or a column
            named `instance_id`. The cutoff time values in the DataFrame must be in a column with
            the same name as the target dataframe time index or a column named `time`. If the
            DataFrame has more than two columns, any additional columns will be added to the
            resulting feature matrix. If a single value is passed, this value will be used for
            all instances.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        dataframes (dict[str -> tuple(DataFrame, str, str, dict[str -> str/Woodwork.LogicalType], dict[str->str/set], boolean)]):
            Dictionary of DataFrames. Entries take the format
            {dataframe name -> (dataframe, index column, time_index, logical_types, semantic_tags, make_index)}.
            Note that only the dataframe is required. If a Woodwork DataFrame is supplied, any other parameters
            will be ignored.

        relationships (list[(str, str, str, str)]): list of relationships
            between dataframes. List items are a tuple with the format
            (parent dataframe name, parent column, child dataframe name, child column).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None): maximum number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

        include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``.

    Returns:
        pd.DataFrame: The feature matrix.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if dataframes is not None:
            entityset = EntitySet("entityset", dataframes, relationships)
        else:
            raise TypeError("No dataframes or valid EntitySet provided")

    if entityset.dataframe_type == Library.DASK.value:
        if approximate:
            msg = "Using approximate is not supported with Dask dataframes"
            raise ValueError(msg)
        if training_window:
            msg = "Using training_window is not supported with Dask dataframes"
            raise ValueError(msg)

    target_dataframe = entityset[features[0].dataframe_name]

    cutoff_time = _validate_cutoff_time(cutoff_time, target_dataframe)
    entityset._check_time_indexes()

    if isinstance(cutoff_time, pd.DataFrame):
        if instance_ids:
            msg = "Passing 'instance_ids' is valid only if 'cutoff_time' is a single value or None - ignoring"
            warnings.warn(msg)
        pass_columns = [
            col for col in cutoff_time.columns
            if col not in ['instance_id', 'time']
        ]
        # make sure dtype of instance_id in cutoff time
        # is same as column it references
        target_dataframe = features[0].dataframe
        ltype = target_dataframe.ww.logical_types[target_dataframe.ww.index]
        cutoff_time.ww.init(logical_types={'instance_id': ltype})
    else:
        pass_columns = []
        if cutoff_time is None:
            if entityset.time_type == 'numeric':
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_col = target_dataframe.ww.index
            df = entityset._handle_time(
                dataframe_name=target_dataframe.ww.name,
                df=target_dataframe,
                time_last=cutoff_time,
                training_window=training_window,
                include_cutoff_time=include_cutoff_time)
            instance_ids = df[index_col]

        if isinstance(instance_ids, dd.Series):
            instance_ids = instance_ids.compute()
        elif is_instance(instance_ids, ks, 'Series'):
            instance_ids = instance_ids.to_pandas()

        # convert list or range object into series
        if not isinstance(instance_ids, pd.Series):
            instance_ids = pd.Series(instance_ids)

        cutoff_time = (cutoff_time, instance_ids)

    _check_cutoff_time_type(cutoff_time, entityset.time_type)

    # Approximate provides no benefit with a single cutoff time, so ignore it
    if isinstance(cutoff_time, tuple) and approximate is not None:
        msg = "Using approximate with a single cutoff_time value or no cutoff_time " \
            "provides no computational efficiency benefit"
        warnings.warn(msg)
        cutoff_time = pd.DataFrame({
            "instance_id":
            cutoff_time[1],
            "time": [cutoff_time[0]] * len(cutoff_time[1])
        })
        target_dataframe = features[0].dataframe
        ltype = target_dataframe.ww.logical_types[target_dataframe.ww.index]
        cutoff_time.ww.init(logical_types={'instance_id': ltype})

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_col = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time, approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time.ww[target_time] = cutoff_time[cutoff_df_time_col]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if isinstance(cutoff_time, pd.DataFrame):
        cutoff_time_len = cutoff_time.shape[0]
    else:
        cutoff_time_len = len(cutoff_time[1])

    chunk_size = _handle_chunk_size(chunk_size, cutoff_time_len)
    tqdm_options = {
        'total': (cutoff_time_len / FEATURE_CALCULATION_PERCENTAGE),
        'bar_format': PBAR_FORMAT,
        'disable': True
    }

    if verbose:
        tqdm_options.update({'disable': False})
    elif progress_callback is not None:
        # allows us to utilize progress_bar updates without printing to anywhere
        tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False})

    with make_tqdm_iterator(**tqdm_options) as progress_bar:
        if n_jobs != 1 or dask_kwargs is not None:
            feature_matrix = parallel_calculate_chunks(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                n_jobs=n_jobs,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_col=cutoff_df_time_col,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                dask_kwargs=dask_kwargs or {},
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)
        else:
            feature_matrix = calculate_chunk(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_col=cutoff_df_time_col,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)

        # ensure rows are sorted by input order
        if isinstance(feature_matrix, pd.DataFrame):
            if isinstance(cutoff_time, pd.DataFrame):
                feature_matrix = feature_matrix.ww.reindex(
                    pd.MultiIndex.from_frame(
                        cutoff_time[["instance_id", "time"]],
                        names=feature_matrix.index.names))
            else:
                # Maintain index dtype
                index_dtype = feature_matrix.index.get_level_values(0).dtype
                feature_matrix = feature_matrix.ww.reindex(
                    cutoff_time[1].astype(index_dtype), level=0)
            if not cutoff_time_in_index:
                feature_matrix.ww.reset_index(level='time',
                                              drop=True,
                                              inplace=True)

        if save_progress and os.path.exists(os.path.join(
                save_progress, 'temp')):
            shutil.rmtree(os.path.join(save_progress, 'temp'))

        # force to 100% since we saved last 5 percent
        previous_progress = progress_bar.n
        progress_bar.update(progress_bar.total - progress_bar.n)

        if progress_callback is not None:
            update, progress_percent, time_elapsed = update_progress_callback_parameters(
                progress_bar, previous_progress)
            progress_callback(update, progress_percent, time_elapsed)

        progress_bar.refresh()

    return feature_matrix