Beispiel #1
0
    def transform(self, cuttof_time_ids):
        """Wrapper for calculate_feature_matix

            Calculates a matrix for a given set of instance ids and calculation
            times.

            Args:
                cuttof_time_ids (list | DataFrame): Instances filtered to
                    calculate features on.

            See Also:
                :func:`computational_backends.calculate_feature_matrix`
        """
        if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
            X_transformed = calculate_feature_matrix(
                features=self.feature_defs,
                entityset=self.entityset,
                instance_ids=cuttof_time_ids,
                entities=self.entities,
                relationships=self.relationships,
                verbose=self.verbose)
            X_transformed = X_transformed.loc[cuttof_time_ids]
        elif isinstance(cuttof_time_ids, pd.DataFrame):
            ct = cuttof_time_ids
            X_transformed = calculate_feature_matrix(
                features=self.feature_defs,
                entityset=self.entityset,
                cutoff_time=cuttof_time_ids,
                entities=self.entities,
                relationships=self.relationships,
                verbose=self.verbose)
            X_transformed = X_transformed.loc[ct[ct.columns[0]]]
        else:
            raise TypeError('instance_ids must be a list or pd.DataFrame')
        return X_transformed
Beispiel #2
0
    def transform(self, cuttof_time_ids):
        """Wrapper for calculate_feature_matix

            Calculates a matrix for a given set of instance ids and calculation
            times.

            Args:
                cuttof_time_ids (list | DataFrame): Instances filtered to
                    calculate features on.

            See Also:
                :func:`computational_backends.calculate_feature_matrix`
        """
        if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
            X_transformed = calculate_feature_matrix(
                features=self.feature_defs,
                entityset=self.entityset,
                instance_ids=cuttof_time_ids,
                entities=self.entities,
                relationships=self.relationships,
                verbose=self.verbose,
                profile=self.profile)
            X_transformed = X_transformed.loc[cuttof_time_ids]
        elif isinstance(cuttof_time_ids, pd.DataFrame):
            ct = cuttof_time_ids
            X_transformed = calculate_feature_matrix(
                features=self.feature_defs,
                entityset=self.entityset,
                cutoff_time=cuttof_time_ids,
                entities=self.entities,
                relationships=self.relationships,
                verbose=self.verbose,
                profile=self.profile)
            X_transformed = X_transformed.loc[ct[ct.columns[0]]]
        else:
            raise TypeError('instance_ids must be a list or pd.DataFrame')
        return X_transformed
    def transform(self, X):
        """Wrapper for calculate_feature_matrix

            Calculates a feature matrix for a the given input data and calculation times.

            Args:
                X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is
                    passed it can take one of these forms: (entityset, cutoff_time_dataframe),
                    (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe)

            See Also:
                :func:`computational_backends.calculate_feature_matrix`
        """
        es, dataframes, relationships, cutoff_time = parse_x_input(X)

        X_transformed = calculate_feature_matrix(features=self.feature_defs,
                                                 instance_ids=None,
                                                 cutoff_time=cutoff_time,
                                                 entityset=es,
                                                 dataframes=dataframes,
                                                 relationships=relationships,
                                                 verbose=self.verbose)

        return X_transformed
Beispiel #4
0
def dfs(entities=None,
        relationships=None,
        entityset=None,
        target_entity=None,
        cutoff_time=None,
        instance_ids=None,
        agg_primitives=None,
        trans_primitives=None,
        groupby_trans_primitives=None,
        allowed_paths=None,
        max_depth=2,
        ignore_entities=None,
        ignore_variables=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_primitives=None,
        max_features=-1,
        cutoff_time_in_index=False,
        save_progress=None,
        features_only=False,
        training_window=None,
        approximate=None,
        chunk_size=None,
        n_jobs=1,
        dask_kwargs=None,
        verbose=False,
        return_variable_types=None,
        progress_callback=None):
    '''Calculates a feature matrix and features given a dictionary of entities
    and a list of relationships.


    Args:
        entities (dict[str -> tuple(pd.DataFrame, str, str)]): Dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): List of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        entityset (EntitySet): An already initialized entityset. Required if
            entities and relationships are not defined.

        target_entity (str): Entity id of entity on which to make predictions.

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to
            calculate each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame with
            'instance_id' and 'time' columns, a DataFrame with the name of the
            index variable in the target entity and a time column, a
            list of values, or a single
            value to calculate for all instances. If the dataframe has more than
            two columns, any additional columns will be added to the resulting
            feature matrix.

        instance_ids (list): List of instances on which to calculate features. Only
            used if cutoff_time is a single datetime.

        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        groupby_trans_primitives (list[str or :class:`.primitives.TransformPrimitive`], optional):
            list of Transform primitives to make GroupByTransformFeatures with

        allowed_paths (list[list[str]]): Allowed entity paths on which to make
            features.

        max_depth (int) : Maximum allowed depth of features.

        ignore_entities (list[str], optional): List of entities to
            blacklist when creating features.

        ignore_variables (dict[str -> list[str]], optional): List of specific
            variables within each entity to blacklist when creating features.

        seed_features (list[:class:`.FeatureBase`]): List of manually defined
            features to use.

        drop_contains (list[str], optional): Drop features
            that contains these strings in name.

        drop_exact (list[str], optional): Drop features that
            exactly match these strings in name.

        where_primitives (list[str or PrimitiveBase], optional):
            List of Primitives names (or types) to apply with where clauses.

                Default:

                    ["count"]

        max_features (int, optional) : Cap the number of generated features to
                this number. If -1, no limit.

        features_only (bool, optional): If True, returns the list of
            features without calculating the feature matrix.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None`` , all data
            before cutoff time is used. Defaults to ``None``. Month and year
            units are not relative when Pandas Timedeltas are used. Relative
            units should be passed as a Featuretools Timedelta or a string.

        approximate (Timedelta): Bucket size to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        save_progress (str, optional): Path to save intermediate computational results.

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix

        chunk_size (int or float or None or "cutoff time", optional): Number
            of rows of output feature matrix to calculate at time. If passed an
            integer greater than 0, will try to use that many rows per chunk.
            If passed a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        return_variable_types (list[Variable] or str, optional): Types of
                variables to return. If None, default to
                Numeric, Discrete, and Boolean. If given as
                the string 'all', use all available variable types.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

    Examples:
        .. code-block:: python

            from featuretools.primitives import Mean
            # cutoff times per instance
            entities = {
                "sessions" : (session_df, "id"),
                "transactions" : (transactions_df, "id", "transaction_time")
            }
            relationships = [("sessions", "id", "transactions", "session_id")]
            feature_matrix, features = dfs(entities=entities,
                                           relationships=relationships,
                                           target_entity="transactions",
                                           cutoff_time=cutoff_times)
            feature_matrix

            features = dfs(entities=entities,
                           relationships=relationships,
                           target_entity="transactions",
                           features_only=True)
    '''
    if not isinstance(entityset, EntitySet):
        entityset = EntitySet("dfs", entities, relationships)

    dfs_object = DeepFeatureSynthesis(
        target_entity,
        entityset,
        agg_primitives=agg_primitives,
        trans_primitives=trans_primitives,
        groupby_trans_primitives=groupby_trans_primitives,
        max_depth=max_depth,
        where_primitives=where_primitives,
        allowed_paths=allowed_paths,
        drop_exact=drop_exact,
        drop_contains=drop_contains,
        ignore_entities=ignore_entities,
        ignore_variables=ignore_variables,
        max_features=max_features,
        seed_features=seed_features)

    features = dfs_object.build_features(
        verbose=verbose, return_variable_types=return_variable_types)

    if features_only:
        return features

    if isinstance(cutoff_time, pd.DataFrame):
        feature_matrix = calculate_feature_matrix(
            features,
            entityset=entityset,
            cutoff_time=cutoff_time,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            chunk_size=chunk_size,
            n_jobs=n_jobs,
            dask_kwargs=dask_kwargs,
            verbose=verbose,
            progress_callback=progress_callback)
    else:
        feature_matrix = calculate_feature_matrix(
            features,
            entityset=entityset,
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            chunk_size=chunk_size,
            n_jobs=n_jobs,
            dask_kwargs=dask_kwargs,
            verbose=verbose,
            progress_callback=progress_callback)
    return feature_matrix, features
Beispiel #5
0
def dfs(entities=None,
        relationships=None,
        entityset=None,
        target_entity=None,
        cutoff_time=None,
        instance_ids=None,
        agg_primitives=None,
        trans_primitives=None,
        allowed_paths=None,
        max_depth=None,
        ignore_entities=None,
        ignore_variables=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_primitives=None,
        max_features=None,
        cutoff_time_in_index=False,
        save_progress=None,
        features_only=False,
        training_window=None,
        approximate=None,
        chunk_size=None,
        verbose=False):
    '''Calculates a feature matrix and features given a dictionary of entities
    and a list of relationships.


    Args:
        entities (dict[str -> tuple(pd.DataFrame, str, str)]): Dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): List of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        entityset (EntitySet): An already initialized entityset. Required if
            entities and relationships are not defined.

        target_entity (str): Entity id of entity on which to make predictions.

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to
            calculate each instance. Can either be a DataFrame with
            'instance_id' and 'time' columns, a DataFrame with the name of the
            index variable in the target entity and a time column, a
            list of values, or a single
            value to calculate for all instances. If the dataframe has more than
            two columns, any additional columns will be added to the resulting
            feature matrix.

        instance_ids (list): List of instances on which to calculate features. Only
            used if cutoff_time is a single datetime.

        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "n_unique", "mode"]

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        allowed_paths (list[list[str]]): Allowed entity paths on which to make
            features.

        max_depth (int) : Maximum allowed depth of features.

        ignore_entities (list[str], optional): List of entities to
            blacklist when creating features.

        ignore_variables (dict[str -> list[str]], optional): List of specific
            variables within each entity to blacklist when creating features.

        seed_features (list[:class:`.PrimitiveBase`]): List of manually defined
            features to use.

        drop_contains (list[str], optional): Drop features
            that contains these strings in name.

        drop_exact (list[str], optional): Drop features that
            exactly match these strings in name.

        where_primitives (list[str or PrimitiveBase], optional):
            List of Primitives names (or types) to apply with where clauses.

                Default:

                    ["count"]

        max_features (int, optional) : Cap the number of generated features to
                this number. If -1, no limit.

        features_only (bool, optional): If True, returns the list of
            features without calculating the feature matrix.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str -> Timedelta] or Timedelta, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping an entity
            id to Timedelta. If None, all older data is used.

        approximate (Timedelta): Bucket size to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        save_progress (str, optional): Path to save intermediate computational results.

        chunk_size (int or float or None or "cutoff time", optionsal): Number
            of rows of output feature matrix to calculate at time. If passed an
            integer greater than 0, will try to use that many rows per chunk.
            If passed a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

    Examples:
        .. code-block:: python

            from featuretools.primitives import Mean
            # cutoff times per instance
            entities = {
                "sessions" : (session_df, "id"),
                "transactions" : (transactions_df, "id", "transaction_time")
            }
            relationships = [("sessions", "id", "transactions", "session_id")]
            feature_matrix, features = dfs(entities=entities,
                                           relationships=relationships,
                                           target_entity="transactions",
                                           cutoff_time=cutoff_times)
            feature_matrix

            features = dfs(entities=entities,
                           relationships=relationships,
                           target_entity="transactions",
                           features_only=True)
    '''
    if not isinstance(entityset, EntitySet):
        entityset = EntitySet("dfs", entities, relationships)

    dfs_object = DeepFeatureSynthesis(target_entity,
                                      entityset,
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives,
                                      max_depth=max_depth,
                                      where_primitives=where_primitives,
                                      allowed_paths=allowed_paths,
                                      drop_exact=drop_exact,
                                      drop_contains=drop_contains,
                                      ignore_entities=ignore_entities,
                                      ignore_variables=ignore_variables,
                                      max_features=max_features,
                                      seed_features=seed_features)

    features = dfs_object.build_features(verbose=verbose)

    if features_only:
        return features

    if isinstance(cutoff_time, pd.DataFrame):
        feature_matrix = calculate_feature_matrix(
            features,
            cutoff_time=cutoff_time,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            chunk_size=chunk_size,
            verbose=verbose)
    else:
        feature_matrix = calculate_feature_matrix(
            features,
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            chunk_size=chunk_size,
            verbose=verbose)
    return feature_matrix, features
Beispiel #6
0
def dfs(entities=None,
        relationships=None,
        entityset=None,
        target_entity=None,
        cutoff_time=None,
        instance_ids=None,
        agg_primitives=None,
        trans_primitives=None,
        allowed_paths=None,
        max_depth=None,
        ignore_entities=None,
        ignore_variables=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_primitives=None,
        max_features=None,
        cutoff_time_in_index=False,
        save_progress=None,
        features_only=False,
        training_window=None,
        approximate=None,
        chunk_size=None,
        verbose=False):
    '''Calculates a feature matrix and features given a dictionary of entities
    and a list of relationships.


    Args:
        entities (dict[str -> tuple(pd.DataFrame, str, str)]): Dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): List of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        entityset (EntitySet): An already initialized entityset. Required if
            entities and relationships are not defined.

        target_entity (str): Entity id of entity on which to make predictions.

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to
            calculate each instance. Can either be a DataFrame with
            'instance_id' and 'time' columns, a DataFrame with the name of the
            index variable in the target entity and a time column, a
            list of values, or a single
            value to calculate for all instances. If the dataframe has more than
            two columns, any additional columns will be added to the resulting
            feature matrix.

        instance_ids (list): List of instances on which to calculate features. Only
            used if cutoff_time is a single datetime.

        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "n_unique", "mode"]

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        allowed_paths (list[list[str]]): Allowed entity paths on which to make
            features.

        max_depth (int) : Maximum allowed depth of features.

        ignore_entities (list[str], optional): List of entities to
            blacklist when creating features.

        ignore_variables (dict[str -> list[str]], optional): List of specific
            variables within each entity to blacklist when creating features.

        seed_features (list[:class:`.PrimitiveBase`]): List of manually defined
            features to use.

        drop_contains (list[str], optional): Drop features
            that contains these strings in name.

        drop_exact (list[str], optional): Drop features that
            exactly match these strings in name.

        where_primitives (list[str or PrimitiveBase], optional):
            List of Primitives names (or types) to apply with where clauses.

                Default:

                    ["count"]

        max_features (int, optional) : Cap the number of generated features to
                this number. If -1, no limit.

        features_only (bool, optional): If True, returns the list of
            features without calculating the feature matrix.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str -> Timedelta] or Timedelta, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping an entity
            id to Timedelta. If None, all older data is used.

        approximate (Timedelta): Bucket size to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        save_progress (str, optional): Path to save intermediate computational results.

        chunk_size (int or float or None or "cutoff time", optionsal): Number
            of rows of output feature matrix to calculate at time. If passed an
            integer greater than 0, will try to use that many rows per chunk.
            If passed a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

    Examples:
        .. code-block:: python

            from featuretools.primitives import Mean
            # cutoff times per instance
            entities = {
                "sessions" : (session_df, "id"),
                "transactions" : (transactions_df, "id", "transaction_time")
            }
            relationships = [("sessions", "id", "transactions", "session_id")]
            feature_matrix, features = dfs(entities=entities,
                                           relationships=relationships,
                                           target_entity="transactions",
                                           cutoff_time=cutoff_times)
            feature_matrix

            features = dfs(entities=entities,
                           relationships=relationships,
                           target_entity="transactions",
                           features_only=True)
    '''
    if not isinstance(entityset, EntitySet):
        entityset = EntitySet("dfs", entities, relationships)

    dfs_object = DeepFeatureSynthesis(target_entity, entityset,
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives,
                                      max_depth=max_depth,
                                      where_primitives=where_primitives,
                                      allowed_paths=allowed_paths,
                                      drop_exact=drop_exact,
                                      drop_contains=drop_contains,
                                      ignore_entities=ignore_entities,
                                      ignore_variables=ignore_variables,
                                      max_features=max_features,
                                      seed_features=seed_features)

    features = dfs_object.build_features(verbose=verbose)

    if features_only:
        return features

    if isinstance(cutoff_time, pd.DataFrame):
        feature_matrix = calculate_feature_matrix(features,
                                                  cutoff_time=cutoff_time,
                                                  training_window=training_window,
                                                  approximate=approximate,
                                                  cutoff_time_in_index=cutoff_time_in_index,
                                                  save_progress=save_progress,
                                                  chunk_size=chunk_size,
                                                  verbose=verbose)
    else:
        feature_matrix = calculate_feature_matrix(features,
                                                  cutoff_time=cutoff_time,
                                                  instance_ids=instance_ids,
                                                  training_window=training_window,
                                                  approximate=approximate,
                                                  cutoff_time_in_index=cutoff_time_in_index,
                                                  save_progress=save_progress,
                                                  chunk_size=chunk_size,
                                                  verbose=verbose)
    return feature_matrix, features
Beispiel #7
0
def dfs(dataframes=None,
        relationships=None,
        entityset=None,
        target_dataframe_name=None,
        cutoff_time=None,
        instance_ids=None,
        agg_primitives=None,
        trans_primitives=None,
        groupby_trans_primitives=None,
        allowed_paths=None,
        max_depth=2,
        ignore_dataframes=None,
        ignore_columns=None,
        primitive_options=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_primitives=None,
        max_features=-1,
        cutoff_time_in_index=False,
        save_progress=None,
        features_only=False,
        training_window=None,
        approximate=None,
        chunk_size=None,
        n_jobs=1,
        dask_kwargs=None,
        verbose=False,
        return_types=None,
        progress_callback=None,
        include_cutoff_time=True):
    '''Calculates a feature matrix and features given a dictionary of dataframes
    and a list of relationships.


    Args:
        dataframes (dict[str -> tuple(DataFrame, str, str, dict[str -> str/Woodwork.LogicalType], dict[str->str/set], boolean)]):
            Dictionary of DataFrames. Entries take the format
            {dataframe name -> (dataframe, index column, time_index, logical_types, semantic_tags, make_index)}.
            Note that only the dataframe is required. If a Woodwork DataFrame is supplied, any other parameters
            will be ignored.

        relationships (list[(str, str, str, str)]): List of relationships
            between dataframes. List items are a tuple with the format
            (parent dataframe name, parent column, child dataframe name, child column).

        entityset (EntitySet): An already initialized entityset. Required if
            dataframes and relationships are not defined.

        target_dataframe_name (str): Name of dataframe on which to make predictions.

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame or a single
            value. If a DataFrame is passed the instance ids for which to calculate features
            must be in a column with the same name as the target dataframe index or a column
            named `instance_id`. The cutoff time values in the DataFrame must be in a column with
            the same name as the target dataframe time index or a column named `time`. If the
            DataFrame has more than two columns, any additional columns will be added to the
            resulting feature matrix. If a single value is passed, this value will be used for
            all instances.

        instance_ids (list): List of instances on which to calculate features. Only
            used if cutoff_time is a single datetime.

        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        groupby_trans_primitives (list[str or TransformPrimitive], optional):
            list of Transform primitives to make GroupByTransformFeatures with

        allowed_paths (list[list[str]]): Allowed dataframe paths on which to make
            features.

        max_depth (int) : Maximum allowed depth of features.

        ignore_dataframes (list[str], optional): List of dataframes to
            blacklist when creating features.

        ignore_columns (dict[str -> list[str]], optional): List of specific
            columns within each dataframe to blacklist when creating features.

        primitive_options (list[dict[str or tuple[str] -> dict] or dict[str or tuple[str] -> dict, optional]):
            Specify options for a single primitive or a group of primitives.
            Lists of option dicts are used to specify options per input for primitives
            with multiple inputs. Each option ``dict`` can have the following keys:

            ``"include_dataframes"``
                List of dataframes to be included when creating features for
                the primitive(s). All other dataframes will be ignored
                (list[str]).
            ``"ignore_dataframes"``
                List of dataframes to be blacklisted when creating features
                for the primitive(s) (list[str]).
            ``"include_columns"``
                List of specific columns within each dataframe to include when
                creating features for the primitive(s). All other columns
                in a given dataframe will be ignored (dict[str -> list[str]]).
            ``"ignore_columns"``
                List of specific columns within each dataframe to blacklist
                when creating features for the primitive(s) (dict[str ->
                list[str]]).
            ``"include_groupby_dataframes"``
                List of dataframes to be included when finding groupbys. All
                other dataframes will be ignored (list[str]).
            ``"ignore_groupby_dataframes"``
                List of dataframes to blacklist when finding groupbys
                (list[str]).
            ``"include_groupby_columns"``
                List of specific columns within each dataframe to include as
                groupbys, if applicable. All other columns in each
                dataframe will be ignored (dict[str -> list[str]]).
            ``"ignore_groupby_columns"``
                List of specific columns within each dataframe to blacklist
                as groupbys (dict[str -> list[str]]).

        seed_features (list[:class:`.FeatureBase`]): List of manually defined
            features to use.

        drop_contains (list[str], optional): Drop features
            that contains these strings in name.

        drop_exact (list[str], optional): Drop features that
            exactly match these strings in name.

        where_primitives (list[str or PrimitiveBase], optional):
            List of Primitives names (or types) to apply with where clauses.

                Default:

                    ["count"]

        max_features (int, optional) : Cap the number of generated features to
                this number. If -1, no limit.

        features_only (bool, optional): If True, returns the list of
            features without calculating the feature matrix.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None`` , all data
            before cutoff time is used. Defaults to ``None``. Month and year
            units are not relative when Pandas Timedeltas are used. Relative
            units should be passed as a Featuretools Timedelta or a string.

        approximate (Timedelta): Bucket size to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        save_progress (str, optional): Path to save intermediate computational results.

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix

        chunk_size (int or float or None or "cutoff time", optional): Number
            of rows of output feature matrix to calculate at time. If passed an
            integer greater than 0, will try to use that many rows per chunk.
            If passed a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        return_types (list[woodwork.ColumnSchema] or str, optional):
            List of ColumnSchemas defining the types of
            columns to return. If None, defaults to returning all
            numeric, categorical and boolean types. If given as
            the string 'all', returns all available types.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

        include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``.

    Returns:
        list[:class:`.FeatureBase`], pd.DataFrame:
            The list of generated feature defintions, and the feature matrix.
            If ``features_only`` is ``True``, the feature matrix will not be generated.

    Examples:
        .. code-block:: python

            from featuretools.primitives import Mean
            # cutoff times per instance
            dataframes = {
                "sessions" : (session_df, "id"),
                "transactions" : (transactions_df, "id", "transaction_time")
            }
            relationships = [("sessions", "id", "transactions", "session_id")]
            feature_matrix, features = dfs(dataframes=dataframes,
                                           relationships=relationships,
                                           target_dataframe_name="transactions",
                                           cutoff_time=cutoff_times)
            feature_matrix

            features = dfs(dataframes=dataframes,
                           relationships=relationships,
                           target_dataframe_name="transactions",
                           features_only=True)
    '''
    if not isinstance(entityset, EntitySet):
        entityset = EntitySet("dfs", dataframes, relationships)

    dfs_object = DeepFeatureSynthesis(
        target_dataframe_name,
        entityset,
        agg_primitives=agg_primitives,
        trans_primitives=trans_primitives,
        groupby_trans_primitives=groupby_trans_primitives,
        max_depth=max_depth,
        where_primitives=where_primitives,
        allowed_paths=allowed_paths,
        drop_exact=drop_exact,
        drop_contains=drop_contains,
        ignore_dataframes=ignore_dataframes,
        ignore_columns=ignore_columns,
        primitive_options=primitive_options,
        max_features=max_features,
        seed_features=seed_features)

    features = dfs_object.build_features(verbose=verbose,
                                         return_types=return_types)

    trans, agg, groupby, where = _categorize_features(features)

    trans_unused = get_unused_primitives(trans_primitives, trans)
    agg_unused = get_unused_primitives(agg_primitives, agg)
    groupby_unused = get_unused_primitives(groupby_trans_primitives, groupby)
    where_unused = get_unused_primitives(where_primitives, where)

    unused_primitives = [
        trans_unused, agg_unused, groupby_unused, where_unused
    ]
    if any(unused_primitives):
        warn_unused_primitives(unused_primitives)

    if features_only:
        return features

    feature_matrix = calculate_feature_matrix(
        features,
        entityset=entityset,
        cutoff_time=cutoff_time,
        instance_ids=instance_ids,
        training_window=training_window,
        approximate=approximate,
        cutoff_time_in_index=cutoff_time_in_index,
        save_progress=save_progress,
        chunk_size=chunk_size,
        n_jobs=n_jobs,
        dask_kwargs=dask_kwargs,
        verbose=verbose,
        progress_callback=progress_callback,
        include_cutoff_time=include_cutoff_time)
    return feature_matrix, features
Beispiel #8
0
def dfs(entities=None,
        relationships=None,
        entityset=None,
        target_entity=None,
        cutoff_time=None,
        instance_ids=None,
        agg_primitives=None,
        trans_primitives=None,
        allowed_paths=None,
        max_depth=None,
        ignore_entities=None,
        ignore_variables=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_primitives=None,
        max_features=None,
        cutoff_time_in_index=False,
        save_progress=None,
        features_only=False,
        training_window=None,
        approximate=None,
        verbose=False):
    '''Calculates a feature matrix and features given a dictionary of entities
    and a list of relationships.


    Args:
        entities (dict[str: tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable)

        entityset (:class:`.EntitySet`): An already initialized entityset. Required if
            entities and relationships are not defined

        target_entity (str): id of entity to predict on

        cutoff_time (pd.DataFrame or Datetime): specifies what time to calculate
            the features for each instance at.  Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, a list of values, or a single
            value to calculate for all instances.

        instance_ids (list): list of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        agg_primitives (list[:class:`AggregationPrimitive .primitives.AggregationPrimitive`], optional):
            list of Aggregation Feature types to apply.

                Default:[:class:`Sum <.primitives.Sum>`, \
                         :class:`Std <.primitives.Std>`, \
                         :class:`Max <.primitives.Max>`, \
                         :class:`Skew <.primitives.Skew>`, \
                         :class:`Min <.primitives.Min>`, \
                         :class:`Mean <.primitives.Mean>`, \
                         :class:`Count <.primitives.Count>`, \
                         :class:`PercentTrue <.primitives.PercentTrue>`, \
                         :class:`NUniqe <.primitives.NUnique>`, \
                         :class:`Mode <.primitives.Mode>`]

        trans_primitives (list[:class:`TransformPrimitive <.primitives.TransformPrimitive>`], optional):
            list of Transform Feature functions to apply.

                Default:[:class:`Day <.primitives.Day>`, \
                         :class:`Year <.primitives.Year>`, \
                         :class:`Month <.primitives.Month>`, \
                         :class:`Weekday <.primitives.Weekday>`]

        allowed_paths (list[list[str]]): Allowed entity paths to make
            features for

        max_depth (int) : maximum allowed depth of features

        ignore_entities (list[str], optional): List of entities to
            blacklist when creating features

        ignore_variables (dict[str : str], optional): List of specific
            variables within each entity to blacklist when creating features

        seed_features (list[:class:`.PrimitiveBase`]): List of manually defined
            features to use.

        drop_contains (list[str], optional): drop features
            that contains these strings in name

        drop_exact (list[str], optional): drop features that
            exactly match these strings in name

        where_primitives (list[:class:`.primitives.AggregationPrimitive`], optional):
            list of Aggregation Feature types to apply with where clauses.

        max_features (int, optional) : Cap the number of generated features to
                this number. If -1, no limit.

        features_only (boolean, optional): if True, returns the list of
            features without calculating the feature matrix.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str-> :class:`Timedelta`] or :class:`Timedelta`, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping entity
            id -> Timedelta. If None, all older data is used.

        approximate (Timedelta): bucket size to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        save_progress (Optional(str)): path to save intermediate computational results


    Examples:
        .. code-block:: python

            from featuretools.primitives import Mean
            # cutoff times per instance
            entities = {
                "sessions" : (session_df, "id"),
                "transactions" : (transactions_df, "id", "transaction_time")
            }
            relationships = [("sessions", "id", "transactions", "session_id")]
            feature_matrix, features = dfs(entities=entities,
                                           relationships=relationships,
                                           target_entity="transactions",
                                           cutoff_time=cutoff_times)
            feature_matrix

            features = dfs(entities=entities,
                           relationships=relationships,
                           target_entity="transactions",
                           features_only=True)
    '''
    if not isinstance(entityset, EntitySet):
        entityset = EntitySet("dfs", entities, relationships)

    dfs_object = DeepFeatureSynthesis(target_entity,
                                      entityset,
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives,
                                      max_depth=max_depth,
                                      where_primitives=where_primitives,
                                      allowed_paths=allowed_paths,
                                      drop_exact=drop_exact,
                                      drop_contains=drop_contains,
                                      ignore_entities=ignore_entities,
                                      ignore_variables=ignore_variables,
                                      max_features=max_features,
                                      seed_features=seed_features)

    features = dfs_object.build_features(verbose=verbose)

    if features_only:
        return features

    if isinstance(cutoff_time, pd.DataFrame):
        feature_matrix = calculate_feature_matrix(
            features,
            cutoff_time=cutoff_time,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            verbose=verbose)
    else:
        feature_matrix = calculate_feature_matrix(
            features,
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
            training_window=training_window,
            approximate=approximate,
            cutoff_time_in_index=cutoff_time_in_index,
            save_progress=save_progress,
            verbose=verbose)
    return feature_matrix, features