Beispiel #1
0
def list_primitives():
    try:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1, 'display.width', 1000):
            print(featuretools.list_primitives())
    except ValueError:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None, 'display.width', 1000):
            print(featuretools.list_primitives())
Beispiel #2
0
def list_primitives():
    try:
        with pd.option_context(
                "display.max_rows",
                None,
                "display.max_columns",
                None,
                "display.max_colwidth",
                -1,
                "display.width",
                1000,
        ):
            print(featuretools.list_primitives())
    except ValueError:
        with pd.option_context(
                "display.max_rows",
                None,
                "display.max_columns",
                None,
                "display.max_colwidth",
                None,
                "display.width",
                1000,
        ):
            print(featuretools.list_primitives())
Beispiel #3
0
    def list_feature_primitives(self):
        """Returns built-in primitive in Featuretools.

        Returns:
            A pandas dataframe that lists and describes each built-in primitives.
        """
        return ft.list_primitives()
Beispiel #4
0
def test_aggregation(pd_es, dask_es):
    primitives = ft.list_primitives()
    trans_primitives = []
    agg_list = primitives[primitives['type'] == 'aggregation']['name'].tolist()
    agg_primitives = [prim for prim in agg_list if prim not in UNSUPPORTED]

    assert pd_es == dask_es

    # Run DFS using each entity as a target and confirm results match
    for entity in pd_es.entities:
        fm, _ = ft.dfs(entityset=pd_es,
                       target_entity=entity.id,
                       trans_primitives=trans_primitives,
                       agg_primitives=agg_primitives,
                       cutoff_time=pd.Timestamp("2019-01-05 04:00"),
                       max_depth=2)

        dask_fm, _ = ft.dfs(entityset=dask_es,
                            target_entity=entity.id,
                            trans_primitives=trans_primitives,
                            agg_primitives=agg_primitives,
                            cutoff_time=pd.Timestamp("2019-01-05 04:00"),
                            max_depth=2)
        # Use the same columns and make sure both indexes are sorted the same
        dask_computed_fm = dask_fm.compute().set_index(
            entity.index).loc[fm.index][fm.columns]
        pd.testing.assert_frame_equal(fm, dask_computed_fm, check_dtype=False)
 def add_agg_primitives(self, agg):
     '''Appends items from agg to the aggregate primitives to be used
        in DFS. Aggregate primitives must be available in Featuretools
        library.
        
        agg: list of string values
     '''
     aggs_list = ft.list_primitives().loc[ft.list_primitives()['type'] ==
                                          'aggregation']
     for i in agg:
         if i in aggs_list['name'].values:
             self.agg_primitives.append(i)
         else:
             print(i, "is not in the available aggregate primitives.")
     print("The aggregate primitives have been added: ",
           *self.agg_primitives)
 def add_trans_primitives(self, trans):
     '''Appends items from trans to the transformative primitives to be used
        in DFS. Transform primitives must be available in Featuretools
        library.
        
        trans: list of string values
     '''
     tran_list = ft.list_primitives().loc[ft.list_primitives()['type'] ==
                                          'transform']
     for i in trans:
         if i in tran_list['name'].values:
             self.trans_primitives.append(i)
         else:
             print(i, "is not in the available transform primitives.")
     print("The transformative primitives have been added: ",
           *self.trans_primitives)
Beispiel #7
0
def _get_primitive_hyperparams():
    """Get one boolean hyperparam object for each available featuretools primitive.

    The hyperparameter will be named {primitimve_type}_{primitive_name},
    will have the primitive description, and will default to True or False
    depending on whether the primitive name is in the _DEAFULT_PRIMITIVES list.

    An example of such a primitive is::

        aggregation_max = hyperparams.Hyperparameter[bool](
            description='Finds the maximum non-null value of a numeric feature.',
            default=True,
            semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
        )

    Returns:
        dict containing hyperparameter names as keys and hyperparameter objects as values.
    """
    primitive_hyperparams = dict()
    primitives = ft.list_primitives()
    for _, primitive in primitives.iterrows():
        primitive_name = primitive['name']
        if primitive_name in ALL_PRIMITIVES:
            hyperparam_name = '{}_{}'.format(primitive['type'], primitive_name)
            hyperparam = hyperparams.Hyperparameter[bool](
                default=primitive_name in DEFAULT_PRIMITIVES,
                description=primitive['description'],
                semantic_types=[
                    'https://metadata.datadrivendiscovery.org/types/TuningParameter'
                ])
            primitive_hyperparams[hyperparam_name] = hyperparam

    return dict(sorted(primitive_hyperparams.items()))
Beispiel #8
0
def test_aggregation(pd_es, dask_es):
    primitives = ft.list_primitives()
    trans_primitives = []
    agg_list = primitives[primitives['type'] == 'aggregation']['name'].tolist()
    agg_primitives = [prim for prim in agg_list if prim not in UNSUPPORTED]

    assert pd_es == dask_es

    # Run DFS using each dataframe as a target and confirm results match
    for df in pd_es.dataframes:
        fm, _ = ft.dfs(entityset=pd_es,
                       target_dataframe_name=df.ww.name,
                       trans_primitives=trans_primitives,
                       agg_primitives=agg_primitives,
                       cutoff_time=pd.Timestamp("2019-01-05 04:00"),
                       max_depth=2)

        dask_fm, _ = ft.dfs(entityset=dask_es,
                            target_dataframe_name=df.ww.name,
                            trans_primitives=trans_primitives,
                            agg_primitives=agg_primitives,
                            cutoff_time=pd.Timestamp("2019-01-05 04:00"),
                            max_depth=2)

        # Categorical categories can be ordered differently, this makes sure they are the same
        dask_fm = dask_fm.astype(fm.dtypes)

        # Use the same columns and make sure both indexes are sorted the same
        dask_computed_fm = dask_fm.compute().set_index(
            df.ww.index).loc[fm.index][fm.columns]
        pd.testing.assert_frame_equal(fm, dask_computed_fm)
Beispiel #9
0
Datei: T.py Projekt: lokcyi/AI
def listprimitives():
    # List the primitives in a dataframe
    # primitives = ft.list_primitives()
    # pd.options.display.max_colwidth = 100

    # primitives[primitives['type'] == 'aggregation'].head(10)
    primitives = ft.list_primitives()
    #pd.options.display.max_colwidth = 100
    return primitives[primitives['type'] == 'transform'].head(78)
    return primitives[primitives['type'] == 'aggregation'].head(22)
Beispiel #10
0
    def es_set(self):

        print("Generating Features...\n")

        # List the primitives in a dataframe
        primitives = ft.list_primitives()
        pd.options.display.max_colwidth = 100
        print("feature primitives:",
              primitives[primitives['type'] == 'aggregation'].head(10))

        self.__feature_matrix, self.__feature_defs = ft.dfs(
            entityset=self.__es, target_entity="app", verbose=True)

        return self.__feature_matrix
def test_transform(pd_es, dask_es):
    pytest.skip(
        "TODO: Dask issue with `series.eq`. Fix once Dask Issue #7957 is closed."
    )
    primitives = ft.list_primitives()
    trans_list = primitives[primitives["type"] == "transform"]["name"].tolist()
    trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED]
    agg_primitives = []
    cutoff_time = pd.Timestamp("2019-01-05 04:00")

    assert pd_es == dask_es

    # Run DFS using each dataframe as a target and confirm results match
    for df in pd_es.dataframes:
        features = ft.dfs(
            entityset=pd_es,
            target_dataframe_name=df.ww.name,
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=2,
            features_only=True,
        )

        dask_features = ft.dfs(
            entityset=dask_es,
            target_dataframe_name=df.ww.name,
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=2,
            features_only=True,
        )
        assert features == dask_features

        # Calculate feature matrix values to confirm output is the same between dask and pandas.
        # Not testing on all returned features due to long run times.
        fm = ft.calculate_feature_matrix(features=features[:100],
                                         entityset=pd_es,
                                         cutoff_time=cutoff_time)
        dask_fm = ft.calculate_feature_matrix(features=dask_features[:100],
                                              entityset=dask_es,
                                              cutoff_time=cutoff_time)

        # Categorical categories can be ordered differently, this makes sure they are the same
        dask_fm = dask_fm.astype(fm.dtypes)

        # Use the same columns and make sure both indexes are sorted the same
        dask_computed_fm = (dask_fm.compute().set_index(
            df.ww.index).loc[fm.index][fm.columns])
        pd.testing.assert_frame_equal(fm, dask_computed_fm)
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df['name'].values
        row = df.loc[df['name'] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row['description']

    types = df['type'].values
    assert 'aggregation' in types
    assert 'transform' in types
 def add_where_primitives(self, where):
     '''Appends items from where to the where primitives to be used
        in DFS. Where primitives are used on specified interesting_values to 
        build conditional features and can take on aggregate or transform
        primitives.
        
        where: list of string values
     '''
     all_prims = ft.list_primitives()['name'].values
     for i in where:
         if i in all_prims:
             self.where_primitives.append(i)
         else:
             print(i, "is not in the available primitives")
     print("The where primitives have been added: ", *self.where_primitives)
Beispiel #14
0
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df["name"].values
        row = df.loc[df["name"] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row["description"]
        assert row["dask_compatible"] == (Library.DASK in primitive.compatibility)
        assert row["valid_inputs"] == ", ".join(
            _get_unique_input_types(primitive.input_types)
        )
        assert row["return_type"] == getattr(primitive.return_type, "__name__", None)

    types = df["type"].values
    assert "aggregation" in types
    assert "transform" in types
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df['name'].values
        row = df.loc[df['name'] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row['description']
        assert row['dask_compatible'] == (Library.DASK
                                          in primitive.compatibility)
        assert row['valid_inputs'] == ', '.join(
            _get_names_valid_inputs(primitive.input_types))
        assert row['return_type'] == getattr(primitive.return_type, '__name__',
                                             None)

    types = df['type'].values
    assert 'aggregation' in types
    assert 'transform' in types
Beispiel #16
0
def test_transform(pd_es, dask_es):
    primitives = ft.list_primitives()
    trans_list = primitives[primitives['type'] == 'transform']['name'].tolist()
    trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED]
    agg_primitives = []
    cutoff_time = pd.Timestamp("2019-01-05 04:00")

    assert pd_es == dask_es

    # Run DFS using each entity as a target and confirm results match
    for entity in pd_es.entities:
        features = ft.dfs(entityset=pd_es,
                          target_entity=entity.id,
                          trans_primitives=trans_primitives,
                          agg_primitives=agg_primitives,
                          max_depth=2,
                          features_only=True)

        dask_features = ft.dfs(entityset=dask_es,
                               target_entity=entity.id,
                               trans_primitives=trans_primitives,
                               agg_primitives=agg_primitives,
                               max_depth=2,
                               features_only=True)
        assert features == dask_features

        # Calculate feature matrix values to confirm output is the same between dask and pandas.
        # Not testing on all returned features due to long run times.
        fm = ft.calculate_feature_matrix(features=features[:100],
                                         entityset=pd_es,
                                         cutoff_time=cutoff_time)
        dask_fm = ft.calculate_feature_matrix(features=dask_features[:100],
                                              entityset=dask_es,
                                              cutoff_time=cutoff_time)

        # Use the same columns and make sure both indexes are sorted the same
        dask_computed_fm = dask_fm.compute().set_index(
            entity.index).loc[fm.index][fm.columns]
        pd.testing.assert_frame_equal(fm, dask_computed_fm)
Beispiel #17
0
    # python 2
    from funcsigs import signature

import featuretools as ft
import pytest
from featuretools import (
    calculate_feature_matrix,
    dfs,
    list_primitives,
    load_features,
    save_features,
)
from featuretools.tests.testing_utils import make_ecommerce_entityset

ft.primitives._load_primitives()
PRIMITIVES = list_primitives()


class PrimitiveT:
    primitive = None

    @pytest.fixture(autouse=True, scope="session")
    def es(self):
        es = make_ecommerce_entityset()
        return es

    def test_name_and_desc(self):
        assert self.primitive.name is not None
        assert self.primitive.__doc__ is not None
        docstring = self.primitive.__doc__
        short_description = docstring.splitlines()[0]
import pandas as pd
import numpy as np
import featuretools as ft



if __name__ == "__main__":
    dataset = load_iris()
    X = dataset.data
    y = dataset.target
    iris_feature_names = dataset.feature_names
    df = pd.DataFrame(X, columns=iris_feature_names)
    es = ft.EntitySet(id='single_dataframe')  # 用id标识实体集
    # 增加一个数据框,命名为iris
    es.entity_from_dataframe(entity_id='iris',
                             dataframe=df,
                             index='index',
                             make_index=True)
    trans_primitives=['add_numeric', 'subtract_numeric', ,'multiply_numeric', 'divide_numeric']  # 2列相加减乘除来生成新特征
    feature_matrix, feature_names = ft.dfs(entityset=es,
                                            target_entity='iris',
                                            max_depth=1,    # max_depth=1,只在原特征上进行运算产生新特征
                                            verbose=1,
                                            trans_primitives=trans_primitives
                                            )
    ft.list_primitives()  # 查看可使用的特征集元
    # features_df = pd.DataFrame(feature_matrix, columns= feature_names)
    # print(features_df.head())
    print(feature_matrix)
 
Beispiel #19
0
class FTTimeSeriesBuilder:
    """
    Scikit-learn-style feature builder based on featuretools.

    Args:

        num_features: The (maximum) number of features to build.

        memory: How much back in time you want to go until the
                feature builder starts "forgetting" data.

        column_id: The name of the column containing the ids.

        time_stamp: The name of the column containing the time stamps.

        target: The name of the target column.
    """

    all_primitives = ft.list_primitives()
    agg_primitives = all_primitives[all_primitives.type ==
                                    "aggregation"].name.tolist()
    trans_primitives = all_primitives[all_primitives.type ==
                                      "transform"].name.tolist()

    def __init__(
        self,
        num_features,
        horizon,
        memory,
        column_id,
        time_stamp,
        target,
        allow_lagged_targets=False,
    ):
        self.num_features = num_features
        self.horizon = horizon
        self.memory = memory
        self.column_id = column_id
        self.time_stamp = time_stamp
        self.target = target
        self.allow_lagged_targets = allow_lagged_targets

        self._runtime = None
        self.fitted = False
        self.max_depth = 2

        self.selected_features = []

    def _extract_features(self, data_frame):
        data_frame = data_frame.reset_index()
        del data_frame["index"]
        rolled = _roll_data_frame(data_frame, self.column_id, self.time_stamp,
                                  self.horizon, self.memory)

        data_frame["_featuretools_index"] = np.arange(data_frame.shape[0])

        entityset = _make_entity_set(data_frame, rolled, self.time_stamp)
        df_extracted, _ = ft.dfs(
            entityset=entityset,
            agg_primitives=self.agg_primitives,
            target_dataframe_name="population",
            max_depth=self.max_depth,
            ignore_columns={
                "peripheral": [
                    self.column_id,
                    "index",
                    "join_key",
                    "_featuretools_join_key",
                    "_featuretools_index",
                ]
            },
        )

        for col in df_extracted:
            if is_numeric_dtype(df_extracted[col]):
                df_extracted[col][df_extracted[col].isna()] = 0

        return df_extracted

    def _select_features(self, data_frame, target):
        colnames = np.asarray(data_frame.columns)
        print("Selecting the best out of " + str(len(colnames)) +
              " features...")
        colnames = np.asarray([
            col for col in colnames if is_numeric_dtype(data_frame[col])
            and np.var(np.asarray(data_frame[col])) > 0.0
        ])
        correlations = np.asarray(
            [np.abs(pearsonr(target, data_frame[col]))[0] for col in colnames])
        correlations[np.isnan(correlations) | np.isinf(correlations)] = 0.0

        self.selected_features = colnames[np.argsort(
            correlations)][::-1][:self.num_features]
        return data_frame[self.selected_features]

    def fit(self, data_frame):
        """
        Fits the DFS on the data frame and returns
        the features for the training set.
        """
        print("featuretools: Trying features...")
        begin = time.time()
        target = np.asarray(data_frame[self.target])
        df_for_extraction = (data_frame if self.allow_lagged_targets else
                             _remove_target_column(data_frame, self.target))
        df_extracted = self._extract_features(df_for_extraction)
        df_selected = self._select_features(df_extracted, target)
        df_selected = _add_original_columns(data_frame, df_selected)
        end = time.time()
        _print_time_taken(begin, end)
        self.fitted = True
        self._runtime = datetime.timedelta(seconds=end - begin)
        return df_selected

    @property
    def runtime(self):
        if self.fitted:
            return self._runtime

    def transform(self, data_frame):
        """
        Fits the DFS on the data frame and returns
        the features for the training set.
        """
        df_for_extraction = (data_frame if self.allow_lagged_targets else
                             _remove_target_column(data_frame, self.target))
        df_extracted = self._extract_features(df_for_extraction)
        df_selected = df_extracted[self.selected_features]
        df_selected = _add_original_columns(data_frame, df_selected)
        return df_selected
Beispiel #20
0
 def test_not_duplicate_of_default(self):
     class_name = self.primitive.__name__
     df = list_primitives()
     primitive_names = df['name'].apply(convert).tolist()
     assert class_name not in primitive_names
    agg_primitives=['count', 'mean'
                    ],  # specified, otherwise defaults primitives will be used
    max_depth=1)
print(feature_matrix.columns.tolist())
print(feature_matrix.head())
print(feature_defs)

print('-----------encode category feature-----------')
feature_matrix_enc, feature_enc = ft.encode_features(feature_matrix,
                                                     feature_defs)
print(feature_matrix_enc.columns.tolist())
print(feature_matrix_enc.head())
print(feature_enc)

print('-----------list primitives---------------------')
print(ft.list_primitives().head())

print('----------custom primitives----------------------')
from featuretools.primitives import make_agg_primitive, make_trans_primitive
from featuretools.variable_types import Text, Numeric


def absolute(column):
    return abs(column)


Absolute = make_trans_primitive(function=absolute,
                                input_types=[Numeric],
                                return_type=Numeric)

Beispiel #22
0
def list_primitives():
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1, 'display.width', 1000):
        print(featuretools.list_primitives())
Beispiel #23
0
# Now we have to tell ft how these entities are related
# Uses the parent-child metaphor. Create the relationships then add to  the entity set
r_c_l = ft.Relationship(es['clients']['client_id'],
                        es['loans']['client_id'])
r_l_p = ft.Relationship(es['loans']['loan_id'],
                        es['payments']['loan_id'])
es = es.add_relationship(r_c_l)
es = es.add_relationship(r_l_p)

# Another look at the whole lot
es

# Ok Now lets make some features. First some primitives
# Either aggregations or transformations
primitives = ft.list_primitives()
pd.options.display.max_colwidth=100
primitives[primitives['type']=='aggregation'].head(20)

# And
primitives[primitives['type']=='transform'].head(20)


# Ok lets do it! Make some features for the clients
features, feature_names = ft.dfs(entityset = es,
                                 target_entity='clients',
                                 agg_primitives=['median','mean','std','max','percent_true','last','time_since_last'],
                                 trans_primitives=['years','month','divide'])

# Wow! After setup (which could be done in a library call)
# we have 408  features with 4 lines of code (140 new ones)
# Course Code: DLBDSMLUSL01

# Automated feature generation

#%% import libraries
import pandas as pd
import featuretools as ft

#%% Remove any limit on the number of columns to display
pd.options.display.max_columns = None

#%% Remove any limit on the number of rows to display
pd.options.display.max_rows = None

#%% Display the list of primitives
print(ft.list_primitives())

#%% create sample data
Customers = pd.DataFrame({ \
    'C_ID': ['C1', 'C2'], \
    'Name': ['Martin', 'Julia'], \
    'Creation_date': ['2018-08-15', '2020-05-05']}, \
        columns = ['C_ID','Name','Creation_date'])
Orders = pd.DataFrame({ \
    'Ord_ID': ['1', '2', '3', '4', '5'], \
    'C_ID': ['C1', 'C2', 'C1', 'C1','C2']}, \
        columns = ['Ord_ID','C_ID'])
Payments = pd.DataFrame({ \
    'Ord_ID':['1', '5', '3', '4', '2'], \
    'Price':[500, 200, 300, 100, 900]}, \
        columns = ['Ord_ID', 'Price'])