def test_custom_primitive_multiple_inputs(es):
    def mean_sunday(numeric, datetime):
        '''
        Finds the mean of non-null values of a feature that occurred on Sundays
        '''
        days = pd.DatetimeIndex(datetime).weekday.values
        df = pd.DataFrame({'numeric': numeric, 'time': days})
        return df[df['time'] == 6]['numeric'].mean()

    MeanSunday = make_agg_primitive(function=mean_sunday,
                                    input_types=[Numeric, Datetime],
                                    return_type=Numeric)

    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[])
    mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None])
    iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value)
    for x, y in iterator:
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

    es.add_interesting_values()
    mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None])
    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[],
                          where_primitives=[MeanSunday])
    where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)"
    for x, y in zip(fm[where_feat], mean_sunday_value_priority_0):
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_transform_consistency():
    # Create dataframe
    df = pd.DataFrame({'a': [14, 12, 10], 'b': [False, False, True],
                       'b1': [True, True, False], 'b12': [4, 5, 6],
                       'P': [10, 15, 12]})
    es = ft.EntitySet(id='test')
    # Add dataframe to entityset
    es.entity_from_dataframe(entity_id='first', dataframe=df,
                             index='index',
                             make_index=True)

    # Generate features
    feature_defs = ft.dfs(entityset=es, target_entity='first',
                          trans_primitives=['and', 'add', 'or'],
                          features_only=True)

    # Check for correct ordering of features
    assert feature_with_name(feature_defs, 'a')
    assert feature_with_name(feature_defs, 'b')
    assert feature_with_name(feature_defs, 'b1')
    assert feature_with_name(feature_defs, 'b12')
    assert feature_with_name(feature_defs, 'P')
    assert feature_with_name(feature_defs, 'AND(b, b1)')
    assert not feature_with_name(feature_defs, 'AND(b1, b)')  # make sure it doesn't exist the other way
    assert feature_with_name(feature_defs, 'a + P')
    assert feature_with_name(feature_defs, 'b12 + P')
    assert feature_with_name(feature_defs, 'a + b12')
    assert feature_with_name(feature_defs, 'OR(b, b1)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')
    def dfs_run(self):
        self.__train_feature, _ = ft.dfs(
            entityset=self.__es,
            target_entity="application_train",
            agg_primitives=[Sum, Std, Max, Min, Median, Count, Skew, PercentTrue, Trend, AvgTimeBetween],
            where_primitives=[Std, Max, Min, Median, Count],
            verbose=True,
            chunk_size=150,  # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间
        )

        self.__train_feature.to_csv(os.path.join(self.__output_path, "train_agg_df.csv"), index=True)
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
                                          'category = c', 'category = b', 'category is unknown']
Beispiel #5
0
def test_pickle_features_with_custom_primitive(es):
    NewMax = make_agg_primitive(
        lambda x: max(x),
        name="NewMax",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")

    features_original = ft.dfs(target_entity='sessions',
                               entityset=es,
                               agg_primitives=["Last", "Mean", NewMax],
                               features_only=True)

    assert any(
        [isinstance(feat.primitive, NewMax) for feat in features_original])
    pickle_features_test_helper(asizeof(es), features_original)
def test_seed_multi_output_feature_stacking(es):
    threecommon = NMostCommon(3)
    tc = ft.Feature(es['log']['product_id'],
                    parent_entity=es["sessions"],
                    primitive=threecommon)

    fm, feat = ft.dfs(entityset=es,
                      target_entity="customers",
                      seed_features=[tc],
                      agg_primitives=[NumUnique],
                      trans_primitives=[],
                      max_depth=4)

    for i in range(3):
        f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i
        assert feature_with_name(feat, f)
    def fit(self, X, y=None, **kwargs):
        self.original_cols = X.columns.to_list()
        if self.selection_args is not None:
            assert y is not None, '`y` must be provided for feature selection.'
            self.selection_args['reserved_cols'] = self.original_cols
            self.selection_transformer = FeatureSelectionTransformer(task=self.task, **self.selection_args)
        # self._check_values(X)
        if self.continuous_cols is None:
            self.continuous_cols = column_number_exclude_timedelta(X)
        if self.datetime_cols is None:
            self.datetime_cols = column_all_datetime(X)

        if self.fix_input:
            _mean = X[self.continuous_cols].mean().to_dict()
            _mode = X[self.datetime_cols].mode().to_dict()
            self._imputed_input = {}
            self._merge_dict(self._imputed_input, _mean, _mode)
            self._replace_invalid_values(X, self._imputed_input)

        feature_type_dict = {}
        self._merge_dict(feature_type_dict,
                         {c: variable_types.Numeric for c in self.continuous_cols},
                         {c: variable_types.Datetime for c in self.datetime_cols})

        es = ft.EntitySet(id='es_hypernets_fit')
        es.entity_from_dataframe(entity_id='e_hypernets_ft', dataframe=X, variable_types=feature_type_dict,
                                 make_index=True, index=self.ft_index)
        feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="e_hypernets_ft",
                                              ignore_variables={"e_hypernets_ft": []},
                                              return_variable_types="all",
                                              trans_primitives=self.trans_primitives,
                                              max_depth=self.max_depth,
                                              features_only=False,
                                              max_features=-1)
        X.pop(self.ft_index)

        self.feature_defs_ = feature_defs

        if self.selection_transformer is not None:
            self.selection_transformer.fit(feature_matrix, y)
            selected_defs = []
            for fea in self.feature_defs_:
                if fea._name in self.selection_transformer.columns_:
                    selected_defs.append(fea)
            self.feature_defs_ = selected_defs

        return self
Beispiel #8
0
def test_make_transform_multiple_output_features(es):
    def test_time(x):
        times = pd.Series(x)
        units = ["year", "month", "day", "hour", "minute", "second"]
        return [times.apply(lambda x: getattr(x, unit)) for unit in units]

    def gen_feat_names(self):
        subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
        return [
            "Now.%s(%s)" % (subname, self.base_features[0].get_name())
            for subname in subnames
        ]

    TestTime = make_trans_primitive(
        function=test_time,
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
        cls_attributes={"get_feature_names": gen_feat_names},
    )

    join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime)
    alt_features = [
        ft.Feature(es["log"]["datetime"], primitive=Year),
        ft.Feature(es["log"]["datetime"], primitive=Month),
        ft.Feature(es["log"]["datetime"], primitive=Day),
        ft.Feature(es["log"]["datetime"], primitive=Hour),
        ft.Feature(es["log"]["datetime"], primitive=Minute),
        ft.Feature(es["log"]["datetime"], primitive=Second)
    ]
    fm, fl = ft.dfs(entityset=es,
                    target_entity="log",
                    agg_primitives=[],
                    trans_primitives=[
                        TestTime, Year, Month, Day, Hour, Minute, Second, Diff
                    ],
                    max_depth=5)

    subnames = join_time_split.get_feature_names()
    altnames = [f.get_name() for f in alt_features]
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()

    for i in range(6):
        f = 'sessions.customers.DIFF(TEST_TIME(date_of_birth)[%d])' % i
        assert feature_with_name(fl, f)
        assert ('DIFF(TEST_TIME(datetime)[%d])' % i) in fl
Beispiel #9
0
def test_make_three_most_common(pd_es):
    class NMostCommoner(AggregationPrimitive):
        name = "pd_top3"
        input_types = ([ColumnSchema(semantic_tags={"category"})],)
        return_type = None
        number_output_features = 3

        def get_function(self):
            def pd_top3(x):
                counts = x.value_counts()
                counts = counts[counts > 0]
                array = np.array(counts[:3].index)
                if len(array) < 3:
                    filler = np.full(3 - len(array), np.nan)
                    array = np.append(array, filler)
                return array

            return pd_top3

    fm, features = ft.dfs(
        entityset=pd_es,
        target_dataframe_name="customers",
        instance_ids=[0, 1, 2],
        agg_primitives=[NMostCommoner],
        trans_primitives=[],
    )

    df = fm[["PD_TOP3(log.product_id)[%s]" % i for i in range(3)]]

    assert set(df.iloc[0].values[:2]) == set(
        ["coke zero", "toothpaste"]
    )  # coke zero and toothpaste have same number of occurrences
    assert df.iloc[0].values[2] in [
        "car",
        "brown bag",
    ]  # so just check that the top two match

    assert (
        df.iloc[1]
        .reset_index(drop=True)
        .equals(pd.Series(["coke zero", "Haribo sugar-free gummy bears", np.nan]))
    )
    assert (
        df.iloc[2]
        .reset_index(drop=True)
        .equals(pd.Series(["taco clock", np.nan, np.nan]))
    )
Beispiel #10
0
    def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None):
        if not entities and not entityset:
            target_entity = 'X'
        else:
            target_entity = target_entity or self.target_entity

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities, relationships)

        if self.training_window is not None:
            entityset.add_last_time_indexes()

        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]
            cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'})

        self.features = ft.dfs(
            cutoff_time=cutoff_time,
            max_depth=self.max_depth,
            entityset=entityset,
            target_entity=target_entity,
            features_only=True,
            agg_primitives=self.agg_primitives,
            trans_primitives=self.trans_primitives,
            max_features=self.max_features,
            training_window=self.training_window,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
        )

        if self.encode or self.remove_low_information:
            X = ft.calculate_feature_matrix(
                self.features,
                entityset=entityset,
                cutoff_time=cutoff_time,
                training_window=self.training_window,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            if self.encode:
                X, self.features = ft.encode_features(X, self.features)

            if self.remove_low_information:
                X, self.features = remove_low_information_features(X, self.features)
Beispiel #11
0
    def _feature_summary_data(self):
        raceuma_df = self.base_df[[
            "RACE_KEY", "UMABAN", "激走指数", "馬スタート指数", "馬出遅率", "IDM", "騎手指数",
            "テン指数", "ペース指数", "上がり指数", "位置指数", "テンF指数", "中間F指数", "終いF指数",
            "コーナー順位3_1", "コーナー順位4_1", "前3F先頭差_1", "後3F先頭差_1", "レース脚質_1",
            "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "追込率_1",
            "コーナー順位3_2", "コーナー順位4_2", "前3F先頭差_2", "後3F先頭差_2", "レース脚質_2",
            "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "追込率_2",
            "コーナー順位3_3", "コーナー順位4_3", "前3F先頭差_3", "後3F先頭差_3", "レース脚質_3",
            "テン指数結果_3", "上がり指数結果_3", "ペース指数結果_3", "レースP指数結果_3", "追込率_3",
            "コーナー順位3_4", "コーナー順位4_4", "前3F先頭差_4", "後3F先頭差_4", "レース脚質_4",
            "テン指数結果_4", "上がり指数結果_4", "ペース指数結果_4", "レースP指数結果_4", "追込率_4",
            "コーナー順位3_5", "コーナー順位4_5", "前3F先頭差_5", "後3F先頭差_5", "レース脚質_5",
            "テン指数結果_5", "上がり指数結果_5", "ペース指数結果_5", "レースP指数結果_5", "追込率_5"
        ]]
        raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"].astype(
            str).str.cat(raceuma_df["UMABAN"].astype(str))
        raceuma_df.drop("UMABAN", axis=1, inplace=True)
        es = ft.EntitySet(id="race")

        es.entity_from_dataframe(
            entity_id='race',
            dataframe=self.ld.race_df[["RACE_KEY", "target_date"]],
            index="RACE_KEY")
        es.entity_from_dataframe(entity_id='raceuma',
                                 dataframe=raceuma_df,
                                 index="RACE_UMA_KEY")
        relationship = ft.Relationship(es['race']["RACE_KEY"],
                                       es['raceuma']["RACE_KEY"])
        es = es.add_relationship(relationship)
        print(es)
        # 集約関数
        aggregation_list = ['mean', 'skew']
        transform_list = []
        # run dfs
        print("un dfs")
        feature_matrix, features_dfs = ft.dfs(entityset=es,
                                              target_entity='race',
                                              agg_primitives=aggregation_list,
                                              trans_primitives=transform_list,
                                              max_depth=2)
        feature_summary_df = pd.merge(feature_matrix,
                                      self.ld.race_df,
                                      on=["RACE_KEY", "target_date"])
        print("_create_feature: feature_summary_df", feature_summary_df.shape)
        return feature_summary_df
Beispiel #12
0
def generate_features(data,
                      var_types,
                      trans_primitives=["multiply", 'divide', "diff"],
                      N_FEATURES=1000,
                      index_col_name="id"):
    data = data.copy()

    print("-" * 15)

    start_columns = data.columns

    data = data.reset_index()
    data[index_col_name] = data[index_col_name].astype(np.int64)

    N_FEATURES += data.shape[1]

    es = ft.EntitySet(id='players')

    main_entity_id = 'train_players'

    # Entities with a unique index
    es = es.entity_from_dataframe(
        entity_id=main_entity_id,
        dataframe=data,  # dataframe object
        index=index_col_name,  # unique index
        variable_types=var_types)

    print(es)

    # DFS with specified primitives
    print("Start dfs")

    features, feature_names = ft.dfs(
        entityset=es,
        target_entity=main_entity_id,
        trans_primitives=trans_primitives,
        agg_primitives=[],
        max_depth=1,
        features_only=False,
        verbose=True,
        chunk_size=0.5,
        max_features=
        N_FEATURES,  # comment it later, computational burden reduction
        n_jobs=-1,
    )
    return features.drop(start_columns, axis=1)
Beispiel #13
0
def test_remove_single_value_features():
    same_vals_df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "all_numeric": [88, 88, 88, 88],
        "with_nan": [1, 1, None, 1],
        "all_nulls": [None, None, None, None],
        "all_categorical": ["a", "a", "a", "a"],
        "all_bools": [True, True, True, True],
        "diff_vals": ["hi", "bye", "bye", "hi"],
    })

    es = ft.EntitySet("data", {"single_vals": (same_vals_df, "id")})
    es["single_vals"].ww.set_types(
        logical_types={
            "all_nulls": "categorical",
            "all_categorical": "categorical",
            "diff_vals": "categorical",
        })
    fm, features = ft.dfs(
        entityset=es,
        target_dataframe_name="single_vals",
        trans_primitives=["is_null"],
        max_depth=2,
    )

    no_params, no_params_features = ft.selection.remove_single_value_features(
        fm, features)
    no_params_cols = set(no_params.columns)
    assert len(no_params_features) == 2
    assert "IS_NULL(with_nan)" in no_params_cols
    assert "diff_vals" in no_params_cols

    nan_as_value, nan_as_value_features = ft.selection.remove_single_value_features(
        fm, features, count_nan_as_value=True)
    nan_cols = set(nan_as_value.columns)
    assert len(nan_as_value_features) == 3
    assert "IS_NULL(with_nan)" in nan_cols
    assert "diff_vals" in nan_cols
    assert "with_nan" in nan_cols

    without_features_param = ft.selection.remove_single_value_features(fm)
    assert len(no_params.columns) == len(without_features_param.columns)
    for i in range(len(no_params.columns)):
        assert no_params.columns[i] == without_features_param.columns[i]
        assert no_params_features[i].get_name(
        ) == without_features_param.columns[i]
Beispiel #14
0
def get_train_data(project,
                   train_file,
                   prediction_key,
                   prediction_target,
                   variable_types={},
                   drop_columns=None):

    # Read the training data
    print("==========Reading the training file {}".format(train_file))
    train_data = pd.read_csv(train_file)
    train_data.head(5)

    print("==========Preparing training labels for target {}".format(
        prediction_target))
    train_labels = train_data[prediction_target].values
    train_data = train_data.drop(prediction_target, axis=1)

    if drop_columns is not None:
        print("==========dropping columns {}".format(drop_columns))
        train_data = train_data.drop(drop_columns, axis=1)

    print("==========Generating the feature with featuretools")

    es = ft.EntitySet(project)

    entities = get_ft_entities(es=es,
                               project=project,
                               prediction_key=prediction_key,
                               data=train_data,
                               variable_types=variable_types)

    print("==========entities are:")
    print(entities)

    feature_matrix, feature_defs = ft.dfs(entityset=entities,
                                          target_entity=project)

    feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)
    print("==========columns are:")
    print(feature_matrix_enc.columns)

    print("==========saving features to {}".format(project))
    ft.save_features(feature_defs, "data/{}/ft_features".format(project))

    return feature_matrix_enc, train_labels
Beispiel #15
0
 def _featuretools_agg(self, methods=['count', 'max', 'mean']):
     es = ft.EntitySet(id='index')
     es.entity_from_dataframe(entity_id='data',
                              dataframe=self.data,
                              index='index')
     for col in self.cols:
         es.normalize_entity(base_entity_id='data',
                             new_entity_id=col,
                             index=col
                             )
     features, _ = ft.dfs(entityset=es,
                          target_entity='data',
                          agg_primitives=methods,
                          max_depth=2,
                          verbose=1,
                          n_jobs=-1)
     return features
def test_cfm_approximate_correct_ordering():
    trips = {
        'trip_id': [i for i in range(1000)],
        'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)],
        'flight_id': [randint(1, 25) for i in range(1000)],
        'trip_duration': [randint(1, 999) for i in range(1000)]
    }
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    features = dfs(entityset=es, target_entity='trips', features_only=True)
    flight_features = [feature for feature in features
                       if isinstance(feature, DirectFeature) and
                       isinstance(feature.base_features[0],
                                  AggregationPrimitive)]
    property_feature = IdentityFeature(es['trips']['trip_id'])
    # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'],
    #                                     es['flights']),
    #                                 es['trips'])
    cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'],
                                          'time': df['flight_time']})
    time_feature = IdentityFeature(es['trips']['flight_time'])
    feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    feature_matrix.index.names = ['instance', 'time']
    assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values))
    feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                                cutoff_time=cutoff_time,
                                                cutoff_time_in_index=True,
                                                approximate=Timedelta(2, 'd'))
    feature_matrix_2.index.names = ['instance', 'time']
    assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)):
                import pdb
                pdb.set_trace()
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Beispiel #17
0
def make_features(df1,fea_col):
    ''' 
    dataframe to make feature columns
    '''
    df_fea = df1[fea_col]
    es = ft.EntitySet(id = 'sales')
    es.entity_from_dataframe(entity_id = 'bigmart', dataframe = df_fea, index = 'index')
    #primitives[primitives['type'] == 'transform'].head(100)
    #primitives[primitives['type'] == 'aggregation'].head(10)
    feature_matrix,feature_names = ft.dfs(entityset = es,
	    target_entity ='bigmart',
	    max_depth = 2,
	    agg_primitives=['mean','max','std'],
	    trans_primitives = ['less_than'],
	    verbose = 1,
	    n_jobs = 1)
    return feature_matrix,feature_names
Beispiel #18
0
def build_card_one_hot():
    """ Reads in the raw data from train.csv and creates
        one-hot encodings for the feature and date fields.

        :return: Data frame with one-hot encoding
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in data.")
    df = pd.read_csv('data/raw/train.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")

    logger.info("Creating entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=df,
                                              index='card_id',
                                              time_index="first_active_month",
                                              variable_types=CARD_TYPES)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity="transactions")

    logger.info("Creating one-hot training data")
    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Creating one-hot test data")
    df = pd.read_csv('data/raw/test.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")
    df['target'] = 0
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=df,
                                            index='card_id',
                                            time_index="first_active_month",
                                            variable_types=CARD_TYPES)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)
    test_feature_matrix_enc.drop(columns='target', inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
def test_encode_features_topn(pd_es):
    topn = Feature(Feature(pd_es['log'].ww['product_id']),
                   parent_dataframe_name='customers',
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=pd_es,
                                 instance_ids=[0, 1, 2],
                                 target_dataframe_name="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.unique_name() in [
        feat.unique_name() for feat in feature_defs_enc
    ]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
        assert features_enc.columns.tolist().count(name) == 1
Beispiel #20
0
def valid_dfs(
    es,
    aggregations,
    transforms,
    feature_substrings,
    target_dataframe_name="log",
    multi_output=False,
    max_depth=3,
    max_features=-1,
    instance_ids=[0, 1, 2, 3],
):
    if not isinstance(feature_substrings, list):
        feature_substrings = [feature_substrings]

    features = dfs(
        entityset=es,
        target_dataframe_name=target_dataframe_name,
        agg_primitives=aggregations,
        trans_primitives=transforms,
        max_features=max_features,
        max_depth=max_depth,
        features_only=True,
    )
    applicable_features = []
    for feat in features:
        for x in feature_substrings:
            if x in feat.get_name():
                applicable_features.append(feat)
    if len(applicable_features) == 0:
        raise ValueError("No feature names with %s, verify the name attribute \
                          is defined and/or generate_name() is defined to \
                          return %s " %
                         (feature_substrings, feature_substrings))
    df = ft.calculate_feature_matrix(entityset=es,
                                     features=applicable_features,
                                     instance_ids=instance_ids)

    ft.encode_features(df, applicable_features)

    # TODO: check the multi_output shape by checking
    # feature.number_output_features for each feature
    # and comparing it with the matrix shape
    if not multi_output:
        assert len(applicable_features) == df.shape[1]
    return
Beispiel #21
0
    def _create_feature(self):
        """ マージしたデータから特徴量を生成する """
        print("_create_feature")
        raceuma_df = self.base_df[["RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離", "IDM", "テン指数",
                                   "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1",
                                   "先行率_1", "追込率_1", "fa_1_1", "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1"]]
        raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"] + raceuma_df["UMABAN"]
        raceuma_df.drop("UMABAN", axis=1, inplace=True)
        # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5
        es = ft.EntitySet(id="race")
        es.entity_from_dataframe(entity_id='race', dataframe=self.ld.race_df.drop("NENGAPPI", axis=1), index="RACE_KEY")
        es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY")
        relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"])
        es = es.add_relationship(relationship)
        print(es)
        # 集約関数
        aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true']
        transform_list = []
        # run dfs
        print("un dfs")
        feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list,
                                              trans_primitives=transform_list, max_depth=2)
        print("_create_feature: feature_matrix", feature_matrix.shape)

        # 予想1番人気のデータを取得
        ninki_df = self.base_df.query("基準人気順位==1")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "情報印", "騎手印",
                                                  "厩舎印", "調教印", "激走印", "展開記号", "輸送区分", "騎手期待単勝率", "騎手期待3着内率", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "放牧先ランク", "厩舎ランク", "調教量評価", "仕上指数変化", "調教評価",
                                                    "IDM", "騎手指数", "情報指数", "総合指数", "人気指数", "調教指数", "厩舎指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "IDM結果_1", "IDM結果_2"]].add_prefix("人気_").rename(columns={"人気_RACE_KEY":"RACE_KEY"})
        # 逃げ予想馬のデータを取得
        nige_df = self.base_df.query("展開記号=='1'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2",
                                                    "先行率_1", "先行率_2"]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY":"RACE_KEY"})
        # 上がり最速予想馬のデータを取得
        agari_df = self.base_df.query("展開記号=='2'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2",
                                                    "先行率_1", "先行率_2"]].add_prefix("上り_").rename(columns={"上り_RACE_KEY":"RACE_KEY"})

        self.base_df = pd.merge(feature_matrix, nige_df, on="RACE_KEY", how="left")
        self.base_df = pd.merge(self.base_df, agari_df, on="RACE_KEY", how="left")
        self.base_df = pd.merge(self.base_df, ninki_df, on="RACE_KEY")
        self.base_df = pd.merge(self.base_df, self.ld.race_df[["RACE_KEY", "NENGAPPI"]], on="RACE_KEY")
Beispiel #22
0
def test_time_since_primitive_matches_all_datetime_types(es):
    if ks and any(isinstance(e.df, ks.DataFrame) for e in es.entities):
        pytest.xfail(
            'TimeSince transform primitive is incompatible with Koalas')
    fm, fl = ft.dfs(target_entity="customers",
                    entityset=es,
                    trans_primitives=[TimeSince],
                    agg_primitives=[],
                    max_depth=1)

    customers_datetime_vars = [
        id for id, t in es['customers'].variable_types.items()
        if issubclass(t, Datetime)
    ]
    expected_names = [f"TIME_SINCE({v})" for v in customers_datetime_vars]

    for name in expected_names:
        assert name in fm.columns
Beispiel #23
0
def test_transform_subset(X_y_binary, X_y_multi, X_y_regression):
    datasets = locals()
    for dataset in datasets.values():
        X, y = dataset
        X_pd = pd.DataFrame(X)
        X_pd.columns = X_pd.columns.astype(str)
        X_fit = X_pd.iloc[: len(X) // 3]
        X_transform = X_pd.iloc[len(X) // 3:]

        es = ft.EntitySet()
        es = es.entity_from_dataframe(entity_id="X", dataframe=X_transform, index='index', make_index=True)
        feature_matrix, features = ft.dfs(entityset=es, target_entity="X")

        feature = DFSTransformer()
        feature.fit(X_fit)
        X_t = feature.transform(X_transform)

        assert_frame_equal(feature_matrix, X_t.to_dataframe())
Beispiel #24
0
    def dfs_run(self):
        self.__feature_dataframe, _ = ft.dfs(
            entityset=self.__es,
            target_entity="application_train",
            agg_primitives=[ft.primitives.aggregation_primitives.Sum,
                            ft.primitives.aggregation_primitives.Std,
                            ft.primitives.aggregation_primitives.Max,
                            ft.primitives.aggregation_primitives.Min,
                            ft.primitives.aggregation_primitives.Mean,
                            ft.primitives.aggregation_primitives.Count,
                            ft.primitives.aggregation_primitives.NUnique,
                            ft.primitives.aggregation_primitives.Mode],
            trans_primitives=[],
            verbose=True,
            chunk_size=110  # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间
        )

        self.__feature_dataframe.to_csv(os.path.join(self.__output_path, self.__output_file_name), index=False)
def test_pickle_features(es):
    features_original = ft.dfs(target_entity='sessions',
                               entityset=es,
                               features_only=True)

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserialized = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_original, features_deserialized):
        assert feat_1.unique_name() == feat_2.unique_name()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    os.remove(filepath)
    def __init__(self, data):
        es = ft.EntitySet("transactions")

        es = es.entity_from_dataframe(entity_id='entities_transactions',
                                      dataframe=data,
                                      index='index_col')

        es.normalize_entity(base_entity_id='entities_transactions',
                            new_entity_id='origin',
                            index='type')

        fm, features = ft.dfs(entityset=es,
                              target_entity='entities_transactions')

        self.feature_matrix = fm
        self.features = features

        return
Beispiel #27
0
    def fit(self, X, y=None):
        """Fits the DFSTransformer Transformer component.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]

        Returns:
            self
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        X.columns = X.columns.astype(str)
        es = self._make_entity_set(X)
        self.features = dfs(entityset=es,
                            target_entity='X',
                            features_only=True)
        return self
Beispiel #28
0
def engineer_features_uk_retail(entities, relationships, label_times, training_window):
    trans_primitives = [Minute, Hour, Day, Week, Month, Weekday, Weekend]

    es = ft.EntitySet("entityset",
                      entities=entities,
                      relationships=relationships)

    es.add_last_time_indexes()

    feature_matrix, features = ft.dfs(entityset=es,
                                     target_entity="customers",
                                     trans_primitives=trans_primitives,
                                     agg_primitives=[Mean,Max,Std],
                                     cutoff_time=label_times[["CustomerID", "cutoff_time"]],
                                     training_window=training_window)
    feature_matrix.drop("Country", axis=1, inplace=True)
    feature_matrix = feature_matrix.sort_index()
    return feature_matrix
def test_time_since_primitive_matches_all_datetime_types(es):
    if es.dataframe_type == Library.KOALAS.value:
        pytest.xfail(
            'TimeSince transform primitive is incompatible with Koalas')
    fm, fl = ft.dfs(target_dataframe_name="customers",
                    entityset=es,
                    trans_primitives=[TimeSince],
                    agg_primitives=[],
                    max_depth=1)

    customers_datetime_cols = [
        id for id, t in es['customers'].ww.logical_types.items()
        if isinstance(t, Datetime)
    ]
    expected_names = [f"TIME_SINCE({v})" for v in customers_datetime_cols]

    for name in expected_names:
        assert name in fm.columns
def test_make_transform_multiple_output_features(es):
    def test_f(x):
        times = pd.Series(x)
        units = ["year", "month", "day", "hour", "minute", "second"]
        return [times.apply(lambda x: getattr(x, unit)) for unit in units]

    def gen_feat_names(self):
        subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
        return [
            "Now.%s(%s)" % (subname, self.base_features[0].get_name())
            for subname in subnames
        ]

    TestTime = make_trans_primitive(
        function=test_f,
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
        cls_attributes={"get_feature_names": gen_feat_names},
    )

    join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime)
    alt_features = [
        ft.Feature(es["log"]["datetime"], primitive=Year),
        ft.Feature(es["log"]["datetime"], primitive=Month),
        ft.Feature(es["log"]["datetime"], primitive=Day),
        ft.Feature(es["log"]["datetime"], primitive=Hour),
        ft.Feature(es["log"]["datetime"], primitive=Minute),
        ft.Feature(es["log"]["datetime"], primitive=Second)
    ]
    fm, fl = ft.dfs(
        entityset=es,
        target_entity="log",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second])

    subnames = join_time_split.get_feature_names()
    altnames = [f.get_name() for f in alt_features]
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()

    # check no feature stacked on new primitive
    for feature in fl:
        for base_feature in feature.base_features:
            assert base_feature.hash() != join_time_split.hash()
def test_remove_single_value_features():
    same_vals_df = pd.DataFrame({
        'id': [0, 1, 2, 3],
        'all_numeric': [88, 88, 88, 88],
        'with_nan': [1, 1, None, 1],
        "all_nulls": [None, None, None, None],
        'all_categorical': ['a', 'a', 'a', 'a'],
        'all_bools': [True, True, True, True],
        'diff_vals': ['hi', 'bye', 'bye', 'hi']
    })

    es = ft.EntitySet("data", {'single_vals': (same_vals_df, 'id')})
    es['single_vals'].ww.set_types(
        logical_types={
            'all_nulls': 'categorical',
            'all_categorical': 'categorical',
            'diff_vals': 'categorical'
        })
    fm, features = ft.dfs(entityset=es,
                          target_dataframe_name="single_vals",
                          trans_primitives=['is_null'],
                          max_depth=2)

    no_params, no_params_features = ft.selection.remove_single_value_features(
        fm, features)
    no_params_cols = set(no_params.columns)
    assert len(no_params_features) == 2
    assert 'IS_NULL(with_nan)' in no_params_cols
    assert 'diff_vals' in no_params_cols

    nan_as_value, nan_as_value_features = ft.selection.remove_single_value_features(
        fm, features, count_nan_as_value=True)
    nan_cols = set(nan_as_value.columns)
    assert len(nan_as_value_features) == 3
    assert 'IS_NULL(with_nan)' in nan_cols
    assert 'diff_vals' in nan_cols
    assert 'with_nan' in nan_cols

    without_features_param = ft.selection.remove_single_value_features(fm)
    assert len(no_params.columns) == len(without_features_param.columns)
    for i in range(len(no_params.columns)):
        assert no_params.columns[i] == without_features_param.columns[i]
        assert no_params_features[i].get_name(
        ) == without_features_param.columns[i]
def dfs_run(es, output_path):
    """
    AvgTimeBetween: 不同事件的平均时间间隔 等同于 Mean(Diff(time_index))
    Trend: 线性趋势的斜率
    PercentTrue: Boolean 特征的 True 值 占比

    where_primitives :是应用在 interesting_values 上的
    """

    train_feature, _ = ft.dfs(
        entityset=es,
        target_entity="application_train",
        agg_primitives=[Sum, Std, Max, Min, Median, Count, PercentTrue, Trend, AvgTimeBetween, Skew],
        where_primitives=[Std, Max, Min, Median, Count, Skew],
        verbose=True,
        chunk_size=70,  # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间
    )

    train_feature.to_csv(os.path.join(output_path, "train_pre_agg_0-5.csv"), index=True)
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a',
                             dataframe=df,
                             index='index',
                             make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == [
        'category = unknown', 'category = e', 'category = d', 'category = c',
        'category = b', 'category is unknown'
    ]
Beispiel #34
0
def test_deserialize_features_s3(es, url, profile_name):
    agg_primitives = [
        Sum, Std, Max, Skew, Min, Mean, Count, PercentTrue, NumUnique, Mode
    ]

    trans_primitives = [
        Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters
    ]

    features_original = sorted(ft.dfs(target_entity='sessions',
                                      entityset=es,
                                      features_only=True,
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives),
                               key=lambda x: x.unique_name())
    features_deserialized = sorted(ft.load_features(url,
                                                    profile_name=profile_name),
                                   key=lambda x: x.unique_name())
    assert_features(features_original, features_deserialized)
Beispiel #35
0
    def get_feats(self, df):
        self.es.entity_from_dataframe('df', df.copy(), index='ID')

        print("把类别当做ID拆成新表")
        # 这只是单key聚合
        for v in self.es['df'].variables:
            if v.dtype == 'categorical':
                self.es.normalize_entity('df', f'df_{v.name}', v.name)

        # todo: 多key聚合(如果不支持,先组合成单key)或者 获得 重要类别的子集组合
        # 多类别交叉得到更细的分组统计信息

        self.es.plot()
        df_feats, _ = ft.dfs(entityset=self.es,
                             target_entity='df',
                             verbose=1,
                             max_depth=3,
                             n_jobs=3)
        return df_feats