Example #1
0
    def test_select_dtypes_include_using_list_like(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        all = column_all(df)
        assert all == ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']

        num = column_number(df)
        assert num == ['b', 'c', 'd', 'k']

        num_exclude_time = column_number_exclude_timedelta(df)
        assert num_exclude_time == ['b', 'c', 'd']

        o_c_b = column_object_category_bool(df)
        assert o_c_b == ['a', 'e', 'f']

        o = column_object(df)
        assert o == ['a']
        c = column_category(df)
        assert c == ['f']
        b = column_bool(df)
        assert b == ['e']

        t = column_timedelta(df)
        assert t == ['k']

        d_tz = column_datetimetz(df)
        assert d_tz == ['h', 'i']

        d = column_datetime(df)
        assert d == ['g']

        all_d = column_all_datetime(df)
        assert all_d == ['g', 'h', 'i']

        skewed = column_skewness_kurtosis(df, 0.5)
        assert skewed == ['b', 'c', 'd']
Example #2
0
 def test_in_dataframe_mapper(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=column_object_category_bool(X_train))
     dfm = DataFrameMapper(features=[(X_train.columns.to_list(), ftt)],
                           input_df=True,
                           df_out=True)
     X_t = dfm.fit_transform(X_train)
     assert X_t.shape == (80, 62)
Example #3
0
 def test_pipeline(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=column_object_category_bool(X_train))
     preprocessor = general_preprocessor()
     pipe = Pipeline(steps=[('feature_gen', ftt), ('processor',
                                                   preprocessor)])
     X_t = pipe.fit_transform(X_train)
     print(X_t.columns)
     assert X_t.shape == (80, 62)
Example #4
0
 def test_feature_generation_with_selection(self):
     df = dsutils.load_bank().head(1000)
     df.drop(['id'], axis=1, inplace=True)
     y = df.pop('y')
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=[
             'add_numeric', 'divide_numeric', 'cross_categorical'
         ],
         categories_cols=column_object_category_bool(df),
         feature_selection_args={'ratio_select_cols': 0.2})
     with pytest.raises(AssertionError) as err:
         ftt.fit(df)
         assert err.value == '`y` must be provided for feature selection.'
     ftt.fit(df, y)
     x_t = ftt.transform(df)
     assert x_t.shape[1] == 35
Example #5
0
 def test_feature_tools_categorical_cross(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     cat_cols = column_object_category_bool(X_train)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=cat_cols)
     ftt.fit(X_train)
     x_t = ftt.transform(X_train)
     columns = set(x_t.columns.to_list())
     for i_left in range(len(cat_cols) - 1):
         for i_right in range(i_left + 1, len(cat_cols)):
             assert f'CROSS_CATEGORICAL_{cat_cols[i_left]}__{cat_cols[i_right]}' in columns \
                    or f'CROSS_CATEGORICAL_{cat_cols[i_right]}__{cat_cols[i_left]}' in columns
Example #6
0
    def test_feature_selection(self):
        df = dsutils.load_bank().head(1000)
        df.drop(['id'], axis=1, inplace=True)
        y = df.pop('y')
        ftt = FeatureGenerationTransformer(
            task='binary',
            trans_primitives=[
                'add_numeric', 'divide_numeric', 'cross_categorical'
            ],
            categories_cols=column_object_category_bool(df))
        ftt.fit(df)
        x_t = ftt.transform(df)

        fst = FeatureSelectionTransformer('binary',
                                          ratio_select_cols=0.2,
                                          reserved_cols=ftt.original_cols)
        fst.fit(x_t, y)
        assert len(fst.scores_.items()) == 99
        assert len(fst.columns_) == 35
        x_t2 = fst.transform(x_t)
        assert x_t2.shape[1] == 35
Example #7
0
def get_x_data_character(X_train, get_step):

	cnt_x_all = len(col_se.column_all(X_train))
	cnt_x_date = len(col_se.column_all_datetime(X_train))
	cnt_x_category = len(col_se.column_object_category_bool(X_train))
	cnt_x_num = len(col_se.column_number(X_train))

	try:
		kwargs = get_step('feature_generation').transformer_kwargs
		if kwargs['text_cols'] != None:
			cnt_x_text = len(kwargs['text_cols'])
			cnt_x_all -= len(col_se.column_text(X_train)) - len(kwargs['text_cols'])
		else:
			cnt_x_text = len(col_se.column_text(X_train))
			
		if kwargs['latlong_cols'] != None:
			cnt_x_latlong = len(kwargs['latlong_cols'])
			cnt_x_all += len(kwargs['latlong_cols'])
		else:
			cnt_x_latlong = 0
	except:
		cnt_x_text = len(col_se.column_text(X_train))
		cnt_x_latlong = 0

	cnt_x_others = cnt_x_all - cnt_x_date - cnt_x_category - cnt_x_num - cnt_x_text - cnt_x_latlong

	x_types = {
		'experimentType': 'compete',
		'featureDistribution':{
		'nContinuous':cnt_x_num, 
		'nText':cnt_x_text,
		'nDatetime':cnt_x_date,
		'nCategorical':cnt_x_category,
		'nLocation':cnt_x_latlong,
		'nOthers':cnt_x_others
		}
	}

	return x_types