def test_drift_detector_lightgbm(self): df = load_bank() y = df.pop('y') X_train, X_test = train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527) dd = DriftDetector() dd.fit(X_train, X_test) assert len(dd.feature_names_) == 17 assert len(dd.feature_importances_) == 17 assert dd.auc_ assert len(dd.estimator_) == 5 proba = dd.predict_proba(df) assert proba.shape[0] == df.shape[0] df = load_bank() y = df.pop('y') p = int(df.shape[0] * 0.2) X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(), y, test_size=0.2) assert X_train.shape == (df.shape[0] - p, df.shape[1]) assert y_train.shape == (df.shape[0] - p,) assert X_test.shape == (p, df.shape[1]) assert y_test.shape == (p,) df['y'] = y X_train['y'] = y_train X_test['y'] = y_test df_split = pd.concat([X_train, X_test]) df_hash = hash_pandas_object(df).sort_values() splitted_hash = hash_pandas_object(df_split).sort_values() assert (df_hash == splitted_hash).all()
def test_drift_detector_split(self): df = dd.from_pandas(load_bank(), npartitions=2) y = df.pop('y') X_train, X_test = DaskToolBox.train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527) ddr = dd_selector().get_detector() ddr.fit(X_train, X_test) assert len(ddr.feature_names_) == 17 assert len(ddr.feature_importances_) == 17 assert ddr.auc_ assert len(ddr.estimator_) == 5 proba = ddr.predict_proba(df) assert proba.compute().shape[0] == len(df) df = dd.from_pandas(load_bank(), npartitions=2) y = df.pop('y') p = int(len(df) * 0.2) X_train, X_test, y_train, y_test = ddr.train_test_split(df.copy(), y, test_size=0.2, remain_for_train=0.) df, X_train, X_test, y_train, y_test = DaskToolBox.compute(df, X_train, X_test, y_train, y_test) assert X_train.shape == (df.shape[0] - p, df.shape[1]) assert y_train.shape == (df.shape[0] - p,) assert X_test.shape == (p, df.shape[1]) assert y_test.shape == (p,) df['y'] = y X_train['y'] = y_train X_test['y'] = y_test df_split = pd.concat([X_train, X_test]) df_hash = hash_pandas_object(df).sort_values() splitted_hash = hash_pandas_object(df_split).sort_values() assert (df_hash == splitted_hash).all()
def _create_bankdata_experiment(predefined_kwargs, maker=None, need_test=False, user_kwargs=None): target = 'y' df = dsutils.load_bank().head(2000) df[target] = LabelEncoder().fit_transform(df[target]) df_train, df_test = train_test_split(df, test_size=0.3, random_state=9527) def maker_(*args, **kwargs): return make_experiment(PlainModel, *args, **kwargs) default_kwargs = dict(log_level='info') predefined_kwargs.update(default_kwargs) if maker is None: maker = maker_ predefined_kwargs['search_space'] = PlainSearchSpace(enable_lr=True, enable_nn=False, enable_dt=False, enable_dtr=False) predefined_kwargs['hyper_model_options'] = {'transformer': MultiLabelEncoder} if need_test: predefined_kwargs['test_data'] = df_test predefined_kwargs.update(user_kwargs) return maker(df_train, target=target, task=const.TASK_BINARY, **predefined_kwargs)
def test_shift_score(self): df = load_bank().head(1000) df = cudf.from_pandas(df) selector = dd_selector() scores = selector._covariate_shift_score(df[:700], df[700:]) print('_covariate_shift_score', scores) assert scores['id'] >= 0.95
def test_datetime_encoder(self): def is_holiday(x): holidays = {'0501', '0502', '0503'} return x.apply(lambda t: int(t.strftime('%m%d') in holidays)) months = {'oct': 10, 'may': 5, 'apr': 4, 'jun': 6, 'feb': 2, 'aug': 8, 'jan': 1, 'jul': 7, 'nov': 11, 'sep': 9, 'mar': 3, 'dec': 12} df = dsutils.load_bank().sample(n=1000, random_state=9527) df['year'] = 2000 df['month'] = df['month'].apply(lambda s: months[s]) df['date'] = pd.to_datetime(df[['year', 'month', 'day']]) encoder = skex.DatetimeEncoder() X = encoder.fit_transform(df) columns = X.columns.to_list() assert 'date' not in columns assert all([c in columns for c in ['date_month', 'date_day']]) assert all([c not in columns for c in ['date_hour', 'date_minute']]) encoder = skex.DatetimeEncoder(include=skex.DatetimeEncoder.default_include + ['timestamp'], extra=[('holiday', is_holiday)], drop_constants=False) X = encoder.fit_transform(df) columns = X.columns.to_list() assert 'date' not in columns assert all([c in columns for c in ['date_holiday', 'date_timestamp']])
def test_feature_selection(self): df = load_bank() df = cudf.from_pandas(df) y = df.pop('y') p = int(df.shape[0] * 0.8) X_train = df[:p] X_test = df[p:] # = train_test_split(df, train_size=0.7, random_state=9527) selector = dd_selector(remove_shift_variable=False, auc_threshold=0.55, min_features=15, remove_size=0.2) remain_features, history, scores = selector.select(X_train, X_test, copy_data=True) assert len(remain_features) == 15 selector = dd_selector(remove_shift_variable=True, auc_threshold=0.55, min_features=15, remove_size=0.2) remain_features, history, scores = selector.select(X_train, X_test, copy_data=True) assert len(remain_features) == 16
def test_collinear(): df = dsutils.load_bank().head(10000) y = df.pop('y') df.drop(['id'], axis=1, inplace=True) corr = spearmanr(df).correlation corr_linkage = hierarchy.ward(corr) # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) # dendro = hierarchy.dendrogram( # corr_linkage, labels=df.columns.to_list(), ax=ax1, leaf_rotation=90 # ) # dendro_idx = np.arange(0, len(dendro['ivl'])) # ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']]) # ax2.set_xticks(dendro_idx) # ax2.set_yticks(dendro_idx) # ax2.set_xticklabels(dendro['ivl'], rotation='vertical') # ax2.set_yticklabels(dendro['ivl']) # fig.tight_layout() # plt.show() cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance') cluster_id_to_feature_ids = defaultdict(list) for idx, cluster_id in enumerate(cluster_ids): cluster_id_to_feature_ids[cluster_id].append(idx) selected_features = [ df.columns[v[0]] for v in cluster_id_to_feature_ids.values() ] assert selected_features == [ 'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome' ]
def load_data(): set_random_state(9527) df = dsutils.load_bank().head(3000) encoder = MultiLabelEncoder() df = encoder.fit_transform(df) df.drop(['id'], axis=1, inplace=True) return df
def test_cache_dask(): clear() cache_counter = CachedDaskMultiLabelEncoder.cache_counter df = dd.from_pandas(dsutils.load_bank(), npartitions=2) t = dex.SafeOrdinalEncoder() X = t.fit_transform(df.copy()) cache_counter.reset() t1 = CachedDaskMultiLabelEncoder() X1 = t1.fit_transform(df.copy()) t2 = CachedDaskMultiLabelEncoder() X2 = t2.fit_transform(df.copy()) hasher = dex.DaskToolBox.data_hasher() assert hasher(X) == hasher(X1) == hasher(X2) assert cache_counter.enter_counter.value == 2 assert cache_counter.apply_counter.value <= 2 assert cache_counter.store_counter.value <= 2 assert cache_counter.apply_counter.value + cache_counter.store_counter.value == 2 cache_counter.reset() t3 = CachedDaskMultiLabelEncoder() X3 = t3.fit_transform_as_array(df.copy()) t4 = CachedDaskMultiLabelEncoder() X4 = t4.fit_transform_as_array(df.copy()) assert hasher(X3) == hasher(X4) assert cache_counter.enter_counter.value == 2 assert cache_counter.apply_counter.value <= 2 assert cache_counter.store_counter.value <= 2 assert cache_counter.apply_counter.value + cache_counter.store_counter.value == 2
def setup_class(cls): from sklearn.preprocessing import LabelEncoder df = dsutils.load_bank() df['y'] = LabelEncoder().fit_transform(df['y']) cls.bank_data = df cls.movie_lens = dsutils.load_movielens() os.makedirs(cls.work_dir)
def test_shift_score(self): df = load_bank().head(1000) df = dd.from_pandas(df, npartitions=2) selector = dd_selector() df_train = DaskToolBox.select_df(df, np.arange(700)) df_test = DaskToolBox.select_df(df, np.arange(700, 1000)) scores = selector._covariate_shift_score(df_train, df_test) assert scores['id'] > 0.99
def setup_class(cls): from sklearn.preprocessing import LabelEncoder df = dsutils.load_bank() df['y'] = LabelEncoder().fit_transform(df['y']) df['education'] = LabelEncoder().fit_transform(df['education']) cf = cudf.from_pandas(df) cls.df = df cls.cf = cf
def test_dataframe_fs(self): file_path = f'/{type(self).__name__}/test_df_fs.parquet' df = dsutils.load_bank() p.store(df, file_path, filesystem=fs) assert fs.exists(file_path) # read it df_read = p.load(file_path, filesystem=fs) assert self.is_same_df(df, df_read)
def test_shufflesplit(self): df = load_bank().head(1000) y = df.pop('y') iterators = StratifiedShuffleSplit(n_splits=1, test_size=0.3) indices = [] for train_index, test_index in iterators.split(df, y): indices.append((train_index, test_index)) assert len(indices) == 1 assert len(indices[0][0]) == 700 assert len(indices[0][1]) == 300
def test_series(self): file_path = f'{test_output_dir}/{type(self).__name__}/test_series.parquet' df = dsutils.load_bank() p.store(df['age'], file_path) assert path.exists(file_path) s = p.load(file_path) assert isinstance(s, pd.Series) assert s.name == 'age' assert len(s) == len(df) assert all(s == df['age'])
def test_ndarray(self): file_path = f'{test_output_dir}/{type(self).__name__}/test_ndarray.parquet' df = dsutils.load_bank() p.store(df.values, file_path) assert path.exists(file_path) values = p.load(file_path) assert isinstance(values, np.ndarray) assert values.shape == df.shape df_read = pd.DataFrame(values, columns=df.columns) assert all(df_read['y'] == df['y'])
def test_feature_tools_transformer(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) y = df.pop('y') X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer( task='binary', trans_primitives=['add_numeric', 'divide_numeric']) ftt.fit(X_train) x_t = ftt.transform(X_train) assert x_t is not None
def setup_class(cls): df = dsutils.load_bank() df = get_tool_box(df).general_preprocessor(df).fit_transform(df) cls.bank_data = df cls.bank_data_cudf = cudf.from_pandas(df) # # cls.boston_data = dsutils.load_blood() # cls.boston_data_cudf = cudf.from_pandas(cls.boston_data) # # cls.movie_lens = dsutils.load_movielens() os.makedirs(cls.work_dir)
def experiment_with_bank_data(init_kwargs, run_kwargs, row_count=3000, with_dask=False): hyper_model = create_plain_model(with_encoder=True, with_dask=with_dask) X = dsutils.load_bank() if row_count is not None: X = X.head(row_count) X['y'] = LabelEncoder().fit_transform(X['y']) if with_dask: setup_dask(None) X = dd.from_pandas(X, npartitions=1) y = X.pop('y') tb = get_tool_box(X, y) scorer = tb.metrics.metric_to_scoring(hyper_model.reward_metric) X_train, X_test, y_train, y_test = \ tb.train_test_split(X, y, test_size=0.3, random_state=9527) X_train, X_eval, y_train, y_eval = \ tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527) init_kwargs = { 'X_eval': X_eval, 'y_eval': y_eval, 'X_test': X_test, 'scorer': scorer, 'ensemble_size': 0, 'drift_detection': False, **init_kwargs } run_kwargs = {'max_trials': 3, **run_kwargs} experiment = CompeteExperiment(hyper_model, X_train, y_train, **init_kwargs) estimator = experiment.run(**run_kwargs) assert estimator preds = estimator.predict(X_test) proba = estimator.predict_proba(X_test) if with_dask: preds, proba = tb.to_local(preds, proba) score = tb.metrics.calc_score( y_test, preds, proba, metrics=['auc', 'accuracy', 'f1', 'recall', 'precision']) print('evaluate score:', score) assert score
def test_dataframe(self): file_path = f'{test_output_dir}/{type(self).__name__}/test_df.parquet' df = dsutils.load_bank() p.store(df, file_path) assert path.exists(file_path) # read with pandas df_pd = pd.read_parquet(file_path) assert self.is_same_df(df, df_pd) # read with our utility df_read = p.load(file_path) assert self.is_same_df(df, df_read)
def test_drift_detector_fit_randomforest(self): df = load_bank().head(10000) y = df.pop('y') X_train, X_test = train_test_split(df, train_size=0.7, shuffle=True, random_state=9527) dd_rf = DriftDetector( estimator=RandomForestClassifier(min_samples_leaf=20, min_impurity_decrease=0.01)) dd_rf.fit(X_train, X_test) assert len(dd_rf.feature_names_) == 17 assert len(dd_rf.feature_importances_) == 17 assert dd_rf.auc_ assert len(dd_rf.estimator_) == 5
def test_in_dataframe_mapper(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer( task='binary', trans_primitives=['cross_categorical'], categories_cols=column_object_category_bool(X_train)) dfm = DataFrameMapper(features=[(X_train.columns.to_list(), ftt)], input_df=True, df_out=True) X_t = dfm.fit_transform(X_train) assert X_t.shape == (80, 62)
def test_experiment_with_data_adaption(): df = dsutils.load_bank() df = MultiLabelEncoder().fit_transform(df) mem_usage = int(df.memory_usage().sum()) experiment = make_experiment( PlainModel, df, target='y', search_space=PlainSearchSpace(), data_adaption_memory_limit=mem_usage // 2, log_level='info', ) estimator = experiment.run(max_trials=3) assert estimator is not None assert estimator.steps[0][0] == 'data_adaption'
def setup_class(cls): from sklearn.preprocessing import LabelEncoder df = dsutils.load_bank() df['y'] = LabelEncoder().fit_transform(df['y']) # binary task target df['education'] = LabelEncoder().fit_transform( df['education']) # multiclass task target cls.bank_data = df cls.bank_data_cudf = cudf.from_pandas(df) cls.boston_data = dsutils.load_blood() cls.boston_data_cudf = cudf.from_pandas(cls.boston_data) cls.movie_lens = dsutils.load_movielens() os.makedirs(cls.work_dir)
def test_pipeline(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer( task='binary', trans_primitives=['cross_categorical'], categories_cols=column_object_category_bool(X_train)) preprocessor = general_preprocessor() pipe = Pipeline(steps=[('feature_gen', ftt), ('processor', preprocessor)]) X_t = pipe.fit_transform(X_train) print(X_t.columns) assert X_t.shape == (80, 62)
def test_pipeline(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) ddf = dd.from_pandas(df.head(100), npartitions=2) tb = get_tool_box(ddf) ftt = tb.transformers['FeatureGenerationTransformer']( task='binary', trans_primitives=['cross_categorical'], categories_cols=tb.column_selector.column_object_category_bool( ddf)) preprocessor = tb.general_preprocessor(ddf) pipe = Pipeline(steps=[('feature_gen', ftt), ('processor', preprocessor)]) X_t = pipe.fit_transform(ddf) X_t = X_t.compute() assert X_t.shape[1] == 62
def test_feature_generation_with_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') ftt = FeatureGenerationTransformer( task='binary', trans_primitives=[ 'add_numeric', 'divide_numeric', 'cross_categorical' ], categories_cols=column_object_category_bool(df), feature_selection_args={'ratio_select_cols': 0.2}) with pytest.raises(AssertionError) as err: ftt.fit(df) assert err.value == '`y` must be provided for feature selection.' ftt.fit(df, y) x_t = ftt.transform(df) assert x_t.shape[1] == 35
def test_feature_tools_categorical_cross(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) cat_cols = column_object_category_bool(X_train) ftt = FeatureGenerationTransformer( task='binary', trans_primitives=['cross_categorical'], categories_cols=cat_cols) ftt.fit(X_train) x_t = ftt.transform(X_train) columns = set(x_t.columns.to_list()) for i_left in range(len(cat_cols) - 1): for i_right in range(i_left + 1, len(cat_cols)): assert f'CROSS_CATEGORICAL_{cat_cols[i_left]}__{cat_cols[i_right]}' in columns \ or f'CROSS_CATEGORICAL_{cat_cols[i_right]}__{cat_cols[i_left]}' in columns
def test_feature_selection(self): df = self.bank_data.copy() y = df.pop('y') reserved_cols = ['age', 'poutcome', 'id'] fse = skex.FeatureSelectionTransformer('classification', 10000, 10000, 10, n_max_cols=8, reserved_cols=reserved_cols) fse.fit(df, y) assert len(fse.scores_.items()) == 10 assert len(fse.columns_) == 11 assert len(set(reserved_cols) - set(fse.columns_)) == 0 x_t = fse.transform(df) assert x_t.columns.to_list() == fse.columns_ df = dsutils.load_bank() y = df.pop('age') fse = skex.FeatureSelectionTransformer('regression', 10000, 10000, -1) fse.fit(df, y) assert len(fse.scores_.items()) == 17 assert len(fse.columns_) == 10
def test_feature_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') ftt = FeatureGenerationTransformer( task='binary', trans_primitives=[ 'add_numeric', 'divide_numeric', 'cross_categorical' ], categories_cols=column_object_category_bool(df)) ftt.fit(df) x_t = ftt.transform(df) fst = FeatureSelectionTransformer('binary', ratio_select_cols=0.2, reserved_cols=ftt.original_cols) fst.fit(x_t, y) assert len(fst.scores_.items()) == 99 assert len(fst.columns_) == 35 x_t2 = fst.transform(x_t) assert x_t2.shape[1] == 35