def test_classifier(self): train_df = datasets.load("titanic")[["Name", "Survived"]] y = np.array(train_df.pop("Survived")) X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["text"]) X_test.set_feature_groups(["text"]) est_cls_list = [ TsvdTransformer, NmfTransformer, LsiTransformer, LdaTransformer, RpTransformer, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") tokenizer = SimpleTokenlizer( **get_default_hp_of_cls(SimpleTokenlizer)) tokenizer.in_feature_groups = "text" tokenizer.out_feature_groups = "token" transformer = cls(**get_default_hp_of_cls(cls)) transformer.in_feature_groups = "token" transformer.out_feature_groups = "num" classifier = RandomForestClassifier( **get_default_hp_of_cls(RandomForestClassifier)) pipeline = ML_Workflow([ ("tokenizer", tokenizer), ("transformer", transformer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager) start = time() pipeline.fit(X_train, y_train, X_test, y_test) y_pred = pipeline.predict(X_test) score = accuracy_score(y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
def test_procedure(self): for cls in [ MinMaxScaler, StandardScaler, Normalizer, QuantileTransformer, RobustScaler, KeepGoing, # WOEEncoder, # 不支持回归 ]: print("=========================") print(cls.__name__) print("=========================") if cls == KeepGoing: hp = {} else: hp = get_default_hp_of_cls(cls) start = time() workflow = ML_Workflow(steps=[("scaler", cls(in_feature_groups="num", out_feature_groups="scaled", **hp)), ("rf", LinearSVR(random_state=0))], resource_manager=self.mock_resource_manager) workflow.fit(X_train=self.X_train, X_valid=self.X_test, y_train=self.y_train, y_valid=self.y_test) y_pred = workflow.predict(self.X_test) score = r2_score(self.y_test.data, y_pred) print("r2 = ", score) print("time = ", time() - start) print("\n" * 2)
def test_procedure(self): for cls in [ EntityEncoder, TargetEncoder, BinaryEncoder, CatBoostEncoder, OrdinalEncoder, LeaveOneOutEncoder, OneHotEncoder, # WOEEncoder, # 不支持回归 ]: print("=========================") print(cls.__name__) print("=========================") start = time() hp = get_default_hp_of_cls(cls) workflow = ML_Workflow(steps=[ ("encoder", cls(in_feature_groups="cat", out_feature_groups="num", **hp)), ("rf", RandomForestRegressor(random_state=0)) ], resource_manager=self.mock_resource_manager) workflow.fit(X_train=self.X_train, X_valid=self.X_test, y_train=self.y_train, y_valid=self.y_test) y_pred = workflow.predict(self.X_test) score = r2_score(self.y_test.data, y_pred) print("r2 = ", score) print("time = ", time() - start) print("\n" * 2)
def test(self): for cls in [ PCA, FastICA, KernelPCA ]: print("=========================") print(cls.__name__) print("=========================") for idx in [1]: for p in [0, 0.25, 0.5, 1]: hp = get_default_hp_of_cls(cls) hp.update({ "in_feature_groups": "num", "out_feature_groups": "reduced", "_n_components__sp1_ratio": p }) start = time() reducer = cls(**hp) X = self.Xs[idx] y = self.ys[idx] reduced = reducer.fit_transform(X, y)["X_train"] assert very_close(reduced.shape[1], self.calc_sp1(X.shape, p), delta=0) print("consuming time :", time() - start) print("assign ratio :", p) print("actual ratio :", reduced.shape[1] / X.shape[1]) print("origin shape :", X.shape) print("actual shape :", reduced.shape) print("\n" * 2)
def test_under_sample(self): est_cls_list = [ AllKNN, ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours, InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, RandomUnderSampler, RepeatedEditedNearestNeighbours, TomekLinks, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") balancer = cls(**get_default_hp_of_cls(cls)) classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC)) pipeline = ML_Workflow([ ("balancer", balancer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager, should_store_intermediate_result=True) start = time() pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test) balanced_y_train = NdArrayContainer( dataset_id=pipeline.intermediate_result["balancer"]["y_train"], resource_manager=self.mock_resource_manager) print("y_train:") print(Counter(self.y_train.data)) print("balanced y_train:") print(Counter(balanced_y_train.data)) y_pred = pipeline.predict(self.X_test) score = accuracy_score(self.y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
def test_over_sample(self): est_cls_list = [ RandomOverSampler, # ADASYN, BorderlineSMOTE, KMeansSMOTE, SMOTE, SVMSMOTE, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") balancer = cls(**get_default_hp_of_cls(cls)) classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC)) pipeline = ML_Workflow([ ("balancer", balancer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager, should_store_intermediate_result=True) start = time() pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test) balanced_y_train = NdArrayContainer( dataset_id=pipeline.intermediate_result["balancer"]["y_train"], resource_manager=self.mock_resource_manager) print("y_train:") print(Counter(self.y_train.data)) print("balanced y_train:") print(Counter(balanced_y_train.data)) y_pred = pipeline.predict(self.X_test) score = accuracy_score(self.y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
def test_io(self): for cls in [ PowerTransformer, QuantileTransformer, KeepGoing, ]: encoder = cls(**get_default_hp_of_cls(cls)) encoder.in_feature_groups = "num" encoder.out_feature_groups = "final" trans = encoder.fit_transform(X_train=self.X_train, X_valid=self.X_test, y_train=self.y_train)["X_train"] assert np.all(trans.feature_groups == "final") assert np.all(trans.index == self.index) assert np.all(trans.columns == self.X_train.columns)
def test_handle_unknown(self): X_train = pd.DataFrame([ ['A', 'alpha', 9], ['A', 'alpha', 1], ['B', 'beta', 2], ['B', 'beta', 3], ['C', 'gamma', 4], ['C', 'gamma', 5], ], columns=['col1', 'col2', 'col3']) X_valid = pd.DataFrame([ ['D', 'kappa', 6], ['D', 'kappa', 6], ['E', 'sigma', 7], ['E', 'sigma', 7], ['F', 'mu', 8], ['F', 'mu', 8], ], columns=['col1', 'col2', 'col3']) X_train = DataFrameContainer(dataset_instance=X_train) X_valid = DataFrameContainer(dataset_instance=X_valid) X_train.set_feature_groups(['cat'] * 3) X_valid.set_feature_groups(['cat'] * 3) y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1]) for cls in [ EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder, CatBoostEncoder ]: hp = get_default_hp_of_cls(cls) encoder = cls(**hp) encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" result = encoder.fit_transform(X_train=X_train, X_valid=X_valid, y_train=y_train) assert np.all( encoder.transform(X_train)['X_train'].data == result['X_train'].data) assert np.all( encoder.transform(X_valid)['X_train'].data == result['X_valid'].data)
def test_classifier(self): X, y = datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) est_cls_list = [ LogisticRegression, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, SGDClassifier, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") est = cls(**get_default_hp_of_cls(cls)) start = time() est.fit(X_train, y_train, X_test, y_test) score = est.component.score(X_test.data, y_test.data) end = time() print("score:", score) print("time:", end - start) self.assertTrue(score == np.max(est.performance_history)) print("max_iterations:", est.max_iterations) print("best_iteration_:", est.best_iteration_) print("early_stopping_rounds:", est.early_stopping_rounds) print("early_stopping_tol:", est.early_stopping_tol) print("iter_inc:", est.iter_inc) print("iteration:", est.iteration) print("iter_ix:", est.iter_ix) print("min_performance:", np.min(est.performance_history)) print("max_performance:", np.max(est.performance_history)) print("learning_curve:", est.learning_curve) print("estimator:", est) print('\n' * 2) learning_curve = est.learning_curve plt.grid() plt.plot(learning_curve[0], learning_curve[1], label="Train Set") plt.plot(learning_curve[0], learning_curve[2], label="Valid Set") plt.xlabel(est.iterations_name) plt.ylabel("Accuracy") title = cls.__name__ plt.title(title) plt.axvline(x=est.best_iteration_, ls="--", c="k") plt.legend(loc="best") plt.savefig(self.plot_dir + f"/{title}.png", quality=100, dpi=600) plt.close()