def test_make_classification_hard_vote_score_mix(self): """Tests RandomForestClassifier score with hard_vote, sklearn_max, distr_depth and max_depth.""" x, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0) x_train = ds.array(x[:len(x) // 2], (300, 10)) y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0, sklearn_max=100, distr_depth=2, max_depth=12, hard_vote=True) rf.fit(x_train, y_train) accuracy = compss_wait_on(rf.score(x_test, y_test)) self.assertGreater(accuracy, 0.7)
def test_make_classification_hard_vote_predict(self): """Tests RandomForestClassifier predict with hard_vote.""" x, y = make_classification( n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0, ) x_train = ds.array(x[::2], (300, 10)) y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) x_test = ds.array(x[1::2], (300, 10)) y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10, hard_vote=True) rf.fit(x_train, y_train) y_pred = rf.predict(x_test).collect() accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) self.assertGreater(accuracy, 0.7)
def test_fit(self): """Tests GridSearchCV fit().""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4), 'max_depth': range(3, 5)} rf = RandomForestClassifier() searcher = GridSearchCV(rf, param_grid) searcher.fit(x, y) expected_keys = { 'param_max_depth', 'param_n_estimators', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score' } split_keys = {'split%d_test_score' % i for i in range(5)} expected_keys.update(split_keys) self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys) expected_params = [(3, 2), (3, 4), (4, 2), (4, 4)] for params in searcher.cv_results_['params']: m = params['max_depth'] n = params['n_estimators'] self.assertIn((m, n), expected_params) expected_params.remove((m, n)) self.assertEqual(len(expected_params), 0) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 5)
def test_scoring_callable(self): """Tests GridSearchCV with callable scoring parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() def scoring(clf, x_score, y_real): return clf.score(x_score, y_real) searcher = GridSearchCV(rf, param_grid, cv=3, scoring=scoring) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) def invalid_scoring(clf, x_score, y_score): return '2' searcher = GridSearchCV(rf, param_grid, cv=3, scoring=invalid_scoring) with self.assertRaisesRegex(ValueError, 'scoring must return a number'): searcher.fit(x, y)
def main(): x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) parameters = { 'n_estimators': (1, 2, 4, 8, 16, 32), 'max_depth': range(3, 5) } rf = RandomForestClassifier() searcher = GridSearchCV(rf, parameters, cv=5) np.random.seed(0) searcher.fit(x, y) print(searcher.cv_results_['params']) print(searcher.cv_results_['mean_test_score']) pd_df = pd.DataFrame.from_dict(searcher.cv_results_) print(pd_df[['params', 'mean_test_score']]) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(pd_df) print(searcher.best_estimator_) print(searcher.best_score_) print(searcher.best_params_) print(searcher.best_index_) print(searcher.scorer_) print(searcher.n_splits_)
def test_refit_callable(self): """Tests GridSearchCV with callable refit parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() best_index = 1 def refit(results): return best_index searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertFalse(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) best_index = 'str' searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) with self.assertRaises(TypeError): searcher.fit(x, y) best_index = -1 searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) with self.assertRaises(IndexError): searcher.fit(x, y)
def test_iris(self): """Tests RandomForestClassifier with a minimal example.""" x, y = datasets.load_iris(return_X_y=True) ds_fit = ds.array(x[::2], block_size=(30, 2)) fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) ds_validate = ds.array(x[1::2], block_size=(30, 2)) validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) rf = RandomForestClassifier(n_estimators=1, max_depth=1, random_state=0) rf.fit(ds_fit, fit_y) accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) # Accuracy should be <= 2/3 for any seed, often exactly equal. self.assertAlmostEqual(accuracy, 2 / 3)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] x_mn, y_mn = ds.load_svmlight_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/mnist/train.scaled", block_size=(5000, 780), n_features=780, store_sparse=False) rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "KDD99", rf.fit, x_kdd, y_kdd) rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "mnist", rf.fit, x_mn, y_mn)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "KDD99", rf.fit, x_kdd, y_kdd)
def main(): x_mn, y_mn = ds.load_svmlight_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.scaled", block_size=(5000, 780), n_features=780, store_sparse=False) rf = RandomForestClassifier(n_estimators=100, distr_depth=2) performance.measure("RF", "mnist", rf.fit, x_mn, y_mn)
def test_cv_invalid(self): """Tests GridSearchCV with invalid cv parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) rf = RandomForestClassifier() param_grid = {'n_estimators': (2, 4)} with self.assertRaises(ValueError): searcher = GridSearchCV(rf, param_grid, cv={}) searcher.fit(x, y)
def test_make_classification_fit_predict(self): """Tests RandomForestClassifier fit_predict with default params.""" x, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0) x_train = ds.array(x[:len(x) // 2], (300, 10)) y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) y_pred = rf.fit(x_train, y_train).predict(x_train).collect() y_train = y_train.collect() accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) self.assertGreater(accuracy, 0.7)
def test_make_classification_score(self): """Tests RandomForestClassifier fit and score with default params.""" x, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0) x_train = ds.array(x[:len(x) // 2], (300, 10)) y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) accuracy = compss_wait_on(rf.score(x_test, y_test)) self.assertGreater(accuracy, 0.7)
def main(): x, y = load_iris(return_X_y=True) indices = np.arange(len(x)) shuffle(indices) # use 80% of samples for training train_idx = indices[:int(0.8 * len(x))] test_idx = indices[int(0.8 * len(x)):] # Train the RF classifier print("- Training Random Forest classifier with %s samples of Iris " "dataset." % len(train_idx)) x_train = ds.array(x[train_idx], (10, 4)) y_train = ds.array(y[train_idx][:, np.newaxis], (10, 1)) forest = RandomForestClassifier(10) forest.fit(x_train, y_train) # Test the trained RF classifier print("- Testing the classifier.", end='') x_test = ds.array(x[test_idx], (10, 4)) y_real = ds.array(y[test_idx][:, np.newaxis], (10, 1)) y_pred = forest.predict(x_test) score = compss_wait_on(forest.score(x_test, y_real)) # Put results in fancy dataframe and print the accuracy df = pd.DataFrame(data=list(zip(y[test_idx], y_pred.collect())), columns=['Label', 'Predicted']) print(" Predicted values: \n\n%s" % df) print("\n- Classifier accuracy: %s" % score)
def test_make_classification_sklearn_max_predict_proba(self): """Tests RandomForestClassifier predict_proba with sklearn_max.""" x, y = make_classification( n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0, ) x_train = ds.array(x[::2], (300, 10)) y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) x_test = ds.array(x[1::2], (300, 10)) y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) probabilities = rf.predict_proba(x_test).collect() rf.classes = compss_wait_on(rf.classes) y_pred = rf.classes[np.argmax(probabilities, axis=1)] accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) self.assertGreater(accuracy, 0.7)
def test_make_classification_predict_and_distr_depth(self): """Tests RandomForestClassifier fit and predict with a distr_depth.""" x, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0) x_train = ds.array(x[:len(x) // 2], (300, 10)) y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = y[len(y) // 2:] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) y_pred = rf.predict(x_test).collect() accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) self.assertGreater(accuracy, 0.7)
def test_scoring_invalid(self): """Tests GridSearchCV raises error with invalid scoring parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() searcher = GridSearchCV(rf, param_grid, cv=3, scoring='roc_auc', refit=False) with self.assertRaises(ValueError): searcher.fit(x, y)
def test_cv_class(self): """Tests GridSearchCV with a class cv parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) rf = RandomForestClassifier() param_grid = {'n_estimators': (2, 4)} searcher = GridSearchCV(rf, param_grid, cv=KFold(4)) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_'))
def test_scoring_dict(self): """Tests GridSearchCV with scoring parameter of type dict.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() def hard_vote_score(rand_forest, x, y): rand_forest.hard_vote = True score = rand_forest.score(x, y) rand_forest.hard_vote = False return score scoring = {'default_score': None, 'custom_score': hard_vote_score} searcher = GridSearchCV(rf, param_grid, cv=3, scoring=scoring, refit=False) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertFalse(hasattr(searcher, 'best_estimator_')) self.assertFalse(hasattr(searcher, 'best_score_')) self.assertFalse(hasattr(searcher, 'best_params_')) self.assertFalse(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) searcher = GridSearchCV(rf, param_grid, cv=3, scoring=scoring, refit=True) with self.assertRaises(ValueError): searcher.fit(x, y)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--estimators", metavar="N_ESTIMATORS", type=int, help="default is 10", default=10) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-md", "--max_depth", metavar="MAX_DEPTH", type=int, help="default is np.inf", required=False) parser.add_argument("-dd", "--dist_depth", metavar="DIST_DEPTH", type=int, help="default is auto", required=False) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="use dense data structures", action="store_true") parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2:x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() if args.dist_depth: dist_depth = args.dist_depth else: dist_depth = "auto" if args.max_depth: max_depth = args.max_depth else: max_depth = np.inf forest = RandomForestClassifier(n_estimators=args.estimators, max_depth=max_depth, distr_depth=dist_depth) forest.fit(x, y) barrier() fit_time = time.time() - s_time out = [ forest.n_estimators, forest.distr_depth, forest.max_depth, read_time, fit_time ] if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1:x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(forest.score(x_test, y_test))) print(out)
def main(): h = .02 # step size in the mesh names = ["Linear C-SVM", "RBF C-SVM", "Random forest"] classifiers = [ CascadeSVM(kernel="linear", c=0.025, max_iter=5), CascadeSVM(gamma=2, c=1, max_iter=5), RandomForestClassifier(random_state=1) ] x, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) x += 2 * rng.uniform(size=x.shape) linearly_separable = (x, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] preprocessed_data = dict() scores = dict() mesh_accuracy_ds = dict() for ds_cnt, data in enumerate(datasets): # preprocess dataset, split into training and test part x, y = data x = StandardScaler().fit_transform(x) x_train, x_test, y_train, y_test = \ train_test_split(x, y, test_size=.4, random_state=42) ds_x_train = ds.array(x_train, block_size=(20, 2)) ds_y_train = ds.array(y_train.reshape(-1, 1), block_size=(20, 1)) ds_x_test = ds.array(x_test, block_size=(20, 2)) ds_y_test = ds.array(y_test.reshape(-1, 1), block_size=(20, 1)) x_min, x_max = x[:, 0].min() - .5, x[:, 0].max() + .5 y_min, y_max = x[:, 1].min() - .5, x[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) preprocessed_data[ds_cnt] = x, x_train, x_test, y_train, y_test, xx, yy for name, clf in zip(names, classifiers): clf.fit(ds_x_train, ds_y_train) scores[(ds_cnt, name)] = clf.score(ds_x_test, ds_y_test) mesh = np.c_[xx.ravel(), yy.ravel()] mesh_array = ds.array(mesh, (mesh.shape[0], 2)) if hasattr(clf, "decision_function"): mesh_proba = clf.decision_function(mesh_array) else: mesh_proba = clf.predict_proba(mesh_array) mesh_accuracy_ds[(ds_cnt, name)] = mesh_proba # Synchronize while plotting the results plt.figure(figsize=(27, 9)) i = 1 for ds_cnt, data in enumerate(datasets): x, x_train, x_test, y_train, y_test, xx, yy = preprocessed_data[ds_cnt] # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) score = compss_wait_on(scores[(ds_cnt, name)]) mesh_proba = mesh_accuracy_ds[(ds_cnt, name)] if hasattr(clf, "decision_function"): Z = mesh_proba.collect() else: Z = mesh_proba.collect()[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') i += 1 plt.tight_layout() plt.show()