def test_predict(self): seed = 666 # negative points belong to class 1, positives to 0 p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(cascade_arity=3, max_iter=10, tol=1e-4, kernel='linear', c=2, gamma=0.1, check_convergence=False, random_state=seed, verbose=False) csvm.fit(x, y) # p5 should belong to class 0, p6 to class 1 p5, p6 = np.array([1, 1]), np.array([-1, -1]) x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) y_pred = csvm.predict(x_test) l1, l2, l3, l4, l5, l6 = y_pred.collect() self.assertTrue(l1 == l2 == l5 == 0) self.assertTrue(l3 == l4 == l6 == 1)
def test_score(self, collect): seed = 666 # negative points belong to class 1, positives to 0 p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(cascade_arity=3, max_iter=10, tol=1e-4, kernel='rbf', c=2, gamma=0.1, check_convergence=True, random_state=seed, verbose=False) csvm.fit(x, y) # points are separable, scoring the training dataset should have 100% # accuracy x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) accuracy = csvm.score(x_test, y_test, collect) if not collect: accuracy = compss_wait_on(accuracy) self.assertEqual(accuracy, 1.0)
def test_init_params(self): # Test all parameters with rbf kernel cascade_arity = 3 max_iter = 1 tol = 1e-4 kernel = 'rbf' c = 2 gamma = 0.1 check_convergence = True seed = 666 verbose = False csvm = CascadeSVM(cascade_arity=cascade_arity, max_iter=max_iter, tol=tol, kernel=kernel, c=c, gamma=gamma, check_convergence=check_convergence, random_state=seed, verbose=verbose) self.assertEqual(csvm._arity, cascade_arity) self.assertEqual(csvm._max_iter, max_iter) self.assertEqual(csvm._tol, tol) self.assertEqual(csvm._clf_params['kernel'], kernel) self.assertEqual(csvm._clf_params['C'], c) self.assertEqual(csvm._clf_params['gamma'], gamma) self.assertEqual(csvm._check_convergence, check_convergence) self.assertEqual(csvm._verbose, verbose) # test correct linear kernel and c param (other's are not changed) kernel, c = 'linear', 0.3 csvm = CascadeSVM(kernel=kernel, c=c) self.assertEqual(csvm._clf_params['kernel'], kernel) self.assertEqual(csvm._clf_params['C'], c)
def test_fit(self): seed = 666 file_ = "tests/files/libsvm/2" x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) csvm = CascadeSVM(cascade_arity=3, max_iter=5, tol=1e-4, kernel='linear', c=2, gamma=0.1, check_convergence=True, random_state=seed, verbose=False) csvm.fit(x, y) self.assertTrue(csvm.converged) csvm = CascadeSVM(cascade_arity=3, max_iter=1, tol=1e-4, kernel='linear', c=2, gamma=0.1, check_convergence=False, random_state=seed, verbose=False) csvm.fit(x, y) self.assertFalse(csvm.converged) self.assertEqual(csvm.iterations, 1)
def test_fit_default_gamma(self): """ Tests that the fit method converges when using gamma=auto on a toy dataset """ seed = 666 file_ = "tests/files/libsvm/2" x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) csvm = CascadeSVM(cascade_arity=3, max_iter=5, tol=1e-4, kernel='linear', c=2, check_convergence=True, random_state=seed, verbose=False) csvm.fit(x, y) self.assertTrue(csvm.converged) csvm = CascadeSVM(cascade_arity=3, max_iter=1, tol=1e-4, kernel='linear', c=2, gamma=0.1, check_convergence=False, random_state=seed, verbose=False) csvm.fit(x, y) self.assertFalse(csvm.converged) self.assertEqual(csvm.iterations, 1)
def test_init_params(self): """ Test constructor parameters""" cascade_arity = 3 max_iter = 1 tol = 1e-4 kernel = 'rbf' c = 2 gamma = 0.1 check_convergence = True seed = 666 verbose = False csvm = CascadeSVM(cascade_arity=cascade_arity, max_iter=max_iter, tol=tol, kernel=kernel, c=c, gamma=gamma, check_convergence=check_convergence, random_state=seed, verbose=verbose) self.assertEqual(csvm.cascade_arity, cascade_arity) self.assertEqual(csvm.max_iter, max_iter) self.assertEqual(csvm.tol, tol) self.assertEqual(csvm.kernel, kernel) self.assertEqual(csvm.c, c) self.assertEqual(csvm.gamma, gamma) self.assertEqual(csvm.check_convergence, check_convergence) self.assertEqual(csvm.random_state, seed) self.assertEqual(csvm.verbose, verbose)
def test_fit(self): """Tests RandomizedSearchCV fit().""" x_np, y_np = datasets.load_iris(return_X_y=True) p = np.random.permutation(len(x_np)) # Pre-shuffling required for CSVM x = ds.array(x_np[p], (30, 4)) y = ds.array((y_np[p] == 0)[:, np.newaxis], (30, 1)) param_distributions = {'c': stats.expon(scale=0.5), 'gamma': stats.expon(scale=1)} csvm = CascadeSVM() n_iter = 12 k = 3 searcher = RandomizedSearchCV(estimator=csvm, param_distributions=param_distributions, n_iter=n_iter, cv=k, random_state=0) searcher.fit(x, y) expected_keys = {'param_c', 'param_gamma', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score'} split_keys = {'split%d_test_score' % i for i in range(k)} expected_keys.update(split_keys) self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys) self.assertEqual(len(searcher.cv_results_['param_c']), n_iter) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, k)
def main(): x_ij, y_ij = ds.load_svmlight_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train", block_size=(5000, 22), n_features=22, store_sparse=True) csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
def main(): x_kdd = ds.load_txt_file( "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
def main(): x_kdd = ds.load_txt_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv", block_size=(11482, 122)) x_kdd = shuffle(x_kdd) y_kdd = x_kdd[:, 121:122] x_kdd = x_kdd[:, :121] x_ij, y_ij = ds.load_svmlight_file( "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train", block_size=(5000, 22), n_features=22, store_sparse=True) csvm = CascadeSVM(c=10000, gamma=0.01) performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd) performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
def test_fit_2(self): """Tests GridSearchCV fit() with different data.""" x_np, y_np = datasets.load_breast_cancer(return_X_y=True) x = ds.array(x_np, block_size=(100, 10)) x = StandardScaler().fit_transform(x) y = ds.array(y_np.reshape(-1, 1), block_size=(100, 1)) parameters = {'c': [0.1], 'gamma': [0.1]} csvm = CascadeSVM() searcher = GridSearchCV(csvm, parameters, cv=5) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 5)
def test_fit_private_params(self): kernel = 'rbf' c = 2 gamma = 0.1 seed = 666 file_ = "tests/files/libsvm/2" x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) csvm.fit(x, y) self.assertEqual(csvm._clf_params['kernel'], kernel) self.assertEqual(csvm._clf_params['C'], c) self.assertEqual(csvm._clf_params['gamma'], gamma) kernel, c = 'linear', 0.3 csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) csvm.fit(x, y) self.assertEqual(csvm._clf_params['kernel'], kernel) self.assertEqual(csvm._clf_params['C'], c)
def test_duplicates(self): """ Tests that C-SVM does not generate duplicate support vectors """ x = ds.array( np.array([[0, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2], [2, 1], [1, 0]]), (2, 2)) y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) csvm.fit(x, y) csvm._collect_clf() self.assertEqual(csvm._clf.support_vectors_.shape[0], 6)
def test_refit_false(self): """Tests GridSearchCV fit() with refit=False.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) seed = 0 x, y = shuffle(x, y, random_state=seed) param_grid = {'max_iter': range(1, 5)} csvm = CascadeSVM(check_convergence=False) searcher = GridSearchCV(csvm, param_grid, cv=3, refit=False) searcher.fit(x, y) self.assertFalse(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 3)
def test_sparse(self): """ Tests that C-SVM produces the same results with sparse and dense data""" seed = 666 train = "tests/files/libsvm/3" x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() self.assertTrue(np.array_equal(sv_d, sv_sp)) coef_d = csvm_d._clf.dual_coef_ coef_sp = csvm_sp._clf.dual_coef_.toarray() self.assertTrue(np.array_equal(coef_d, coef_sp))
def test_decision_func(self): seed = 666 # negative points belong to class 1, positives to 0 # all points are in the x-axis p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(cascade_arity=3, max_iter=10, tol=1e-4, kernel='rbf', c=2, gamma=0.1, check_convergence=False, random_state=seed, verbose=False) csvm.fit(x, y) # p1 should be equidistant to p3, and p2 to p4 x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) y_pred = csvm.decision_function(x_test) d1, d2, d3, d4 = y_pred.collect() self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) # p5 and p6 should be in the decision function (distance=0) p5, p6 = np.array([1, 0]), np.array([-1, 0]) x_test = ds.array(np.array([p5, p6]), (1, 2)) y_pred = csvm.decision_function(x_test) d5, d6 = y_pred.collect() self.assertTrue(np.isclose(d5, 0)) self.assertTrue(np.isclose(d6, 0))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str, help="linear or rbf (default is rbf)", choices=["linear", "rbf"], default="rbf") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float, help="(only for rbf kernel) default is 1 / n_features", default=None) parser.add_argument("-c", metavar="C", type=float, default=1, help="Penalty parameter C of the error term. " "Default:1") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH", help="output file path", type=str, required=False) parser.add_argument("--convergence", help="check for convergence", action="store_true") parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-s", "--shuffle", help="shuffle input data", action="store_true") args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 if not args.gamma: gamma = "auto" else: gamma = args.gamma sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2: x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.shuffle: x, y = shuffle(x, y) if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration, c=args.c, gamma=gamma, check_convergence=args.convergence, verbose=args.verbose) csvm.fit(x, y) barrier() fit_time = time.time() - s_time out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"], args.c, csvm.iterations, csvm.converged, read_time, fit_time] if os.path.isdir(train_data): n_files = os.listdir(train_data) out.append(len(n_files)) if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(csvm.score(x_test, y_test))) if args.output_file: with open(args.output_file, "ab") as f: wr = csv.writer(f) wr.writerow(out) else: print(out)
def main(): h = .02 # step size in the mesh names = ["Linear C-SVM", "RBF C-SVM", "Random forest"] classifiers = [ CascadeSVM(kernel="linear", c=0.025, max_iter=5), CascadeSVM(gamma=2, c=1, max_iter=5), RandomForestClassifier(random_state=1) ] x, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) x += 2 * rng.uniform(size=x.shape) linearly_separable = (x, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] preprocessed_data = dict() scores = dict() mesh_accuracy_ds = dict() for ds_cnt, data in enumerate(datasets): # preprocess dataset, split into training and test part x, y = data x = StandardScaler().fit_transform(x) x_train, x_test, y_train, y_test = \ train_test_split(x, y, test_size=.4, random_state=42) ds_x_train = ds.array(x_train, block_size=(20, 2)) ds_y_train = ds.array(y_train.reshape(-1, 1), block_size=(20, 1)) ds_x_test = ds.array(x_test, block_size=(20, 2)) ds_y_test = ds.array(y_test.reshape(-1, 1), block_size=(20, 1)) x_min, x_max = x[:, 0].min() - .5, x[:, 0].max() + .5 y_min, y_max = x[:, 1].min() - .5, x[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) preprocessed_data[ds_cnt] = x, x_train, x_test, y_train, y_test, xx, yy for name, clf in zip(names, classifiers): clf.fit(ds_x_train, ds_y_train) scores[(ds_cnt, name)] = clf.score(ds_x_test, ds_y_test) mesh = np.c_[xx.ravel(), yy.ravel()] mesh_array = ds.array(mesh, (mesh.shape[0], 2)) if hasattr(clf, "decision_function"): mesh_proba = clf.decision_function(mesh_array) else: mesh_proba = clf.predict_proba(mesh_array) mesh_accuracy_ds[(ds_cnt, name)] = mesh_proba # Synchronize while plotting the results plt.figure(figsize=(27, 9)) i = 1 for ds_cnt, data in enumerate(datasets): x, x_train, x_test, y_train, y_test, xx, yy = preprocessed_data[ds_cnt] # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) score = compss_wait_on(scores[(ds_cnt, name)]) mesh_proba = mesh_accuracy_ds[(ds_cnt, name)] if hasattr(clf, "decision_function"): Z = mesh_proba.collect() else: Z = mesh_proba.collect()[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') i += 1 plt.tight_layout() plt.show()