Example #1
0
    def test_predict(self):
        seed = 666

        # negative points belong to class 1, positives to 0
        p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1]

        x = ds.array(np.array([p1, p4, p3, p2]), (2, 2))
        y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=10,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        # p5 should belong to class 0, p6 to class 1
        p5, p6 = np.array([1, 1]), np.array([-1, -1])

        x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2))

        y_pred = csvm.predict(x_test)

        l1, l2, l3, l4, l5, l6 = y_pred.collect()

        self.assertTrue(l1 == l2 == l5 == 0)
        self.assertTrue(l3 == l4 == l6 == 1)
Example #2
0
    def test_score(self, collect):
        seed = 666

        # negative points belong to class 1, positives to 0
        p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1]

        x = ds.array(np.array([p1, p4, p3, p2]), (2, 2))
        y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=10,
                          tol=1e-4,
                          kernel='rbf',
                          c=2,
                          gamma=0.1,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        # points are separable, scoring the training dataset should have 100%
        # accuracy
        x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2))
        y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1))

        accuracy = csvm.score(x_test, y_test, collect)
        if not collect:
            accuracy = compss_wait_on(accuracy)

        self.assertEqual(accuracy, 1.0)
Example #3
0
    def test_init_params(self):
        # Test all parameters with rbf kernel
        cascade_arity = 3
        max_iter = 1
        tol = 1e-4
        kernel = 'rbf'
        c = 2
        gamma = 0.1
        check_convergence = True
        seed = 666
        verbose = False

        csvm = CascadeSVM(cascade_arity=cascade_arity,
                          max_iter=max_iter,
                          tol=tol,
                          kernel=kernel,
                          c=c,
                          gamma=gamma,
                          check_convergence=check_convergence,
                          random_state=seed,
                          verbose=verbose)
        self.assertEqual(csvm._arity, cascade_arity)
        self.assertEqual(csvm._max_iter, max_iter)
        self.assertEqual(csvm._tol, tol)
        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
        self.assertEqual(csvm._clf_params['gamma'], gamma)
        self.assertEqual(csvm._check_convergence, check_convergence)
        self.assertEqual(csvm._verbose, verbose)

        # test correct linear kernel and c param (other's are not changed)
        kernel, c = 'linear', 0.3
        csvm = CascadeSVM(kernel=kernel, c=c)
        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
Example #4
0
    def test_fit(self):
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=5,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        self.assertTrue(csvm.converged)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=1,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)
        self.assertFalse(csvm.converged)
        self.assertEqual(csvm.iterations, 1)
Example #5
0
    def test_fit_default_gamma(self):
        """ Tests that the fit method converges when using gamma=auto on a
        toy dataset """
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=5,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        self.assertTrue(csvm.converged)

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=1,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)
        self.assertFalse(csvm.converged)
        self.assertEqual(csvm.iterations, 1)
Example #6
0
    def test_init_params(self):
        """ Test constructor parameters"""
        cascade_arity = 3
        max_iter = 1
        tol = 1e-4
        kernel = 'rbf'
        c = 2
        gamma = 0.1
        check_convergence = True
        seed = 666
        verbose = False

        csvm = CascadeSVM(cascade_arity=cascade_arity,
                          max_iter=max_iter,
                          tol=tol,
                          kernel=kernel,
                          c=c,
                          gamma=gamma,
                          check_convergence=check_convergence,
                          random_state=seed,
                          verbose=verbose)
        self.assertEqual(csvm.cascade_arity, cascade_arity)
        self.assertEqual(csvm.max_iter, max_iter)
        self.assertEqual(csvm.tol, tol)
        self.assertEqual(csvm.kernel, kernel)
        self.assertEqual(csvm.c, c)
        self.assertEqual(csvm.gamma, gamma)
        self.assertEqual(csvm.check_convergence, check_convergence)
        self.assertEqual(csvm.random_state, seed)
        self.assertEqual(csvm.verbose, verbose)
Example #7
0
    def test_fit(self):
        """Tests RandomizedSearchCV fit()."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        p = np.random.permutation(len(x_np))  # Pre-shuffling required for CSVM
        x = ds.array(x_np[p], (30, 4))
        y = ds.array((y_np[p] == 0)[:, np.newaxis], (30, 1))
        param_distributions = {'c': stats.expon(scale=0.5),
                               'gamma': stats.expon(scale=1)}
        csvm = CascadeSVM()
        n_iter = 12
        k = 3
        searcher = RandomizedSearchCV(estimator=csvm,
                                      param_distributions=param_distributions,
                                      n_iter=n_iter, cv=k, random_state=0)
        searcher.fit(x, y)

        expected_keys = {'param_c', 'param_gamma', 'params', 'mean_test_score',
                         'std_test_score', 'rank_test_score'}
        split_keys = {'split%d_test_score' % i for i in range(k)}
        expected_keys.update(split_keys)

        self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys)
        self.assertEqual(len(searcher.cv_results_['param_c']), n_iter)
        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, k)
Example #8
0
def main():
    x_ij, y_ij = ds.load_svmlight_file(
        "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train",
        block_size=(5000, 22), n_features=22, store_sparse=True)

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
Example #9
0
def main():
    x_kdd = ds.load_txt_file(
        "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv",
        block_size=(11482, 122))

    x_kdd = shuffle(x_kdd)
    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
Example #10
0
def main():
    x_kdd = ds.load_txt_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv",
        block_size=(11482, 122))

    x_kdd = shuffle(x_kdd)
    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    x_ij, y_ij = ds.load_svmlight_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train",
        block_size=(5000, 22), n_features=22, store_sparse=True)

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
    performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
Example #11
0
    def test_fit_2(self):
        """Tests GridSearchCV fit() with different data."""
        x_np, y_np = datasets.load_breast_cancer(return_X_y=True)
        x = ds.array(x_np, block_size=(100, 10))
        x = StandardScaler().fit_transform(x)
        y = ds.array(y_np.reshape(-1, 1), block_size=(100, 1))
        parameters = {'c': [0.1], 'gamma': [0.1]}
        csvm = CascadeSVM()
        searcher = GridSearchCV(csvm, parameters, cv=5)
        searcher.fit(x, y)

        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, 5)
Example #12
0
    def test_fit_private_params(self):
        kernel = 'rbf'
        c = 2
        gamma = 0.1
        seed = 666
        file_ = "tests/files/libsvm/2"

        x, y = ds.load_svmlight_file(file_, (10, 300), 780, False)
        csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed)
        csvm.fit(x, y)

        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
        self.assertEqual(csvm._clf_params['gamma'], gamma)

        kernel, c = 'linear', 0.3
        csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed)
        csvm.fit(x, y)
        self.assertEqual(csvm._clf_params['kernel'], kernel)
        self.assertEqual(csvm._clf_params['C'], c)
Example #13
0
    def test_duplicates(self):
        """ Tests that C-SVM does not generate duplicate support vectors """
        x = ds.array(
            np.array([[0, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2], [2, 1],
                      [1, 0]]), (2, 2))

        y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0)
        csvm.fit(x, y)

        csvm._collect_clf()
        self.assertEqual(csvm._clf.support_vectors_.shape[0], 6)
Example #14
0
    def test_refit_false(self):
        """Tests GridSearchCV fit() with refit=False."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        x = ds.array(x_np, (30, 4))
        y = ds.array(y_np[:, np.newaxis], (30, 1))

        seed = 0
        x, y = shuffle(x, y, random_state=seed)

        param_grid = {'max_iter': range(1, 5)}
        csvm = CascadeSVM(check_convergence=False)
        searcher = GridSearchCV(csvm, param_grid, cv=3, refit=False)
        searcher.fit(x, y)

        self.assertFalse(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, 3)
Example #15
0
    def test_sparse(self):
        """ Tests that C-SVM produces the same results with sparse and dense
        data"""
        seed = 666
        train = "tests/files/libsvm/3"

        x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True)
        x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False)

        csvm_sp = CascadeSVM(random_state=seed)
        csvm_sp.fit(x_sp, y_sp)

        csvm_d = CascadeSVM(random_state=seed)
        csvm_d.fit(x_d, y_d)

        sv_d = csvm_d._clf.support_vectors_
        sv_sp = csvm_sp._clf.support_vectors_.toarray()

        self.assertTrue(np.array_equal(sv_d, sv_sp))

        coef_d = csvm_d._clf.dual_coef_
        coef_sp = csvm_sp._clf.dual_coef_.toarray()

        self.assertTrue(np.array_equal(coef_d, coef_sp))
Example #16
0
    def test_decision_func(self):
        seed = 666

        # negative points belong to class 1, positives to 0
        # all points are in the x-axis
        p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1]

        x = ds.array(np.array([p1, p4, p3, p2]), (2, 2))
        y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=10,
                          tol=1e-4,
                          kernel='rbf',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        # p1 should be equidistant to p3, and p2 to p4
        x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2))

        y_pred = csvm.decision_function(x_test)

        d1, d2, d3, d4 = y_pred.collect()

        self.assertTrue(np.isclose(abs(d1) - abs(d3), 0))
        self.assertTrue(np.isclose(abs(d2) - abs(d4), 0))

        # p5 and p6 should be in the decision function (distance=0)
        p5, p6 = np.array([1, 0]), np.array([-1, 0])

        x_test = ds.array(np.array([p5, p6]), (1, 2))

        y_pred = csvm.decision_function(x_test)

        d5, d6 = y_pred.collect()

        self.assertTrue(np.isclose(d5, 0))
        self.assertTrue(np.isclose(d6, 0))
Example #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str,
                        help="linear or rbf (default is rbf)",
                        choices=["linear", "rbf"], default="rbf")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float,
                        help="(only for rbf kernel) default is 1 / n_features",
                        default=None)
    parser.add_argument("-c", metavar="C", type=float, default=1,
                        help="Penalty parameter C of the error term. "
                             "Default:1")
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH",
                        help="test file path", type=str, required=False)
    parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH",
                        help="output file path", type=str, required=False)
    parser.add_argument("--convergence", help="check for convergence",
                        action="store_true")
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-s", "--shuffle", help="shuffle input data",
                        action="store_true")
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    if not args.gamma:
        gamma = "auto"
    else:
        gamma = args.gamma

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2: x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.shuffle:
        x, y = shuffle(x, y)

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration,
                      c=args.c, gamma=gamma,
                      check_convergence=args.convergence, verbose=args.verbose)

    csvm.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"],
           args.c, csvm.iterations, csvm.converged, read_time, fit_time]

    if os.path.isdir(train_data):
        n_files = os.listdir(train_data)
        out.append(len(n_files))

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features,
                                                   sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(csvm.score(x_test, y_test)))

    if args.output_file:
        with open(args.output_file, "ab") as f:
            wr = csv.writer(f)
            wr.writerow(out)
    else:
        print(out)
Example #18
0
def main():
    h = .02  # step size in the mesh

    names = ["Linear C-SVM", "RBF C-SVM", "Random forest"]

    classifiers = [
        CascadeSVM(kernel="linear", c=0.025, max_iter=5),
        CascadeSVM(gamma=2, c=1, max_iter=5),
        RandomForestClassifier(random_state=1)
    ]

    x, y = make_classification(n_features=2,
                               n_redundant=0,
                               n_informative=2,
                               random_state=1,
                               n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    x += 2 * rng.uniform(size=x.shape)
    linearly_separable = (x, y)

    datasets = [
        make_moons(noise=0.3, random_state=0),
        make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
    ]

    preprocessed_data = dict()
    scores = dict()
    mesh_accuracy_ds = dict()
    for ds_cnt, data in enumerate(datasets):
        # preprocess dataset, split into training and test part
        x, y = data
        x = StandardScaler().fit_transform(x)
        x_train, x_test, y_train, y_test = \
            train_test_split(x, y, test_size=.4, random_state=42)
        ds_x_train = ds.array(x_train, block_size=(20, 2))
        ds_y_train = ds.array(y_train.reshape(-1, 1), block_size=(20, 1))
        ds_x_test = ds.array(x_test, block_size=(20, 2))
        ds_y_test = ds.array(y_test.reshape(-1, 1), block_size=(20, 1))

        x_min, x_max = x[:, 0].min() - .5, x[:, 0].max() + .5
        y_min, y_max = x[:, 1].min() - .5, x[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        preprocessed_data[ds_cnt] = x, x_train, x_test, y_train, y_test, xx, yy

        for name, clf in zip(names, classifiers):
            clf.fit(ds_x_train, ds_y_train)
            scores[(ds_cnt, name)] = clf.score(ds_x_test, ds_y_test)

            mesh = np.c_[xx.ravel(), yy.ravel()]
            mesh_array = ds.array(mesh, (mesh.shape[0], 2))

            if hasattr(clf, "decision_function"):
                mesh_proba = clf.decision_function(mesh_array)
            else:
                mesh_proba = clf.predict_proba(mesh_array)
            mesh_accuracy_ds[(ds_cnt, name)] = mesh_proba

    # Synchronize while plotting the results
    plt.figure(figsize=(27, 9))
    i = 1
    for ds_cnt, data in enumerate(datasets):
        x, x_train, x_test, y_train, y_test, xx, yy = preprocessed_data[ds_cnt]

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        if ds_cnt == 0:
            ax.set_title("Input data")
        # Plot the training points
        ax.scatter(x_train[:, 0],
                   x_train[:, 1],
                   c=y_train,
                   cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(x_test[:, 0],
                   x_test[:, 1],
                   c=y_test,
                   cmap=cm_bright,
                   alpha=0.6,
                   edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

            score = compss_wait_on(scores[(ds_cnt, name)])
            mesh_proba = mesh_accuracy_ds[(ds_cnt, name)]

            if hasattr(clf, "decision_function"):
                Z = mesh_proba.collect()
            else:
                Z = mesh_proba.collect()[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot the training points
            ax.scatter(x_train[:, 0],
                       x_train[:, 1],
                       c=y_train,
                       cmap=cm_bright,
                       edgecolors='k')
            # Plot the testing points
            ax.scatter(x_test[:, 0],
                       x_test[:, 1],
                       c=y_test,
                       cmap=cm_bright,
                       edgecolors='k',
                       alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name)
            ax.text(xx.max() - .3,
                    yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15,
                    horizontalalignment='right')
            i += 1

    plt.tight_layout()
    plt.show()