def modelselection_grid_search_kernel():
	num_subsets=3
	num_vectors=20
	dim_vectors=3

	# create some (non-sense) data
	matrix=rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	param_tree.print_tree()

	grid_search=GridSearchModelSelection(param_tree, cross)

	print_state=True
	best_combination=grid_search.select_model(print_state)
	print("best parameter(s):")
	best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have tighter confidence intervals
	cross.set_num_runs(10)
	cross.set_conf_int_alpha(0.01)
	result=cross.evaluate()
	print("result: ")
	result.print_result()

	return 0
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    print("unserializing SVM")
    svm2 = load(fn)

    print("comparing objectives")

    svm2.train()

    print("objective before serialization:", svm.get_objective())
    print("objective after serialization:", svm2.get_objective())

    print("comparing predictions")

    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    print("all checks passed.")

    return True
Exemple #3
0
    elif ctype == 'kernel':
        feats = util.get_features(indata, 'kernel_')
    else:
        feats = util.get_features(indata, prefix)

    machine = _get_machine(indata, prefix, feats)

    try:
        fun = eval(indata[prefix + 'name'])
    except NameError, e:
        print "%s is disabled/unavailable!" % indata[prefix + 'name']
        return False

    # cannot refactor into function, because labels is unrefed otherwise
    if indata.has_key(prefix + 'labels'):
        labels = BinaryLabels(double(indata[prefix + 'labels']))
        if ctype == 'kernel':
            classifier = fun(indata[prefix + 'C'], machine, labels)
        elif ctype == 'linear':
            classifier = fun(indata[prefix + 'C'], feats['train'], labels)
        elif ctype == 'knn':
            classifier = fun(indata[prefix + 'k'], machine, labels)
        elif ctype == 'lda':
            classifier = fun(indata[prefix + 'gamma'], feats['train'], labels)
        elif ctype == 'perceptron':
            classifier = fun(feats['train'], labels)
        elif ctype == 'wdsvmocas':
            classifier = fun(indata[prefix + 'C'], indata[prefix + 'degree'],
                             indata[prefix + 'degree'], feats['train'], labels)
        else:
            return False
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################

    seed(17)
    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################

    #print("labels:")
    #print(pickle.dumps(labels))
    #
    #print("features")
    #print(pickle.dumps(feats_train))
    #
    #print("kernel")
    #print(pickle.dumps(kernel))
    #
    #print("svm")
    #print(pickle.dumps(svm))
    #
    #print("#################################")

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)

    save(fn, svm)

    #print("#################################")

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("#################################")
    #print("comparing training")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())
    return svm, svm.get_objective(), svm2, svm2.get_objective()
def classifier_non_separable_svm(n=100, distance=5, seed=None, C1=1, C2=100):
    '''
	n is the number of examples per class and m is the number of examples per class that gets its
	label swapped to force non-linear separability

	C1 and C2 are the two regularization values used
	'''
    from shogun.Features import RealFeatures, BinaryLabels

    # 2D data
    _DIM = 2

    # To get the nice message that the perceptron has converged
    dummy = BinaryLabels()

    np.random.seed(seed)

    # Produce some (probably) linearly separable training data by hand
    # Two Gaussians at a far enough distance
    X = np.array(2 * np.random.randn(_DIM, n)) + distance
    Y = np.array(1.5 * np.random.randn(_DIM, n)) - distance
    label_train_twoclass = np.hstack((np.ones(n), -np.ones(n)))
    # The last 5 points of the positive class are closer to the negative examples
    X[:, -3:-1] = np.random.randn(_DIM, 2) - 0.5 * distance / 2

    fm_train_real = np.hstack((X, Y))
    # add a feature with all ones to learn implicitily a bias
    fm_train_real = np.vstack((fm_train_real, np.ones(2 * n)))
    feats_train = RealFeatures(fm_train_real)
    labels = BinaryLabels(label_train_twoclass)

    # Find limits for visualization
    x_min = min(np.min(X[0, :]), np.min(Y[0, :]))
    x_max = max(np.max(X[0, :]), np.max(Y[0, :]))

    # Train first SVM and plot its hyperplane
    svm1 = train_svm(feats_train, labels, C1)
    plot_hyperplane(svm1, x_min, x_max, 'g')

    # Train second SVM and plot its hyperplane
    svm2 = train_svm(feats_train, labels, C2)
    plot_hyperplane(svm2, x_min, x_max, 'y')

    # Plot the two-class data
    plt.scatter(X[0, :],
                X[1, :],
                s=40,
                marker='o',
                facecolors='none',
                edgecolors='b')
    plt.scatter(Y[0, :],
                Y[1, :],
                s=40,
                marker='s',
                facecolors='none',
                edgecolors='r')

    # Customize the plot

    y_min = min(np.min(X[1, :]), np.min(Y[1, :]))
    y_max = max(np.max(X[1, :]), np.max(Y[1, :]))

    plt.axis([x_min - 1, x_max + 1, y_min - 1, y_max + 1])
    plt.title('SVM trade-off')
    plt.xlabel('x')
    plt.ylabel('y')

    plt.show()

    return svm1, svm2
Exemple #6
0
def classifier_non_separable_svm(n=100, m=10, distance=5, seed=None):
    '''
	n is the number of examples per class and m is the number of examples per class that gets its
	label swapped to force non-linear separability
	'''
    from shogun.Features import RealFeatures, BinaryLabels
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC

    # 2D data
    _DIM = 2

    # To get the nice message that the perceptron has converged
    dummy = BinaryLabels()

    np.random.seed(seed)

    # Produce some (probably) linearly separable training data by hand
    # Two Gaussians at a far enough distance
    X = np.array(np.random.randn(_DIM, n)) + distance
    Y = np.array(np.random.randn(_DIM, n))
    # The last five points of each class are swapped to force non-linear separable data
    label_train_twoclass = np.hstack(
        (np.ones(n - m), -np.ones(m), -np.ones(n - m), np.ones(m)))

    fm_train_real = np.hstack((X, Y))
    feats_train = RealFeatures(fm_train_real)
    labels = BinaryLabels(label_train_twoclass)

    # Train linear SVM
    C = 1
    epsilon = 1e-3
    svm = LibLinear(C, feats_train, labels)
    svm.set_liblinear_solver_type(L2R_L2LOSS_SVC)
    svm.set_epsilon(epsilon)
    svm.set_bias_enabled(True)
    svm.train()

    # Get hyperplane parameters
    b = svm.get_bias()
    w = svm.get_w()

    # Find limits for visualization
    x_min = min(np.min(X[0, :]), np.min(Y[0, :]))
    x_max = max(np.max(X[0, :]), np.max(Y[0, :]))

    y_min = min(np.min(X[1, :]), np.min(Y[1, :]))
    y_max = max(np.max(X[1, :]), np.max(Y[1, :]))

    hx = np.linspace(x_min - 1, x_max + 1)
    hy = -w[1] / w[0] * hx

    plt.plot(hx, -1 / w[1] * (w[0] * hx + b), 'k', linewidth=2.0)

    # Plot the two-class data
    pos_idxs = label_train_twoclass == +1
    plt.scatter(fm_train_real[0, pos_idxs],
                fm_train_real[1, pos_idxs],
                s=40,
                marker='o',
                facecolors='none',
                edgecolors='b')

    neg_idxs = label_train_twoclass == -1
    plt.scatter(fm_train_real[0, neg_idxs],
                fm_train_real[1, neg_idxs],
                s=40,
                marker='s',
                facecolors='none',
                edgecolors='r')

    # Customize the plot
    plt.axis([x_min - 1, x_max + 1, y_min - 1, y_max + 1])
    plt.title('SVM with non-linearly separable data')
    plt.xlabel('x')
    plt.ylabel('y')

    plt.show()

    return svm
def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10,2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1) #2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    #w=kernel.get_subkernel_weights()
    #kernel.set_subkernel_weights(w)


    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(),kernel
Exemple #8
0
def modelselection_grid_search_liblinear_modular(traindat=traindat,
                                                 label_traindat=label_traindat
                                                 ):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.ModelSelection import GridSearchModelSelection
    from shogun.ModelSelection import ModelSelectionParameters, R_EXP
    from shogun.ModelSelection import ParameterCombination
    from shogun.Features import BinaryLabels
    from shogun.Features import RealFeatures
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC

    # build parameter tree to select C1 and C2
    param_tree_root = ModelSelectionParameters()
    c1 = ModelSelectionParameters("C1")
    param_tree_root.append_child(c1)
    c1.build_values(-2.0, 2.0, R_EXP)

    c2 = ModelSelectionParameters("C2")
    param_tree_root.append_child(c2)
    c2.build_values(-2.0, 2.0, R_EXP)

    # training data
    features = RealFeatures(traindat)
    labels = BinaryLabels(label_traindat)

    # classifier
    classifier = LibLinear(L2R_L2LOSS_SVC)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #classifier.print_modsel_params()

    # splitting strategy for cross-validation
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 10)

    # evaluation method
    evaluation_criterium = ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation = CrossValidation(classifier, features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # model selection instance
    model_selection = GridSearchModelSelection(param_tree_root,
                                               cross_validation)

    # perform model selection with selected methods
    #print "performing model selection of"
    #param_tree_root.print_tree()
    best_parameters = model_selection.select_model()

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(classifier)
    result = cross_validation.evaluate()
Exemple #9
0
    def solve(self, C, all_xt, all_lt, task_indicator, M, L):
        """
        wrap shogun solver with same interface as others
        """

        xt = numpy.array(all_xt)
        lt = numpy.array(all_lt)
        tt = numpy.array(task_indicator, dtype=numpy.int32)
        tsm = numpy.array(M)
        laplacian = numpy.array(L)

        print "task_sim:", tsm

        num_tasks = L.shape[0]

        # sanity checks
        assert len(xt) == len(lt) == len(tt)
        assert M.shape == L.shape
        assert num_tasks == len(set(tt))

        # set up shogun objects
        if type(xt[0]) == str or type(xt[0]) == numpy.string_:
            feat = create_hashed_features_wdk(xt, 8)
        else:
            feat = RealFeatures(xt.T)

        lab = BinaryLabels(lt)

        # set up machinery
        svm = LibLinearMTL()
        svm.io.set_loglevel(MSG_DEBUG)
        svm.set_epsilon(self.eps)

        svm.set_C(C, C)
        svm.set_bias_enabled(False)

        # set MTL stuff
        svm.set_task_indicator_lhs(tt)
        svm.set_task_indicator_rhs(tt)
        svm.set_num_tasks(num_tasks)
        svm.set_use_cache(False)

        #print "setting sparse matrix!"
        tsm_sp = csc_matrix(tsm)
        svm.set_task_similarity_matrix(tsm_sp)
        #svm.set_task_similarity_matrix(tsm)
        svm.set_graph_laplacian(laplacian)

        # invoke training
        svm.set_labels(lab)

        # how often do we like to compute objective etc
        svm.set_record_interval(self.record_interval)
        svm.set_min_interval(self.min_interval)
        svm.set_max_iterations(10000000)

        # start training
        start_time = time.time()
        svm.train(feat)

        if self.record_variables:

            self.final_train_time = time.time() - start_time
            print "total training time:", self.final_train_time, "seconds"

            self.primal_objectives = svm.get_primal_objectives()
            self.dual_objectives = svm.get_dual_objectives()
            self.train_times = svm.get_training_times()

            print "computing objectives one last time"
            self.final_primal_obj = svm.compute_primal_obj()
            self.final_dual_obj = svm.compute_dual_obj()

            print "obj primal", self.final_primal_obj
            print "obj dual", self.final_dual_obj
            print "actual duality gap:", self.final_primal_obj - self.final_dual_obj

            #print "V", svm.get_V()
            self.V = svm.get_V()
            self.W = svm.get_W()
            self.alphas = svm.get_alphas()

            # get model parameters
            #V = svm.get_W().T

        if self.sanity_check:
            print "comparing to python implementation"

            #dual_obj_python = compute_dual_objective(alphas, xt, lt, task_indicator, M)
            #print "dual obj python", dual_obj_python
            #print "dual obj C++", dual_obj

            #print alphas
            #W = alphas_to_w(alphas, xt, lt, task_indicator, M)

            #print W

            #primal_obj = compute_primal_objective(W.reshape(W.shape[0] * W.shape[1]), C, xt, lt, task_indicator, L)
            #print "python primal", primal_obj

            # compare dual obj

            #return objectives#, train_times

        return True
Exemple #10
0
    def solve(self, C, all_xt, all_lt, task_indicator, M, L):
        """
        implementation using multitask kernel
        """

        xt = numpy.array(all_xt)
        lt = numpy.array(all_lt)
        tt = numpy.array(task_indicator, dtype=numpy.int32)
        tsm = numpy.array(M)

        print "task_sim:", tsm

        num_tasks = L.shape[0]

        # sanity checks
        assert len(xt) == len(lt) == len(tt)
        assert M.shape == L.shape
        assert num_tasks == len(set(tt))

        # set up shogun objects
        if type(xt[0]) == numpy.string_:
            feat = StringCharFeatures(DNA)
            xt = [str(a) for a in xt]
            feat.set_features(xt)
            base_kernel = WeightedDegreeStringKernel(feat, feat, 8)
        else:
            feat = RealFeatures(xt.T)
            base_kernel = LinearKernel(feat, feat)

        lab = BinaryLabels(lt)

        # set up normalizer
        normalizer = MultitaskKernelNormalizer(tt.tolist())

        for i in xrange(num_tasks):
            for j in xrange(num_tasks):
                normalizer.set_task_similarity(i, j, M[i, j])

        print "num of unique tasks: ", normalizer.get_num_unique_tasks(
            task_indicator)

        # set up kernel
        base_kernel.set_cache_size(4000)
        base_kernel.set_normalizer(normalizer)
        base_kernel.init_normalizer()

        # set up svm
        svm = SVMLight()  #LibSVM()

        svm.set_epsilon(self.eps)

        #SET THREADS TO 1
        #print "reducing num threads to one"
        #segfaults
        #svm.parallel.set_num_threads(1)
        #print "using one thread"

        # how often do we like to compute objective etc
        svm.set_record_interval(self.record_interval)
        svm.set_min_interval(self.min_interval)
        #svm.set_target_objective(target_obj)

        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        #svm.set_shrinking_enabled(False)
        svm.io.set_loglevel(MSG_DEBUG)

        svm.set_C(C, C)
        svm.set_bias_enabled(False)

        # prepare for training
        svm.set_labels(lab)
        svm.set_kernel(base_kernel)

        # train svm
        svm.train()

        if self.record_variables:

            print "recording variables"

            self.dual_objectives = [-obj for obj in svm.get_dual_objectives()]
            self.train_times = svm.get_training_times()

            # get model parameters
            sv_idx = svm.get_support_vectors()
            sparse_alphas = svm.get_alphas()

            assert len(sv_idx) == len(sparse_alphas)

            # compute dense alpha (remove label)
            self.alphas = numpy.zeros(len(xt))
            for id_sparse, id_dense in enumerate(sv_idx):
                self.alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense]

            # print alphas
            W = alphas_to_w(self.alphas, xt, lt, task_indicator, M)
            self.W = W

            #
            self.final_primal_obj = compute_primal_objective(
                W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt,
                task_indicator, L)

            print "MTK duality gap:", self.dual_objectives[
                -1] - self.final_primal_obj

        return True
Exemple #11
0
trainData = trainData.reshape(-1, 10000)
f.close()

f = open(os.path.dirname(__file__) + '../data/arcene_train.label')
trainLabel = np.fromfile(f, dtype=np.int32, sep=' ')
f.close()

# Load test data.
f = open(os.path.dirname(__file__) + '../data/arcene_test.data')
testData = np.fromfile(f, dtype=np.float64, sep=' ')
testData = testData.reshape(-1, 10000)
f.close()

f = open(os.path.dirname(__file__) + '../data/arcene_test.label')
testLabel = np.fromfile(f, dtype=np.float64, sep=' ')
f.close()

# Construct a KNN classifier with a neighborhood size of 9.
feat = RealFeatures(trainData.T)
distance = EuclideanDistance(feat, feat)
labels = BinaryLabels(trainLabel.astype(np.float64))
testFeat = RealFeatures(testData.T)
knn = KNN(9, distance, labels)
knn.train()

# Predict the classification.
output = knn.apply(testFeat).get_labels()

# Validate the classification.
print output == testLabel