Esempio n. 1
0
def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10,2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = Labels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1) #2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    #w=kernel.get_subkernel_weights()
    #kernel.set_subkernel_weights(w)


    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(),kernel
		myobj = pickle.load(f)
		f.close()
		return myobj


	##################################################

	seed(17)
	traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
	testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

	trainlab=concatenate((-ones(num), ones(num)));
	testlab=concatenate((-ones(num), ones(num)));

	feats_train=RealFeatures(traindata_real);
	feats_test=RealFeatures(testdata_real);
	kernel=GaussianKernel(feats_train, feats_train, width);
	#kernel.io.set_loglevel(MSG_DEBUG)

	labels=Labels(trainlab);

	svm=SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	##################################################

	#print "labels:"
	#print pickle.dumps(labels)
	#
Esempio n. 3
0
X = Xall[0:ntrain, :]
y = yall[0:ntrain]

Xtest = Xall[ntrain:, :]
ytest = yall[ntrain:]

# preprocess data
for i in xrange(p):
    X[:, i] -= np.mean(X[:, i])
    X[:, i] /= np.linalg.norm(X[:, i])
y -= np.mean(y)

# train LASSO
LeastAngleRegression = LeastAngleRegression()
LeastAngleRegression.set_labels(RegressionLabels(y))
LeastAngleRegression.train(RealFeatures(X.T))

# train ordinary LSR
if use_ridge:
    lsr = LinearRidgeRegression(0.01, RealFeatures(X.T), Labels(y))
    lsr.train()
else:
    lsr = LeastSquaresRegression()
    lsr.set_labels(RegressionLabels(y))
    lsr.train(RealFeatures(X.T))

# gather LASSO path
path = np.zeros((p, LeastAngleRegression.get_path_size()))
for i in xrange(path.shape[1]):
    path[:, i] = LeastAngleRegression.get_w(i)
Esempio n. 4
0
def features_dense_protocols_modular(in_data=data):
    m_real = array(in_data, dtype=float64, order='F')
    f_real = RealFeatures(m_real)

    #print m_real
    #print f_real

    #print f_real[-1]
    #print f_real[1, 2]
    #print f_real[-1:3]
    #print f_real[2, 0:2]
    #print f_real[0:3, 1]
    #print f_real[0:3, 1:2]
    #print f_real[:,1]
    #print f_real[1,:]

    #print m_real[-2]
    f_real[-1] = m_real[-2]
    #print f_real[-1]

    #print m_real[0, 1]
    f_real[1, 2] = m_real[0, 1]
    #print f_real[1, 2]

    #print m_real[0:2]
    f_real[1:3] = m_real[0:2]
    #print f_real[1:3]

    #print m_real[0, 0:2]
    f_real[2, 0:2] = m_real[0, 0:2]
    #print f_real[2, 0:2]

    #print m_real[0:3, 2]
    f_real[0:3, 1] = m_real[0:3, 2]
    #print f_real[0:3, 1]

    #print m_real[0:3, 0:1]
    f_real[0:3, 1:2] = m_real[0:3, 0:1]
    #print f_real[0:3, 1:2]

    f_real[:, 0] = 0
    #print f_real.get_feature_matrix()

    if numpy.__version__ >= '1.5':
        f_real += m_real
        f_real *= m_real
        f_real -= m_real
    else:
        print("numpy version >= 1.5 is needed")
        return None

    f_real += f_real
    f_real *= f_real
    f_real -= f_real

    #print f_real
    #print f_real.get_feature_matrix()

    try:
        mem_real = memoryview(f_real)
    except NameError:
        print("Python2.7 and later is needed for memoryview class")
        return

    ret_real = array(f_real)
    #print ret_real

    return f_real[:, 0]
Esempio n. 5
0
def get_realfeatures(pos, neg):
    arr = array((pos, neg))
    features = concatenate(arr, axis=1)
    return RealFeatures(features)
Esempio n. 6
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
Esempio n. 7
0
# Number of classes
M = 3
# Number of samples of each class
N = 1000
# Dimension of the data
dim = 2

X, y = gen_data()

cnt = 250

X2, y2 = fill_data(cnt, np.min(X), np.max(X))

labels = MulticlassSOLabels(y)
features = RealFeatures(X.T)

model = MulticlassModel(features, labels)
loss = HingeLoss()

lambda_ = 1e1
sosvm = DualLibQPBMSOSVM(model, loss, labels, lambda_)

sosvm.set_cleanAfter(10)	# number of iterations that cutting plane has to be inactive for to be removed
sosvm.set_cleanICP(True)	# enables inactive cutting plane removal feature
sosvm.set_TolRel(0.001)		# set relative tolerance
sosvm.set_verbose(True)		# enables verbosity of the solver
sosvm.set_cp_models(16)		# set number of cutting plane models
sosvm.set_solver(BMRM)		# select training algorithm
#sosvm.set_solver(PPBMRM)
#sosvm.set_solver(P3BMRM)
Esempio n. 8
0
try:
    swiss_roll_fig = fig.add_subplot(3, 3, 1, projection='3d')
    new_mpl = True
except:
    figure = plt.figure()
    swiss_roll_fig = Axes3D(figure)

swiss_roll_fig.scatter(X[0], X[1], X[2], s=10, c=tt, cmap=plt.cm.Spectral)
plt.subplots_adjust(hspace=0.4)

from shogun.Features import RealFeatures

for (i, (converter, label)) in enumerate(converters):
    X = numpy.genfromtxt('../../../../data/toy/swissroll.dat', unpack=True).T
    features = RealFeatures(X)
    converter.set_target_dim(2)
    new_feats = converter.embed(features).get_feature_matrix()
    if not new_mpl:
        embedding_subplot = fig.add_subplot(4, 2, i + 1)
    else:
        embedding_subplot = fig.add_subplot(3, 3, i + 2)
    embedding_subplot.scatter(new_feats[0],
                              new_feats[1],
                              c=tt,
                              cmap=plt.cm.Spectral)
    plt.axis('tight')
    plt.xticks([]), plt.yticks([])
    plt.title(label)
    print converter.get_name(), 'done'
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("comparing objectives")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())

    #print("comparing predictions")

    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    #print("all checks passed.")

    return True
Esempio n. 10
0
def statistics_hsic():
    from shogun.Features import RealFeatures
    from shogun.Features import DataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import HSIC
    from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # note that the HSIC has to store kernel matrices
    # which upper bounds the sample size
    n = 250
    difference = 3
    angle = pi / 3

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(n, difference, angle)
    #plot(data[0], data[1], 'x');show()

    # create shogun feature representation
    features_x = RealFeatures(array([data[0]]))
    features_y = RealFeatures(array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = Math.randperm_vec(features_x.get_num_vectors())
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_y = median_distance**2
    print "median distance for Gaussian kernel on x:", sigma_x
    print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = hsic.compute_statistic()
    print "HSIC:", statistic
    alpha = 0.05

    print "computing p-value using bootstrapping"
    hsic.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    hsic.set_bootstrap_iterations(100)
    # bootstrapping allows usage of unbiased or biased statistic
    p_value = hsic.compute_p_value(statistic)
    thresh = hsic.compute_threshold(alpha)
    print "p_value:", p_value
    print "threshold for 0.05 alpha:", thresh
    print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value < alpha

    print "computing p-value using gamma method"
    hsic.set_null_approximation_method(HSIC_GAMMA)
    p_value = hsic.compute_p_value(statistic)
    thresh = hsic.compute_threshold(alpha)
    print "p_value:", p_value
    print "threshold for 0.05 alpha:", thresh
    print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    hsic.set_null_approximation_method(BOOTSTRAP)
    hsic.set_bootstrap_iterations(100)
    null_samples = hsic.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)
Esempio n. 11
0
def classifier_non_separable_svm(n=100, m=10, distance=5, seed=None):
    '''
	n is the number of examples per class and m is the number of examples per class that gets its
	label swapped to force non-linear separability
	'''
    from shogun.Features import RealFeatures, BinaryLabels
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC

    # 2D data
    _DIM = 2

    # To get the nice message that the perceptron has converged
    dummy = BinaryLabels()

    np.random.seed(seed)

    # Produce some (probably) linearly separable training data by hand
    # Two Gaussians at a far enough distance
    X = np.array(np.random.randn(_DIM, n)) + distance
    Y = np.array(np.random.randn(_DIM, n))
    # The last five points of each class are swapped to force non-linear separable data
    label_train_twoclass = np.hstack(
        (np.ones(n - m), -np.ones(m), -np.ones(n - m), np.ones(m)))

    fm_train_real = np.hstack((X, Y))
    feats_train = RealFeatures(fm_train_real)
    labels = BinaryLabels(label_train_twoclass)

    # Train linear SVM
    C = 1
    epsilon = 1e-3
    svm = LibLinear(C, feats_train, labels)
    svm.set_liblinear_solver_type(L2R_L2LOSS_SVC)
    svm.set_epsilon(epsilon)
    svm.set_bias_enabled(True)
    svm.train()

    # Get hyperplane parameters
    b = svm.get_bias()
    w = svm.get_w()

    # Find limits for visualization
    x_min = min(np.min(X[0, :]), np.min(Y[0, :]))
    x_max = max(np.max(X[0, :]), np.max(Y[0, :]))

    y_min = min(np.min(X[1, :]), np.min(Y[1, :]))
    y_max = max(np.max(X[1, :]), np.max(Y[1, :]))

    hx = np.linspace(x_min - 1, x_max + 1)
    hy = -w[1] / w[0] * hx

    plt.plot(hx, -1 / w[1] * (w[0] * hx + b), 'k', linewidth=2.0)

    # Plot the two-class data
    pos_idxs = label_train_twoclass == +1
    plt.scatter(fm_train_real[0, pos_idxs],
                fm_train_real[1, pos_idxs],
                s=40,
                marker='o',
                facecolors='none',
                edgecolors='b')

    neg_idxs = label_train_twoclass == -1
    plt.scatter(fm_train_real[0, neg_idxs],
                fm_train_real[1, neg_idxs],
                s=40,
                marker='s',
                facecolors='none',
                edgecolors='r')

    # Customize the plot
    plt.axis([x_min - 1, x_max + 1, y_min - 1, y_max + 1])
    plt.title('SVM with non-linearly separable data')
    plt.xlabel('x')
    plt.ylabel('y')

    plt.show()

    return svm
Esempio n. 12
0
    except TypeError:
        return False
    return True


import re
encoders = [
    x for x in dir(Classifier)
    if re.match(r'ECOC.+Encoder', x) and nonabstract_class(x)
]
decoders = [
    x for x in dir(Classifier)
    if re.match(r'ECOC.+Decoder', x) and nonabstract_class(x)
]

fea_train = RealFeatures(traindat)
fea_test = RealFeatures(testdat)
gnd_train = MulticlassLabels(label_traindat)
if label_testdat is None:
    gnd_test = None
else:
    gnd_test = MulticlassLabels(label_testdat)

base_classifier = LibLinear(L2R_L2LOSS_SVC)
base_classifier.set_bias_enabled(True)

print('Testing with %d encoders and %d decoders' %
      (len(encoders), len(decoders)))
print('-' * 70)
format_str = '%%15s + %%-10s  %%-10%s %%-10%s %%-10%s'
print((format_str % ('s', 's', 's')) %
Esempio n. 13
0
x_pos, y_pos = np.random.multivariate_normal(mean_pos, cov_pos, 500).T
plot(x_pos, y_pos, 'bo')

# negative examples
mean_neg = [0, -3]
cov_neg = [[100, 50], [20, 3]]

x_neg, y_neg = np.random.multivariate_normal(mean_neg, cov_neg, 500).T
plot(x_neg, y_neg, 'ro')

# train qda
labels = MulticlassLabels(np.concatenate([np.zeros(N), np.ones(N)]))
pos = np.array([x_pos, y_pos])
neg = np.array([x_neg, y_neg])
features = RealFeatures(np.array(np.concatenate([pos, neg], 1)))

qda = QDA()
qda.set_labels(labels)
qda.train(features)

# compute output plot iso-lines
xs = np.array(np.concatenate([x_pos, x_neg]))
ys = np.array(np.concatenate([y_pos, y_neg]))

x1_max = max(1.2 * xs)
x1_min = min(1.2 * xs)
x2_max = max(1.2 * ys)
x2_min = min(1.2 * ys)

x1 = np.linspace(x1_min, x1_max, size)
def classifier_non_separable_svm(n=100, distance=5, seed=None, C1=1, C2=100):
    '''
	n is the number of examples per class and m is the number of examples per class that gets its
	label swapped to force non-linear separability

	C1 and C2 are the two regularization values used
	'''
    from shogun.Features import RealFeatures, BinaryLabels

    # 2D data
    _DIM = 2

    # To get the nice message that the perceptron has converged
    dummy = BinaryLabels()

    np.random.seed(seed)

    # Produce some (probably) linearly separable training data by hand
    # Two Gaussians at a far enough distance
    X = np.array(2 * np.random.randn(_DIM, n)) + distance
    Y = np.array(1.5 * np.random.randn(_DIM, n)) - distance
    label_train_twoclass = np.hstack((np.ones(n), -np.ones(n)))
    # The last 5 points of the positive class are closer to the negative examples
    X[:, -3:-1] = np.random.randn(_DIM, 2) - 0.5 * distance / 2

    fm_train_real = np.hstack((X, Y))
    # add a feature with all ones to learn implicitily a bias
    fm_train_real = np.vstack((fm_train_real, np.ones(2 * n)))
    feats_train = RealFeatures(fm_train_real)
    labels = BinaryLabels(label_train_twoclass)

    # Find limits for visualization
    x_min = min(np.min(X[0, :]), np.min(Y[0, :]))
    x_max = max(np.max(X[0, :]), np.max(Y[0, :]))

    # Train first SVM and plot its hyperplane
    svm1 = train_svm(feats_train, labels, C1)
    plot_hyperplane(svm1, x_min, x_max, 'g')

    # Train second SVM and plot its hyperplane
    svm2 = train_svm(feats_train, labels, C2)
    plot_hyperplane(svm2, x_min, x_max, 'y')

    # Plot the two-class data
    plt.scatter(X[0, :],
                X[1, :],
                s=40,
                marker='o',
                facecolors='none',
                edgecolors='b')
    plt.scatter(Y[0, :],
                Y[1, :],
                s=40,
                marker='s',
                facecolors='none',
                edgecolors='r')

    # Customize the plot

    y_min = min(np.min(X[1, :]), np.min(Y[1, :]))
    y_max = max(np.max(X[1, :]), np.max(Y[1, :]))

    plt.axis([x_min - 1, x_max + 1, y_min - 1, y_max + 1])
    plt.title('SVM trade-off')
    plt.xlabel('x')
    plt.ylabel('y')

    plt.show()

    return svm1, svm2
Esempio n. 15
0
def modelselection_grid_search_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
           width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import MeanSquaredError
    from shogun.Evaluation import CrossValidationSplitting
    from shogun.Features import RegressionLabels
    from shogun.Features import RealFeatures
    from shogun.Kernel import GaussianKernel
    from shogun.Regression import LibSVR
    from shogun.ModelSelection import GridSearchModelSelection
    from shogun.ModelSelection import ModelSelectionParameters, R_EXP
    from shogun.ModelSelection import ParameterCombination

    # training data
    features_train = RealFeatures(traindat)
    labels = RegressionLabels(label_traindat)

    # kernel
    kernel = GaussianKernel(features_train, features_train, width)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #kernel.print_modsel_params()

    labels = RegressionLabels(label_train)

    # predictor
    predictor = LibSVR(C, tube_epsilon, kernel, labels)
    predictor.set_epsilon(epsilon)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy = CrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium = MeanSquaredError()

    # cross-validation instance
    cross_validation = CrossValidation(predictor, features_train, labels,
                                       splitting_strategy,
                                       evaluation_criterium)

    # (optional) repeat x-val 10 times
    cross_validation.set_num_runs(10)

    # (optional) request 95% confidence intervals for results (not actually needed
    # for this toy example)
    cross_validation.set_conf_int_alpha(0.05)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #predictor.print_modsel_params()

    # build parameter tree to select C1 and C2
    param_tree_root = ModelSelectionParameters()
    c1 = ModelSelectionParameters("C1")
    param_tree_root.append_child(c1)
    c1.build_values(-2.0, 2.0, R_EXP)

    c2 = ModelSelectionParameters("C2")
    param_tree_root.append_child(c2)
    c2.build_values(-2.0, 2.0, R_EXP)

    # model selection instance
    model_selection = GridSearchModelSelection(param_tree_root,
                                               cross_validation)

    # perform model selection with selected methods
    #print "performing model selection of"
    #print "parameter tree"
    #param_tree_root.print_tree()

    #print "starting model selection"
    # print the current parameter combination, if no parameter nothing is printed
    print_state = False
    # lock data before since model selection will not change the kernel matrix
    # (use with care) This avoids that the kernel matrix is recomputed in every
    # iteration of the model search
    predictor.data_lock(labels, features_train)
    best_parameters = model_selection.select_model(print_state)

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(predictor)
    result = cross_validation.evaluate()
Esempio n. 16
0
    pyplot.show()


from shogun.Features import RealFeatures, MulticlassLabels
from shogun.Metric import LMNN

# input features and labels
features = numpy.array([[0, 0], [-1, 0.1], [0.3, -0.05], [0.7, 0.3],
                        [-0.2, -0.6], [-0.15, -0.63], [-0.25, 0.55],
                        [-0.28, 0.67]])
labels = numpy.array([0, 0, 0, 0, 1, 1, 2, 2], dtype=numpy.float64)

plot_data(features, labels)

# wrap them into Shogun objects
features = RealFeatures(features.T)
labels = MulticlassLabels(labels)

# train LMNN
# number of target neighbours per example
k = 1
lmnn = LMNN(features, labels, k)
lmnn.set_maxiter(1000)
lmnn.set_correction(15)
lmnn.train()

L = lmnn.get_linear_transform()
M = numpy.matrix(numpy.dot(L.T, L))

plot_covariance_ellipse(M.I)
#plot_covariance_ellipse(numpy.eye(2))
# parameters, change to get different results
m = 1000  # set to 10000 for a good test result
dim = 2

# setting the difference of the first dimension smaller makes a harder test
difference = 1

# number of samples taken from null and alternative distribution
num_null_samples = 500

# use data generator class to produce example data
data = DataGenerator.generate_mean_data(m, dim, difference)

# create shogun feature representation
features = RealFeatures(data)

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset = Math.randperm_vec(features.get_num_vectors())
subset = subset[0:200]
features.add_subset(subset)
dist = EuclideanDistance(features, features)
distances = dist.get_distance_matrix()
features.remove_subset()
median_distance = Statistics.matrix_median(distances, True)
sigma = median_distance**2
def modelselection_grid_search_krr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
           width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import MeanSquaredError
    from shogun.Evaluation import CrossValidationSplitting
    from shogun.Features import RegressionLabels
    from shogun.Features import RealFeatures
    from shogun.Regression import KernelRidgeRegression
    from shogun.ModelSelection import GridSearchModelSelection
    from shogun.ModelSelection import ModelSelectionParameters

    # training data
    features_train = RealFeatures(traindat)
    features_test = RealFeatures(testdat)
    labels = RegressionLabels(label_traindat)

    # labels
    labels = RegressionLabels(label_train)

    # predictor, set tau=0 here, doesnt matter
    predictor = KernelRidgeRegression()

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy = CrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium = MeanSquaredError()

    # cross-validation instance
    cross_validation = CrossValidation(predictor, features_train, labels,
                                       splitting_strategy,
                                       evaluation_criterium)

    # (optional) repeat x-val 10 times
    cross_validation.set_num_runs(10)

    # (optional) request 95% confidence intervals for results (not actually needed
    # for this toy example)
    cross_validation.set_conf_int_alpha(0.05)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #predictor.print_modsel_params()

    # build parameter tree to select regularization parameter
    param_tree_root = create_param_tree()

    # model selection instance
    model_selection = GridSearchModelSelection(param_tree_root,
                                               cross_validation)

    # perform model selection with selected methods
    #print "performing model selection of"
    #print "parameter tree:"
    #param_tree_root.print_tree()

    #print "starting model selection"
    # print the current parameter combination, if no parameter nothing is printed
    print_state = False

    best_parameters = model_selection.select_model(print_state)

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(predictor)
    result = cross_validation.evaluate()
def evaluation_cross_validation_multiclass_storage(
        traindat=traindat, label_traindat=label_traindat):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import CrossValidationPrintOutput
    from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage
    from shogun.Evaluation import MulticlassAccuracy, F1Measure
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.Features import MulticlassLabels
    from shogun.Features import RealFeatures, CombinedFeatures
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Classifier import MKLMulticlass
    from shogun.Mathematics import Statistics, MSG_DEBUG, Math

    Math.init_random(1)

    # training data, combined features all on same data
    features = RealFeatures(traindat)
    comb_features = CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels = MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel = CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm = MKLMulticlass(1.0, kernel, labels)
    svm.set_kernel(kernel)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 3)

    # evaluation method
    evaluation_criterium = MulticlassAccuracy()

    # cross-validation instance
    cross_validation = CrossValidation(svm, comb_features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    #mkl_storage=CrossValidationMKLStorage()
    #cross_validation.add_cross_validation_output(mkl_storage)
    multiclass_storage = CrossValidationMulticlassStorage()
    multiclass_storage.append_binary_evaluation(F1Measure())
    cross_validation.add_cross_validation_output(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result = cross_validation.evaluate()

    roc_0_0_0 = multiclass_storage.get_fold_ROC(0, 0, 0)
    #print roc_0_0_0
    auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0, 0, 0, 0)
    #print auc_0_0_0
    return roc_0_0_0, auc_0_0_0
Esempio n. 20
0
real_gmm.set_nth_mean(array([1.0, 1.0]), 0)
real_gmm.set_nth_mean(array([-1.0, -1.0]), 1)

real_gmm.set_nth_cov(array([[1.0, 0.2],[0.2, 0.1]]), 0)
real_gmm.set_nth_cov(array([[0.3, 0.1],[0.1, 1.0]]), 1)

real_gmm.set_coef(array([0.3, 0.7]))

#generate training set from real GMM
generated=array([real_gmm.sample()])
for i in range(199):
    generated=append(generated, array([real_gmm.sample()]), axis=0)

generated=generated.transpose()
feat_train=RealFeatures(generated)

#train GMM using EM
est_gmm=GMM(2, cov_type)
est_gmm.train(feat_train)
est_gmm.train_em(min_cov, max_iter, min_change)

#get and print estimated means and covariances
est_mean1=est_gmm.get_nth_mean(0)
est_mean2=est_gmm.get_nth_mean(1)
est_cov1=est_gmm.get_nth_cov(0)
est_cov2=est_gmm.get_nth_cov(1)
est_coef=est_gmm.get_coef()
print est_mean1
print est_cov1
print est_mean2
def serialization_complex_example(num=5, dist=1, dim=10, C=2.0, width=10):
	import os
	from numpy import concatenate, zeros, ones
	from numpy.random import randn, seed
	from shogun.Features import RealFeatures, Labels
	from shogun.Classifier import GMNPSVM
	from shogun.Kernel import GaussianKernel
	from shogun.IO import SerializableHdf5File,SerializableAsciiFile, \
			SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
	from shogun.Preprocessor import NormOne, LogPlusOne

	seed(17)

	data=concatenate((randn(dim, num), randn(dim, num) + dist,
					  randn(dim, num) + 2*dist,
					  randn(dim, num) + 3*dist), axis=1)
	lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))

	feats=RealFeatures(data)
	#feats.io.set_loglevel(MSG_DEBUG)
	kernel=GaussianKernel(feats, feats, width)

	labels=Labels(lab)

	svm = GMNPSVM(C, kernel, labels)

	feats.add_preprocessor(NormOne())
	feats.add_preprocessor(LogPlusOne())
	feats.set_preprocessed(1)
	svm.train(feats)

	#svm.print_serializable()

	fstream = SerializableHdf5File("blaah.h5", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableAsciiFile("blaah.asc", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableJsonFile("blaah.json", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableXmlFile("blaah.xml", "w")
	status = svm.save_serializable(fstream)
	check_status(status)


	fstream = SerializableHdf5File("blaah.h5", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	fstream = SerializableAsciiFile("blaah.asc", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	#fstream = SerializableJsonFile("blaah.json", "r")
	#new_svm=GMNPSVM()
	#status = new_svm.load_serializable(fstream)
	#check_status(status)
	#new_svm.train()

	fstream = SerializableXmlFile("blaah.xml", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	os.unlink("blaah.h5")
	os.unlink("blaah.asc")
	os.unlink("blaah.json")
	os.unlink("blaah.xml")
	return svm,new_svm
def classifier_multiclass_ecoc(fm_train_real=traindat,
                               fm_test_real=testdat,
                               label_train_multiclass=label_traindat,
                               label_test_multiclass=label_testdat,
                               lawidth=2.1,
                               C=1,
                               epsilon=1e-5):

    import shogun.Classifier as Classifier
    from shogun.Classifier import ECOCStrategy, LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
    from shogun.Evaluation import MulticlassAccuracy
    from shogun.Features import RealFeatures, MulticlassLabels

    def nonabstract_class(name):
        try:
            getattr(Classifier, name)()
        except TypeError:
            return False
        return True

    encoders = [
        x for x in dir(Classifier)
        if re.match(r'ECOC.+Encoder', x) and nonabstract_class(x)
    ]
    decoders = [
        x for x in dir(Classifier)
        if re.match(r'ECOC.+Decoder', x) and nonabstract_class(x)
    ]

    fea_train = RealFeatures(fm_train_real)
    fea_test = RealFeatures(fm_test_real)
    gnd_train = MulticlassLabels(label_train_multiclass)
    if label_test_multiclass is None:
        gnd_test = None
    else:
        gnd_test = MulticlassLabels(label_test_multiclass)

    base_classifier = LibLinear(L2R_L2LOSS_SVC)
    base_classifier.set_bias_enabled(True)

    print('Testing with %d encoders and %d decoders' %
          (len(encoders), len(decoders)))
    print('-' * 70)
    format_str = '%%15s + %%-10s  %%-10%s %%-10%s %%-10%s'
    print((format_str % ('s', 's', 's')) %
          ('encoder', 'decoder', 'codelen', 'time', 'accuracy'))

    def run_ecoc(ier, idr):
        encoder = getattr(Classifier, encoders[ier])()
        decoder = getattr(Classifier, decoders[idr])()

        # whether encoder is data dependent
        if hasattr(encoder, 'set_labels'):
            encoder.set_labels(gnd_train)
            encoder.set_features(fea_train)

        strategy = ECOCStrategy(encoder, decoder)
        classifier = LinearMulticlassMachine(strategy, fea_train,
                                             base_classifier, gnd_train)
        classifier.train()
        label_pred = classifier.apply(fea_test)
        if gnd_test is not None:
            evaluator = MulticlassAccuracy()
            acc = evaluator.evaluate(label_pred, gnd_test)
        else:
            acc = None

        return (classifier.get_num_machines(), acc)

    for ier in range(len(encoders)):
        for idr in range(len(decoders)):
            t_begin = time.clock()
            (codelen, acc) = run_ecoc(ier, idr)
            if acc is None:
                acc_fmt = 's'
                acc = 'N/A'
            else:
                acc_fmt = '.4f'

            t_elapse = time.clock() - t_begin
            print((format_str % ('d', '.3f', acc_fmt)) %
                  (encoders[ier][4:-7], decoders[idr][4:-7], codelen, t_elapse,
                   acc))
Esempio n. 23
0
import numpy as np
import os
from shogun.Features import RealFeatures, MulticlassLabels
from shogun.Classifier import GaussianNaiveBayes

# Load the data.
f = open(os.path.dirname(__file__) + '../data/fisheriris.data')
data = np.fromfile(f, dtype=np.float64, sep=' ')
data = data.reshape(-1, 4)
f.close()

f = open(os.path.dirname(__file__) + '../data/fisheriris_label.csv')
label = np.fromfile(f, dtype=np.float64, sep=' ')
f.close()

# Naive Bayes classifier.
feat = RealFeatures(data.T)
test = RealFeatures(data.T)
labels = MulticlassLabels(label)
nbc = GaussianNaiveBayes(feat, labels)
nbc.train()

# Predict labels.
results = nbc.apply(test).get_labels()

print results
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################

    seed(17)
    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################

    #print("labels:")
    #print(pickle.dumps(labels))
    #
    #print("features")
    #print(pickle.dumps(feats_train))
    #
    #print("kernel")
    #print(pickle.dumps(kernel))
    #
    #print("svm")
    #print(pickle.dumps(svm))
    #
    #print("#################################")

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)

    save(fn, svm)

    #print("#################################")

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("#################################")
    #print("comparing training")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())
    return svm, svm.get_objective(), svm2, svm2.get_objective()
Esempio n. 25
0
def features_io_modular(fm_train_real, label_train_twoclass):
    import numpy
    from shogun.Features import SparseRealFeatures, RealFeatures, Labels
    from shogun.Kernel import GaussianKernel
    from shogun.Library import AsciiFile, BinaryFile, HDF5File

    feats = SparseRealFeatures(fm_train_real)
    feats2 = SparseRealFeatures()

    f = BinaryFile("fm_train_sparsereal.bin", "w")
    feats.save(f)

    f = AsciiFile("fm_train_sparsereal.ascii", "w")
    feats.save(f)

    f = BinaryFile("fm_train_sparsereal.bin")
    feats2.load(f)

    f = AsciiFile("fm_train_sparsereal.ascii")
    feats2.load(f)

    feats = RealFeatures(fm_train_real)
    feats2 = RealFeatures()

    f = BinaryFile("fm_train_real.bin", "w")
    feats.save(f)

    f = HDF5File("fm_train_real.h5", "w", "/data/doubles")
    feats.save(f)

    f = AsciiFile("fm_train_real.ascii", "w")
    feats.save(f)

    f = BinaryFile("fm_train_real.bin")
    feats2.load(f)
    #print "diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))

    f = AsciiFile("fm_train_real.ascii")
    feats2.load(f)
    #print "diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))

    lab = Labels(numpy.array([1.0, 2.0, 3.0]))
    lab2 = Labels()
    f = AsciiFile("label_train_twoclass.ascii", "w")
    lab.save(f)

    f = BinaryFile("label_train_twoclass.bin", "w")
    lab.save(f)

    f = HDF5File("label_train_real.h5", "w", "/data/labels")
    lab.save(f)

    f = AsciiFile("label_train_twoclass.ascii")
    lab2.load(f)

    f = BinaryFile("label_train_twoclass.bin")
    lab2.load(f)

    f = HDF5File("fm_train_real.h5", "r", "/data/doubles")
    feats2.load(f)
    #print feats2.get_feature_matrix()
    f = HDF5File("label_train_real.h5", "r", "/data/labels")
    lab2.load(f)
    #print lab2.get_labels()

    #clean up
    import os
    for f in [
            'fm_train_sparsereal.bin', 'fm_train_sparsereal.ascii',
            'fm_train_real.bin', 'fm_train_real.h5', 'fm_train_real.ascii',
            'label_train_real.h5', 'label_train_twoclass.ascii',
            'label_train_twoclass.bin'
    ]:
        os.unlink(f)
    return feats, feats2, lab, lab2
scale = 2.1
size_cache = 10

C = 0.017
epsilon = 1e-5
tube_epsilon = 1e-2
svm = LibSVM()
svm.set_C(C, C)
svm.set_epsilon(epsilon)
svm.set_tube_epsilon(tube_epsilon)

for i in xrange(3):
    data_train = random.rand(num_feats, num_vec)
    data_test = random.rand(num_feats, num_vec)
    feats_train = RealFeatures(data_train)
    feats_test = RealFeatures(data_test)
    labels = Labels(random.rand(num_vec).round() * 2 - 1)

    svm.set_kernel(LinearKernel(size_cache, scale))
    svm.set_labels(labels)

    kernel = svm.get_kernel()
    print "kernel cache size: %s" % (kernel.get_cache_size())

    kernel.init(feats_test, feats_test)
    svm.train()

    kernel.init(feats_train, feats_test)
    print svm.apply().get_labels()
def statistics_quadratic_time_mmd():
    from shogun.Features import RealFeatures
    from shogun.Features import DataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import QuadraticTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # note that the quadratic time mmd has to store kernel matrices
    # which upper bounds the sample size
    n = 500
    dim = 2
    difference = 0.5

    # use data generator class to produce example data
    data = DataGenerator.generate_mean_data(n, dim, difference)

    print "dimension means of X", mean(data.T[0:n].T)
    print "dimension means of Y", mean(data.T[n:2 * n + 1].T)

    # create shogun feature representation
    features = RealFeatures(data)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = Math.randperm_vec(features.get_num_vectors())
    subset = subset[0:200]
    features.add_subset(subset)
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()
    features.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    mmd = QuadraticTimeMMD(kernel, features, n)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = mmd.compute_statistic()
    alpha = 0.05

    print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    mmd.set_bootstrap_iterations(10)
    # bootstrapping allows usage of unbiased or biased statistic
    mmd.set_statistic_type(UNBIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # only can do this if SHOGUN was compiled with LAPACK so check
    if "sample_null_spectrum" in dir(QuadraticTimeMMD):
        print "computing p-value using spectrum method"
        mmd.set_null_approximation_method(MMD2_SPECTRUM)
        # normally, at least 250 iterations should be done, but that takes long
        mmd.set_num_samples_sepctrum(50)
        mmd.set_num_eigenvalues_spectrum(n - 10)
        # spectrum method computes p-value for biased statistics only
        mmd.set_statistic_type(BIASED)
        p_value = mmd.compute_p_value(statistic)
        print "p_value:", p_value
        print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    print "computing p-value using gamma method"
    mmd.set_null_approximation_method(MMD2_GAMMA)
    # gamma method computes p-value for biased statistics only
    mmd.set_statistic_type(BIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_statistic_type(BIASED)
    mmd.set_bootstrap_iterations(10)
    null_samples = mmd.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # spectrum, biased statistic
    print "sampling null distribution using spectrum method"
    mmd.set_null_approximation_method(MMD2_SPECTRUM)
    mmd.set_statistic_type(BIASED)
    # 200 samples using 100 eigenvalues
    null_samples = mmd.sample_null_spectrum(50, 10)
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)
Esempio n. 28
0
def hsic_graphical():
    # parameters, change to get different results
    m = 250
    difference = 3

    # setting the angle lower makes a harder test
    angle = pi / 30

    # number of samples taken from null and alternative distribution
    num_null_samples = 500

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)

    # create shogun feature representation
    features_x = RealFeatures(array([data[0]]))
    features_y = RealFeatures(array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = int32(array([x for x in range(features_x.get_num_vectors())
                          ]))  # numpy
    subset = random.permutation(subset)  # numpy permutation
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_y = median_distance**2
    print "median distance for Gaussian kernel on x:", sigma_x
    print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    # create hsic instance. Note that this is a convienience constructor which copies
    # feature data. features_x and features_y are not these used in hsic.
    # This is only for user-friendlyness. Usually, its ok to do this.
    # Below, the alternative distribution is sampled, which means
    # that new feature objects have to be created in each iteration (slow)
    # However, normally, the alternative distribution is not sampled
    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # sample alternative distribution
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)
        features_x.set_feature_matrix(array([data[0]]))
        features_y.set_feature_matrix(array([data[1]]))

        # re-create hsic instance everytime since feature objects are copied due to
        # useage of convienience constructor
        hsic = HSIC(kernel_x, kernel_y, features_x, features_y)
        alt_samples[i] = hsic.compute_statistic()

    # sample from null distribution
    # bootstrapping, biased statistic
    hsic.set_null_approximation_method(BOOTSTRAP)
    hsic.set_bootstrap_iterations(num_null_samples)
    null_samples_boot = hsic.bootstrap_null()

    # fit gamma distribution, biased statistic
    hsic.set_null_approximation_method(HSIC_GAMMA)
    gamma_params = hsic.fit_null_gamma()
    # sample gamma with parameters
    null_samples_gamma = array([
        gamma(gamma_params[0], gamma_params[1])
        for _ in range(num_null_samples)
    ])

    # plot
    figure()

    # plot data x and y
    subplot(2, 2, 1)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    grid(True)
    plot(data[0], data[1], 'o')
    title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m))
    xlabel('$x$')
    ylabel('$y$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_gamma.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_gamma = null_samples_gamma[floor(
        len(null_samples_gamma) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_gamma = sum(
        null_samples_gamma < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 2, 2)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([min(null_samples_boot),
             min(null_samples_gamma)]),
        max([max(null_samples_boot),
             max(null_samples_gamma)])
    ]

    # plot null distribution with threshold
    subplot(2, 2, 3)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Bootstrapped Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))

    # plot null distribution gamma
    subplot(2, 2, 4)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_gamma, 20, range=hist_range, normed=True)
    axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma))
    grid(True)

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)
Esempio n. 29
0
# parameters, change to get different results
m=250
difference=3

# setting the angle lower makes a harder test
angle=pi/30

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
Esempio n. 30
0
def modelselection_grid_search_liblinear_modular(traindat=traindat,
                                                 label_traindat=label_traindat
                                                 ):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.ModelSelection import GridSearchModelSelection
    from shogun.ModelSelection import ModelSelectionParameters, R_EXP
    from shogun.ModelSelection import ParameterCombination
    from shogun.Features import BinaryLabels
    from shogun.Features import RealFeatures
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC

    # build parameter tree to select C1 and C2
    param_tree_root = ModelSelectionParameters()
    c1 = ModelSelectionParameters("C1")
    param_tree_root.append_child(c1)
    c1.build_values(-2.0, 2.0, R_EXP)

    c2 = ModelSelectionParameters("C2")
    param_tree_root.append_child(c2)
    c2.build_values(-2.0, 2.0, R_EXP)

    # training data
    features = RealFeatures(traindat)
    labels = BinaryLabels(label_traindat)

    # classifier
    classifier = LibLinear(L2R_L2LOSS_SVC)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #classifier.print_modsel_params()

    # splitting strategy for cross-validation
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 10)

    # evaluation method
    evaluation_criterium = ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation = CrossValidation(classifier, features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # model selection instance
    model_selection = GridSearchModelSelection(param_tree_root,
                                               cross_validation)

    # perform model selection with selected methods
    #print "performing model selection of"
    #param_tree_root.print_tree()
    best_parameters = model_selection.select_model()

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(classifier)
    result = cross_validation.evaluate()