def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'):
	try:
		from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
		import matplotlib.pyplot as pyplot
	except ImportError:
		print 'Error importing shogun or other required modules. Please, verify their installation.'
		return

	features = RealFeatures(load_compressed_features(fname_features).T)
	labels = MulticlassLabels(CSVFile(fname_labels))

#	print 'number of examples = %d' % features.get_num_vectors()
#	print 'number of features = %d' % features.get_num_features()

	assert(features.get_num_vectors() == labels.get_num_labels())

	# train LMNN
	lmnn = LMNN(features, labels, k)
	lmnn.set_correction(100)
#	lmnn.io.set_loglevel(MSG_DEBUG)
	print 'Training LMNN, this will take about two minutes...'
	lmnn.train()
	print 'Training done!'

	# plot objective obtained during training
	statistics = lmnn.get_statistics()

	pyplot.plot(statistics.obj.get())
	pyplot.grid(True)
	pyplot.xlabel('Iterations')
	pyplot.ylabel('LMNN objective')
	pyplot.title('LMNN objective during training for the multiclass digits data set')

	pyplot.show()
def transfer_multitask_logistic_regression(fm_train=traindat,
                                           fm_test=testdat,
                                           label_train=label_traindat):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup
    try:
        from shogun import MultitaskLogisticRegression
    except ImportError:
        print("MultitaskLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 2)
    task_two = Task(n_vectors // 2, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)

    mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    mtlr.set_current_task(0)
    out = mtlr.apply().get_labels()

    return out
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import RegressionLabels, RealFeatures, Task, TaskGroup
	try:
		from shogun import MultitaskLeastSquaresRegression
	except ImportError:
		print("MultitaskLeastSquaresRegression not available")
		exit(0)

	features = RealFeatures(traindat)
	labels = RegressionLabels(label_train)

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group)
	mtlsr.set_regularization(1) # use regularization ratio
	mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlsr.train()
	mtlsr.set_current_task(0)
	out = mtlsr.apply_regression().get_labels()
	return out
def transfer_multitask_clustered_logistic_regression(fm_train=traindat,
                                                     fm_test=testdat,
                                                     label_train=label_traindat
                                                     ):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
    try:
        from shogun import MultitaskClusteredLogisticRegression
    except ImportError:
        print("MultitaskClusteredLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat))))
    labels = BinaryLabels(hstack((label_train, label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 3)
    task_two = Task(n_vectors // 3, 2 * n_vectors // 3)
    task_three = Task(2 * n_vectors // 3, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)
    task_group.append_task(task_three)

    mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels,
                                                task_group, 2)
    #mtlr.io.set_loglevel(MSG_DEBUG)
    mtlr.set_tolerance(1e-3)  # use 1e-2 tolerance
    mtlr.set_max_iter(100)
    mtlr.train()
    mtlr.set_current_task(0)
    #print mtlr.get_w()
    out = mtlr.apply_regression().get_labels()

    return out
def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from shogun import MultitaskTraceLogisticRegression
	except ImportError:
		print("MultitaskTraceLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group)
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.set_max_iter(10)
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply_regression().get_labels()

	return out
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
	try:
		from shogun import MultitaskClusteredLogisticRegression
	except ImportError:
		print("MultitaskClusteredLogisticRegression not available")
		exit()

	features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat))))
	labels = BinaryLabels(hstack((label_train,label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//3)
	task_two = Task(n_vectors//3,2*n_vectors//3)
	task_three = Task(2*n_vectors//3,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)
	task_group.append_task(task_three)

	mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2)
	#mtlr.io.set_loglevel(MSG_DEBUG)
	mtlr.set_tolerance(1e-3) # use 1e-2 tolerance
	mtlr.set_max_iter(100)
	mtlr.train()
	mtlr.set_current_task(0)
	#print mtlr.get_w()
	out = mtlr.apply_regression().get_labels()

	return out
Example #7
0
def multiclass_c45classifiertree(train=traindat,
                                 test=testdat,
                                 labels=label_traindat,
                                 ft=feattypes):
    try:
        from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
        from numpy import random, int32
    except ImportError:
        print("Could not import Shogun and/or numpy modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
    subset = int32(random.permutation(feats_train.get_num_vectors()))
    vsubset = subset[1:int(subset.size / 3)]
    trsubset = subset[1 + int(subset.size / 3):subset.size]

    # C4.5 Tree formation using training subset
    train_labels.add_subset(trsubset)
    feats_train.add_subset(trsubset)

    c = C45ClassifierTree()
    c.set_labels(train_labels)
    c.set_feature_types(ft)
    c.train(feats_train)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # prune tree using validation subset
    train_labels.add_subset(vsubset)
    feats_train.add_subset(vsubset)

    c.prune_tree(feats_train, train_labels)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # Classify test data
    output = c.apply_multiclass(feats_test).get_labels()
    output_certainty = c.get_certainty_vector()

    return c, output, output_certainty
Example #8
0
def metric_lmnn_statistics(
        k=3,
        fname_features='../../data/fm_train_multiclass_digits.dat.gz',
        fname_labels='../../data/label_train_multiclass_digits.dat'):
    try:
        from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
        import matplotlib.pyplot as pyplot
    except ImportError:
        print 'Error importing shogun or other required modules. Please, verify their installation.'
        return

    features = RealFeatures(load_compressed_features(fname_features).T)
    labels = MulticlassLabels(CSVFile(fname_labels))

    #	print 'number of examples = %d' % features.get_num_vectors()
    #	print 'number of features = %d' % features.get_num_features()

    assert (features.get_num_vectors() == labels.get_num_labels())

    # train LMNN
    lmnn = LMNN(features, labels, k)
    lmnn.set_correction(100)
    #	lmnn.io.set_loglevel(MSG_DEBUG)
    print 'Training LMNN, this will take about two minutes...'
    lmnn.train()
    print 'Training done!'

    # plot objective obtained during training
    statistics = lmnn.get_statistics()

    pyplot.plot(statistics.obj.get())
    pyplot.grid(True)
    pyplot.xlabel('Iterations')
    pyplot.ylabel('LMNN objective')
    pyplot.title(
        'LMNN objective during training for the multiclass digits data set')

    pyplot.show()
Example #9
0
    def train(self, images, labels):
        """
        Train eigenfaces
        """
        print "Train...",
        #copy labels
        self._labels = labels

        #transform the numpe vector to shogun structure
        features = RealFeatures(images)
        #PCA
        self.pca = PCA()
        #set dimension
        self.pca.set_target_dim(self._num_components)
        #compute PCA
        self.pca.init(features)

        for sampleIdx in range(features.get_num_vectors()):
            v = features.get_feature_vector(sampleIdx)
            p = self.pca.apply_to_feature_vector(v)
            self._projections.insert(sampleIdx, p)

        print "ok!"
Example #10
0
def hsic_graphical():
	# parameters, change to get different results
	m=250
	difference=3

	# setting the angle lower makes a harder test
	angle=pi/30

	# number of samples taken from null and alternative distribution
	num_null_samples=500

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy
	subset=random.permutation(subset) # numpy permutation
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=np.median(distances)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=np.median(distances)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	# create hsic instance. Note that this is a convienience constructor which copies
	# feature data. features_x and features_y are not these used in hsic.
	# This is only for user-friendlyness. Usually, its ok to do this.
	# Below, the alternative distribution is sampled, which means
	# that new feature objects have to be created in each iteration (slow)
	# However, normally, the alternative distribution is not sampled
	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# sample alternative distribution
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
		features_x.set_feature_matrix(array([data[0]]))
		features_y.set_feature_matrix(array([data[1]]))

		# re-create hsic instance everytime since feature objects are copied due to
		# useage of convienience constructor
		hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
		alt_samples[i]=hsic.compute_statistic()

	# sample from null distribution
	# permutation, biased statistic
	hsic.set_null_approximation_method(PERMUTATION)
	hsic.set_num_null_samples(num_null_samples)
	null_samples_boot=hsic.sample_null()

	# fit gamma distribution, biased statistic
	hsic.set_null_approximation_method(HSIC_GAMMA)
	gamma_params=hsic.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])

	# plot
	figure()

	# plot data x and y
	subplot(2,2,1)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	grid(True)
	plot(data[0], data[1], 'o')
	title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
	xlabel('$x$')
	ylabel('$y$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,2,2)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]

	# plot null distribution with threshold
	subplot(2,2,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))

	# plot null distribution gamma
	subplot(2,2,4)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	grid(True)

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)