def multiclass_c45classifiertree(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:int(subset.size / 3)] trsubset = subset[1 + int(subset.size / 3):subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def stochasticgbmachine(train=traindat, train_labels=label_traindat, ft=feat_types): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats = RealFeatures(CSVFile(train)) labels = RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p = np.random.permutation(labels.get_num_labels()) num = labels.get_num_labels() * 0.9 cart = CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss = SquaredLoss() s = StochasticGBMachine(cart, loss, 500, 0.01, 0.6) # train feats.add_subset(np.int32(p[0:int(num)])) labels.add_subset(np.int32(p[0:int(num)])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[int(num):len(p)])) labels.add_subset(np.int32(p[int(num):len(p)])) output = s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s, output
def stochasticgbmachine(train=traindat,train_labels=label_traindat,ft=feat_types): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:int(num)])) labels.add_subset(np.int32(p[0:int(num)])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[int(num):len(p)])) labels.add_subset(np.int32(p[int(num):len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)