def get_shogun_statistics(self): # turn data into Shogun representation (columns vectors) feat_p = sg.RealFeatures(self._x.reshape(1, len(self._x))) feat_q = sg.RealFeatures(self._y.reshape(1, len(self._y))) # choose kernel for testing. Here: Gaussian kernel_width = 1 kernel = sg.GaussianKernel(10, kernel_width) # create mmd instance of test-statistic self._mmd = sg.QuadraticTimeMMD() self._mmd.set_kernel(kernel) self._mmd.set_p(feat_p) self._mmd.set_q(feat_q) # compute biased and unbiased test statistic (default is unbiased) self._mmd.set_statistic_type(sg.ST_BIASED_FULL) biased_statistic = self._mmd.compute_statistic() self._mmd.set_statistic_type(sg.ST_UNBIASED_FULL) unbiased_statistic = self._mmd.compute_statistic() self._statistic = unbiased_statistic print("\nShogun tests statistics:") print( f"biased test statistic {len(self._x)} x MMD_b[X,Y]^2={biased_statistic:.2f}" ) print( f"unbiased test statistic {len(self._x)} x MMD_u[X,Y]^2={unbiased_statistic:.2f}" ) return self
def RunQDAShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') trainFeat = shogun.RealFeatures(trainData[:, :-1].T) if len(self.dataset) == 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') testFeat = shogun.RealFeatures(testData.T) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") # Labels are the last row of the training set. labels = shogun.MulticlassLabels( trainData[:, (trainData.shape[1] - 1)]) with totalTimer: model = shogun.QDA(trainFeat, labels) model.train() if len(self.dataset) == 2: model.apply_multiclass(testFeat).get_labels() except Exception as e: return -1 return totalTimer.ElapsedTime()
def _read_toy_data(request): y_set = [] x_set = [] x_set_induc = [] points = [] points_induc = [] model_sel_error = False toy_data = json.loads(request.POST['point_set']) for pt in toy_data: if int(pt['label']) == 1: points.append(pt) elif pt['label'] == -1: points_induc.append(pt) for pt in points: y_set.append(float(pt["y"])) x_set.append(float(pt["x"])) for pt in points_induc: x_set_induc.append(float(pt["x"])) noise_level = float(request.POST['noise_level']) scale = float(request.POST['scale']) inf = request.POST['inf'] domain = json.loads(request.POST['axis_domain']) labels = np.array(y_set, dtype=np.float64) num = len(x_set) if num == 0: raise Http404 examples = np.zeros((1, num)) for i in xrange(num): examples[0, i] = x_set[i] feat_train = sg.RealFeatures(examples) labels = sg.RegressionLabels(labels) #Get inducing points num_induc = len(x_set_induc) if num_induc != 0: examples_induc = np.zeros((1, num_induc)) for i in xrange(num_induc): examples_induc[0, i] = x_set_induc[i] feat_train_induc = sg.RealFeatures(examples_induc) elif num_induc == 0: feat_train_induc = None kernel = get_kernel(request, feat_train) try: learn = request.POST["learn"] except: raise ValueError("Argument Error") if int(feat_train.get_num_vectors()) > 100 and learn == "ML2": model_sel_error = True return (feat_train, labels, noise_level, scale, kernel, domain, learn, feat_train_induc, inf), model_sel_error
def visualise_distribution_test_statistic(self, alpha=0.05): num_samples = 500 # we first sample null distribution null_samples = self._mmd.sample_null() # we then sample alternative distribution, generate new data for that alt_samples = np.zeros(num_samples) for i in range(num_samples): x = norm.rvs(size=self._n, loc=self._mu, scale=self._sigma_squared) y = laplace.rvs(size=self._n, loc=self._mu, scale=self._b) feat_p = sg.RealFeatures(np.reshape(x, (1, len(x)))) feat_q = sg.RealFeatures(np.reshape(y, (1, len(y)))) kernel_width = 1 kernel = sg.GaussianKernel(10, kernel_width) mmd = sg.QuadraticTimeMMD() mmd.set_kernel(kernel) mmd.set_p(feat_p) mmd.set_q(feat_q) alt_samples[i] = mmd.compute_statistic() np.std(alt_samples) plt.figure(figsize=(18, 5)) plt.subplot(131) plt.hist(null_samples, 50, color='blue') plt.title('Null distribution') plt.subplot(132) plt.title('Alternative distribution') plt.hist(alt_samples, 50, color='green') plt.subplot(133) plt.hist(null_samples, 50, color='blue') plt.hist(alt_samples, 50, color='green', alpha=0.5) plt.title('Null and alternative distriution') # find (1-alpha) element of null distribution null_samples_sorted = np.sort(null_samples) quantile_idx = int(len(null_samples) * (1 - alpha)) quantile = null_samples_sorted[quantile_idx] plt.axvline(x=quantile, ymin=0, ymax=100, color='red', label=str(int(round( (1 - alpha) * 100))) + '% quantile of null') plt.show() return self
def get_binary_features(request): try: point_set_raw = json.loads(request.POST['point_set']) except: raise ValueError("cannot read click pts") class_a_point_set = [] class_b_point_set = [] for point in point_set_raw: if point['label'] == 1: class_a_point_set.append([point['x'], point['y']]) else: class_b_point_set.append([point['x'], point['y']]) class_a = np.transpose(np.array(class_a_point_set, dtype=float)) class_b = np.transpose(np.array(class_b_point_set, dtype=float)) if not (len(class_a) + len(class_b)): raise ValueError("labels not enough") else: features = np.concatenate((class_a, class_b), axis=1) labels = np.concatenate( (np.ones(class_a.shape[1]), -np.ones(class_b.shape[1])), axis=1) features = sg.RealFeatures(features) labels = sg.BinaryLabels(labels) return features, labels
def get_multi_features(request): try: point_set_raw = json.loads(request.POST['point_set']) except: raise ValueError("cannot read click pts") x = [] y = [] labels = [] for pt in point_set_raw: x.append(float(pt['x'])) y.append(float(pt['y'])) labels.append(float(pt['label'])) n = len(set(labels)) if not n: raise ValueError("0-labels") elif n == 1: raise ValueError("1-class-labels") else: features = np.array([x, y]) features = sg.RealFeatures(features) labels = sg.MulticlassLabels(np.array(labels)) return features, labels
def __init__(self, X, y, n_importance, prior_log_pdf, ridge=0., num_shogun_threads=1): self.n_importance = n_importance self.prior_log_pdf = prior_log_pdf self.ridge = ridge self.X = X self.y = y self.num_shogun_threads = num_shogun_threads # tell shogun to use 1 thread only logger.debug("Using Shogun with %d threads" % self.num_shogun_threads) sg.ZeroMean().parallel.set_num_threads(self.num_shogun_threads) # shogun representation of data self.sg_labels = sg.BinaryLabels(self.y) self.sg_feats_train = sg.RealFeatures(self.X.T) # ARD: set theta, which is in log-scale, as kernel weights D = X.shape[1] theta_start = np.ones(D) self.sg_mean = sg.ZeroMean() self.sg_likelihood = sg.LogitLikelihood()
def regress_dump(request): try: data_set = request.POST['data_set'] feature = request.POST['feature'] temp_feats = sg.RealFeatures( sg.CSVFile(REGRESS_DATA_DIR + REGRESS_DATA_SET[data_set])) labels = sg.RegressionLabels( sg.CSVFile(REGRESS_DATA_DIR + REGRESS_LABELS[data_set])) lab = labels.get_labels() #rescale to 0...1 preproc = sg.RescaleFeatures() preproc.init(temp_feats) temp_feats.add_preprocessor(preproc) temp_feats.apply_preprocessor(True) mat = temp_feats.get_feature_matrix() if feature == 'CRIM': feat = mat[0] elif feature == 'DIS': feat = mat[7] elif feature == 'INDUS': feat = mat[2] elif feature == 'LSTAT': feat = mat[12] except: raise Http404 toy_data = [] for i in xrange(len(feat)): toy_data.append({'x': feat[i], 'y': lab[i], 'label': float(0)}) return HttpResponse(json.dumps(toy_data))
def _train_clustering(point_set, distance_name, k): labels = np.array([0]*len(point_set)) features = np.zeros((2, len(point_set))) for i in xrange(len(point_set)): features[0, i] = point_set[i]['x'] features[1, i] = point_set[i]['y'] labels[i] = point_set[i]['label'] lab = sg.BinaryLabels(labels) train = sg.RealFeatures(features) if distance_name == "EuclideanDistance": distance = sg.EuclideanDistance(train, train) elif distance_name == "ManhattanMetric": distance = sg.ManhattanMetric(train, train) elif distance_name == "JensenMetric": distance = sg.JensenMetric(train, train) else: raise TypeError kmeans = sg.KMeans(k, distance) kmeans.train() return kmeans
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo = None, initialClusters = None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) if os.path.exists(outputFile): print("shogun skipped") return train_features = shogun.RealFeatures(dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) hierarchical = shogun.Hierarchical(clustersNumber, distance) #TODO Makes the pyhon process dies!!!???!!! d = hierarchical.get_merge_distances() cp = hierarchical.get_cluster_pairs() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)])
def run_kmeans_base(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, config_function, run_info=None, nb_iterations=None): self._init() output_file, centroids_file = self._prepare_files( dataset_name, run_info, True) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(nb_clusters, distance) if config_function is not None: config_function(kmeans) if nb_iterations is not None: kmeans.set_max_iter(nb_iterations) centers, result = Shogun._kmeans_process(kmeans) ClusteringToolkit._save_clustering( Shogun._clustering_to_list(data_without_target, result), output_file) ClusteringToolkit._save_centroids(Shogun._centroids_to_list(centers), centroids_file) return output_file, {"centroids": centroids_file}
def regression(request): try: domain = json.loads(request.POST['axis_domain']) X = np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100) x = np.array([X]) feat = sg.RealFeatures(x) arguments = _read_data(request) tool = request.POST['regression'] if (tool == 'LeastSquaresRegression'): ls = _train_ls(*arguments) y = _apply_ls(feat, ls) elif (tool == 'LinearRidgeRegression'): lrr = _train_lrr(*arguments) y = _apply_lrr(feat, lrr) elif (tool == 'KernelRidgeRegression'): krr, kernel, train = _train_krr(*arguments) y = _apply_krr(kernel, train, feat, krr) line_dot = [] for i in xrange(len(X)): line_dot.append({'x': X[i], 'y': y[i]}) return HttpResponse(json.dumps(line_dot)) except: raise Http404
def load_mult_data(self, x_train, z_train): ''' This function re-configures the training data according to the library requirement ''' self.input_dim = x_train.shape[1] self.z_train = shogun.RegressionLabels(z_train) self.x_train = shogun.RealFeatures( np.array(x_train).reshape(self.input_dim, len(x_train)))
def feature_prepare(self, X): features = shogun.CombinedFeatures() X = X.astype(np.float64) for kernel_type in self.kernel_dict.keys(): for kernel_feature in self.kernel_dict[kernel_type].values(): features.append_feature_obj( shogun.RealFeatures(X[:, kernel_feature].T)) return features
def shogun_mmd(X, Y, kernel_width, null_samples=1000, median_samples=1000, cache_size=32): ''' Run an MMD test using a Gaussian kernel. Parameters ---------- X : row-instance feature array Y : row-instance feature array kernel_width : float The bandwidth of the RBF kernel (sigma). null_samples : int How many times to sample from the null distribution. Returns ------- p_val : float The obtained p value of the test. stat : float The test statistic. null_samples : array of length null_samples The samples from the null distribution. ''' import shogun as sg mmd = sg.QuadraticTimeMMD() mmd.set_p(sg.RealFeatures(X.T.astype(np.float64))) mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64))) mmd.set_kernel(sg.GaussianKernel(cache_size, float(kernel_width))) mmd.set_num_null_samples(null_samples) samps = mmd.sample_null() stat = mmd.compute_statistic() p_val = np.mean(stat <= samps) return p_val, stat, samps
def dftoxz(self, dataframe, data_type): x = None z = None x = shogun.RealFeatures( np.array(dataframe["x"]).reshape(1, len(dataframe["x"]))) if data_type == 'train': z = shogun.RegressionLabels(np.array(dataframe['z_train'])) return x, z
def load_data(self, dataframe): ''' This function re-configures the training data according to the library requirement ''' self.train_dataframe = dataframe # Re-configuration of the data self.z = shogun.RealFeatures( np.array(self.train_dataframe['z_train']).reshape( 1, len(self.train_dataframe["z_train"]))) self.x_train, self.z_train = self.dftoxz(self.train_dataframe, 'train')
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(SHOGUN_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("shogun skipped") return train_features = shogun.RealFeatures( dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(clustersNumber, distance) if initialClusters is None: # set KMeans++ flag kmeans.set_use_kmeanspp(True) else: # set new initial centers kmeans.set_initial_centers( initialClusters.astype("float64").transpose()) # KMeans training kmeans.train() # cluster centers centers = kmeans.get_cluster_centers() # Labels for data points result = kmeans.apply() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)]) with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in centers.transpose(): filewriter.writerow(row.tolist())
def support_vector_regression(request): try: arguments=_read_data(request) svm=_train_svr(*arguments) domain = json.loads(request.POST['axis_domain']) x=np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100) y=np.array(svm.apply(sg.RealFeatures(np.array([x]))).get_labels(), dtype=np.float64) line_dot = [] for i in xrange(len(x)): line_dot.append({'x' : x[i], 'y' : y[i]}) return HttpResponse(json.dumps(line_dot)) except: raise Http404
def mmd_test(Sample1, Sample2): for i in range(Sample1.shape[1]): x = Sample1[:, i] y = Sample2[:, i] feat_p = sg.RealFeatures(x.reshape(1, len(x))) feat_q = sg.RealFeatures(y.reshape(1, len(y))) # choose kernel for testing. Here: Gaussian kernel_width = 1 kernel = sg.GaussianKernel(10, kernel_width) # create mmd instance of test-statistic mmd = sg.QuadraticTimeMMD() mmd.set_kernel(kernel) mmd.set_p(feat_p) mmd.set_q(feat_q) # compute biased and unbiased test statistic (default is unbiased) mmd.set_statistic_type(sg.ST_UNBIASED_FULL) statistic = mmd.compute_statistic() return statistic
def RunMetrics(self, options): Log.Info("Perform QDA.", self.verbose) results = self.QDAShogun(options) if results < 0: return results metrics = {'Runtime': results} if len(self.dataset) >= 3: trainData, labels = SplitTrainData(self.dataset) testData = LoadDataset(self.dataset[1]) truelabels = LoadDataset(self.dataset[2]) model = shogun.QDA(shogun.RealFeatures(trainData.T), shogun.MulticlassLabels(labels)) model.train() predictions = model.apply_multiclass( shogun.RealFeatures(testData.T)).get_labels() confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions) metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix) metrics['MultiClass Precision'] = Metrics.AvgPrecision( confusionMatrix) metrics['MultiClass Recall'] = Metrics.AvgRecall(confusionMatrix) metrics['MultiClass FMeasure'] = Metrics.AvgFMeasure( confusionMatrix) metrics['MultiClass Lift'] = Metrics.LiftMultiClass( confusionMatrix) metrics['MultiClass MCC'] = Metrics.MCCMultiClass(confusionMatrix) metrics['MultiClass Information'] = Metrics.AvgMPIArray( confusionMatrix, truelabels, predictions) metrics['Simple MSE'] = Metrics.SimpleMeanSquaredError( truelabels, predictions) return metrics
def classify_gp(features, labels, kernel, domain, lik, learn, scale, returnValues=True): mean = sg.ZeroMean() inf = sg.EPInferenceMethod(kernel, features, mean, labels, lik) inf.set_scale(scale) gp = sg.GaussianProcessBinaryClassification(inf) best_width = 0.0 best_param = 0 best_degree = 0 best_scale = 0.0 if learn == 'ML2': inf.set_scale(1) if kernel.get_name() == 'GaussianKernel': kernel.set_width(1) grad = sg.GradientEvaluation(gp, features, labels, sg.GradientCriterion(), False) grad.set_function(inf) grad_search = sg.GradientModelSelection(grad) best_combination = grad_search.select_model() best_combination.apply_to_machine(gp) try: best_width = sg.GaussianKernel.obtain_from_generic( inf.get_kernel()).get_width() except: pass best_scale = inf.get_scale() gp.train() size = 50 x1 = np.linspace(domain['horizontal'][0], domain['horizontal'][1], size) y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], size) x, y = np.meshgrid(x1, y1) test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y)))) if returnValues: out = gp.apply(test).get_values() else: out = gp.apply(test).get_labels() z = out.reshape((size, size)) z = np.transpose(z) return x, y, z, best_width, best_param, best_scale
def run_hierarchical(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, run_info=None): output_file, = self._prepare_files(dataset_name, run_info, False) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) hierarchical = shogun.Hierarchical(nb_clusters, distance)
def predict_mult(self, x_test): ''' This function predicts for the test data ''' self.x_test = shogun.RealFeatures( np.array(x_test).reshape(self.input_dim, len(x_test))) if type(self.model) == str: return else: self.z_postmean = self.model.apply_regression(self.x_test) self.z_postvar = np.sqrt( self.model.get_variance_vector(self.x_test)) return self.z_postmean, self.z_postvar
def _predictive_process(feat_train, labels, noise_level, scale, kernel, domain, learn, feat_induc, inf_select): variances, means, best_width, best_scale, best_sigma = _process( feat_train, labels, noise_level, scale, kernel, domain, learn, feat_induc, inf_select, True) size = 75 x_test = np.array( [np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)]) feat_test = sg.RealFeatures(x_test) y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], 50) D = np.zeros((len(y1), size)) # evaluate normal distribution at every prediction point (column) for j in range(np.shape(D)[1]): # create gaussian distributio instance, expects mean vector and covariance matrix, reshape gauss = sg.GaussianDistribution( np.array(means[j]).reshape(1, ), np.array(variances[j]).reshape(1, 1)) # evaluate predictive distribution for test point, method expects matrix D[:, j] = np.exp(gauss.log_pdf_multiple(y1.reshape(1, len(y1)))) z = np.transpose(D) z_max = np.nanmax(z) z_min = np.nanmin(z) z_delta = 0.1 * (np.nanmax(z) - np.nanmin(z)) result = [] for i in xrange(len(feat_test.get_feature_matrix()[0])): result.append({ 'x': feat_test.get_feature_matrix()[0][i], 'y': means[i], 'range_upper': means[i] + 2 * np.sqrt(variances[i]), 'range_lower': means[i] - 2 * np.sqrt(variances[i]), 'best_width': float(best_width), 'best_scale': float(best_scale), 'best_sigma': float(best_sigma), "status": "ok", "domain": [z_min - z_delta, z_max + z_delta], "max": z_max + z_delta, "min": z_min - z_delta, "z": z.tolist() }) return result
def run_gaussian(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, run_info=None): output_file, = self._prepare_files(dataset_name, run_info, False) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance # distance = shogun.EuclideanDistance(train_features, train_features) gmm = shogun.GMM(nb_clusters) gmm.set_features(train_features) gmm.train_em() print(gmm)
def _process(x1_set, x2_set, kernel_width, kernel_name, degree): num = len(x1_set) if num == 0: raise Http404 examples = np.zeros((2, num)) for i in xrange(num): examples[0, i] = x1_set[i] examples[1, i] = x2_set[i] feat_train = sg.RealFeatures(examples) # construct covariance function if kernel_name == "LinearKernel": kernel = sg.LinearKernel(feat_train, feat_train) elif kernel_name == "PolynomialKernel": kernel = sg.PolyKernel(feat_train, feat_train, degree, True) elif kernel_name == "GaussianKernel": kernel = sg.GaussianKernel(feat_train, feat_train, kernel_width) kernel_matrix = kernel.get_kernel_matrix() return kernel_matrix.tolist()
def _read_data(request): labels = [] features = [] data = json.loads(request.POST['point_set']) cost = float(request.POST['C']) tubeeps = float(request.POST['tube']) kernel_name = request.POST['kernel'] for pt in data: labels.append(float(pt["y"])) features.append(float(pt["x"])) labels = np.array(labels, dtype=np.float64) num = len(features) if num == 0: raise TypeError examples = np.zeros((1,num)) for i in xrange(num): examples[0,i] = features[i] lab = sg.RegressionLabels(labels) train = sg.RealFeatures(examples) kernel = get_kernel(request, train) return (cost, tubeeps, lab, kernel)
def _read_data(request): labels = [] features = [] data = json.loads(request.POST['point_set']) tau = float(request.POST['Tau']) for pt in data: labels.append(float(pt["y"])) features.append(float(pt["x"])) labels = np.array(labels, dtype=np.float64) num = len(features) if num == 0: raise TypeError examples = np.zeros((1, num)) for i in xrange(num): examples[0, i] = features[i] lab = sg.RegressionLabels(labels) train = sg.RealFeatures(examples) sigma = float(request.POST["sigma"]) kernel = sg.GaussianKernel(train, train, sigma) return (tau, lab, kernel, train)
import shogun as sg import data # load data feature_matrix = data.swissroll() # create features instance features = sg.RealFeatures(feature_matrix) # create Linear Local Tangent Space Alignment converter instance converter = sg.LinearLocalTangentSpaceAlignment() # set target dimensionality converter.set_target_dim(2) # set number of neighbors converter.set_k(10) # set number of threads converter.parallel.set_num_threads(2) # set nullspace shift (optional) converter.set_nullspace_shift(-1e-6) # compute embedding with Linear Local Tangent Space Alignment method embedding = converter.embed(features)