def RunQDAShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') trainFeat = modshogun.RealFeatures(trainData[:, :-1].T) if len(self.dataset) == 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') testFeat = modshogun.RealFeatures(testData.T) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") # Labels are the last row of the training set. labels = modshogun.MulticlassLabels( trainData[:, (trainData.shape[1] - 1)]) with totalTimer: model = modshogun.QDA(trainFeat, labels) model.train() if len(self.dataset) == 2: model.apply_multiclass(testFeat).get_labels() except Exception as e: return -1 return totalTimer.ElapsedTime()
def get_CosineDistance(xm, ym): # CosineDistance by shogun xm = np.array(xm).T ym = np.array(ym).T fxm = modshogun.RealFeatures(xm) fym = modshogun.RealFeatures(ym) return modshogun.CosineDistance(fxm, fym).get_distance_matrix()
def RunMetrics(self, options): Log.Info("Perform QDA.", self.verbose) results = self.QDAShogun(options) if results < 0: return results metrics = {'Runtime': results} if len(self.dataset) >= 3: trainData, labels = SplitTrainData(self.dataset) testData = LoadDataset(self.dataset[1]) truelabels = LoadDataset(self.dataset[2]) model = modshogun.QDA(modshogun.RealFeatures(trainData.T), modshogun.MulticlassLabels(labels)) model.train() predictions = model.apply(modshogun.RealFeatures( testData.T)).get_labels() confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions) metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix) metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix) metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix) metrics['Recall'] = Metrics.AvgRecall(confusionMatrix) metrics['MSE'] = Metrics.SimpleMeanSquaredError( truelabels, predictions) return metrics
def RunMetrics(self, options): if len(self.dataset) >= 3: trainData, labels = SplitTrainData(self.dataset) testData = LoadDataset(self.dataset[1]) truelabels = LoadDataset(self.dataset[2]) model = modshogun.QDA(modshogun.RealFeatures(trainData.T),modshogun.MulticlassLabels(labels)) model.train() predictions = model.apply(modshogun.RealFeatures(testData.T)).get_labels() # Datastructure to store the results. metrics = {} confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions) metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix) metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix) metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix) metrics['Recall'] = Metrics.AvgRecall(confusionMatrix) metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions) return metrics else: Log.Fatal("This method requires three datasets!")
def RunQDAShogun(q): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') trainFeat = modshogun.RealFeatures(trainData[:, :-1].T) if len(self.dataset) == 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') testFeat = modshogun.RealFeatures(testData.T) # Labels are the last row of the training set. labels = modshogun.MulticlassLabels( trainData[:, (trainData.shape[1] - 1)]) with totalTimer: model = modshogun.QDA(trainFeat, labels) model.train() if len(self.dataset) == 2: model.apply(testFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def _read_toy_data(request): y_set = [] x_set = [] x_set_induc = [] points = [] points_induc = [] model_sel_error = False toy_data = json.loads(request.POST['point_set']) for pt in toy_data: if int(pt['label']) == 1: points.append(pt) elif pt['label'] == -1: points_induc.append(pt) for pt in points: y_set.append(float(pt["y"])) x_set.append(float(pt["x"])) for pt in points_induc: x_set_induc.append(float(pt["x"])) noise_level = float(request.POST['noise_level']) scale = float(request.POST['scale']) inf = request.POST['inf'] domain = json.loads(request.POST['axis_domain']) labels = np.array(y_set, dtype=np.float64) num = len(x_set) if num == 0: raise Http404 examples = np.zeros((1, num)) for i in xrange(num): examples[0, i] = x_set[i] feat_train = sg.RealFeatures(examples) labels = sg.RegressionLabels(labels) #Get inducing points num_induc = len(x_set_induc) if num_induc != 0: examples_induc = np.zeros((1, num_induc)) for i in xrange(num_induc): examples_induc[0, i] = x_set_induc[i] feat_train_induc = sg.RealFeatures(examples_induc) elif num_induc == 0: feat_train_induc = None kernel = get_kernel(request, feat_train) try: learn = request.POST["learn"] except: raise ValueError("Argument Error") if int(feat_train.get_num_vectors()) > 100 and learn == "ML2": model_sel_error = True return (feat_train, labels, noise_level, scale, kernel, domain, learn, feat_train_induc, inf), model_sel_error
def fit(self, x): x = np.array(x).T features_train = modshogun.RealFeatures(x) distance = self.distance(features_train, features_train) self.kmeans = modshogun.KMeans(self.k, distance) self.kmeans.train() self.cluster_centers_ = self.kmeans.get_cluster_centers().T kcc = modshogun.RealFeatures(self.cluster_centers_.T) discc = self.distance(kcc, features_train).get_distance_matrix() self.labels_ = np.copy(discc.argsort(axis=0)[0, :]).T return self
def linear_mmd_test(X, Y, null_samples=1000): mmd = sg.QuadraticTimeMMD() mmd.set_p(sg.RealFeatures(X.T.astype(np.float64))) mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64))) mmd.set_kernel(sg.LinearKernel()) mmd.set_num_null_samples(null_samples) samps = mmd.sample_null() stat = mmd.compute_statistic() p_val = np.mean(stat <= samps) return p_val, stat, samps
def regress_dump(request): try: data_set = request.POST['data_set'] feature = request.POST['feature'] temp_feats = sg.RealFeatures( sg.CSVFile(REGRESS_DATA_DIR + REGRESS_DATA_SET[data_set])) labels = sg.RegressionLabels( sg.CSVFile(REGRESS_DATA_DIR + REGRESS_LABELS[data_set])) lab = labels.get_labels() #rescale to 0...1 preproc = sg.RescaleFeatures() preproc.init(temp_feats) temp_feats.add_preprocessor(preproc) temp_feats.apply_preprocessor(True) mat = temp_feats.get_feature_matrix() if feature == 'CRIM': feat = mat[0] elif feature == 'DIS': feat = mat[7] elif feature == 'INDUS': feat = mat[2] elif feature == 'LSTAT': feat = mat[12] except: raise Http404 toy_data = [] for i in xrange(len(feat)): toy_data.append({'x': feat[i], 'y': lab[i], 'label': float(0)}) return HttpResponse(json.dumps(toy_data))
def get_binary_features(request): try: point_set_raw = json.loads(request.POST['point_set']) except: raise ValueError("cannot read click pts") class_a_point_set = [] class_b_point_set = [] for point in point_set_raw: if point['label'] == 1: class_a_point_set.append([point['x'], point['y']]) else: class_b_point_set.append([point['x'], point['y']]) class_a = np.transpose(np.array(class_a_point_set, dtype=float)) class_b = np.transpose(np.array(class_b_point_set, dtype=float)) if not (len(class_a) + len(class_b)): raise ValueError("labels not enough") else: features = np.concatenate((class_a, class_b), axis=1) labels = np.concatenate( (np.ones(class_a.shape[1]), -np.ones(class_b.shape[1])), axis=1) features = sg.RealFeatures(features) labels = sg.BinaryLabels(labels) return features, labels
def get_multi_features(request): try: point_set_raw = json.loads(request.POST['point_set']) except: raise ValueError("cannot read click pts") x = [] y = [] labels = [] for pt in point_set_raw: x.append(float(pt['x'])) y.append(float(pt['y'])) labels.append(float(pt['label'])) n = len(set(labels)) if not n: raise ValueError("0-labels") elif n == 1: raise ValueError("1-class-labels") else: features = np.array([x, y]) features = sg.RealFeatures(features) labels = sg.MulticlassLabels(np.array(labels)) return features, labels
def __init__(self, X, y, n_importance, prior_log_pdf, ridge=0., num_shogun_threads=1): self.n_importance = n_importance self.prior_log_pdf = prior_log_pdf self.ridge = ridge self.X = X self.y = y self.num_shogun_threads = num_shogun_threads # tell shogun to use 1 thread only logger.debug("Using Shogun with %d threads" % self.num_shogun_threads) sg.ZeroMean().parallel.set_num_threads(self.num_shogun_threads) # shogun representation of data self.sg_labels = sg.BinaryLabels(self.y) self.sg_feats_train = sg.RealFeatures(self.X.T) # ARD: set set theta, which is in log-scale, as kernel weights self.sg_kernel = sg.GaussianARDKernel(10, 1) self.sg_mean = sg.ZeroMean() self.sg_likelihood = sg.LogitLikelihood()
def classify_perceptron(classifier, features, labels, learn=1, bias=0): perceptron = classifier(features, labels) perceptron.set_learn_rate(learn) perceptron.set_max_iter(100) perceptron.set_bias(bias) perceptron.train() size = 100 x1 = np.linspace(0, 1, size) y1 = np.linspace(0, 1, size) x, y = np.meshgrid(x1, y1) test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y)))) outl = perceptron.apply(test).get_labels() outv = perceptron.apply(test).get_values() # Normalize output outv /= np.max(outv) z_value = outv.reshape((size, size)) z_value = np.transpose(z_value) z_label = outl.reshape((size, size)) z_label = np.transpose(z_label) z_label = z_label + np.random.rand(*z_label.shape) * 0.01 return z_value, z_label
def regression(request): try: domain = json.loads(request.POST['axis_domain']) X = np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100) x = np.array([X]) feat = sg.RealFeatures(x) arguments = _read_data(request) tool = request.POST['regression'] if (tool == 'LeastSquaresRegression'): ls = _train_ls(*arguments) y = _apply_ls(feat, ls) elif (tool == 'LinearRidgeRegression'): lrr = _train_lrr(*arguments) y = _apply_lrr(feat, lrr) elif (tool == 'KernelRidgeRegression'): krr, kernel, train = _train_krr(*arguments) y = _apply_krr(kernel, train, feat, krr) line_dot = [] for i in xrange(len(X)): line_dot.append({'x': X[i], 'y': y[i]}) return HttpResponse(json.dumps(line_dot)) except: raise Http404
def _train_clustering(point_set, distance_name, k): labels = np.array([0]*len(point_set)) features = np.zeros((2, len(point_set))) for i in xrange(len(point_set)): features[0, i] = point_set[i]['x'] features[1, i] = point_set[i]['y'] labels[i] = point_set[i]['label'] lab = sg.BinaryLabels(labels) train = sg.RealFeatures(features) if distance_name == "EuclideanDistance": distance = sg.EuclideanDistance(train, train) elif distance_name == "ManhattanMetric": distance = sg.ManhattanMetric(train, train) elif distance_name == "JensenMetric": distance = sg.JensenMetric(train, train) else: raise TypeError kmeans = sg.KMeans(k, distance) kmeans.train() return kmeans
def shogun_mmd(X, Y, kernel_width, null_samples=1000, median_samples=1000, cache_size=32): ''' Run an MMD test using a Gaussian kernel. Parameters ---------- X : row-instance feature array Y : row-instance feature array kernel_width : float The bandwidth of the RBF kernel (sigma). null_samples : int How many times to sample from the null distribution. Returns ------- p_val : float The obtained p value of the test. stat : float The test statistic. null_samples : array of length null_samples The samples from the null distribution. ''' import modshogun as sg mmd = sg.QuadraticTimeMMD() mmd.set_p(sg.RealFeatures(X.T.astype(np.float64))) mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64))) mmd.set_kernel(sg.GaussianKernel(cache_size, float(kernel_width))) mmd.set_num_null_samples(null_samples) samps = mmd.sample_null() stat = mmd.compute_statistic() p_val = np.mean(stat <= samps) return p_val, stat, samps
def support_vector_regression(request): try: arguments = _read_data(request) svm = _train_svr(*arguments) domain = json.loads(request.POST['axis_domain']) x = np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100) y = np.array(svm.apply(sg.RealFeatures(np.array([x]))).get_labels(), dtype=np.float64) line_dot = [] for i in xrange(len(x)): line_dot.append({'x': x[i], 'y': y[i]}) return HttpResponse(json.dumps(line_dot)) except: raise Http404
def classify_gp(features, labels, kernel, domain, lik, learn, scale, returnValues=True): mean = sg.ZeroMean() inf = sg.EPInferenceMethod(kernel, features, mean, labels, lik) inf.set_scale(scale) gp = sg.GaussianProcessBinaryClassification(inf) best_width = 0.0 best_param = 0 best_degree = 0 best_scale = 0.0 if learn == 'ML2': inf.set_scale(1) if kernel.get_name() == 'GaussianKernel': kernel.set_width(1) grad = sg.GradientEvaluation(gp, features, labels, sg.GradientCriterion(), False) grad.set_function(inf) grad_search = sg.GradientModelSelection(grad) best_combination = grad_search.select_model() best_combination.apply_to_machine(gp) try: best_width = sg.GaussianKernel.obtain_from_generic( inf.get_kernel()).get_width() except: pass best_scale = inf.get_scale() gp.train() size = 50 x1 = np.linspace(domain['horizontal'][0], domain['horizontal'][1], size) y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], size) x, y = np.meshgrid(x1, y1) test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y)))) if returnValues: out = gp.apply(test).get_values() else: out = gp.apply(test).get_labels() z = out.reshape((size, size)) z = np.transpose(z) return x, y, z, best_width, best_param, best_scale
def _predictive_process(feat_train, labels, noise_level, scale, kernel, domain, learn, feat_induc, inf_select): variances, means, best_width, best_scale, best_sigma = _process( feat_train, labels, noise_level, scale, kernel, domain, learn, feat_induc, inf_select, True) size = 75 x_test = np.array( [np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)]) feat_test = sg.RealFeatures(x_test) y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], 50) D = np.zeros((len(y1), size)) # evaluate normal distribution at every prediction point (column) for j in range(np.shape(D)[1]): # create gaussian distributio instance, expects mean vector and covariance matrix, reshape gauss = sg.GaussianDistribution( np.array(means[j]).reshape(1, ), np.array(variances[j]).reshape(1, 1)) # evaluate predictive distribution for test point, method expects matrix D[:, j] = np.exp(gauss.log_pdf_multiple(y1.reshape(1, len(y1)))) z = np.transpose(D) z_max = np.nanmax(z) z_min = np.nanmin(z) z_delta = 0.1 * (np.nanmax(z) - np.nanmin(z)) result = [] for i in xrange(len(feat_test.get_feature_matrix()[0])): result.append({ 'x': feat_test.get_feature_matrix()[0][i], 'y': means[i], 'range_upper': means[i] + 2 * np.sqrt(variances[i]), 'range_lower': means[i] - 2 * np.sqrt(variances[i]), 'best_width': float(best_width), 'best_scale': float(best_scale), 'best_sigma': float(best_sigma), "status": "ok", "domain": [z_min - z_delta, z_max + z_delta], "max": z_max + z_delta, "min": z_min - z_delta, "z": z.tolist() }) return result
def _process(x1_set, x2_set, kernel_width, kernel_name, degree): num = len(x1_set) if num == 0: raise Http404 examples = np.zeros((2, num)) for i in xrange(num): examples[0,i] = x1_set[i] examples[1,i] = x2_set[i] feat_train = sg.RealFeatures(examples) # construct covariance function if kernel_name == "LinearKernel": kernel = sg.LinearKernel(feat_train, feat_train) elif kernel_name == "PolynomialKernel": kernel = sg.PolyKernel(feat_train, feat_train, degree, True) elif kernel_name == "GaussianKernel": kernel = sg.GaussianKernel(feat_train, feat_train, kernel_width) kernel_matrix=kernel.get_kernel_matrix() return kernel_matrix.tolist()
def _read_data(request): labels = [] features = [] data = json.loads(request.POST['point_set']) cost = float(request.POST['C']) tubeeps = float(request.POST['tube']) kernel_name = request.POST['kernel'] for pt in data: labels.append(float(pt["y"])) features.append(float(pt["x"])) labels = np.array(labels, dtype=np.float64) num = len(features) if num == 0: raise TypeError examples = np.zeros((1, num)) for i in xrange(num): examples[0, i] = features[i] lab = sg.RegressionLabels(labels) train = sg.RealFeatures(examples) kernel = get_kernel(request, train) return (cost, tubeeps, lab, kernel)
def _read_data(request): labels = [] features = [] data = json.loads(request.POST['point_set']) tau = float(request.POST['Tau']) for pt in data: labels.append(float(pt["y"])) features.append(float(pt["x"])) labels = np.array(labels, dtype=np.float64) num = len(features) if num == 0: raise TypeError examples = np.zeros((1, num)) for i in xrange(num): examples[0, i] = features[i] lab = sg.RegressionLabels(labels) train = sg.RealFeatures(examples) sigma = float(request.POST["sigma"]) kernel = sg.GaussianKernel(train, train, sigma) return (tau, lab, kernel, train)
import modshogun as sg import data import numpy as np # load data feature_matrix = data.swissroll() # create features instance features = sg.RealFeatures(feature_matrix) # create Isomap converter instance converter = sg.Isomap() # set target dimensionality converter.set_target_dim(2) # compute embedding with Isomap method embedding = converter.embed(features) # enable landmark approximation converter.set_landmark(True) # set number of landmarks converter.set_landmark_number(100) # set number of threads converter.parallel.set_num_threads(2) # compute approximate embedding approx_embedding = converter.embed(features) # disable landmark approximation converter.set_landmark(False) # compute cosine distance matrix 'manually' N = features.get_num_vectors()
def _process(feat_train, labels, noise_level, scale, kernel, domain, learn, feat_induc, inf_select, return_values=False): n_dimensions = 1 likelihood = sg.GaussianLikelihood() if learn == 'ML2': likelihood.set_sigma(1) else: likelihood.set_sigma(noise_level) covar_parms = np.log([2]) hyperparams = {'covar': covar_parms, 'lik': np.log([1])} # construct covariance function SECF = kernel covar = SECF zmean = sg.ZeroMean() if str(inf_select) == 'ExactInferenceMethod': inf = sg.ExactInferenceMethod(SECF, feat_train, zmean, labels, likelihood) if learn == 'ML2': inf.set_scale(1) else: inf.set_scale(scale) elif str(inf_select) == 'FITCInferenceMethod': if feat_induc != None: inf = sg.FITCInferenceMethod(SECF, feat_train, zmean, labels, likelihood, feat_induc) if learn == 'ML2': inf.set_scale(1) else: inf.set_scale(scale) elif feat_induc == None: raise ValueError("Argument Error") # location of unispaced predictions size = 75 x_test = np.array( [np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)]) feat_test = sg.RealFeatures(x_test) gp = sg.GaussianProcessRegression(inf) best_width = 0.0 best_scale = 0.0 best_sigma = 0.0 if learn == 'ML2': grad = sg.GradientEvaluation(gp, feat_train, labels, sg.GradientCriterion(), False) grad.set_function(inf) grad_search = sg.GradientModelSelection(grad) best_combination = grad_search.select_model() best_combination.apply_to_machine(gp) best_scale = inf.get_scale() best_sigma = sg.GaussianLikelihood.obtain_from_generic( inf.get_model()).get_sigma() if kernel.get_name() == 'GaussianKernel': best_width = sg.GaussianKernel.obtain_from_generic( inf.get_kernel()).get_width() gp.train() # gp.set_return_type(sg.GaussianProcessRegression.GP_RETURN_COV) covariance = gp.get_variance_vector(feat_test) # gp.set_return_type(sg.GaussianProcessRegression.GP_RETURN_MEANS) predictions = gp.get_mean_vector(feat_test) result = [] for i in xrange(len(feat_test.get_feature_matrix()[0])): result.append({ 'x': feat_test.get_feature_matrix()[0][i], 'y': predictions[i], 'range_upper': predictions[i] + 2 * np.sqrt(covariance[i]), 'range_lower': predictions[i] - 2 * np.sqrt(covariance[i]), 'best_width': float(best_width), 'best_scale': float(best_scale), 'best_sigma': float(best_sigma) }) if not return_values: return result elif return_values: return covariance, predictions, best_width, best_scale, best_sigma
import numpy as np import modshogun as sg X = np.random.randn(100, 3) Y = np.random.randn(100, 3) + .5 mmd = sg.QuadraticTimeMMD() mmd.set_p(sg.RealFeatures(X.T)) mmd.set_q(sg.RealFeatures(Y.T)) mmd.set_kernel(sg.GaussianKernel(32, 1)) mmd.set_num_null_samples(200) samps = mmd.sample_null() stat = mmd.compute_statistic()
def classify_svm(classifier, features, labels, kernel, domain, learn, value, C=1, returnValues=True): if learn == 'GridSearch': svm = classifier() root = sg.ModelSelectionParameters() c1 = sg.ModelSelectionParameters("C1") root.append_child(c1) c1.build_values(1.0, 10.0, sg.R_LINEAR, 2) c2 = sg.ModelSelectionParameters("C2") root.append_child(c2) c2.build_values(1.0, 10.0, sg.R_LINEAR, 2) if kernel.get_name() == 'GaussianKernel': param_kernel = sg.ModelSelectionParameters("kernel", kernel) width = sg.ModelSelectionParameters("width") width.build_values(0.0, 10.0, sg.R_LINEAR, 0.5) param_kernel.append_child(width) root.append_child(param_kernel) elif kernel.get_name() == 'PolyKernel': param_kernel = sg.ModelSelectionParameters("kernel", kernel) degree = sg.ModelSelectionParameters("degree") if value: degree.build_values(value[0], value[1], sg.R_LINEAR) else: degree.build_values(0, 5, sg.R_LINEAR) param_kernel.append_child(degree) root.append_child(param_kernel) elif kernel.get_name() == 'LinearKernel': param_kernel = sg.ModelSelectionParameters("kernel", kernel) root.append_child(param_kernel) pos = 0 neg = 0 for i in range(0, labels.get_num_labels()): if labels.get_label(i) == 1: pos += 1 else: neg += 1 if pos < 2 or neg < 2: class LabelsError(Exception): pass raise LabelsError('Need at least two labels from one class') elif pos < 3 or neg < 3: splitting_strategy = sg.StratifiedCrossValidationSplitting( labels, 2) else: splitting_strategy = sg.StratifiedCrossValidationSplitting( labels, 3) evaluation_criterium = sg.ContingencyTableEvaluation(sg.ACCURACY) cross = sg.CrossValidation(svm, features, labels, splitting_strategy, evaluation_criterium) cross.set_num_runs(2) grid_search = sg.GridSearchModelSelection(cross, root) best_combination = grid_search.select_model() best_combination.apply_to_machine(svm) else: svm = classifier(C, kernel, labels) svm.train(features) size = 100 x1 = np.linspace(domain['horizontal'][0], domain['horizontal'][1], size) y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], size) x, y = np.meshgrid(x1, y1) test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y)))) kernel.init(features, test) if returnValues: out = svm.apply(test).get_values() else: out = svm.apply(test).get_labels() z = out.reshape((size, size)) z = np.transpose(z) return x, y, z
def rbf_mmd_test(X, Y, bandwidth='median', null_samples=1000, median_samples=1000, cache_size=32): ''' Run an MMD test using a Gaussian kernel. Parameters ---------- X : row-instance feature array Y : row-instance feature array bandwidth : float or 'median' The bandwidth of the RBF kernel (sigma). If 'median', estimates the median pairwise distance in the aggregate sample and uses that. null_samples : int How many times to sample from the null distribution. median_samples : int How many points to use for estimating the bandwidth. Returns ------- p_val : float The obtained p value of the test. stat : float The test statistic. null_samples : array of length null_samples The samples from the null distribution. bandwidth : float The used kernel bandwidth ''' if bandwidth == 'median': from sklearn.metrics.pairwise import euclidean_distances sub = lambda feats, n: feats[np.random.choice( feats.shape[0], min(feats.shape[0], n), replace=False)] Z = np.r_[sub(X, median_samples // 2), sub(Y, median_samples // 2)] D2 = euclidean_distances(Z, squared=True) upper = D2[np.triu_indices_from(D2, k=1)] kernel_width = np.median(upper, overwrite_input=True) bandwidth = np.sqrt(kernel_width / 2) # sigma = median / sqrt(2); works better, sometimes at least del Z, D2, upper else: kernel_width = 2 * bandwidth**2 mmd = sg.QuadraticTimeMMD() mmd.set_p(sg.RealFeatures(X.T.astype(np.float64))) mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64))) mmd.set_kernel(sg.GaussianKernel(cache_size, kernel_width)) mmd.set_num_null_samples(null_samples) samps = mmd.sample_null() stat = mmd.compute_statistic() p_val = np.mean(stat <= samps) return p_val, stat, samps, bandwidth
def fit_predict(self, x): features_train = modshogun.RealFeatures(x) kcc = RealFeatures(self.cluster_centers_) discc = self.distance(kcc, features_train).get_distance_matrix() return np.copy(discc.argsort(axis=0)[0, :]).T
def get_estimates(gen, sigmas=None, n_reps=100, n_null_samps=1000, cache_size=64, rep_states=False, name=None, save_samps=False, thresh_levels=(.2, .1, .05, .01)): if sigmas is None: sigmas = np.logspace(-1.7, 1.7, num=30) sigmas = np.asarray(sigmas) mmd = sg.QuadraticTimeMMD() mmd.set_num_null_samples(n_null_samps) mmd_mk = mmd.multikernel() for s in sigmas: mmd_mk.add_kernel(sg.GaussianKernel(cache_size, 2 * s**2)) info = OrderedDict() for k in 'sigma rep mmd_est var_est p'.split(): info[k] = [] thresh_names = [] for l in thresh_levels: s = 'thresh_{}'.format(l) thresh_names.append(s) info[s] = [] if save_samps: info['samps'] = [] thresh_prob = 1 - np.asarray(thresh_levels) bar = pb.ProgressBar() if name is not None: bar.start() bar.widgets.insert(0, '{} '.format(name)) for rep in bar(xrange(n_reps)): if rep_states: rep = np.random.randint(0, 2**32) X, Y = gen(rs=rep) else: X, Y = gen() n = X.shape[0] assert Y.shape[0] == n mmd.set_p(sg.RealFeatures(X.T)) mmd.set_q(sg.RealFeatures(Y.T)) info['sigma'].extend(sigmas) info['rep'].extend([rep] * len(sigmas)) stat = mmd_mk.compute_statistic() info['mmd_est'].extend(stat / (n / 2)) samps = mmd_mk.sample_null() info['p'].extend(np.mean(samps >= stat, axis=0)) if save_samps: info['samps'].extend(samps.T) info['var_est'].extend(mmd_mk.compute_variance_h1()) threshes = np.asarray(mquantiles(samps, prob=thresh_prob, axis=0)) for s, t in zip(thresh_names, threshes): info[s].extend(t) info = pd.DataFrame(info) info.set_index(['sigma', 'rep'], inplace=True) return info