def testAllKernels(self): X = self.X y = np.random.rand(X.shape[0], 1) Ks = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": 0.1}), Kinterface(data=X, kernel=matern32_gpy, kernel_args={"lengthscale": 3.0}), Kinterface(data=X, kernel=matern52_gpy, kernel_args={"lengthscale": 5.0}), # Kinterface(data=X, kernel=periodic_gpy, kernel_args={"lengthscale": 5.0, "period": 4.0}), ] Km = sum([K[:, :] for K in Ks]) kern = GPy.kern.RBF(1, lengthscale=FITC.gamma2lengthscale(0.1)) \ + GPy.kern.Matern32(1, lengthscale=3) \ + GPy.kern.Matern52(1, lengthscale=5) # + GPy.kern.PeriodicExponential(1, lengthscale=5, period=4) Ky = kern.K(X, X) self.assertAlmostEqual(np.linalg.norm(Ky - Km[:, :]), 0, places=3) model = FITC() model.fit(Ks, y, optimize=True, fix_kernel=True) yp = model.predict([X]) v1 = np.var(y.ravel()) v2 = np.var((y - yp).ravel()) self.assertTrue(v2 < v1)
def ramdom_kernels(kernel_indexs, samples, classes, rbf_par, poly_par): kernels = [] for indexes in kernel_indexs: choice = np.random.randint(3, size=1)[0] if choice == 0: kernels.append( Kinterface(data=x_train[:, indexes], kernel=linear_kernel)) elif choice == 1: #print(rbf_par) length_of_param1 = len(rbf_par["gamma"]) # print(rbf_par["gamma"]) # print(np.random.randint(length_of_param1, size=1)[0]) K = Kinterface(data=x_train[:, indexes], kernel=rbf_kernel, kernel_args={ "gamma": rbf_par["gamma"][np.random.randint( length_of_param1, size=1)[0]] }) kernels.append(K) else: length_of_param1 = len(poly_par["degree"]) K = Kinterface(data=x_train[:, indexes], kernel=poly_kernel, kernel_args={ "degree": poly_par["degree"][np.random.randint( length_of_param1, size=1)[0]] }) kernels.append(K) return kernels
def createKernelCombination(kernel_indexs,samples,classes,rbf_par,poly_par,scorer): kernels= [] for indexes in kernel_indexs: svm = tunning_svm(samples[:,indexes],classes,rbf_par,poly_par,scorer) kernel = svm.get_params()["kernel"] if kernel=="linear": kernels.append( Kinterface(data=x_train[:,indexes], kernel=linear_kernel)) elif kernel =="rbf": gamma=svm.get_params()["gamma"] K = Kinterface(data=x_train[:,indexes], kernel=rbf_kernel,kernel_args={"gamma": gamma}) kernels.append(K) else: degree=svm.get_params()["degree"] coef0 =degree=svm.get_params()["coef0"] K = Kinterface(data=x_train[:,indexes], kernel=poly_kernel,kernel_args={"degree": degree}) kernels.append(K) model = Alignf(typ="convex") model.fit(kernels, classes.values) model.mu # kernel weights (convex combination) mu = model.mu print(mu) combined_k = lambda x,y: \ sum([mu[i]*kernels[i](x[:,kernel_indexs[i]],y[:,kernel_indexs[i]]) for i in range(len(kernels))]) return combined_k
def test_bias(self): """ Assert least squares solution is valid at each step. """ n = 100 rank = 20 delta = 5 bias = 20 X = np.linspace(-10, 10, n).reshape((n, 1)) Ks = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": 0.6}), Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": 0.1}), ] Kt = 1.0 + Ks[0][:, :] + 0.0 * Ks[1][:, :] y = mvn.rvs(mean=np.zeros(n, ), cov=Kt).reshape((n, 1)) y = y + bias model = KMP(rank=rank, delta=delta, lbd=0) model.fit(Ks, y) ypath = model.predict_path([X, X]) for i in range(model.rank): yp = ypath[:, i] - model.bias yu = y.ravel() - model.bias assert np.linalg.norm(yp.T.dot(yu - yp)) < 1e-3
def ramdom_kernels_combination(kernel_indexs,samples,classes,rbf_par,poly_par,scorer): kernels= [] for indexes in kernel_indexs: choice = np.random.randint(3, size=1)[0] if choice ==0: kernels.append( Kinterface(data=x_train[:,indexes], kernel=linear_kernel)) elif choice ==1: #print(rbf_par) length_of_param1 = len(rbf_par["gamma"]) # print(rbf_par["gamma"]) # print(np.random.randint(length_of_param1, size=1)[0]) K = Kinterface(data=x_train[:,indexes], kernel=rbf_kernel,kernel_args={"gamma": rbf_par["gamma"][np.random.randint(length_of_param1, size=1)[0]]}) kernels.append(K) else: length_of_param1 = len(poly_par["degree"]) K = Kinterface(data=x_train[:,indexes], kernel=poly_kernel,kernel_args={"degree": poly_par["degree"][np.random.randint(length_of_param1, size=1)[0]]}) kernels.append(K) #mu = [random.randrange(0,1) for i in range(40)] model = Alignf(typ="convex") model.fit(kernels, classes.values) model.mu # kernel weights (convex combination) mu = model.mu #print("numbers:" +str(mu)) combined_k = lambda x,y: \ sum([mu[i]*kernels[i](x[:,kernel_indexs[i]],y[:,kernel_indexs[i]]) for i in range(len(kernels))]) return combined_k
def testPredictionKernPrecomp(self): for t in range(self.trials): X = np.random.rand(self.n, self.m) Ks = [ Kinterface(kernel=exponential_kernel, data=X, kernel_args={"gamma": 0.1}), Kinterface(kernel=exponential_kernel, data=X, kernel_args={"gamma": 0.2}), ] Ls = [K[:, :] for K in Ks] y = X[:, :3].sum(axis=1) y = y - y.mean() X_te = np.random.rand(10, self.m) Ls_te = [K(X_te, X) for K in Ks] for method in ["icd", "csi", "nystrom"]: print method # Kinterface model model0 = RidgeLowRank(method=method, lbd=0.01) model0.fit(Ks, y) y0 = model0.predict([X, X]) yp0 = model0.predict([X_te, X_te]) # Kernel matrix model model1 = RidgeLowRank(method=method, lbd=0.01) model1.fit(Ls, y) y1 = model0.predict(Xs=None, Ks=Ls) yp1 = model0.predict(Xs=None, Ks=Ls_te) self.assertAlmostEqual(np.linalg.norm(y0 - y1), 0, places=3) self.assertAlmostEqual(np.linalg.norm(yp0 - yp1), 0, places=3)
def testRowNorm(self): Kp = poly_kernel(self.X, self.X, degree=2) Kr = kernel_row_normalize(Kp) Ki = Kinterface(data=self.X, kernel=poly_kernel, kernel_args={"degree": 2}, row_normalize=True) self.assertAlmostEquals(np.linalg.norm(Ki.diag().ravel() - np.ones((self.n, ))), 0, delta=3) self.assertAlmostEquals(np.linalg.norm(Ki(self.X, self.X) - Kr), 0, delta=3) self.assertAlmostEquals(np.linalg.norm(Ki[:, :] - Kr), 0, delta=3)
def create_kinterfce(kernel_list, type_k): """ Creates a kinterface of type_k Parameters ---------- kernel_list : List A list of all kernels type_k : Tran Type of transformation for kernels Returns -------- train_array : list A list of all training kernels as array kinterface_kernel : Kinterface Kernel of type kinterface """ kinterface_kernel = [] train_array = [] for ker in kernel_list: arr = ker.toarray() train_array.append(arr) k_arr = Kinterface(data=arr, kernel=type_k) kinterface_kernel.append(k_arr) return train_array, kinterface_kernel
def testPrediction(self): for t in range(self.trials): X = np.random.rand(self.n, self.m) tr = np.arange(self.n / 2).astype(int) # necessarily int 1D array te = np.arange(self.n / 2, self.n).astype(int) Ks = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": g}) for g in self.gamma_range ] inxs = np.random.choice(tr.ravel(), size=self.n / 3) alpha = np.zeros((self.n, 1)) alpha[inxs] = np.random.randn(len(inxs), 1) mu0 = np.random.randn(len(Ks), 1) K0 = sum([w * K[:, :] for K, w in zip(Ks, mu0)]) y = K0.dot(alpha).ravel() y = y - y.mean() # y necessarily 1D array y += np.random.randn(len(K0), 1).ravel() * 0.001 for method in RidgeMKL.mkls.keys(): model = RidgeMKL(method=method) model.fit(Ks, y, holdout=te) yp = model.predict(te) expl_var = (np.var(y[te]) - np.var(y[te] - yp)) / np.var(y[te]) self.assertGreater(expl_var, 0.5)
def testCallOtherNorm(self): Ki = Kinterface(data=self.X, kernel=poly_kernel, kernel_args={"degree": 2}, row_normalize=True) Kr = Ki(self.X, self.Y) self.assertTrue(np.all(Kr < 1))
def variousKernelVariousMethodsOneTCGA(tcga, X_te, y_tr, y_te, method, rank): K_exp = Kinterface(data=tcga, kernel=rbf_kernel, kernel_args={"sigma": 30}) # RBF kernel K_poly = Kinterface(data=tcga, kernel=poly_kernel, kernel_args={"degree": 3}) # polynomial kernel with degree=3 K_lin = Kinterface(data=tcga, kernel=linear_kernel) # linear kernel model = RidgeLowRank(method=method, rank=rank, lbd=1) model.fit([K_exp, K_lin, K_poly], y_tr) yp = model.predict([X_te, X_te, X_te]) # The features passed to each kernel mse = mean_squared_error(y_te, yp) #rmse = np.var(y_tr-yp)**0.5 print "Test MSE:", mse
def testCallOther(self): Kp = poly_kernel(self.X, self.Y, degree=2) Ki = Kinterface(data=self.X, kernel=poly_kernel, kernel_args={"degree": 2}, row_normalize=False) Kr = Ki(self.X, self.Y) self.assertAlmostEquals(np.linalg.norm(Kp - Kr), 0, delta=3)
def testCall(self): Kp = poly_kernel(self.X, self.X, degree=2) Ki = Kinterface(data=self.X, kernel=poly_kernel, kernel_args={"degree": 2}) self.assertAlmostEquals(np.linalg.norm(Ki(self.X, self.X) - Kp), 0, delta=3)
def testFITCfit(self): n = self.n X = self.X noise = 1.0 # Construct a combined kernel gamma_range = [0.1, 0.3, 1.0] Ks = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": gm}) for gm in gamma_range ] Km = Kinterface(data=X, kernel=kernel_sum, kernel_args={ "kernels": [exponential_kernel] * len(gamma_range), "kernels_args": map(lambda gm: {"gamma": gm}, gamma_range) }) for seed in range(5): # Sample a function from a GP f = mvn.rvs(mean=np.zeros((n, )), cov=Km[:, :], random_state=seed) y = mvn.rvs(mean=f, cov=np.eye(n, n) * noise, random_state=seed) y = y.reshape((n, 1)) # Fit a model model = FITC() model.fit(Ks, y, optimize=False, fix_kernel=False) # Compare kernels self.assertAlmostEqual( np.linalg.norm(model.kernel.K(X, X) - Km[:, :]), 0, places=3) # Predictions yp = model.predict([X]) v1 = np.var(y.ravel()) v2 = np.var((y - yp).ravel()) self.assertTrue(v2 < v1) # Fixed model model_fix = FITC() model_fix.fit(Ks, y, optimize=False, fix_kernel=True) ypf = model_fix.predict([X]) v3 = np.var((y - ypf).ravel()) self.assertTrue(v3 < v1)
def variousKernel(tcga, sigmaKernel, degreeKernel, biasKernel, cKernel, sigmaABSKernel, sigmaPerKernel, nuKernel): #Kernels K_exp = Kinterface(data=np.array(tcga), kernel=rbf_kernel, kernel_args={"sigma": sigmaKernel}) # RBF kernel K_poly = Kinterface(data=np.array(tcga), kernel=poly_kernel, kernel_args={"degree": degreeKernel }) # polynomial kernel with degree=3 K_lin = Kinterface(data=np.array(tcga), kernel=linear_kernel, kernel_args={'b': biasKernel}) K_sig = Kinterface(data=np.array(tcga), kernel=sigmoid_kernel, kernel_args={'c': cKernel}) K_expoAbs = Kinterface(data=np.array(tcga), kernel=exponential_absolute, kernel_args={"sigma": sigmaABSKernel}) K_perio = Kinterface(data=np.array(tcga), kernel=periodic_kernel, kernel_args={"sigma": sigmaPerKernel}) K_matern = Kinterface(data=np.array(tcga), kernel=matern_kernel, kernel_args={"nu": nuKernel}) return K_exp, K_poly, K_lin, K_sig, K_expoAbs, K_perio, K_matern
def testKernGamma(self): for gamma in [0.1, 1.0, 2.0, 10.0]: k = GPy.kern.RBF(1, variance=1, lengthscale=FITC.gamma2lengthscale(gamma)) K = k.K(self.X, self.X) Ki = Kinterface(data=self.X, kernel=exponential_kernel, kernel_args={"gamma": gamma}) self.assertAlmostEqual(np.linalg.norm(K - Ki[:, :]), 0, places=3)
def setUp(self): X, y = generate_data(N=100, L=100, p=0.5, motif="TGTG", mean=0, var=3, seed=42) self.Xa = np.array(X) self.y = y self.Ks = [ Kinterface(kernel=string_kernel, data=self.Xa, kernel_args={"mode": SPECTRUM}), Kinterface(kernel=string_kernel, data=self.Xa, kernel_args={"mode": SPECTRUM_MISMATCH}) ]
def NystromOneKernelOneTCGA(tcga, kernel, kernel_args, rank): K = Kinterface(data=np.array(tcga), kernel=kernel, kernel_args=kernel_args) model = Nystrom(rank=rank) model.fit(K) G_nyst = model.G print "G shape:", G_nyst.shape, "Error:", np.linalg.norm( K[:, :] - G_nyst.dot(G_nyst.T)) return model
def ridgeLowRankOneKernel(tcga, y_tr, kernel, kernel_args, rank, method): #K = Kinterface(data=X_tr, kernel=rbf_kernel, kernel_args={"sigma": 110}) K = Kinterface(data=np.array(tcga), kernel=kernel, kernel_args=kernel_args) #for method in "nystrom", "icd": model = RidgeLowRank(method=method, rank=rank, lbd=1) model.fit([K], y_tr) #yp = model.predict([np.array(X_te)]) #mse = mean_squared_error(y_te, yp) #rmse = np.var(y_te-yp)**0.5 #print "Method:", method, "Test MSE:", mse return model
def ICDoneKernelOneTCGA(tcga, kernel, kernel_args, rank): #K = Kinterface(data=np.array(cnv), kernel=rfb_kernel, kernel_args={"sigma": 110}) K = Kinterface(data=np.array(tcga), kernel=kernel, kernel_args=kernel_args) model = ICD(rank=rank) model.fit(K) G_icd = model.G #inxs = model.active_set_ print("G shape:", G_icd.shape, "Error:", np.linalg.norm(K[:, :] - G_icd.dot(G_icd.T))) return model
def setUp(self): self.n = 100 self.m = 3 self.gamma_range = np.logspace(-1, 1, 5) self.lbd_range = [0, 1, 100, 1000] self.X = np.random.rand(self.n, self.m) self.Ks = [ Kinterface(data=self.X, kernel=exponential_kernel, kernel_args={"gamma": g}) for g in self.gamma_range ] self.trials = 5
def test_least_squares_sol(self): np.random.seed(1) n = 100 rank = 20 delta = 5 X = np.linspace(-10, 10, n).reshape((n, 1)) Ks = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": 0.6}), Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": 0.1}), ] Kt = 1.0 + Ks[0][:, :] + 0.0 * Ks[1][:, :] y = mvn.rvs(mean=np.zeros(n, ), cov=Kt).reshape((n, 1)) y = y - y.mean() model = KMP(rank=rank, delta=delta, lbd=0) model.fit(Ks, y) yp = model.predict([X, X]) assert np.linalg.norm(yp.T.dot(y.ravel() - yp)) < 1e-2
def testMklarenPredict(self): X_tr = self.Xa[:50] X_te = self.Xa[50:] y_tr = self.y[:50] y_te = self.y[50:] Ks = [ Kinterface(kernel=string_kernel, data=X_tr, kernel_args={"mode": SPECTRUM}), Kinterface(kernel=string_kernel, data=X_tr, kernel_args={"mode": SPECTRUM_MISMATCH}) ] model = Mklaren(rank=10) model.fit(Ks, y_tr) yp = model.predict([X_te] * len(Ks)) c, p = st.spearmanr(yp, y_te) self.assertGreater(c, 0) self.assertLess(p, 0.05)
def testCSIFit(self): Ks = [ Kinterface(kernel=string_kernel, data=self.Xa, kernel_args={"mode": SPECTRUM}) ] model = RidgeLowRank(rank=5, method="csi", method_init_args={"delta": 5}, lbd=0.01) model.fit(Ks, self.y) yp = model.predict([self.Xa] * len(Ks)) c, p = st.spearmanr(yp, self.y) self.assertGreater(c, 0) self.assertLess(p, 0.05)
def get_kernel_matrix(dframe, n_dim=15): """ This returns a Kernel Transformation Matrix $\Theta$ It uses kernel approximation offered by the MKlaren package For the sake of completeness (and for my peace of mind, I use the best possible approx.) :param dframe: input data as a pandas dataframe. :param n_dim: Number of dimensions for the kernel matrix (default=15) :return: $\Theta$ matrix """ ker = Kinterface(data=dframe.values, kernel=linear_kernel) model = ICD(rank=n_dim) model.fit(ker) g_nystrom = model.G return g_nystrom
def testKernelSum(self): Ki = Kinterface(data=self.X, kernel=kernel_sum, kernel_args={ "kernels": [poly_kernel, poly_kernel, poly_kernel], "kernels_args": [{ "degree": 2 }, { "degree": 3 }, { "degree": 4 }] }, row_normalize=False) Kc = poly_kernel(self.X, self.X, degree=2) + \ poly_kernel(self.X, self.X, degree=3) + \ poly_kernel(self.X, self.X, degree=4) self.assertAlmostEqual(np.linalg.norm(Ki[:, :] - Kc), 0, places=3)
def testPolySum(self): """ Test expected reconstruction properties of the ICD. Kernels are iteratively summed. """ K = np.zeros((self.n, self.n)) for d in range(1, 6): K += Kinterface(data=self.X, kernel=poly_kernel, kernel_args={"degree": d}, row_normalize=True)[:, :] model = ICD(rank=self.n) model.fit(K) errors = np.zeros((self.n, )) for i in range(self.n): Ki = model.G[:, :i+1].dot(model.G[:, :i+1].T) errors[i] = np.linalg.norm(K-Ki) self.assertTrue(np.all(errors[:-1] > errors[1:])) self.assertAlmostEqual(errors[-1], 0, delta=3)
def process(dataset, outdir): """ Run experiments with epcified parameters. :param dataset: Dataset key. :param outdir: Output directory. :return: """ # List available kernels K_range = range(1, 11) kargs = [{"mode": SPECTRUM, "K": kl} for kl in K_range] kernels = ",".join(set(map(lambda t: t["mode"], kargs))) # Fixed settings methods = ["Mklaren", "CSI", "Nystrom", "ICD"] rank_range = (rnk, ) trueK = RNA_OPTIMAL_K.get(dataset, None) # Fixed output # Create output directory detname = os.path.join(outdir, "_%s" % dataset) if not os.path.exists(outdir): os.makedirs(outdir) if not os.path.exists(detname): os.makedirs(detname) fname = os.path.join(outdir, "%s.csv" % dataset) print("Writing to %s ..." % fname) # Output header = [ "dataset", "n", "L", "kernels", "method", "rank", "iteration", "lambda", "pivots", "time", "evar_tr", "evar_va", "evar", "mse" ] fp = open(fname, "w", buffering=0) writer = csv.DictWriter(fp, fieldnames=header, quotechar='"', quoting=csv.QUOTE_ALL) writer.writeheader() # Load data data = load_rna(dataset) X = data["data"] y = st.zscore(data["target"]) n, L = len(X), len(X[0]) # Load feature spaces Ys = [ pickle.load(gzip.open(dataset2spectrum(dataset, K))) for K in K_range ] # Generate random datasets and perform prediction seed = 0 for cv in iterations: # Select random test/train indices np.random.seed(seed) inxs = np.arange(n, dtype=int) np.random.shuffle(inxs) tr = inxs[:n_tr] va = inxs[n_tr:n_tr + n_val] te = inxs[n_tr + n_val:] # Training / test split y_tr = y[tr] y_va = y[va] y_te = y[te] # Print after dataset generation dat = datetime.datetime.now() print("%s\tdataset=%s cv=%d (computing kernels...)" % (dat, dataset, cv)) # For plotting X_te = X[te] Ks = [ Kinterface(kernel=string_kernel, data=X[tr], kernel_args=arg) for arg in kargs ] # Precomputed kernel matrices Ls_tr = [np.array(Y[tr, :].dot(Y[tr, :].T).todense()) for Y in Ys] Ls_va = [np.array(Y[va, :].dot(Y[tr, :].T).todense()) for Y in Ys] Ls_te = [np.array(Y[te, :].dot(Y[tr, :].T).todense()) for Y in Ys] Ls_tr_sum = [sum(Ls_tr)] Ls_va_sum = [sum(Ls_va)] Ls_te_sum = [sum(Ls_te)] # Modeling for rank in rank_range: dat = datetime.datetime.now() print("\t%s\tdataset=%s cv=%d rank=%d" % (dat, dataset, cv, rank)) best_models = { "True": { "y": y_te, "color": "black", "fmt": "--", } } for method in methods: best_models[method] = {"color": meth2color[method], "fmt": "-"} best_evar = -np.inf for lbd in lbd_range: t1 = time.time() if method == "Mklaren": mkl = Mklaren(rank=rank, lbd=lbd, delta=delta) try: mkl.fit(Ls_tr, y_tr) yt = mkl.predict(Xs=None, Ks=Ls_tr) yv = mkl.predict(Xs=None, Ks=Ls_va) yp = mkl.predict(Xs=None, Ks=Ls_te) pivots = ",".join( map(lambda pi: str(K_range[pi]), mkl.G_mask.astype(int))) except Exception as e: print(e) continue else: pivots = "" if method == "CSI": model = RidgeLowRank( rank=rank, method="csi", method_init_args={"delta": delta}, lbd=lbd) else: model = RidgeLowRank(rank=rank, method=method.lower(), lbd=lbd) try: model.fit(Ls_tr_sum, y_tr) yt = model.predict(Xs=None, Ks=Ls_tr_sum) yv = model.predict(Xs=None, Ks=Ls_va_sum) yp = model.predict(Xs=None, Ks=Ls_te_sum) except Exception as e: print(e) continue t2 = time.time() - t1 # Evaluate explained variance on the three sets evar_tr = (np.var(y_tr) - np.var(yt - y_tr)) / np.var(y_tr) evar_va = (np.var(y_va) - np.var(yv - y_va)) / np.var(y_va) evar = (np.var(y_te) - np.var(yp - y_te)) / np.var(y_te) mse = np.var(yp - y_te) # Select best lambda to plot if evar_va > best_evar: best_evar = evar_va best_yp = yp best_models[method]["y"] = best_yp # Write to output row = { "L": L, "n": len(X), "method": method, "dataset": dataset, "kernels": kernels, "rank": rank, "iteration": cv, "lambda": lbd, "time": t2, "evar_tr": evar_tr, "evar_va": evar_va, "evar": evar, "mse": mse, "pivots": pivots } writer.writerow(row) seed += 1 # Plot a function fit after selecting best lambda fname = os.path.join( detname, "%s.generic_plot_cv-%d_rank-%d.pdf" % (dataset, cv, rank)) generic_function_plot( f_out=fname, Ks=Ks, X=X_te, models=best_models, xlabel="K-mer length", xnames=K_range, truePar=K_range.index(trueK) if trueK else None)
def process(dataset=RNA_DATASETS[0], repl=0): """ Process one iteration of a dataset. """ dat = datetime.datetime.now() print("\t%s\tdataset=%s cv=%d rank=%d" % (dat, dataset, repl, rank)) # Load data np.random.seed(repl) K_range = range(3, 8) # Load data data = load_rna(dataset) inxs = np.argsort(st.zscore(data["target"])) y = st.zscore(data["target"])[inxs] # Training/test; return a shuffled list sample = np.random.choice(inxs, size=len(inxs), replace=False) a, b = int(N * p_tr), int(N) tr, va, te = np.sort(sample[:a]), \ np.sort(sample[a:b]), \ np.sort(sample[b:]) # Load feature spaces try: Ys = [ pickle.load(gzip.open(dataset2spectrum(dataset, K))) for K in K_range ] except IOError: return None # Training kernels Ks_tr = [ Kinterface(data=Y[tr], kernel=linear_kernel, row_normalize=True) for Y in Ys ] # Process results = dict() for m in formats.keys(): model = LarsMKL(delta=delta, rank=rank, f=penalty[m]) try: model.fit(Ks_tr, y[tr]) except Exception as e: print("%s: %s" % (m, str(e))) continue y_va = y[va].reshape((len(va), 1)) y_te = y[te].reshape((len(te), 1)) ypath_va = model.predict_path_ls([Y[va] for Y in Ys]) ypath_te = model.predict_path_ls([Y[te] for Y in Ys]) scores_va = (np.var(y_va) - np.var(ypath_va - y_va, axis=0)) / np.var(y_va) scores_te = (np.var(y_te) - np.var(ypath_te - y_te, axis=0)) / np.var(y_te) t = np.argmax(scores_va) results[m] = np.round(scores_te[t], 3) # Compute ranking rows = list() scores = dict([(m, ev) for m, ev in results.items()]) scale = np.array(sorted(scores.values(), reverse=True)).ravel() for m in results.keys(): ranking = 1 + np.where(scale == scores[m])[0][0] row = { "dataset": dataset, "repl": repl, "method": m, "N_tr": len(tr), "N_va": len(va), "N_te": len(te), "evar": scores[m], "ranking": ranking } rows.append(row) return rows
def generate_data(n, rank, inducing_mode="uniform", noise=1, gamma_range=(0.1, ), seed=None, input_dim=1, signal_sampling="GP", data="mesh"): """ Generate an artificial dataset with imput dimension. :param n: Number od data points. :param rank: Number of inducing points. :param inducing_mode: Biased or uniform distribution of data points. :param noise: Noise variance. :param gamma_range: Number of kernels and hyperparameters. :param seed: Random seed. :param input_dim: Input space dimension. :param signal_sampling: 'GP' or 'weights'. Weights is more efficient. :param data: mesh or input_dim. :return: """ if seed is not None: np.random.seed(seed) # Generate data for arbitray input_dim if data == "mesh": x = np.linspace(-10, 10, n).reshape((n, 1)) M = np.meshgrid(*(input_dim * [x])) X = np.array(zip(*[m.ravel() for m in M])) N = X.shape[0] xp = np.linspace(-10, 10, 100).reshape((100, 1)) Mp = np.meshgrid(*(input_dim * [xp])) Xp = np.array(zip(*[m.ravel() for m in Mp])) elif data == "random": # Ensure data is separated at proper lengthscales ls = SPGP.gamma2lengthscale(min(gamma_range)) / np.sqrt(input_dim) a, b = -n * ls / 2.0, n * ls / 2.0 X = a + 2 * b * np.random.rand(n, input_dim) N = X.shape[0] Xp = np.random.rand(100, input_dim) else: raise ValueError("Unknown data mode: %s" % data) # Kernel sum Ksum = Kinterface(data=X, kernel=kernel_sum, kernel_args={ "kernels": [exponential_kernel] * len(gamma_range), "kernels_args": [{ "gamma": g } for g in gamma_range] }) # Sum of kernels Klist = [ Kinterface(data=X, kernel=exponential_kernel, kernel_args={"gamma": g}) for g in gamma_range ] a = np.arange(X.shape[0], dtype=int) if inducing_mode == "uniform": p = None elif inducing_mode == "biased": af = np.sum(X + abs(X.min(axis=0)), axis=1) p = (af**2 / (af**2).sum()) else: raise ValueError(inducing_mode) inxs = np.random.choice(a, p=p, size=rank, replace=False) if signal_sampling == "GP": Kny = Ksum[:, inxs].dot(np.linalg.inv(Ksum[inxs, inxs])).dot(Ksum[inxs, :]) f = mvn.rvs(mean=np.zeros((N, )), cov=Kny) y = mvn.rvs(mean=f, cov=noise * np.eye(N, N)) elif signal_sampling == "weights": L = Ksum[:, inxs].dot(scipy.linalg.sqrtm(np.linalg.inv(Ksum[inxs, inxs]))) w = mvn.rvs(mean=np.zeros(rank, ), cov=np.eye(rank, rank)).ravel() f = L.dot(w) y = f + np.random.rand(n, 1).ravel() * noise else: raise ValueError(signal_sampling) return Ksum, Klist, inxs, X, Xp, y, f