def caller(tx, ty, rx, ry): nums = [4,8,12,16] for n in nums: print("PCA") print(n) compressor = PCA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA") for n in nums: print("ICA") print(n) compressor = ICA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="ICA") for n in nums: print("RandProj") print(n) compressor = RandomProjection(n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA") for n in nums: print("kbest") print(n) compressor = best(k=n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA")
def graphCallerNN(tx, ty, rx, ry): n = tx[1].size/2 compressor = PCA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) myNN(newtx, ty, newrx, ry, "EM-PCA") # nnTable(newtx, ty, newrx, ry, alg="EM-PCA") compressor = ICA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-ICA") myNN(newtx, ty, newrx, ry, "EM-Ica") compressor = RandomProjection(n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-RP") myNN(newtx, ty, newrx, ry, "EM-RP") compressor = best(k=n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-KB") myNN(newtx, ty, newrx, ry, "EM-KB")
def PComponent_(train_Set, test_Set, var_Threshold=None, components=None): if (var_Threshold == None and components == None): print( "please give a threshold for PComponent - either var threshold or components" ) quit() if (var_Threshold != None and components != None): print("give only one threshold") quit() if (var_Threshold != None): pca = PCA() pca.fit(train_Set) #variance ratio in percentage explain_Variance = around(pca.explained_variance_ratio_, decimals=4) explain_Variance = explain_Variance.tolist() explain_Variance = [x * 100 for x in explain_Variance] #cumulative variance temp = 0 for x in range(len(explain_Variance)): explain_Variance[x] = temp + explain_Variance[x] temp = explain_Variance[x] explain_Variance = [x for x in explain_Variance if x < var_Threshold] n_components = len(explain_Variance) pca = PCA(n_components=n_components) return (pca.fit_transform(train_Set), pca.transform(test_Set)) else: pca = PCA(n_components=components) return (pca.fit_transform(train_Set), pca.transform(test_Set))
class EnsembleModel: def __init__(self, models, **params): self.models = models.values() self.model_funcs = [j.model for j in models.values()] self.params = params self._pca = PCA(n_components=0.99) self._clf = None def fit(self, x, y): train_x, test_x, train_y, test_y, = train_test_split(x, y, test_size=0.2) pca_train_x = self._pca.fit_transform(train_x) pca_test_x = self._pca.transform(test_x) for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): train_x = pca_train_x test_x = pca_test_x else: pass model_func.fit(train_x, train_y) self._fit_meta_estimator(test_x, test_y) return self def _fit_meta_estimator(self, x, y): predictions = self._predictions(x).T y = numpy.atleast_2d(y).T labels = numpy.argmin(abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1) self._clf = GaussianNB().fit(x, labels) def _predictions(self, x): pca_x = self._pca.transform(x) predictions = [] weights = [] for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): test_x = pca_x else: test_x = x predictions.append(model_func.predict_proba(test_x)[:, 1]) weights.append(model.best_params()["loss"]) return numpy.array(predictions) def predict_proba(self, x): blend = self.params.get("blend", "mean") predictions = self._predictions(x) if blend == "median": return numpy.median(predictions, 0) if blend == "meta": probs = self._clf.predict_proba(x) preds = [] for row, prob in zip(predictions.T, probs): if max(prob) > 0.99: preds.append(row[numpy.argmax(prob)]) else: preds.append(numpy.median(row)) return numpy.array(preds) return predictions.mean(0)
def PCA佮SVM模型(self, 問題, 答案): sample_weight_constant = np.ones(len(問題)) clf = svm.SVC(C=1) pca = PCA(n_components=100) # clf = svm.NuSVC() print('訓練PCA') pca.fit(問題) print('訓練SVM') clf.fit(pca.transform(問題), 答案, sample_weight=sample_weight_constant) print('訓練了') return lambda 問:clf.predict(pca.transform(問))
def dimensional(tx, ty, rx, ry, add=None): print "pca" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = PCA(n_components = i) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 V = compressor.components_ print runtime, V.shape, compressor.score(tx) distances = np.linalg.norm(tx-compressor.inverse_transform(newtx)) print distances print "pca done" print "ica" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = ICA(whiten=True) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 print newtx.shape, runtime distances = np.linalg.norm(tx-compressor.inverse_transform(newtx)) print distances print "ica done" print "RP" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = RandomProjection(n_components=i) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 shape = newtx.shape print runtime, shape print "RP done" print "K-best" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = best(add, k=i) t0 = time() compressor.fit(tx, y=ty.ravel()) newtx = compressor.transform(tx) runtime=time() - t0 shape = newtx.shape print runtime, shape print "K-best done"
def do_train_with_freq(): tf_mix = TrainFiles(train_path=train_path_mix, labels_file=labels_file, test_size=0.) tf_freq = TrainFiles(train_path=train_path_freq, labels_file=labels_file, test_size=0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size=0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs=1000, test_error=0.025, momentum=0.2, weight_decay=0.0001)
def pca(target, control, title, name_one, name_two): np_fps = [] for fp in target + control: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) ys_fit = [1] * len(target) + [0] * len(control) names = ["PAINS", "Control"] pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p1 = figure(x_axis_label="PC1", y_axis_label="PC2", title=title) p1.scatter(np_fps_r[:len(target), 0], np_fps_r[:len(target), 1], color="blue", legend=name_one) p1.scatter(np_fps_r[len(target):, 0], np_fps_r[len(target):, 1], color="red", legend=name_two) p2 = figure(x_axis_label="PC2", y_axis_label="PC3", title=title) p2.scatter(np_fps_r[:len(target), 1], np_fps_r[:len(target), 2], color="blue", legend=name_one) p2.scatter(np_fps_r[len(target):, 1], np_fps_r[len(target):, 2], color="red", legend=name_two) return HBox(p1, p2)
class LogisticClassifier(object): def __init__(self, learning_rate=0.01, reg=0., momentum=0.5): self.classifier = LogisticRegression(learning_rate, reg, momentum) self.pca = None self.scaler = None def sgd_optimize(self, data, n_epochs, mini_batch_size): data = self._preprocess_data(data) sgd_optimization(data, self.classifier, n_epochs, mini_batch_size) def _preprocess_data(self, data): # center data and scale to unit std if self.scaler is None: self.scaler = StandardScaler() data = self.scaler.fit_transform(data) else: data = self.scaler.transform(data) if self.pca is None: # use minika's mle to guess appropriate dimension self.pca = PCA(n_components='mle') data = self.pca.fit_transform(data) else: data = self.pca.transform(data) return data
class PCAImpl(): def __init__(self, n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): self._hyperparams = { 'n_components': n_components, 'copy': copy, 'whiten': whiten, 'svd_solver': svd_solver, 'tol': tol, 'iterated_power': iterated_power, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def cross_validate(self, train_x, train_y, test_x, test_y, **params): if not params: params = {"dummy": [0]} keys, values = list(zip(*list(params.items()))) for param_list in itertools.product(*values): cv_params = list(self.params.items()) + list(zip(keys, param_list)) for use_pca in (False, True): if self.have_tested(cv_params, use_pca): continue if use_pca: pca = PCA(n_components=0.99) proc_train_x = pca.fit_transform(train_x) proc_test_x = pca.transform(test_x) else: proc_train_x = train_x proc_test_x = test_x if "dummy" in params: model = self.func().fit(proc_train_x, train_y) else: model = self.func(**dict(cv_params)).fit( proc_train_x, train_y) predictions = model.predict_proba(proc_test_x) if len(predictions.shape) == 2: predictions = predictions[:, 1] num_right = (test_y == predictions.round()).sum() self.json["tests"].append({}) test_data = self.json["tests"][-1] test_data["use_pca"] = use_pca test_data["pct_right"] = 100 * num_right / float(len(test_y)) test_data["loss"] = log_loss(test_y, predictions) test_data["num_right"] = num_right test_data["num_tests"] = len(test_y) test_data["params"] = dict(cv_params) self._write() print((self.print_test(test_data)))
def write_predictions(self, model): if not os.path.exists(self.pred_dir): os.mkdir(self.pred_dir) raw_train_x, train_y = features_labels(self.season + 1) scaler = StandardScaler() train_x = scaler.fit_transform(raw_train_x) pca = PCA() if model.json.get("use_pca", False): train_x = pca.fit_transform(train_x) clf = model.func(**model.best_params()["params"]).fit(train_x, train_y) features, ids = self.get_features_and_ids() features = scaler.transform(features) if model.json.get("use_pca", False): features = pca.transform(features) predictions = clf.predict_proba(features) if len(predictions.shape) == 2: predictions = predictions[:, 1] with open(self.pred_path, 'w') as buff: buff.write("id,pred\n") for (label, pred) in zip(ids, predictions): buff.write("{:s},{:s}\n".format(label, str(pred)))
def pca_plot(fp_list, clusters): np_fps = [] for fp in fp_list: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p1 = figure(x_axis_label="PC1", y_axis_label="PC2", title="PCA clustering of PAINS") p2 = figure(x_axis_label="PC2", y_axis_label="PC3", title="PCA clustering of PAINS") color_vector = ["blue", "red", "green", "orange", "pink", "cyan", "magenta", "brown", "purple"] print len(set(clusters)) for clust_num in set(clusters): print clust_num local_cluster = [] for i in xrange(len(clusters)): if clusters[i] == clust_num: local_cluster.append(np_fps_r[i]) print len(local_cluster) p1.scatter(np_fps_r[:,0], np_fps_r[:,1], color=color_vector[clust_num]) p2.scatter(np_fps_r[:,1], np_fps_r[:,2], color=color_vector[clust_num]) return HBox(p1, p2)
class PCACCLayer(Layer): def __init__(self, n_out): self.pca = PCA(n_components=n_out) def get_train_output_for(self, inputX): batches, n_in, rows, cols = inputX.shape # 归一化 # inputX = norm4d(inputX) # inputX, self.P1 = whiten4d(inputX) myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4d') inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in)) outputX = self.pca.fit_transform(inputX) outputX = outputX.reshape((batches, rows, cols, -1)).transpose( (0, 3, 1, 2)) myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pca') return outputX def get_test_output_for(self, inputX): batches, n_in, rows, cols = inputX.shape # 归一化 # inputX = norm4d(inputX) # inputX = whiten4d(inputX, self.P1) myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4dte') inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in)) outputX = self.pca.transform(inputX) outputX = outputX.reshape((batches, rows, cols, -1)).transpose( (0, 3, 1, 2)) myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pcate') return outputX
class PCADecomposition(AbstractPreProcessor): pca = None no_components = 2 def fit(self, data, y=None): self.pca = PCA(n_components=self.no_components) self.pca.fit(data) def fit_transform(self, data, y=None): self.fit(data, y) return self.transform(data, y) def transform(self, data, y=None): data = self._check_input(data) output = self.pca.transform(data) output = self._check_output(data, output) return output def _check_output(self, data, output): if isinstance(data, pd.DataFrame): columns = [ 'Component ' + str(x + 1) for x in range(self.no_components) ] output = pd.DataFrame(data=output, columns=columns, index=data.index) return output
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) #eigenvalues = compressor.explained_variance_ print "PCA" # for eigenvalue, eigenvector in zip(eigenvalues, compressor.components_): # print(eigenvalue) # variance = compressor.explained_variance_ratio_ #calculate variance ratios # var = np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100) # print var #print compressor.explained_variance_ #print compressor.explained_variance_ratio_ print compressor.explained_variance_ratio_.cumsum() print compressor.singular_values_ newtx = compressor.transform(tx) newrx = compressor.transform(rx) #em(newtx, ty, newrx, ry, add="wPCAtr", times=10) #km(newtx, ty, newrx, ry, add="wPCAtr", times=10) # var=np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100) # print var # plt.ylabel('% Variance Explained') # plt.xlabel('# of Features') # plt.title('PCA Analysis') # plt.ylim(30,100.5) # plt.style.context('seaborn-whitegrid') # plt.plot(var) # plt.savefig('PCA.png') # plt.show() nn(newtx, ty, newrx, ry, add="wPCA")
def run(ARGS, data=None, model=None, is_test=False): data = data or get_regression_data(ARGS.dataset, split=ARGS.split) model = model or get_regression_model(ARGS.model)(is_test=is_test, seed=ARGS.seed) model.fit(data.X_train, data.Y_train) res = {} samples = model.sample(data.X_test, ARGS.num_samples) data_tiled = np.tile(data.X_test[None, :, :], [ARGS.num_samples, 1, 1]) shape = [ARGS.num_samples * data.X_test.shape[0], data.X_test.shape[1] + data.Y_test.shape[1]] A = np.reshape(np.concatenate([data_tiled, samples], -1), shape) B = np.concatenate([data.X_test, data.Y_test], -1) if ARGS.pca_dim > 0: AB = np.concatenate([A, B], 0) pca = PCA(n_components=ARGS.pca_dim).fit(AB) A = pca.transform(A) B = pca.transform(B) # import matplotlib.pyplot as plt # plt.scatter(A[:, 0], A[:, 1], color='b') # plt.scatter(B[:, 0], B[:, 1], color='r') # plt.show() kernel = gpflow.kernels.RBF(A.shape[-1]) res['mmd'] = mmd(A, B, kernel) print(res) res.update(ARGS.__dict__) if not is_test: # prgama: no cover with Database(ARGS.database_path) as db: db.write('mmd', res)
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=10) km(newtx, ty, newrx, ry, add="wPCAtr", times=10) nn(newtx, ty, newrx, ry, add="wPCAr")
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=10) km(newtx, ty, newrx, ry, add="wPCAtr", times=10) nn(newtx, ty, newrx, ry, add="wPCAtr")
def pca(tx, ty, rx, ry): print "pca" compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr") km(newtx, ty, newrx, ry, add="wPCAtr") nn(newtx, ty, newrx, ry, add="wPCAtr") print "pca done"
def do_train_with_freq(): tf_mix = TrainFiles(train_path = train_path_mix, labels_file = labels_file, test_size = 0.) tf_freq = TrainFiles(train_path = train_path_freq, labels_file = labels_file, test_size = 0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size = 0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.2, weight_decay = 0.0001)
def pca(X, y, components, max_cluster, num_classes, run_nn=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, shuffle=True) pca_compress = PCA(n_components=components, whiten=True) pca_compress.fit(X_train, y=y_train) X_train_new = pca_compress.transform(X_train) X_test_new = pca_compress.transform(X_test) X_original = pca_compress.inverse_transform(X_test_new) loss = ((X_test - X_original)**2).mean() print("Reconstruction Error " + str(loss)) eigenvalues = pca_compress.explained_variance_ print(eigenvalues) if run_nn: mlp_classifier(X_train_new, y_train, 0.3, plot=True, X_test=X_test_new, y_test=y_test) X_new = np.concatenate((X_train_new, X_test_new), axis=0) y = np.concatenate((y_train, y_test), axis=0) kmeans(X_new, y,max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='PCA') expectation_max(X_new, y, max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='PCA')
def pca(tx, ty, rx, ry, dataset): ncomponents = tx[1].size/2 compressor = PCA(n_components = ncomponents) xarr = [] for i in range(0, ncomponents): xarr.append(i+1) compressor.fit(tx, y=ty) arr = compressor.explained_variance_ plt.figure() plt.title('Phishing PCA Explained Variance') plt.rc('legend',**{'fontsize':10}) plt.plot(xarr, arr, '-', label='explained variance') plt.legend() plt.ylabel('explained variance') plt.xlabel('number of components') plt.savefig("phishingPCAVar" + dataset + ".png") compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=21, dataset=dataset, alg="PCA") em(newtx, ty, newrx, ry, PCA(n_components=2).fit_transform(tx), add="wPCAtr", times=9, dataset=dataset, alg="PCA") nn(newtx, ty, newrx, ry, add="wPCAtr") km(newtx, ty, newrx, ry, add="wPCAtr", times=10) myNN(newtx, ty, newrx, ry, "PCA") km(newtx, ty, newrx, ry, [], add="", times=4, dataset=dataset, alg="PCA") reduced_data = PCA(n_components=2).fit_transform(tx) em(tx, ty, rx, ry, reduced_data, add="", times=4, dataset=dataset, alg="PCA") pca = PCA(n_components=2) pca.fit(tx) result=pd.DataFrame(pca.transform(tx), columns=['PCA%i' % i for i in range(2)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(result['PCA0'], result['PCA1'], c=my_color, cmap="Dark2_r", s=60) ax.set_xlabel("PC1") ax.set_ylabel("PC2") ax.set_title("PCA on the phishing data set") plt.show()
def pca(data, whiten_bool, components): # Set PCA parameters pca = PCA(n_components=components, whiten=whiten_bool, svd_solver="full") # Fit PCA to data pca.fit(data) np.set_printoptions(suppress=True) print("PCA Components Explained Variance Ratio: " + str(np.around(pca.explained_variance_ratio_ * 100, 2))) # Calculate loading matrix loadings_matrix = (pca.components_.T * np.sqrt(pca.explained_variance_)).T # Transform data data_transformed = pca.transform(data) return data_transformed
def apply_pca(X_train, X_test, pca_thresh): """ apply principal component analysis to reduce dimensionality of feature vectors""" feature_labels = X_train.columns pca = PCA(n_components=pca_thresh) shape_orig = X_train.shape X_train = pca.fit_transform(X_train) shape_reduced = X_train.shape X_test = pca.transform(X_test) logging.info("reduced dimensionality from {} to {}".format( shape_orig, shape_reduced)) rows = ["PC-{}".format(i) for i in range(len(pca.components_))] pcs = pd.DataFrame(pca.components_, columns=feature_labels, index=rows) return X_train, X_test, pcs
def find_distributions(query_areas, query_energies, binned_image, n_classes=2, bimages_path='/home/fin/Documents/Timepix/particle-tk/datagen/images.pkl', segments_path='/home/fin/Documents/Timepix/particle-tk/datagen/segments.pkl'): # Load binned images and segments b_im = pkl.load(open(bimages_path, 'rb')) segments = pkl.load(open(segments_path, 'rb')) reductor = PCA(n_components=3) b_im = reductor.fit_transform(b_im) queried_binned_image = reductor.transform(binned_image.reshape(1,-1)) areas = [[] for i in range(0,n_classes)] pixel_energies = [[] for i in range(0,n_classes)] binned_images = [[] for i in range(0,n_classes)] binned_images_energies = [[] for i in range(0,n_classes)] for segment in segments: for lbl in range(1,n_classes+1): if segment.get_metadata('label') == lbl: areas[lbl-1].append(area(segment.get_bitmap())) nonzeroE = segment.get_bitmap().flatten()[segment.get_bitmap().flatten() > 0] for e in nonzeroE: pixel_energies[lbl-1].append(e) binned_images_energies[lbl-1].append(b_im[segment.get_metadata('parent_im_id')]) binned_images[lbl-1].append(b_im[segment.get_metadata('parent_im_id')]) break # Estimation of size density given image sizes = list() # for each particle type one array of size sizes.append(np.linspace(0,20,100)) sizes.append(np.linspace(0,10,100)) energies = list() energies.append(np.linspace(0,400,100)) energies.append(np.linspace(0,400,100)) p_SgX = list() p_EgX = list() for lbl in range(1,n_classes+1): print(areas[lbl-1]) estimator_P_SgX = estimate_P_SgX(areas[lbl-1], binned_images[lbl-1]) estimator_P_EgX = estimate_P_SgX(pixel_energies[lbl-1], binned_images_energies[lbl-1]) p_SgX.append(estimator_P_SgX.score_samples(query_areas[lbl-1,:], np.repeat(np.atleast_2d(queried_binned_image), query_areas[lbl-1,:].shape[0], axis=0))) p_EgX.append(estimator_P_EgX.score_samples(query_energies[lbl-1,:], np.repeat(np.atleast_2d(queried_binned_image), query_energies[lbl-1,:].shape[0], axis=0))) return np.array(p_SgX), np.array(p_EgX)
def train_pca(pains_fps, num_components=3): ''' Dimensional reduction of fps bit vectors to principal components :param pains_fps: :return: pca reduced fingerprints bit vectors ''' np_fps = [] for fp in pains_fps: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=num_components) pca.fit(np_fps) fps_reduced = pca.transform(np_fps) return fps_reduced
def reduction(data, params): # parse parameters for item in params: if isinstance(params[item], str): exec(item+'='+'"'+params[item]+'"') else: exec(item+'='+str(params[item])) # apply PCA pca = PCA(n_components=n_components) pca.fit(data) X = pca.transform(data) return X
def airline_pca(): X = np.array(pca_data) pca = PCA(n_components=3) pca.fit(X) Y = pca.transform(normalize(X)) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) colordict = {carrier: i for i, carrier in enumerate(major_carriers)} pointcolors = [colordict[carrier] for carrier in target_carrier] ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], c=pointcolors) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([])
def pca_no_labels(target, title="PCA clustering of PAINS", color="blue"): np_fps = [] for fp in target: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p3 = figure(x_axis_label="PC1", y_axis_label="PC2", title=title) p3.scatter(np_fps_r[:, 0], np_fps_r[:, 1], color=color) p4 = figure(x_axis_label="PC2", y_axis_label="PC3", title=title) p4.scatter(np_fps_r[:, 1], np_fps_r[:, 2], color=color) return HBox(p3, p4)
def airline_pca(): X = np.array(pca_data) pca = PCA(n_components=3) pca.fit(X) Y=pca.transform(normalize(X)) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) colordict = {carrier:i for i,carrier in enumerate(major_carriers)} pointcolors = [colordict[carrier] for carrier in target_carrier] ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], c=pointcolors) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([])
def plot_embeddings(embeddings, labels, protecteds, plot3d=False, subsample=False, label_names=None, protected_names=None): if protected_names is None: protected_names = ["A0", "A1"] if label_names is None: label_names = ["L0", "L1"] n = embeddings.shape[0] if not subsample: subsample = n inds = np.random.permutation(n)[:subsample] pca = PCA(n_components=3 if plot3d else 2) labels = labels.astype(bool)[inds] protecteds = protecteds.astype(bool)[inds] pca.fit(embeddings) embs = pca.transform(embeddings)[inds, :] fig = plt.figure() if plot3d: ax = fig.add_subplot(111, projection='3d') else: ax = fig.add_subplot(111) for l in [False, True]: # labels for p in [False, True]: # protecteds idxs = np.logical_and(labels == l, protecteds == p) embs_slice = embs[idxs, :] data_vectors = [embs_slice[:, 0], embs_slice[:, 1]] if plot3d: data_vectors.append(embs_slice[:, 2]) color = "b" if p else "r" marker = "o" if l else "x" name = "{} {}".format(protected_names[p], label_names[l]) ax.scatter( *data_vectors, edgecolors=color, marker=marker, facecolors=[color, 'none'][l], # only leave circles unfilled label=name) ax.legend(fontsize="small") plt.show()
class Classifier_pca(Layer): def __init__(self, C, n_times): self.C = C self.n_times = n_times self.pca = PCA(n_components=0.99) def get_train_output_for(self, inputX, inputy=None): inputX = self.pca.fit_transform(inputX) n_hidden = int(self.n_times * inputX.shape[1]) self.W = init.GlorotNormal().sample((inputX.shape[1], n_hidden)) self.b = init.Normal().sample(n_hidden) H = dotbiasact_decomp(inputX, self.W, self.b) self.beta = compute_beta(H, inputy, self.C) out = dot_decomp(H, self.beta) return out def get_test_output_for(self, inputX): inputX = self.pca.transform(inputX) H = dotbiasact_decomp(inputX, self.W, self.b) out = dot_decomp(H, self.beta) return out
def reduction(data, params): # parse parameters possible_keys = [ 'components', ] for item in params: if item not in possible_keys: ERROR(item) if isinstance(params[item], str): exec(item + '=' + '"' + params[item] + '"') else: exec(item + '=' + str(params[item])) # apply PCA pca = PCA(n_components=n_components) pca.fit(data) X = pca.transform(data) return X
class Model(BaseModel): ''' classdocs ''' def __init__(self, n_components): ''' Constructor ''' self.model = PCA(n_components=5) self.model_name = 'pca' def fit(self, X): ''' Performs a principal component analysis. ''' self.model.fit(X) variance = self.model.explained_variance_ratio_ print variance def transform(self, X): ''' Transforms the given data with the eigenvalues found in the principal component analysis. ''' return self.model.transform(X) def save(self, filepath): ''' Persists the trained model to a file. ''' joblib.dump(self.model, create_filename(filepath, '%s.pkl' % self.model_name)) def load(self, filepath): ''' Loads an already train model from a file to perform predictions. ''' self.model = joblib.load( create_filename(filepath, '%s.pkl' % self.model_name))
class SpaceMapper: def __init__(self): self.npc = 10 self.scaler = None self.pca_transformer = None self.lda_transformer = None self.nmf_transformer = None self.traindata = None self.testdata = None self.valdata = None self.trainlabels = None self.trainaudiolabels = None self.vallabels = None self.valaudiolabels = None self.testlabels = None self.testaudiolabels = None self.pred_frame_labels = None self.pred_recording_labels = None self.recording_labels = None self.max_frame_accuracy = -1 self.max_recording_accuracy = -1 self.pca_traindata = None self.lda_traindata = None self.nmf_traindata = None self.pca_testdata = None self.lda_testdata = None self.nmf_testdata = None self.pca_valdata = None self.lda_valdata = None self.nmf_valdata = None def learn_train_space(self, npc=None, traindata=None, trainlabels=None): """ learn PCA, LDA, NMF space and transform train data """ # initialize train data if traindata is not None: self.traindata = traindata if trainlabels is not None: self.trainlabels = trainlabels if npc is not None: self.npc = npc # Add a fucntion to remove strings from data_Group_4 temp = [] for array in self.traindata: new_arr = [] for item in array: if not isinstance(item, basestring): new_arr.append(item) temp.append(numpy.array(new_arr)) self.traindata = numpy.array(temp) # learn space and transform train data random.seed(254678) # standardize (samples) self.traindata = scale(self.traindata, axis=1) # then pca print "training with PCA transform..." self.pca_transformer = PCA(n_components=npc).fit(self.traindata) self.pca_traindata = self.pca_transformer.transform(self.traindata) # then lda print "training with LDA transform..." self.lda_transformer = LDA(n_components=npc).fit(self.traindata, self.trainlabels) self.lda_traindata = self.lda_transformer.transform(self.traindata) # then nmf print "training with NMF transform..." self.nmf_transformer = NMF(n_components=npc).fit(self.traindata-numpy.min(self.traindata)) self.nmf_traindata = self.nmf_transformer.transform(self.traindata-numpy.min(self.traindata)) def map_val_space(self, valdata=None, vallabels=None): """ map validation space """ # initialize validation data if valdata is not None: self.valdata = valdata if vallabels is not None: self.vallabels = vallabels # Add a function to remove strings from data_Group_4 temp = [] for array in self.valdata: new_arr = [] for item in array: if not isinstance(item, basestring): new_arr.append(item) temp.append(numpy.array(new_arr)) self.valdata = numpy.array(temp) # transform validation data random.seed(3759137) self.valdata = scale(self.valdata, axis=1) print "transform val data..." self.pca_valdata = self.pca_transformer.transform(self.valdata) self.lda_valdata = self.lda_transformer.transform(self.valdata) self.nmf_valdata = self.nmf_transformer.transform(self.valdata-numpy.min(self.valdata)) def map_test_space(self, testdata=None, testlabels=None): """ map test space """ # initialize test data if testdata is not None: self.testdata = testdata if testlabels is not None: self.testlabels = testlabels #Add a function to remove strings from data_Group_4 temp = [] for array in self.testdata: new_arr = [] for item in array: if not isinstance(item, basestring): new_arr.append(item) temp.append(numpy.array(new_arr)) self.testdata = numpy.array(temp) # transform test data random.seed(3759137) self.testdata = scale(self.testdata, axis=1) print "transform test data..." self.pca_testdata = self.pca_transformer.transform(self.testdata) self.lda_testdata = self.lda_transformer.transform(self.testdata) self.nmf_testdata = self.nmf_transformer.transform(self.testdata-numpy.min(self.testdata)) def classify_frames_recordings(self, model, transform_name="", model_name=""): """ predictions per frame and per recording """ # classification accuracy per frame if transform_name == "": acc_frame, preds_frame = self.classify(model, self.traindata, self.trainlabels, self.testdata, self.testlabels) elif transform_name == "PCA": acc_frame, preds_frame = self.classify(model, self.pca_traindata, self.trainlabels, self.pca_testdata, self.testlabels) elif transform_name == "LDA": acc_frame, preds_frame = self.classify(model, self.lda_traindata, self.trainlabels, self.lda_testdata, self.testlabels) elif transform_name == "NMF": acc_frame, preds_frame = self.classify(model, self.nmf_traindata, self.trainlabels, self.nmf_testdata, self.testlabels) # classification accuracy per recording by a vote count acc_vote, preds_vote = self.vote_count(preds_frame) print model_name+" "+transform_name+" "+str(acc_frame)+" "+str(acc_vote) # update highest accuracy and predictions per frame and per recording if acc_vote > self.max_recording_accuracy: self.pred_frame_labels = preds_frame self.pred_recording_labels = preds_vote self.max_frame_accuracy = acc_frame self.max_recording_accuracy = acc_vote def evaluate_space(self, traindata=None, trainlabels=None, testdata=None, testlabels=None, audiolabels=None): """ evaluate space by classification """ #Fixed spelling mistake for function name_Group_4 # initialize data (features, labels, audiolabels) if self.traindata is None: self.learn_train_space(traindata=traindata, trainlabels=trainlabels) if self.testdata is None: self.map_test_space(testdata=testdata, testlabels=testlabels) if self.testaudiolabels is None: self.testaudiolabels = audiolabels # initialize classifiers modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') modelLDA = LDA() modelSVM = svm.SVC(kernel='rbf', gamma=0.1) transforms = ["", "PCA", "LDA", "NMF"] # predict labels per frame and per recording print "classify with KNN..." for transform in transforms: self.classify_frames_recordings(modelKNN, transform_name=transform, model_name="KNN") print "classify with LDA..." for transform in transforms: self.classify_frames_recordings(modelLDA, transform_name=transform, model_name="LDA") print "classify with SVM..." for transform in transforms: self.classify_frames_recordings(modelSVM, transform_name=transform, model_name="SVM") def classify(self, model, traindata, trainlabels, testdata, testlabels): """ train classifier and return predictions and accuracy on test set """ model.fit(traindata, trainlabels) predlabels = model.predict(testdata) accuracy = metrics.accuracy_score(testlabels, predlabels) return accuracy, predlabels def vote_count(self, preds_frame): """ return predictions per recording by a vote count of predictions per frame """ # initialize uniq_audiolabels = numpy.unique(self.testaudiolabels) preds_vote = [] true_labels = [] # get prediction vote count and true label for each recording for audio_label in uniq_audiolabels: inds = numpy.where(self.testaudiolabels==audio_label)[0] preds, counts = numpy.unique(preds_frame[inds], return_counts=True) preds_vote.append(preds[numpy.argmax(counts)]) true_labels.append(numpy.unique(self.testlabels[inds])) # return accuracy and predictions per recording preds_vote = numpy.array(preds_vote) true_labels = numpy.array(true_labels) accuracy = metrics.accuracy_score(true_labels, preds_vote) self.recording_labels = true_labels # todo: this should only be assigned once return accuracy, preds_vote
ax.set_title('%s (%s)' % (name, 'correlation')) pos += 1 plt.savefig(wd + '/reports/Figure5_dendrograms_signif_protein.pdf', bbox_inches='tight') plt.close('all') # ---- Figure 6 (f, m_plot), pos = plt.subplots(3, 2, sharex=False, sharey=False, figsize=(12, 22)), 0 for name, dataset in datasets_quant.items(): plot_df = dataset.loc[:, ['FED' in i.upper() for i in dataset.columns]].T n_components = 3 pca_o = PCA(n_components=n_components).fit(plot_df) pcs = pca_o.transform(plot_df) explained_var = ['%.2f' % (pca_o.explained_variance_ratio_[i] * 100) for i in range(n_components)] # Plot 1 ax = m_plot[pos][0] x_pc, y_pc = 0, 1 ax.scatter(pcs[:, x_pc], pcs[:, y_pc], s=90, c=datasets_colour[name], linewidths=0) ax.set_xlabel('PC 1 (%s%%)' % explained_var[x_pc]) ax.set_ylabel('PC 2 (%s%%)' % explained_var[y_pc]) ax.set_title(name) sns.despine(ax=ax) for i, txt in enumerate(plot_df.index): ax.annotate(txt, (pcs[:, x_pc][i], pcs[:, y_pc][i]), size='x-small') # Plot 2
def predict(): tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test) sl_svm.fit_standard_scaler() sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} print "Starting SVM: ", time_now_str() _, ll_svm = sl_svm.fit_and_validate() print "SVM score: {0:.4f}".format(ll_svm if not prediction else _) print "Finished training SVM: ", time_now_str() # neural net print "Starting NN: ", time_now_str() trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True) tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled) fnn = predict_nn(trndata) proba_nn = fnn.activateOnDataset(tstdata) print "Finished training NN: ", time_now_str() # no validation labels on actual prediction if doTrees: # random forest sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() print "Starting on RF: ", time_now_str() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate() print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn) sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob") sl_svm.proba_test.tofile("/temp/sl_svm.prob") proba_nn.tofile("/temp/nn.prob") print "Finished training RF: ", time_now_str() if prediction: proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.]) out_labels = "/kaggle/malware/submission33.csv" task_labels = "/kaggle/malware/testLabels.csv" labels = [path.splitext(t)[0] for t in tf.get_val_inputs()] out = write_to_csv(task_labels, labels, proba, out_labels) else: # visualize the decision surface, projected down to the first # two principal components of the dataset pca = PCA(n_components=2).fit(sl_svm.X_train_scaled) X = pca.transform(sl_svm.X_train_scaled) x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1) y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1) xx, yy = np.meshgrid(x, y) # title for the plots titles = ['SVC with rbf kernel', 'Random Forest \n' 'n_components=7500', 'Decision Trees \n' 'n_components=7500'] #plt.tight_layout() plt.figure(figsize=(12, 5)) # predict and plot for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) clf.fit(X, Y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis('off') # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired) plt.title(titles[i]) plt.tight_layout() plt.show()
from numpy import loadtxt, genfromtxt, shape, mean, sort, savetxt, size, array, copy from pylab import figure from matplotlib.pyplot import plot, savefig, xlabel, ylabel, scatter, axis, xlim, fill_between, legend, text from sklearn.decomposition.pca import PCA data_dir= '../data_all_types/' out_dir='./plots/' der = loadtxt(data_dir+'derivatives.dat') flux = loadtxt(data_dir+'fluxes_not_res.dat.gz') labels = loadtxt(data_dir+'labels.dat') spectra_data = genfromtxt(data_dir+'spectra_data.dat',dtype=None) pca = PCA(n_components=4) pca.fit(der) X = pca.transform(der) pred_PCA = (pca.inverse_transform(X)) pca = PCA(n_components=15) pca.fit(der) X = pca.transform(der) pred_PCA_15PC = (pca.inverse_transform(X)) pred_DL = loadtxt('out_DeepLearning/predictions_120,100,90,50,30,20,4,20,30,50,90,100,120_seed1_dl.dat' ) #range_to_plot=[1,2,3,4,5,6,10] #range_to_plot=range(300) #range_to_plot=[] #labels_to_plot=[] #for i in range(size(labels)): # if spectra_data['f3'][i]> 0.2 and spectra_data['f3'][i]<.5: # range_to_plot.append(i)
sep='\t', index_col=0).loc[:, ko_strains].dropna(how='all', axis=1) phospho_df = phospho_df[(phospho_df.count(1) / phospho_df.shape[1]) > .25].replace(np.NaN, 0.0) # PCA analysis n_components = 10 sns.set(style='ticks') fig, gs, pos = plt.figure(figsize=(10, 15)), GridSpec(3, 2, hspace=.3), 0 for df, df_type in [(trans.T, 'Transcriptomics'), (phospho_df.T, 'Phospho-proteomics'), (metabolomics.T, 'Metabolomics')]: pca = PCA(n_components=n_components).fit(df) pca_pc = DataFrame( pca.transform(df), columns=['PC%d' % i for i in range(1, n_components + 1)], index=df.index) ax = plt.subplot(gs[pos]) cor, pvalue, nmeas = pearson(growth[pca_pc.index], pca_pc['PC1']) sns.regplot(growth[pca_pc.index], pca_pc['PC1'], ax=ax) ax.set_title('%s (pearson: %.2f, p-value: %.2e)' % (df_type, cor, pvalue)) ax.set_xlabel('Relative growth') ax.set_ylabel('PC1 (%.1f%%)' % (pca.explained_variance_ratio_[0] * 100)) sns.despine(trim=True, ax=ax) ax = plt.subplot(gs[pos + 1]) plot_df = DataFrame(zip(['PC%d' % i for i in range(1, n_components + 1)], pca.explained_variance_ratio_), columns=['PC', 'var'])
lbls = np.array(dgts_lbl) dt = dgts_data.T #remove mean values from each row mn = np.mean(dt,axis=0).reshape(1,dt.shape[1]) print dt.shape print mn.shape # now subtract the mean dt = dt - mn sigma = np.dot(dt,dt.T)/dt.shape[0] print sigma u,s,v = linalg.svd(sigma) dt_rot = np.dot(u.T,dt) sigma1 = np.cov(dt_rot) pc = PCA() pc.fit(dt) ab = pc.transform(dt) print ab print sigma1 print tyu abc =np.divide(s,np.sqrt(s+0.000001)) pcawhite = np.dot(abc,np.dot(u.T,dt)) print pcawhite
params = dict(gamma=gamma_range, C=C_range) clfSVM = svm.SVC(kernel="rbf") # SVM classifier optimized through grid search and cross validation clfSVM = grid_search.GridSearchCV(clfSVM, param_grid=params, cv=skf, n_jobs=-1, verbose=2) # fit the model after the reduced dimensionality performed by PCA clfSVM.fit(train_pca, train_labels) print("The best classifier is: ", clfSVM.best_estimator_) # C=10.0; gamma=0.01 ## Estimate the score of the classifier scores = cv.cross_val_score(clfSVM.best_estimator_, train_pca, train_labels, cv=10, n_jobs=-1, verbose=2) print("Estimated score: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() / 2)) ### Predict ### test = test.astype("float64") test_minmax = min_max_scaler.transform(test) # NOT fit_transform predicted_class = clfSVM.best_estimator_.predict(pca.transform(test_minmax)) predicted_class = predicted_class.astype("int") # predicted_probs = clfSVM.best_estimator_.predict_proba(pca.transform(test_minmax)) ### Save the results ### f = open("submission.csv", "w") # headers in the CSV file f.write("ImageId,Label\n") id = 1 for x in predicted_class: # ImageId, label are numerical values (%d) f.write("%d,%d\n" % (id, x)) id += 1 f.close()
from operator import itemgetter data_dir = '../../data/' n_pca_components = 10 eps_range = numpy.arange(0.01,20,0.1) min_samples_range = [2,3,5,10] allowed_noise_ratio = 0.2 # data derivatives = numpy.loadtxt(os.path.join(data_dir, 'derivatives.dat')) # PCA pca = PCA(n_components=n_pca_components) pca.fit(derivatives) X = pca.transform(derivatives) X = StandardScaler().fit_transform(X) results = [] for eps in eps_range: for minsamp in min_samples_range: model = DBSCAN(eps=eps, min_samples=minsamp, algorithm='kd_tree') model.fit(X) labels = model.labels_ noise_ratio = float(sum(labels==-1)) / len(labels) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) if noise_ratio <= allowed_noise_ratio: results.append([eps, minsamp, noise_ratio, n_clusters])
from sklearn.decomposition.pca import PCA import numpy as np import pandas as pd prices = pd.read_csv("close_prices.csv") X = prices.loc[:, "AXP":] est = PCA(n_components=10) est.fit(X) total = 0.0 for num, val in enumerate(est.explained_variance_ratio_): total += val if total >= 0.9: with open("1", "w") as f: print(num + 1, file=f, end="") break X0 = pd.DataFrame(est.transform(X))[0] index = pd.read_csv("djia_index.csv") corr = np.corrcoef(X0, index["^DJI"])[1][0] with open("2", "w") as f: print(round(corr, 2), file=f, end="") mx_company = X.columns[np.argmax(est.components_[0])] with open("3", "w") as f: print(mx_company, file=f, end="") print(mx_company)
from sklearn.decomposition.pca import PCA # package for principal # component analysis from sklearn import svm import csv X_train = pd.read_csv('train.csv', header=None).as_matrix() X_test = pd.read_csv('test.csv', header=None).as_matrix() trainLabels = np.loadtxt(open('trainLabels.csv', 'rb'), delimiter=',', skiprows=0) pca=PCA(n_components=12, whiten=True) #pca.fit(np.r_[X_train, X_test],trainLabels) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) clf = svm.SVC(C=3, gamma=0.6) clf.fit(X_train_pca,trainLabels) predictions = clf.predict(X_test_pca) with open('svm_model_submission.csv', 'wb') as prediction_file: writer=csv.writer(prediction_file, delimiter=',') writer.writerow(['Id','Solution']) for i in range(0,len(predictions)): writer.writerow([i+1,int(predictions[i])]) # scores around 92%, could maybe get a bit better tweaking parameters for SVC # -- use GridSearch to do this? Need a way of testing "goodness" of model
read_csv('%s/files/%s' % (wd, growth_file), sep='\t', index_col=0)['relative_growth']) # Import data-set df = read_csv('%s/tables/%s.tab' % (wd, df_file), sep='\t', index_col=0) if df_type == 'Kinases/Phosphatases': df = df[(df.count(1) / df.shape[1]) > .75] # Conditions overlap conditions = list(set(growth.index).intersection(df)) # PCA analysis pca = PCA(n_components=n_components).fit(df.T.replace(np.nan, 0)) pca_pc = DataFrame( pca.transform(df.T.replace(np.nan, 0)), columns=['PC%d' % i for i in range(1, n_components + 1)], index=df.columns) # Plot correlation with PCA ax = plt.subplot(gs[pos]) cor, pvalue, nmeas = pearson(growth[pca_pc.index], pca_pc[pc]) sns.regplot(growth[pca_pc.index], pca_pc[pc], ax=ax, color='#4c4c4c') ax.axhline(0, ls='-', lw=0.1, c='black', alpha=.3) ax.axvline(0, ls='-', lw=0.1, c='black', alpha=.3) ax.set_title('%s - %s\n(Pearson: %.2f, p-value: %.1e)' % (dataset_type, df_type, cor, pvalue)) ax.set_xlabel('Relative growth (centered)') ax.set_ylabel( 'PC%d (%.1f%%)' % (int(pc[-1:]), pca.explained_variance_ratio_[int(pc[-1:]) - 1] * 100))
plt.figure(3) # percentage of variance explained by each of the selected components. # The sum of explained variances is equal to 1.0 plt.plot(features, pca40.explained_variance_ratio_, 'g--', marker='o') plt.axis([1, 40, 0, 0.3]) plt.grid(True) plt.xlabel("principal components"), plt.ylabel("variance explained") plt.title("scree plot") plt.savefig("scree_plot.png") # from the scree plot we choose to pick the first 12 principal components pca12 = PCA(n_components=12, whiten=True) pca12.fit(X_train) # apply dimensionality reduction to the training set and the test set X_pca_train = pca12.transform(X_train) X_pca_test = pca12.transform(X_test) # Kernel Density Plot def kde_plot(x): from scipy.stats.kde import gaussian_kde kde = gaussian_kde(x) positions = np.linspace(x.min(), x.max()) smoothed = kde(positions) plt.figure() plt.plot(positions, smoothed) # qq plot, to see if this variable follows a gaussian distribution def qq_plot(x): from scipy.stats import probplot plt.figure()
import numpy as np from sklearn import tree from sklearn.decomposition.pca import PCA import mnist_loader as loader import mnist_writer as writer print('Reading data...') train_data, train_labels = loader.load_train_data() test_data = loader.load_test_data() # convert to numpy arrays train_data = np.array(train_data) train_labels = np.array(train_labels) test_data = np.array(test_data) print('PCA analysis...') pca = PCA(n_components=35, whiten=True) pca.fit(train_data) train_data = pca.transform(train_data) test_data = pca.transform(test_data) print('Fitting decision tree...') clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_labels) print('Making predictions...') predict = clf.predict(test_data) print('Writing results...') writer.write_predictions(predict, '/Users/clint/Development/data/mnist/predict_tree.csv')
pca = PCA(n_components=2) surrogate_projected = pca.fit_transform(surrogate_X) print('surrogate_projected', surrogate_projected) mean = np.asarray([p.mean for p in predictions]) var = np.asarray([p.variance for p in predictions]) print('mean', mean) print('var', var) results = task.get_results() evaluations_config_values = [r.configuration.values for r in results] evaluations_score = np.asarray([r.score for r in results]) print('evaluations_score', evaluations_score) evaluations_X = [[c['pin'], c['r'], c['inc'], c['inf'], c['trans']] for c in evaluations_config_values] print('evaluations_X', evaluations_X) evaluations_projected = pca.transform(evaluations_X) print('evaluations_projected', evaluations_projected) samples = np.concatenate( (np.asarray(surrogate_X).reshape(m, 5), surrogate_projected.reshape( m, 2), mean.reshape(m, 1), var.reshape(m, 1)), axis=1) evaluations = np.concatenate( (np.asarray(evaluations_X).reshape(n, 5), evaluations_projected.reshape(n, 2), evaluations_score.reshape(n, 1)), axis=1) np.save('surrogate_results', samples) np.save('task_results', evaluations)
class PipelineTests(PhotonBaseTest): def setUp(self): self.X, self.y = load_breast_cancer(True) # Photon Version self.p_pca = PipelineElement("PCA", {}, random_state=3) self.p_svm = PipelineElement("SVC", {}, random_state=3) self.p_ss = PipelineElement("StandardScaler", {}) self.p_dt = PipelineElement("DecisionTreeClassifier", random_state=3) dummy_element = DummyYAndCovariatesTransformer() self.dummy_photon_element = PipelineElement.create( "DummyTransformer", dummy_element, {}) self.sk_pca = PCA(random_state=3) self.sk_svc = SVC(random_state=3) self.sk_ss = StandardScaler() self.sk_dt = DecisionTreeClassifier(random_state=3) def test_regular_use(self): photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)]) photon_pipe.fit(self.X, self.y) photon_transformed_X, _, _ = photon_pipe.transform(self.X) photon_predicted_y = photon_pipe.predict(self.X) # the element is given by reference, so it should be fitted right here photon_ref_transformed_X, _, _ = self.p_pca.transform(self.X) photon_ref_predicted_y = self.p_svm.predict(photon_ref_transformed_X) self.assertTrue( np.array_equal(photon_transformed_X, photon_ref_transformed_X)) self.assertTrue( np.array_equal(photon_predicted_y, photon_ref_predicted_y)) sk_pipe = SKPipeline([("PCA", self.sk_pca), ("SVC", self.sk_svc)]) sk_pipe.fit(self.X, self.y) sk_predicted_y = sk_pipe.predict(self.X) self.assertTrue(np.array_equal(photon_predicted_y, sk_predicted_y)) # sklearn pipeline does not offer a transform function # sk_transformed_X = sk_pipe.transform(X) # self.assertTrue(np.array_equal(photon_transformed_X, sk_transformed_X)) def test_add_preprocessing(self): my_preprocessing = Preprocessing() my_preprocessing += PipelineElement("LabelEncoder") photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)]) photon_pipe._add_preprocessing(my_preprocessing) self.assertEqual(len(photon_pipe.named_steps), 3) first_element = photon_pipe.elements[0][1] self.assertTrue(first_element == my_preprocessing) self.assertTrue( photon_pipe.named_steps["Preprocessing"] == my_preprocessing) def test_no_estimator(self): no_estimator_pipe = PhotonPipeline([("StandardScaler", self.p_ss), ("PCA", self.p_pca)]) no_estimator_pipe.fit(self.X, self.y) photon_no_estimator_transform, _, _ = no_estimator_pipe.transform( self.X) photon_no_estimator_predict = no_estimator_pipe.predict(self.X) self.assertTrue( np.array_equal(photon_no_estimator_predict, photon_no_estimator_transform)) self.sk_ss.fit(self.X) standardized_data = self.sk_ss.transform(self.X) self.sk_pca.fit(standardized_data) pca_data = self.sk_pca.transform(standardized_data) self.assertTrue(np.array_equal(photon_no_estimator_transform, pca_data)) self.assertTrue(np.array_equal(photon_no_estimator_predict, pca_data)) def test_y_and_covariates_transformation(self): X = np.ones((200, 50)) y = np.ones((200, )) + 2 kwargs = {"sample1": np.ones((200, 5))} photon_pipe = PhotonPipeline([("DummyTransformer", self.dummy_photon_element)]) # if y is none all y transformer should be ignored Xt2, yt2, kwargst2 = photon_pipe.transform(X, None, **kwargs) self.assertTrue(np.array_equal(Xt2, X)) self.assertTrue(np.array_equal(yt2, None)) self.assertTrue(np.array_equal(kwargst2, kwargs)) # if y is given, all y transformers should be working Xt, yt, kwargst = photon_pipe.transform(X, y, **kwargs) # assure that data is delivered to element correctly self.assertTrue( np.array_equal(X, self.dummy_photon_element.base_element.X)) self.assertTrue( np.array_equal(y, self.dummy_photon_element.base_element.y)) self.assertTrue( np.array_equal( kwargs["sample1"], self.dummy_photon_element.base_element.kwargs["sample1"], )) # assure that data is transformed correctly self.assertTrue(np.array_equal(Xt, X - 1)) self.assertTrue(np.array_equal(yt, y + 1)) self.assertTrue("sample1_edit" in kwargst) self.assertTrue( np.array_equal(kwargst["sample1_edit"], kwargs["sample1"] + 5)) def test_predict_with_training_flag(self): # manually edit labels sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_svc)]) y_plus_one = self.y + 1 sk_pipe.fit(self.X, y_plus_one) sk_pred = sk_pipe.predict(self.X) # edit labels during pipeline p_pipe = PhotonPipeline([("SS", self.p_ss), ("YT", self.dummy_photon_element), ("SVC", self.p_svm)]) p_pipe.fit(self.X, self.y) p_pred = p_pipe.predict(self.X) sk_standardized_X = self.sk_ss.transform(self.X) input_of_y_transformer = self.dummy_photon_element.base_element.X self.assertTrue( np.array_equal(sk_standardized_X, input_of_y_transformer)) self.assertTrue(np.array_equal(sk_pred, p_pred)) def test_inverse_tansform(self): # simple pipe sk_pipe = SKPipeline([("SS", self.sk_ss), ("PCA", self.sk_pca)]) sk_pipe.fit(self.X, self.y) sk_transform = sk_pipe.transform(self.X) sk_inverse_transformed = sk_pipe.inverse_transform(sk_transform) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", self.p_pca)]) photon_pipe.fit(self.X, self.y) p_transform, _, _ = photon_pipe.transform(self.X) p_inverse_transformed, _, _ = photon_pipe.inverse_transform( p_transform) self.assertTrue( np.array_equal(sk_inverse_transformed, p_inverse_transformed)) # now including stack stack = Stack("stack", [self.p_pca]) stack_pipeline = PhotonPipeline([ ("stack", stack), ("StandardScaler", PipelineElement("StandardScaler")), ("LinearSVC", PipelineElement("LinearSVC")), ]) stack_pipeline.fit(self.X, self.y) feature_importances = stack_pipeline.feature_importances_ inversed_data, _, _ = stack_pipeline.inverse_transform( feature_importances) self.assertEqual(inversed_data.shape[1], self.X.shape[1]) # Todo: add tests for kwargs def test_predict_proba(self): sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_dt)]) sk_pipe.fit(self.X, self.y) sk_proba = sk_pipe.predict_proba(self.X) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("SVC", self.p_dt)]) photon_pipe.fit(self.X, self.y) photon_proba = photon_pipe.predict_proba(self.X) self.assertTrue(np.array_equal(sk_proba, photon_proba)) def test_copy_me(self): switch = Switch("my_copy_switch") switch += PipelineElement("StandardScaler") switch += PipelineElement("RobustScaler", test_disabled=True) stack = Stack("RandomStack") stack += PipelineElement("SVC") branch = Branch("Random_Branch") pca_hyperparameters = {"n_components": [5, 10]} branch += PipelineElement("PCA", hyperparameters=pca_hyperparameters) branch += PipelineElement("DecisionTreeClassifier") stack += branch photon_pipe = PhotonPipeline([ ("SimpleImputer", PipelineElement("SimpleImputer")), ("my_copy_switch", switch), ("RandomStack", stack), ("Callback1", CallbackElement("tmp_callback", np.mean)), ("PhotonVotingClassifier", PipelineElement("PhotonVotingClassifier")), ]) copy_of_the_pipe = photon_pipe.copy_me() self.assertEqual(photon_pipe.random_state, copy_of_the_pipe.random_state) self.assertTrue(len(copy_of_the_pipe.elements) == 5) self.assertTrue(copy_of_the_pipe.elements[2][1].name == "RandomStack") self.assertTrue(copy_of_the_pipe.named_steps["my_copy_switch"]. elements[1].test_disabled) self.assertDictEqual( copy_of_the_pipe.elements[2] [1].elements[1].elements[0].hyperparameters, {"PCA__n_components": [5, 10]}, ) self.assertTrue( isinstance(copy_of_the_pipe.elements[3][1], CallbackElement)) self.assertTrue(copy_of_the_pipe.named_steps["tmp_callback"]. delegate_function == np.mean) def test_random_state(self): photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", PipelineElement("PCA")), ("SVC", self.p_dt)]) photon_pipe.random_state = 666 photon_pipe.fit(self.X, self.y) self.assertEqual(self.p_dt.random_state, photon_pipe.random_state) self.assertEqual(photon_pipe.elements[1][-1].random_state, photon_pipe.random_state) self.assertEqual(self.p_dt.random_state, 666)
import numpy as np import pandas as pd from sklearn.decomposition.pca import PCA from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC from sklearn.mixture import GMM from sklearn.base import BaseEstimator import matplotlib.pyplot as plt X_test = pd.read_csv('Data/test.csv', header=None).as_matrix() y = pd.read_csv('Data/trainLabels.csv', header=None)[0].as_matrix() X = pd.read_csv('Data/train.csv', header=None).as_matrix() pca2 = PCA(n_components=2, whiten=True) pca2.fit(np.r_[X, X_test]) X_pca = pca2.transform(X) i0 = np.argwhere(y == 0)[:, 0] i1 = np.argwhere(y == 1)[:, 0] X0 = X_pca[i0, :] X1 = X_pca[i1, :] plt.plot(X0[:, 0], X0[:, 1], 'ro') plt.plot(X1[:, 0], X1[:, 1], 'b*') pca = PCA(whiten=True) X_all = pca.fit_transform(np.r_[X, X_test]) print (pca.explained_variance_ratio_) def kde_plot(x): from scipy.stats.kde import gaussian_kde kde = gaussian_kde(x) positions = np.linspace(x.min(), x.max())
## Generate predictions r('predictions_dl <- h2o.predict(dlmodel, test3.hex)') r('head(predictions_dl)') ## new predictions pred = r('as.matrix(predictions_dl)') return var(pred -test) ################################################################ figure() variances_table = [] for i in range(2,11,1): pca = PCA(n_components=i) der = derivatives[train_mask_TL] pca.fit(der) X = pca.transform(derivatives[test_mask]) pred_pca_temp = (pca.inverse_transform(X)) # var_fraction_pca_TL = var(pred_pca_temp-derivatives[test_mask])/var(derivatives[test_mask]) #plot([i], [var(pred_pca_temp-derivatives[test_mask])],'D') var_fraction_DL_TL = DL( derivatives[train_mask_TL], derivatives[test_mask], i)/var(derivatives[test_mask]) #plot([i], [var_DL_TL ],'Dk') pca = PCA(n_components=i) der = derivatives[train_mask_no_TL] pca.fit(der) X = pca.transform(derivatives[test_mask]) pred_pca_temp = (pca.inverse_transform(X))
gc = GridSearchCV(estimator=Lasso(), param_grid=gs_params) lasso_model = gc.fit(X_train, Y_train) Y_pred = lasso_model.predict(X_test) best_alpha = lasso_model.best_params_['alpha'] print 'The best lasso model is obtained wiht an alpha of', best_alpha print 'RMSE of the best standardized lasso regression model:', mean_squared_error( Y_test, Y_pred)**0.5 # split data into train, dev and test # todo: do this before, so that we have errors measured on the same test set for i in range(1, 9): n_components = 2**i #n_components = i pca = PCA(n_components=n_components) X_reduced_train = pca.fit_transform(X_train) X_reduced_test = pca.transform(X_test) vanilla_lr = LinearRegression() vanilla_lr = vanilla_lr.fit(X_reduced_train, Y_train) Y_pred = vanilla_lr.predict(X_reduced_test) print 'MSE for ', n_components, ' components with LR ', mean_squared_error( Y_test, Y_pred)**0.5 gs_params = {'alpha': [2**i for i in range(-10, 20)]} gc = GridSearchCV(estimator=Ridge(), param_grid=gs_params) ridge_model = gc.fit(X_reduced_train, Y_train) Y_pred = ridge_model.predict(X_reduced_test) print 'RMSE for ', n_components, ' components with Ridge ', mean_squared_error( Y_test, Y_pred)**0.5
if ((old_extra - extra)**2).sum() < tol: print "finished at iteration %d" % it break old_extra = extra.copy() return labels if __name__ == "__main__": X, Y = data.libras_movement() labels = kernel_k_means(X, k=15) # Pour representer les donnees, prendre le PCA pca = PCA(n_components=2) pca.fit(X) Xt = pca.transform(X) fig = pl.figure() colors = ['#334433', '#6699aa', '#88aaaa', '#aacccc', '#447799', '#225533', '#44bbcc', '#88dddd', '#bbeeff', '#0055bb', '#220000', '#880022',