def simple_pls_cv(X, y, n_comp): # Run PLS with suggested number of components pls = PLSRegression(n_components=n_comp) pls.fit(X, y) y_c = pls.predict(X) # Cross-validation y_cv = cross_val_predict(pls, X, y, cv=10) # Calculate scores for calibration and cross-validation score_c = r2_score(y, y_c) score_cv = r2_score(y, y_cv) # Calculate mean square error for calibration and cross validation mse_c = mean_squared_error(y, y_c) mse_cv = mean_squared_error(y, y_cv) print('R2 calib: %5.3f' % score_c) print('R2 CV: %5.3f' % score_cv) print('MSE calib: %5.3f' % mse_c) print('MSE CV: %5.3f' % mse_cv) # Plot regression z = np.polyfit(y, y_cv, 1) with plt.style.context(('ggplot')): fig, ax = plt.subplots(figsize=(9, 5)) ax.scatter(y_cv, y, c='red', edgecolors='k') ax.plot(z[1] + z[0] * y, y, c='blue', linewidth=1) ax.plot(y, y, color='green', linewidth=1) plt.title('$R^{2}$ (CV): ' + str(score_cv)) plt.xlabel('Predicted $^{\circ}$Brix') plt.ylabel('Measured $^{\circ}$Brix') plt.show()
def PLS(X, y, X_ind, y_ind): """ Cross validation and Independent test for PLS regression model. Arguments: X (np.ndarray): m x n feature matrix for cross validation, where m is the number of samples and n is the number of features. y (np.ndarray): m-d label array for cross validation, where m is the number of samples and equals to row of X. X_ind (np.ndarray): m x n Feature matrix for independent set, where m is the number of samples and n is the number of features. y_ind (np.ndarray): m-d label array for independent set, where m is the number of samples and equals to row of X_ind, and l is the number of types. reg (bool): it True, the training is for regression, otherwise for classification. Returns: cvs (np.ndarray): m x l result matrix for cross validation, where m is the number of samples and equals to row of X, and l is the number of types and equals to row of X. inds (np.ndarray): m x l result matrix for independent test, where m is the number of samples and equals to row of X, and l is the number of types and equals to row of X. """ folds = KFold(5).split(X) cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) for i, (trained, valided) in enumerate(folds): model = PLSRegression() model.fit(X[trained], y[trained]) cvs[valided] = model.predict(X[valided])[:, 0] inds += model.predict(X_ind)[:, 0] return cvs, inds / 5
def predicao(self, idmodelo, idamostra): idmodelo = idmodelo idamostra = idamostra print(idmodelo) print(idamostra) X = self.selectMatrizX(idmodelo, "VALIDACAO") Y = self.selectMatrizY(idmodelo, "VALOR", "VALIDACAO") amostraPredicao = self.selectAmostra(idamostra, idmodelo) valorReferencia = self.selectDadosReferenciaAmostra(idamostra, idmodelo) pls = PLSRegression(copy=True, max_iter=500, n_components=20, scale=False, tol=1e-06) pls.fit(X, Y) print(amostraPredicao) valorPredito = pls.predict(amostraPredicao) print('Amostra: ' + str(idamostra) + ' - Valor Predito :' + str(valorPredito) + ' - Valor Referencia :' + str( valorReferencia)) cursorDadosCalibracao = db.execute("select rmsec, rmsep, coeficientecal, coeficienteval, dtcalibracao " "from calibracao where inativo = 'A' and idmodelo = " + str(idmodelo) + " ") for regCodigo in cursorDadosCalibracao: rmsec = regCodigo[0] rmsep = regCodigo[1] coeficienteCal = regCodigo[2] coeficienteVal = regCodigo[3] dtcalibracao = regCodigo[4] print(rmsec) print(rmsep) print(coeficienteCal) print(coeficienteVal) print(dtcalibracao) dtcalibracao = dtcalibracao.strftime('%d/%m/%Y') print(dtcalibracao) # tratamento dos dados para o Json coeficienteCal = round(coeficienteCal, 2) coeficienteVal = round(coeficienteVal, 2) rmsec = round(rmsec, 2) rmsep = round(rmsep, 2) valorReferencia = round(valorReferencia, 2) valorPreditoString = str(valorPredito) valorPreditoString = valorPreditoString.replace("[", "") valorPreditoString = valorPreditoString.replace("]", "") ##Contrucao do JSON json_data = jsonify(idamostra=str(idamostra), valorpredito=str(valorPreditoString), rmsec=str(rmsec), rmsep=str(rmsep), idmodelo=str(idmodelo), dtcalibracao=str(dtcalibracao), valorreferencia=str(valorReferencia), coeficientecal=str(coeficienteCal), coeficienteval=str(coeficienteVal)) return json_data
class PLSDADummy(BaseEstimator): """ Wrapper of PLSRegression for classification. PLSRegression predicts one hot encoded vectors, then plsda outputs class with maximal score. """ def __init__(self, n_components=2): self.pls = PLSRegression(n_components) self.classes = None def __one_hot_encode(self, Y): # encode labels to numbers Y = np.array([np.where(self.classes == y)[0][0] for y in Y]) enc = OneHotEncoder(n_values=len(self.classes)) return enc.fit_transform(Y.reshape(-1, 1)).toarray() def fit(self, X, Y): """ :param X: :param Y: list of labels :return : """ self.classes = np.array(sorted(np.unique(Y))) Y = self.__one_hot_encode(Y) self.pls.fit(X, Y) return self def predict(self, X): y_pred = np.argmax(self.pls.predict(X), axis=1) return np.array([self.classes[cls] for cls in y_pred])
class MyPLS(): def __init__(self, n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True): self.pls = PLSRegression(n_components, scale, max_iter, tol, copy) def fit(self, X, Y): self.pls.fit(X, Y) return self.pls def predict(self, X, copy=True): return self.pls.predict(X, copy).flatten() def score(self, X, Y, sample_weight=None): return self.pls.score(X, Y, sample_weight) def get_params(self, deep=True): return self.pls.get_params(deep) def set_params(self, **parameters): self.pls.set_params(**parameters) return self @property def intercept_(self): return 0 @property def coeff_(self): return self.pls.coef_
def get_cal(self, X, y, n_comp=None): if n_comp == None: n_comp = self.opt_ncomp X = self.transform(X) pls = PLSRegression(n_components=n_comp) pls.fit(X, y) return pls
class MSLMultiModel(WebModel): ''' a Multitask version of MSLModel ''' def __init__(self, output_dir, ccs_dir, lanl_file, n_components=10, **kwargs): self.output_dir = output_dir self.ccs_dir = ccs_dir self.lanl_file = lanl_file self.n_components = n_components self.model = PLSRegression(n_components=n_components, scale=False) self.multitask = True self.name = 'msl_multi_model' def fit(self, data, composition, elements): self.elements = elements # order matters data = libs_norm3(data[:, ALAMOS_MASK]) self.model.fit(data, composition) def predict(self, data, mask=ALAMOS_MASK, clip=True): data = libs_norm3(data[:, mask]) predictions = self.model.predict(data, copy=False) if predictions: predictions = np.clip(predictions, 0, 100) else: predictions[predictions < 0] = 0 return predictions
def pls_balances_cmd(table_file, metadata_file, category, output_file): metadata = pd.read_table(metadata_file, index_col=0) table = load_table(table_file) table = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(axis='sample'), columns=table.ids(axis='observation')) ctable = pd.DataFrame(clr(centralize(table + 1)), index=table.index, columns=table.columns) rfc = PLSRegression(n_components=1) if metadata[category].dtype != np.float: cats = np.unique(metadata[category]) groups = (metadata[category] == cats[0]).astype(np.int) else: groups = metadata[category] rfc.fit(X=ctable.values, Y=groups) pls_df = pd.DataFrame(rfc.x_weights_, index=ctable.columns, columns=['PLS1']) l, r = round_balance(pls_df.values, means_init=[[pls_df.PLS1.min()], [0], [pls_df.PLS1.max()]], n_init=100) num = pls_df.loc[pls_df.PLS1 > r] denom = pls_df.loc[pls_df.PLS1 < l] diff_features = list(num.index.values) diff_features += list(denom.index.values) with open(output_file, 'w') as f: f.write(','.join(diff_features))
def do_pls(df, n_components=-1): Y_cols = ["slump", "flow", "compressive_strength"] X_cols = [ "cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate" ] Y = df[Y_cols] X = df[X_cols] if n_components == -1: r2s = [] mses = [] rpds = [] xticks = np.arange(1, X.shape[1] + 1) for n_comp in xticks: y_cv, r2, mse, rpd = optimise_pls_cv(X, Y, n_comp) r2s.append(r2) mses.append(mse) rpds.append(rpd) plot_metrics(mses, 'MSE', 'min', xticks) plot_metrics(r2s, 'R2', 'max', xticks) #plot_metrics(rpds, 'RPD', 'max', xticks) n_components = np.argmin(mses) + 1 pls = PLSRegression(n_components=n_components, scale=True) pls.fit(X, Y) loadings = pd.DataFrame(pls.x_loadings_) scores = pd.DataFrame(pls.x_scores_) X_rows_dict = {i: X_cols[i] for i in range(0, len(X_cols))} X_cols_dict = {i: 'LV' + str(i + 1) for i in range(0, n_components)} loadings.rename(index=X_rows_dict, columns=X_cols_dict, inplace=True) print(loadings)
def varselpls(x, y, ti, nc, step=1): ''' X: m x n data, m: number of samples Y: m x 1 reference values TI: test indices NC: number of principal components (latent variables) Returns the following outputs: IND_OPT: optimum indices (variables) of X RMSE_OPT: the RMSE that is reached by the optimal varible selection ''' ci = np.arange(x.shape[0]) # calibration index ci = np.delete(ci, ti) plsModel = PLSRegression(n_components=nc) plsModel.fit(x[ci, :], y[ci]) reg_coe = np.abs(plsModel.coef_[:, 0]) a = np.sort(reg_coe)[:-nc:step] la = len(a) rmse_opt = np.zeros(la) for c, k in zip(a, range(la)): var_sel = reg_coe >= c x_cal = x[ci, :][:, var_sel] rmse_opt[k] = cross_val_score(plsModel, x_cal, y[ci], cv=4, scoring='neg_mean_squared_error').mean() # pdb.set_trace() k_opt = np.argmax(rmse_opt) ind_opt = np.arange(x.shape[1])[reg_coe >= a[k_opt]] rmse_opt = np.sqrt(-rmse_opt[k_opt]) return ind_opt, rmse_opt
def fit_pls(self, X_test): reg = PLSRegression(n_components=20, scale=False, max_iter=1000) reg.fit(self.X.copy().values, self.y.copy().values.flatten()) preds = reg.predict(X_test.copy().values) ids = X_test.index pred_df = pd.DataFrame(data=preds, index=ids, columns=['SalePrice']) pred_df.to_csv('results/results_pls.csv', sep=',')
def fitcv(self): pls = PLSRegression(n_components=self.n_components,scale=False) kf = KFold(n_splits=self.n_splits) yTrue=None yHat=None # 判断Y 是几维的 dimensiony=len(self.Y.shape) for train_index, test_index in kf.split(self.X): X_train, X_test = self.X[train_index], self.X[test_index] y_train, y_test = self.Y[train_index], self.Y[test_index] pls.fit(X_train, y_train) if dimensiony==1: ypred = pls.predict(X_test)[:,0] else: ypred = pls.predict(X_test) ypred[ypred>0]=1 ypred[ypred<0]=-1 if yTrue is None: yTrue=y_test # 真值 yHat=ypred #预测值 else: yTrue=np.r_[yTrue,y_test] yHat=np.r_[yHat, ypred] err=yTrue-yHat errSampleNo=np.where(err!=0) err=err[err!=0] return len(err)/len(self.X)*100,errSampleNo #返回误判率
def piecewise_ds(A, B, win_size=5, pls=None): assert A.shape == B.shape, "Input matrices must be the same shape." assert win_size % 2 == 1, "Window size must be odd." padding = (win_size - 1) / 2 n_feats = A.shape[1] coefs = [] for i in xrange(n_feats): row = np.zeros(n_feats) start = max(i - padding, 0) end = min(i + padding, n_feats - 1) + 1 if isinstance(pls, int): model = PLSRegression(n_components=pls, scale=False) model.fit(B[:, start:end], A[:, i]) row[start:end] = model.coefs.ravel() elif pls is None: row[start:end] = np.dot(np.linalg.pinv(B[:, start:end]), A[:, i]) else: print "ERROR: bad number of PLS components." return coefs.append(row) proj_to_A = np.array(coefs).T proj_B = np.dot(B, proj_to_A) return proj_to_A, proj_B
def knn_denoise(X, X_reference, *, k, ncomp): from sklearn.cross_decomposition import PLSRegression from sklearn.neighbors import NearestNeighbors # Xb = X * window print('PCA...') npca = np.minimum(300, X.shape[0]) u, s, vh = np.linalg.svd(X) features = u[:, 0:npca] * s[0:npca] components = vh[0:npca, :] # s = s[0:npca] print('Nearest neighbors...') nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='ball_tree').fit(features) distances, indices = nbrs.kneighbors(features) features2 = np.zeros(features.shape, dtype=features.dtype) for j in range(X.shape[0]): print(f'{j+1} of {X.shape[0]}') inds0 = np.squeeze(indices[j, :]) inds0 = inds0[1:] # Xbneighbors = Xb[inds0, :] f_neighbors = features[inds0, :] pls = PLSRegression(n_components=ncomp) # pls.fit(Xbneighbors.T, Xb[j, :].T) pls.fit(f_neighbors.T, features[j, :].T) features2[j, :] = pls.predict(f_neighbors.T).T # X2[j, :] = pls.predict(Xbneighbors.T).T print(features2.shape) print(components.shape) X2 = features2 @ components return X2
def plot_embedding(df, labels, method='tSNE', cmap='tab20', figsize=(6, 6), markersize=50, show_legend=True, return_emb=False, save=False, save_emb=False): """ df: DataFrame nsamples x nfeatures labels: labels for each sample """ df = df.fillna(0) if method == 'tSNE': from sklearn.manifold import TSNE X = TSNE(n_components=2, random_state=124).fit_transform(df) if method == 'UMAP': from umap import UMAP X = UMAP(n_neighbors=30, min_dist=0.1).fit_transform(df) if method == 'PCA': from sklearn.decomposition import PCA X = PCA(n_components=2, random_state=124).fit_transform(df) if method == 'PLS': from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import LabelEncoder encode = LabelEncoder() ref = encode.fit_transform(labels) pls2 = PLSRegression(n_components=2) pls2.fit(df, ref) X = pls2.x_scores_ plt.figure(figsize=figsize) if cmap is not None: cmap = cmap elif len(classes) <= 10: cmap = 'tab10' elif len(classes) <= 20: cmap = 'tab20' else: cmap = 'husl' palette = sns.color_palette(cmap, n_colors=len(np.unique(labels))) ax = sns.scatterplot(x=X[:,0], y=X[:,1], hue=labels, palette=palette, marker="o", legend='full', s=markersize) ax.tick_params(axis='x', bottom=True, top=False, labeltop=False, labelbottom=True, labelsize=12, length=3, pad=3) ax.tick_params(axis='y', left=True, right=False, labelright=False, labelleft=True, labelsize=12, length=3, pad=3) ax.set_xlabel('{}_1'.format(method), fontsize=15, labelpad=10, va='center') ax.set_ylabel('{}_2'.format(method), rotation=90, fontsize=16, labelpad=10, va='center') if save: plt.savefig(save, format='pdf', bbox_inches='tight') else: plt.show() if save_emb: np.savetxt(save_emb, X) if return_emb: return X
def PLSCrossValidation(n_components, trainSet, validationSet): pls = PLSRegression(n_components=n_components) pls.fit(trainSet[predictorList], trainSet['Apps']) predictPls = pls.predict(validationSet[predictorList]) different = predictPls.flat - validationSet['Apps'] error_rate = np.mean(different ** 2) return error_rate
def train_PLSR(x_filename, y_filename, model_filename, n): """ Train a PLSR model and save it to the model_filename. X and Y matrices are read from x_filename and y_filename. The no. of PLSR components is given by n. """ X = loadMatrix(x_filename)[0].todense() Y = loadMatrix(y_filename)[0].todense() if X.shape[0] != Y.shape[0]: sys.stderr.write("X and Y must have equal number of rows!\n") raise ValueError sys.stderr.write("Learning PLSR...") startTime = time.time() pls2 = PLSRegression(copy=True, max_iter=10000, n_components=n, scale=True, tol=1e-06) pls2.fit(X, Y) model = open(model_filename, 'w') pickle.dump(pls2, model, 1) model.close() endTime = time.time() sys.stderr.write(" took %ss\n" % str(round(endTime - startTime, 2))) pass
def fit(self, X, y): self.X = X pls = PLSRegression(n_components=self.n_comps) # Fit data pls.fit(self.X, y) # Get X scores self.T = pls.x_scores_ # Get X loadings self.P = pls.x_loadings_ # Calculate error array self.Err = self.X - np.dot(self.T, self.P.T) # Calculate Q-residuals (sum over the rows of the error array) self.Q = np.sum(self.Err ** 2, axis=1) # Calculate Hotelling's T-squared (note that data are normalised by default) self.Tsq = np.sum((pls.x_scores_ / np.std(pls.x_scores_, axis=0)) ** 2, axis=1) # set the confidence level # conf = self.conf # Calculate confidence level for T-squared from the ppf of the F distribution self.Tsq_conf = ( f.ppf(q=self.conf, dfn=self.n_comps, dfd=self.X.shape[0]) * self.n_comps * (self.X.shape[0] - 1) / (self.X.shape[0] - self.n_comps) ) # Estimate the confidence level for the Q-residuals i = np.max(self.Q) + 1 while 1 - np.sum(self.Q > i) / np.sum(self.Q > 0) > self.conf: i -= 1 self.Q_conf = i self._fitted = True
class PLS(): """ Implement PLS to make it compliant with the other dimensionality reduction methodology. (Simple class rewritting). """ def __init__(self, n_components=10): self.clf = PLSRegression(n_components) def get_components_(self): return self.clf.x_weights_.transpose() def set_components_(self, x): pass components_ = property(get_components_, set_components_) def fit(self, X, y): self.clf.fit(X,y) return self def transform(self, X): return self.clf.transform(X) def predict(self, X): return self.clf.predict(X)
def cv_predict(self, n_folds, max_components): x_tr, x_te, y_tr, y_te = baseCV.CV(self, self.x, self.y, n_folds) y_predict_all=np.ones((1,max_components)) # pls = _NIPALS(max_components) for i in range(n_folds): y_predict = np.zeros((x_te[i].shape[0],max_components)) xtrainmean = np.mean(x_tr[i], axis=0) ytrainmean = np.mean(y_tr[i], axis=0) xte_center = np.subtract(x_te[i], xtrainmean) yte_center = np.subtract(y_te[i],ytrainmean) # W,T,P,C,U,Q,lists_coefs_B=pls.fit(x_tr[i],y_tr[i],max_components) # print lists_coefs_B for j in range(1, max_components, 1): pls2 = PLSRegression(j) pls2.fit(x_tr[i],y_tr[i]) # print pls2.coef_ y_pre_center = np.dot(xte_center,pls2.coef_) # y_pre_center = np.dot(xte_center,lists_coefs_B[j]) Y_pre = y_pre_center + ytrainmean y_predict[:,j]=Y_pre.ravel() y_predict_all=np.vstack((y_predict_all,y_predict)) y_predict_all=y_predict_all[1:] return y_predict_all,self.y
def fitPLS(input_data): ### Step 1 - unpack input JSON to normal data array input_df = pd.DataFrame(input_data) colnames = list(input_df.columns.values) colnames[len(colnames) - 1] = 'Intercept' train_arr = np.zeros(shape = input_df.shape) for i in range(train_arr.shape[0]): for j in range(train_arr.shape[1]): train_arr[i,j] = float(input_df.values[i,j]['value']) ### Step 2 - fit the model. X - independent variables; Y - output variable X, Y = train_arr[:, : - 1], train_arr[:, -1] n_best = 2 best_fit = PLSRegression(n_components = n_best, scale = False) best_fit.fit(X, Y) ### Step 3 - retrieve regression coefficients and pack them to JSON model_coef = best_fit.coef_ y_intercept = best_fit.y_mean_ - np.dot(best_fit.x_mean_ , best_fit.coef_) model_vector = np.append(model_coef, y_intercept) model_list = model_vector.tolist() model_dict = dict(zip(colnames, model_list)) model_js = json.dumps(model_dict) return model_js;
def getPLSRegression(differenceMatrix): similarityRatings = np.zeros([80]) feature_length = 103 X1 = np.nan_to_num(similarityRatings) # (80,) featureSet = scale( differenceMatrix) # (80, 103) - 103 diff. values for 80 ratings i = 0 with open('average-similarity-ratings.csv') as csvfile: reader = csv.reader(csvfile, delimiter=",") for row in reader: similarityRatings[i] = float(row[0]) i += 1 normalizedDifference = np.zeros([80, feature_length]) differenceArray = np.zeros([80, feature_length]) for i in range(80): differenceArray[i, :] = differenceMatrix[i] y = np.nan_to_num(similarityRatings) # (80,) PLSreg = PLSRegression(n_components=16) PLSreg.fit(featureSet, y) print(featureSet.shape, 'featureset shape') return PLSreg, similarityRatings
def PLSR_LOOCV(data): ''' Performs LOOCV on the data and returns R2Y value ''' R2Y = 0 predVal = [] for i in range(len(data[:, 0])): train = np.zeros((len(data[:, 0]) - 1, 8)) test = np.zeros((1, 8)) for j in range(len(data[:, 0])): if j < i: train[j, :] = data[j, :] elif j > i: train[j - 1, :] = data[j, :] else: test[0, :] = data[j, :] testScaled = np.zeros((1, 8)) trainScale = StandardScaler() trainScaled = trainScale.fit_transform(train) testScaled[0, :] = trainScale.transform(test) PLSR = PLSRegression(n_components=2) PLSR.fit(trainScaled[:, 2:6], trainScaled[:, 0]) pred = PLSR.predict(testScaled[:, 2:6]) predVal.append(np.squeeze(pred)) scaledData = scaler(data) R2Y = 1 - np.sum( (predVal - scaledData[:, 0])**2) / np.sum(scaledData[:, 0]**2) return R2Y
def feature_clustering(x,y,fc): if fc=='True': plsca = PLSRegression(n_components=200) plsca.fit(x,y) x3 = plsca.transform(x) string = "pls_" pls_column_name = [string + `i` for i in range(x3.shape[1])]
def fit_plt(dados,ncomp): from sklearn.cross_decomposition import PLSRegression colmap = [ (0,0,0), (1,0,0),(0,1,0),(0,0,1),(0.41,0.41,0.41),(0,1,1), (0.58,0,0.82),(0,0.50,0),(0.98,0.50,0.44),(1, 1,0.87), (0.39,0.58,0.92),(0.50,0.50,0),(1,0.89,0.76),(0.96,0.96,0.86), (0,1,1)] g = dados['g'] r = dados['r'] wn = dados['wn'] pls = PLSRegression(n_components=ncomp) pls.fit(r,g) Y_pred = pls.predict(r) plt.figure() plt.subplot(2,1,1) for i in range(1,g.max()+1): sel = g == i plt.scatter(g[sel],Y_pred[sel],color = colmap[i]) plt.xlabel( 'Y_class',Fontsize = 12) plt.ylabel( 'Y_predited',Fontsize = 12) plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::')) plt.subplot(2,1,2) for i in range(1,g.max()+1): sel = g == i plt.hist(Y_pred[sel]) plt.xlabel( 'Y_class',Fontsize = 12) plt.ylabel( 'histograma',Fontsize = 12) plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False, osc_params=(10,1)): # Separating X from Y for PLS X=self.df[self.freqs].to_numpy() Y=self.df[self.y_name].to_numpy().reshape(-1, 1) sample_std=np.std(self.df[self.y_name]) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") # Array for storing CV errors cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)]) i=0 for train, val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) osc.fit(X[train], Y[train]) X_train_osc=osc.X_osc X_val_osc=osc.transform(X[val]) j=0 for ncomp in ncomp_range: pls = PLSRegression(n_components=ncomp,scale=False) if len(osc_params)==2: pls.fit(X_train_osc, Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X_val_osc))**0.5 else: pls.fit(X[train], Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X[val]))**0.5 j=j+1 i=i+1 # Printing and plotting CV results cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0) cv_RPD_ncomp=sample_std/cv_RMSE_ncomp if plot: fig = plt.figure(figsize=(12,8)) plt.gca().xaxis.grid(True) plt.xticks(ncomp_range) plt.ylabel("RPD") plt.xlabel("Number of components") plt.plot(ncomp_range,cv_RPD_ncomp) # Best model rpd_best=max(cv_RPD_ncomp) ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()] if verbose: print("Best RMSE: ",min(cv_RMSE_ncomp)) print("Best RPD: ",max(cv_RPD_ncomp)) print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()]) return (ncomp_best,rpd_best)
def plot_pls_results(x_data, y_data, pls_components, num_variables): pls = PLSRegression(pls_components) cv_splitter = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=6) # 1 group_splitter = data_full['Leaf number'] print('111111111') print(x_data) for train_index, test_index in cv_splitter.split(x_data, y_data, group_splitter): # print(train_index, test_index) x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index] y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index] pls.fit(x_train, y_train) y_pred_train = pls.predict(x_train) y_pred_test = pls.predict(x_test) r2_test = r2_score(y_test, y_pred_test) r2_train = r2_score(y_train, y_pred_train) mae_test = mean_absolute_error(y_test, y_pred_test) mae_train = mean_absolute_error(y_train, y_pred_train) print(r2_test, mae_test) print(r2_train, mae_train) print(r2_score(y_train, y_pred_train)) print(r2_score(y_test, y_pred_test)) plt.scatter(y_train, y_pred_train, c='blue', label='Training Set') plt.scatter(y_test, y_pred_test, c='red', label='Test Set') _line = np.linspace(0.2, 1.2) # plt.plot(_line, _line, c='indigo', linestyle='dashed') # # plt.plot(_line, _line + .06, c='darkslategray', linestyle='dashed') # plt.plot(_line, _line - .06, c='darkslategray', linestyle='dashed') # # left_annote_pos = 0.20 # plt.annotate("Training Median Absolute Error = {}".format(0.059), # (left_annote_pos, 1.1), fontsize=12) # # plt.annotate("Testing Median Absolute Error = {}".format(0.07), # # (left_annote_pos, 1.02), fontsize=12) # # plt.annotate(u"Training R\u00B2 = {}".format(0.83), # (left_annote_pos, .95), fontsize=12) # # # plt.annotate(u"Testing R\u00B2 = {}".format(0.82), # # (left_annote_pos, .89), fontsize=12) # plt.xlabel('Meausured Chlorophyll b (ug/ml)', fontsize=16) # plt.ylabel('Predicted Chlorophyll b (ug/ml)', fontsize=16) # plt.title("Chlorophyll b Model for AS7262\nbased on 2-Component\nPartial Least Squared Model", # fontsize=18) # plt.legend(loc='lower right', fontsize=12) plt.tight_layout() plt.show() plt.scatter(y_pred_train, y_train, c='blue', label='Training Set') plt.scatter(y_pred_test, y_test, c='red', label='Test Set') plt.show()
def compute_q2_pls(tdata, tlabel, vdata, vlabel, Rval): test = PLSRegression(n_components=Rval) with warnings.catch_warnings(): warnings.simplefilter("ignore") test.fit(matricize(tdata), matricize(tlabel)) Y_pred = test.predict(matricize(vdata)) Q2 = qsquared(matricize(vlabel), matricize(Y_pred)) return Q2
def test_regressor_fit(pls_regressor): X = np.random.rand(10, 10) y = np.random.rand(10) sklearn_regressor = PLSRegression().fit(X, y) assert pls_regressor.fit(X, y) assert pls_regressor.fit(X[:, 0:1], y) with pytest.raises(ValueError): sklearn_regressor.fit(X[:, 0:1], y)
def test_n_components_bounds_pls_regression(n_components, err_type, err_msg): """Check the validation of `n_components` for `PLSRegression`.""" rng = np.random.RandomState(0) X = rng.randn(10, 5) Y = rng.randn(10, 3) est = PLSRegression(n_components=n_components) with pytest.raises(err_type, match=err_msg): est.fit(X, Y)
def run_pls(X, Y, LV): model = PLSRegression(n_components=LV, scale=False) model.fit(X, Y) Yr = [ y[0] for y in model.predict(X).tolist() ] r2, sdec = calc_regr_metrics(Y_exp=Y, Y_pred=Yr) q2, sdep, variables.Y_pred = regr_loo(X=np.array(X), Y=np.array(Y), M=model) scores = { 'R2': r2, 'Q2': q2, 'SDEC': sdec,'SDEP': sdep } return scores, model
def Training(df,seed, yratio, xratio, index = 1): snp_matrix = np.array(df.values) xdim, ydim = snp_matrix.shape ydimlist = range(0,ydim) xdimlist = range(0,xdim) random.seed(seed) random.shuffle(ydimlist) # shuffle the individuals random.shuffle(xdimlist) # shuffle the SNPs accuracy = 0 snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist]) snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:]) snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)] snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):] snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:] snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] for i in range(int(xdim*xratio), xdim): snp_matrix_train_y = snp_matrix_train[i,:] snp_matrix_test_y = snp_matrix_test[i,:] if index != 7: if index == 1: clf = AdaBoostClassifier(n_estimators= 100) elif index == 2: clf = RandomForestClassifier(n_estimators=100) elif index == 3: clf = linear_model.LogisticRegression(C=1e5) elif index == 4: clf = svm.SVC(kernel = 'rbf') elif index == 5: clf = svm.SVC(kernel = 'poly') else: clf = svm.SVC(kernel = 'linear') clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y) Y_pred = clf.predict(snp_matrix_test_x.T) prediction = snp_matrix_test_y - Y_pred wrong = np.count_nonzero(prediction) tmp = 1 - (wrong + 0.0) / len(prediction) print tmp accuracy += tmp accuracy = accuracy / (xdim - int(xdim*xratio)) if index == 7: pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000) snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:] pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T) snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:] Y_pred = transform(pls2.predict(snp_matrix_test_x.T)) prediction = snp_matrix_test_y - Y_pred.T xdim, ydim = prediction.shape wrong = np.count_nonzero(prediction) accuracy = 1 - wrong / (xdim * ydim + 0.0) return accuracy
def fit(predictors, predictands, log=False, **kwargs): model = PLSRegression(n_components=2) try: model.fit(predictors, predictands) except: return None return model
def trainmodels(m, x, y, iter=1000): '''For the model type m, train a model on x->y using built-in CV to parameterize. Return both this model and an unfit model that can be used for CV. Note for PLS we cheat a little bit since there isn't a built-in CV trainer. ''' if m == 'pls': #have to manually cross-validate to choose number of components kf = KFold(len(y), n_folds=3) bestscore = -10000 besti = 0 for i in xrange(1,min(100,len(x[0]))): #try larger number of components until average CV perf decreases pls = PLSRegression(i) scores = [] #TODO: parallelize below for train,test in kf: xtrain = x[train] ytrain = y[train] xtest = x[test] ytest = y[test] pls.fit(xtrain,ytrain) score = scoremodel(pls,xtest,ytest) scores.append(score) ave = np.mean(scores) if ave < bestscore*0.95: #getting significantly worse break elif ave > bestscore: bestscore = ave besti = i model = PLSRegression(besti) model.fit(x,y) unfit = PLSRegression(besti) #choose number of components using full data - iffy print "PLS components =",besti elif m == 'lasso': model = LassoCV(n_jobs=-1,max_iter=iter) model.fit(x,y) unfit = LassoCV(n_jobs=-1,max_iter=iter) #(alpha=model.alpha_) print "LASSO alpha =",model.alpha_ return (model,unfit) elif m == 'ridge': model = RidgeCV() model.fit(x,y) print "Ridge alpha =",model.alpha_ unfit = RidgeCV() else: model = ElasticNetCV(n_jobs=-1,l1_ratio=[.1, .5, .7, .9, .95, .99, 1],max_iter=iter) model.fit(x,y) print "Elastic alpha =",model.alpha_," l1_ratio =",model.l1_ratio_ unfit = ElasticNetCV(n_jobs=-1,max_iter=iter) return (model,unfit)
def get_correlations(param, spec, wave): '''Returns correlations between spec and params by wavelengths''' # using PLS pls = PLSRegression(10) pls.fit(spec, param) #get corretalions nparam = param.shape[1] cor = pls.coefs*np.asarray([pls.x_std_]*nparam).T cor /= np.tile(pls.y_std_, (cor.shape[0],1)) return cor
def pls_approach(): from sklearn.cross_decomposition import PLSRegression (X, Y), cities = pull_xy_data() pls = PLSRegression() pls.fit(X, Y) plsX, plsY = pls.transform(X, Y) plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1) return "OK What Now?"
def do_pls(X, Y): pls2 = PLSRegression(n_components=2) pls2.fit(X,Y) out = pls2.transform(X) print(out) print(out.shape) plt.title("PLS2") plt.xlabel("PL1") plt.ylabel("PL2") plt.grid(); plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis') plt.savefig('pls.png', dpi=125)
class PLSPredictor: def __init__(self): self.pls2 = PLSRegression(n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True) def predict(self, values): self.pls2.predict(values) def train(self, measured_values, screen_points): self.pls2.fit(measured_values, screen_points)
def __one_pls(self, cat): np.seterr(all='raise') lcat = np.zeros(self.train_set['labels'].size) lcat[self.train_set['labels'] != cat] = -1 lcat[self.train_set['labels'] == cat] = +1 pls = PLSRegression(n_components=2, scale=False) pls.fit(self.train_set['data'], lcat) return pls
def fit(self, predictors, predictands, locations, log=False, **kwargs): self.locations = locations self.models = [] self.n = predictors['n'] id = 0 for location in locations: X = extract_n_by_n(predictors, location, **kwargs) Y = predictands[:,id] if log: Y = np.log(Y) #pca = PCA(n_components='mle', whiten=True) model = PLSRegression(n_components=2) model = model.fit(X,Y) #components = pca.components_ #pca.components_ = components self.models.append(model) print "pls: ", location, model.score(X, Y), model.x_loadings_.shape, np.argmax(model.x_loadings_, axis=0) id += 1
def build_model(X, y): # gbr = GradientBoostingRegressor(learning_rate= 0.03, n_estimators=2000, max_depth=8, subsample=0.9) # rf = RandomForestRegressor(n_estimators=200) # lr = LinearRegression(fit_intercept=True) # knr = KNeighborsRegressor(n_neighbors=10, weights='uniform') # svr = SVR(C=5.0, kernel='linear') pls = PLSRegression(n_components=35) return pls.fit(X, y)
def reduce_PLS(dataframe): PLS_file="data/pls_structure.pickle" selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]] X=np.array(dataframe[selectedcolumn]) y=np.array(dataframe["click"]) if os.path.exists(PLS_file): stand_PLS=pickle.load(open(PLS_file,'rb')) print "PLS structure is loaded." else: stand_PLS=PLSRegression(n_components=10,scale=True) stand_PLS.fit(X, y[:,np.newaxis]) stand_PLS.y_scores_=None stand_PLS.x_scores_=None pickle.dump(stand_PLS,open(PLS_file,"wb")) print "PLS transform structure is stored." T=stand_PLS.transform(X) print "PLS transformation is performed." return T
def pls_regr(x, y): from sklearn.cross_decomposition import PLSRegression n = len(x[0]) if n < 2: raise TypeError score = -999999999999 pls = None ''' for i in range(3, n): pls2 = PLSRegression(n_components=i) pls2.fit(x,y) cscore = pls2.score(x, y) #print i, cscore if cscore > score: pls = pls2 score = cscore ''' pls = PLSRegression(n_components=5) pls.fit(x,y) return pls
def lex_function_learning( class_name, hyper_vec ) : #pls2 = KernelRidge( kernel = "rbf", gamma= 100) #pls2 = KernelRidge( ) pls2 = PLSRegression(n_components=50, max_iter=5000) X = extract_postive_features ( train_dataset[class_name][0], train_dataset[class_name][1] ) Y = [] for hypo_vec in X : sub = hyper_vec-hypo_vec Y.append(sub) # Target = difference vector ( Hypernym_vector - Hyponym_vector ) #Y.append(hyper_vec) # Target = Hypernym vector pls2.fit( X, Y) train_acc = pls2.score(X, Y) print "class = ", class_name, "train len = ", len(X) return pls2, train_acc, len(X)
def train_PLSR(x_filename, y_filename, model_filename, n): """ Train a PLSR model and save it to the model_filename. X and Y matrices are read from x_filename and y_filename. The no. of PLSR components is given by n. """ X = loadMatrix(x_filename)[0].todense() Y = loadMatrix(y_filename)[0].todense() if X.shape[0] != Y.shape[0]: sys.stderr.write("X and Y must have equal number of rows!\n") raise ValueError sys.stderr.write("Learning PLSR...") startTime = time.time() pls2 = PLSRegression(copy=True, max_iter=10000, n_components=n, scale=True, tol=1e-06) pls2.fit(X, Y) model = open(model_filename, 'w') pickle.dump(pls2, model, 1) model.close() endTime = time.time() sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2))) pass
def hacerPLS(X,Y): pls_wild_b = PLSRegression(n_components = 9) pls_wild_b.fit(X,Y) Z = pls_wild_b.transform(X) scores = list() scores_std = list() n_features = np.shape(X)[1] X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0) N = np.shape(X)[0] for num_comp in range(n_features): kf = KFold(N,n_folds = 10) aux_scores = list() for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] if num_comp == 0: y_pred = np.mean(y_test) y_pred = y_pred* np.ones(np.shape(y_test)) aux_scores.append(metrics.mean_squared_error(y_test,y_pred)) else: pls_foo = PLSRegression(n_components = num_comp) pls_foo.fit(X_train,y_train) y_pred = pls_foo.predict(X_test) #obtaing the score this_score = metrics.mean_squared_error(y_test,y_pred) aux_scores.append(this_score) scores.append(np.mean(aux_scores)) scores_std.append(np.std(aux_scores)) plt.plot(scores) xlabel('Componentes') ylabel("$MSE$") title("Animales PLS") plt.show() num_comp = np.argmin(scores) pls_pred = PLSRegression(n_components =2) pls_pred.fit(X,Y) y_pred_test = pls_pred.predict(X_test_tot) print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
train = pd.read_csv('train.csv', index_col='id') targets = pd.get_dummies(train.target) train.drop('target', axis=1, inplace=True) train = train.apply(np.log1p) test = pd.read_csv('test.csv', index_col='id') test = test.apply(np.log1p) Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27) best = 10. for n in range(5,16): clf = PLSRegression(n_components=n) clf.fit(Xt,yt) y_pred = clf.predict(Xv) loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred) if loss < best: n_best = n best = loss postfix = '(*)' else: postfix = '' print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix)) clf = PLSRegression(n_components=n_best) clf.fit(train,targets) y_pred = clf.predict(test)
if i == 0: plt.ylabel('1st component') elif i == 1: plt.ylabel('2nd component') else: plt.ylabel('3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7) axis_c.set_xticks(axis_c.get_xticks() + 0.5) print "dentro del bucleeeeeeeeeee" #Select the number of components using CV #%% ##PLSR pls_wild_b = PLSRegression(n_components = 3) pls_wild_b.fit(X_train_prepro,Y_train) X_train_pls_proj = pls_wild_b.transform(X_train_prepro) print("loadings") for i in range(pls_wild_b.n_components): plt.figure() plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i]) if i == 0: plt.ylabel('PLS 1st component') elif i == 1: plt.ylabel('PLS2nd component') else: plt.ylabel('PLS 3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7) axis_c.set_xticks(axis_c.get_xticks() + 0.5)
plt.yticks(()) plt.show() # ############################################################################# # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y)
(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath) (Xtest, ytest) = loadData(xtestpath, ytestpath) #trim off background and scale ytrain=ytrain[:,1:] #ytrain=scale(ytrain) Xtrain=standardize(Xtrain) #trim off background and scale ytest = ytest[:,1:] #ytest = scale(ytest) Xtest = standardize(Xtest) pls = PLSRegression(n_components=10) pls.fit(Xtrain, ytrain) y_pls = pls.predict(Xtest) print 1 + pls.score(Xtest, ytest) pls_rmse=[] pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3]))) fig = plt.figure(figsize=(20,10)) ax1 = fig.add_subplot(241) ax1.plot(y_pls[:,0], c='r', label='PLS Fit') ax1.plot(ytest[:,0], c='grey', label='Target')
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ]) plsr.fit(data, Y) # Transpose it, as vars need to along the top # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n,s in enumerate(plsr.x_scores_.T): scored.data[:,n] = s scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n]) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean( cw_x[c] ) cy = np.mean( cw_y[c] ) # Calculate 95% CI rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn) figure_regions.append( (c, cx, cy, rx, ry) ) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 ) dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] )) dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z ] weightsd = DataSet(size=plsr.x_weights_.T.shape) weightsd.data = plsr.x_weights_.T weightsd.scales[1] = input.scales[1] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1] ): lvd = DataSet( size=(1, input.shape[1] ) ) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:,n:n+1].T dso_lv['lv%s' % (n+1)] = lvd weightsd.labels[0][n] = "Weights on LV %s" % (n+1) weightsd.classes[0][n] = "LV %s" % (n+1) return dict(list({ 'dso': dso, 'scores':scored, 'weights':weightsd, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()) )
plt.plot(nComponents,plsCanScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS Cannonical accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% PLS Regression nComponents = np.arange(1,nClasses+1) plsRegScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plsReg = PLSRegression(n_components=n) plsReg.fit(Xtrain,Ytrain) XtrainT = plsReg.transform(Xtrain) XtestT = plsReg.transform(Xtest) plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plsReg = PLSRegression(n_components=2) plsReg.fit(Xtrain,Ytrain) xt = plsReg.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure()
def plsvip (X, Y, V, lat_var): attributes = len(X[0]) if not lat_var: latent_variables = attributes else: latent_variables = lat_var num_instances = len(X) attributes_gone = [] min_att = -1 #start_time = time.time() #attr_time = time.time() #time_counter = 0 while attributes>0: #if (attributes +9) %10 ==0: # print "total time: ", time.time() - start_time # print "attr time: ", time.time() - attr_time # attr_time = time.time() if (latent_variables == 0) or (latent_variables > attributes): latent_variables = attributes lv_best = best_latent_variable(X, Y, latent_variables, num_instances) #print "current best lv: ", lv_best, "num. attr. ", attributes #### #fin_pls = PLSCanonical(n_components = lv_best) fin_pls = PLSRegression(n_components = lv_best) fin_pls.fit(X, Y) currentR2 = fin_pls.score(X, Y) #######################################w # alternative r2 """ meanY4r2 = numpy.mean(Y) predY = fin_pls.predict(X) RSS = 0 for i in range (len(Y)): RSS += numpy.power (Y[i] - predY[i], 2) TSS = 0 for i in range (len(Y)): TSS += numpy.power (Y[i] - meanY4r2, 2) alterR2 = 1 - (RSS/TSS) #print currentR2, "vs", alterR2 """ #######################################w min_vip = 1000 if min_att ==-1: attributes_gone.append(["None", currentR2, attributes, lv_best]) ##########################################r #threaded version """ myThreads = [] VIPcurrent = [] for i in range (0,attributes): myThreads.append(enthread( target = get_vip, args = (fin_pls, lv_best, i, attributes_gone, attributes )) ) for i in range (0,attributes): VIPcurrent.append(myThreads[i].get()) min_vip = min(VIPcurrent) min_att = VIPcurrent.index(min_vip) """ # Working version #""" for i in range (0,attributes): VIPcurrent = get_vip (fin_pls, lv_best, i, attributes_gone, attributes ) if VIPcurrent< min_vip: min_vip = VIPcurrent min_att = i #""" ##########################################r if min_att >-1: attributes_gone.append([V[min_att], currentR2, attributes, lv_best]) ####### CURRENT : to BE popped, NOT already popped V.pop(min_att) for i in range (num_instances): X[i].pop(min_att) attributes -= 1 #print attributes_gone #### #time_counter +=1 return attributes_gone
#correct not accurate from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics from sklearn.svm import SVC import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.cross_decomposition import PLSCanonical df=pd.read_csv('newdata.csv') x=df.drop(['tag'],axis=1) y=df.drop(['kx','ky','kz','wa','wb','wc','wd','we','wf'],axis=1) X_train , X_test , Y_train , Y_test = train_test_split(x,y , random_state=5) plsr=PLSRegression() plsr.fit(X_train,Y_train) plsc=PLSCanonical() plsc.fit(X_train,Y_train) print (plsr.score(X_test,Y_test)) print (plsc.score(X_test,Y_test))
#Partial Least Squares Regression from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import scale X_train_scaled = scale(X_train) X_test_scaled = scale(X_test) #Performing Cross_Validation for PLS mse = [] n= len(X_train_scaled) kf_10 = cross_validation.KFold(n,n_folds=10, shuffle=True, random_state=0) for i in np.arange(1,17): plsregr = PLSRegression(n_components=i, scale=False) plsregr.fit(X_train_scaled,y_train) score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean() mse.append(score) plt.plot(np.arange(1,17), np.array(mse), '-v') plt.title("PLS: MSE vs. Principal Components") plt.xlabel('Number of principal components in PLS regression') plt.ylabel('MSE') plt.xlim((-0.2, 17.2)) #Based off of the plot, 12 principal components minimized MSE plsregr_test = PLSRegression(n_components=12, scale=False) plsregr_test.fit(X_train_scaled, y_train) MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2) # print "Mean Squared Error: ", MSE_PLS
X_levelOne = [] y_levelOne = [] level0Classifier = [] for tid,Xp,yp in zip(subjId_train,X_train,y_train): print "Predicting subject ", vid, "from subject ", tid y0 = np.zeros(yp.shape) y1 = np.ones(Xt.shape[0]) X = np.vstack([Xp,Xt]) yd = np.concatenate([y0,y1]) pls = PLSRegression(n_components) Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9) yp_t = yp_t.astype(bool) yp_t_not = np.vstack((yp_t,~yp_t)).T #print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t,yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred) ** 2).sum() print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred) ** 2).sum() print "PLS Validation error " , float(error)/yp_v.shape[0] X_new = pls.transform(X) rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4)
''' clf = linear_model.ElasticNet(alpha=0.2, l1_ratio=0.01) clf.fit(x_scaled, y_scaled) print(clf.coef_) yvalid_scaled = clf.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) ''' General Linear Model -- Elastic Net ''' from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=20) pls.fit(x_scaled, y_scaled) print(pls.coef_) yvalid_scaled = pls.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) from sklearn.decomposition import PCA reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax) pca = PCA(n_components=2) pca.fit(xtrain_minmax) print(pca.explained_variance_ratio_)
import pandas as pd import numpy as np _experiment_test = config['experiment_test'] _experiment_control = config['experiment_control'] plsr = PLSRegression(n_components=config['number_of_components'], scale=config['autoscale']) #, algorithm=self.config.get('algorithm')) # We need classes to do the classification; should check and raise an error class_idx = input_data.index.names.index('Class') classes = list( input_data.index.levels[ class_idx ] ) Y = input_data.index.labels[ class_idx ] plsr.fit(input_data.values, Y) # Build scores into a dso no_of_samples x no_of_principal_components scores = pd.DataFrame(plsr.x_scores_) scores.index = input_data.index scoresl =[] for n,s in enumerate(plsr.x_scores_.T): scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n]) scores.columns = scoresl weights = pd.DataFrame( plsr.x_weights_.T ) weights.columns = input_data.columns dso_lv = {}
X = dataset["data"] y = dataset["target"] # Center each feature and scale the variance to be unitary X = preprocessing.scale(X) # Compute the variance for each column print(numpy.var(X, 0).sum()) # Now use PCA using 3 components pca = PCA(3) X2 = pca.fit_transform(X) print(numpy.var(X2, 0).sum()) pls = PLSRegression(3) pls.fit(X, y) X2 = pls.transform(X) print(numpy.var(X2, 0).sum()) # Make predictions using an SVM with PCA and PLS pca_error = 0 pls_error = 0 n_folds = 10 svc = LinearSVC() for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds): X_train, X_test = X[train_inds], X[test_inds] y_train, y_test = y[train_inds], y[test_inds] # Use PCA and then classify using an SVM
def bestpls(vipMatrix, X, Y, V): ########################### #bestR2 = -10000 #lv_best = 1 #position = 1 ########################### bestR2 = vipMatrix[0][1] lv_best = vipMatrix[0][3] position = 0 ########################### #for i in range (len(vipMatrix)): # print vipMatrix[i] for entries in range (len(vipMatrix)): #print vipMatrix[entries][1], "=?=", bestR2 ############# if vipMatrix[entries][1] > bestR2: position = entries bestR2 = vipMatrix[entries][1] lv_best = vipMatrix[entries][3] #################################################################################################qq variables = [] for i in range (1, position): # not position + 1, as the vipMatrix[position] holds the next variable to be removed variables.append(vipMatrix[i][0]) #print "VAR TO BE REMOVED: ", variables V_new_Indices = [] for i in variables: # removed variable names in random order V_new_Indices.append(V.index(i)) #if V == sorted(V): # print "\nV ok!\n" # keep names == separate V_new = deepcopy(V) for i in variables: V_new.remove(i) X_new = [] for i in range (len(X)): X_new.append([]) variables_sent = [] #### for i in range (len(X)): for j in range (len(V)): if j not in V_new_Indices: #if V[j] not in variables_sent: #### # variables_sent.append(V[j])#### X_new[i].append(X[i][j]) # epic test if not V_new == sorted(V_new): return base64.b64encode("tobulo"), [], [], 0 #else: # print "v_new ok!" #validity tests #for i in range (len (variables_sent)): # if variables_sent[i] == V_new[i]: # print "ok", i #print "var: ", len(V), "selected: ", len(V_new), "data (var) init length: ", len(X[0]), "data (var) now length: ", len(X_new[0]) """ # PREVIOUS variables = [] for i in range (1, position): variables.append(vipMatrix[i][0]) V_new = deepcopy(V) for i in variables: V_new.remove(i) ################ remove by index??? CHECK!!!! X_new = [] for i in range (len(X)): X_new.append([]) for i in range (len(X)): for j in range (len(V_new)): ####### HERE ALSO X_new[i].append(X[i][j]) """ #################################################################################################qq #print V_new, "OOOO\n\n" #var names == cool #print "\n\nNumber of variables ", len(V_new), " and latent: ", lv_best #best_pls = PLSCanonical(n_components = lv_best) best_pls = PLSRegression(n_components = lv_best) best_pls.fit(X_new, Y) saveas = pickle.dumps(best_pls) encoded = base64.b64encode(saveas) return encoded, X_new, V_new, lv_best