def fitcv(self): pls = PLSRegression(n_components=self.n_components,scale=False) kf = KFold(n_splits=self.n_splits) yTrue=None yHat=None # 判断Y 是几维的 dimensiony=len(self.Y.shape) for train_index, test_index in kf.split(self.X): X_train, X_test = self.X[train_index], self.X[test_index] y_train, y_test = self.Y[train_index], self.Y[test_index] pls.fit(X_train, y_train) if dimensiony==1: ypred = pls.predict(X_test)[:,0] else: ypred = pls.predict(X_test) ypred[ypred>0]=1 ypred[ypred<0]=-1 if yTrue is None: yTrue=y_test # 真值 yHat=ypred #预测值 else: yTrue=np.r_[yTrue,y_test] yHat=np.r_[yHat, ypred] err=yTrue-yHat errSampleNo=np.where(err!=0) err=err[err!=0] return len(err)/len(self.X)*100,errSampleNo #返回误判率
def get_score(X_train, X_test, y_train, y_test, nc): ''' input:training and testing dataset output:r2 score of 2 methods->pca_score,pls_score ''' #pca方法 pca = PCA(n_components=nc) X_train_reduced = pca.fit_transform(X_train) X_test_reduced = pca.transform(X_test) pcr = LinearRegression().fit(X_train_reduced, y_train) pca_score = pcr.score(X_test_reduced, y_test) predictions = pcr.predict(X_test_reduced) #测试集结果 predictions1 = pcr.predict(X_train_reduced) #训练集结果 print(predictions, predictions1) plt.title("comparison of PLSR and PCA method(nc={},{})".format(nc, item)) plt.xlabel("observed") plt.ylabel("fitted") plt.scatter(y_test / 100, predictions / 100, label='pca') #pls方法 pls = PLSRegression(n_components=nc, ).fit(X_train, y_train.astype(int)) pls_score = pls.score(X_test, y_test) yfit = pls.predict(X_test) yfit1 = pls.predict(X_train) print(yfit, yfit1) plt.scatter(y_test / 100, yfit / 100, label='plsr') plt.legend() # plt.show() return pca_score, pls_score, predictions / 100, predictions1 / 100, yfit / 100, yfit1 / 100
def do_pls(data_x, data_y, train_split_percentage): latent_variables = [] x_test, x_train, y_test, y_train = train_test_split(data_x, data_y, test_size=train_split_percentage, random_state=0) for i in range(20): pls = PLSRegression(n_components=(i + 1), scale=True) pls.fit(x_train, y_train) predicted_cv_y = pls.predict(x_test) mean_squared_error_cv = sqrt(mean_squared_error(y_test, predicted_cv_y)) latent_variables.append(mean_squared_error_cv) best_factor = np.argmin(latent_variables) pls2 = PLSRegression(n_components=(best_factor + 1), scale=True) pls2.fit(x_train, y_train) predicted_cal = pls2.predict(x_train) rmsec = sqrt(mean_squared_error(y_train, predicted_cal)) r2c = pls2.score(x_train, y_train) predicted_cv_y = pls2.predict(x_test) rmsecv = sqrt(mean_squared_error(y_test, predicted_cv_y)) r2v = pls2.score(x_test, y_test) plsfinal = PLSRegression(n_components=(best_factor + 1), scale=True) plsfinal.fit(data_x, data_y) return plsfinal, rmsec, r2c, rmsecv, r2v
def PLS(X, y, X_ind, y_ind): """ Cross validation and Independent test for PLS regression model. Arguments: X (np.ndarray): m x n feature matrix for cross validation, where m is the number of samples and n is the number of features. y (np.ndarray): m-d label array for cross validation, where m is the number of samples and equals to row of X. X_ind (np.ndarray): m x n Feature matrix for independent set, where m is the number of samples and n is the number of features. y_ind (np.ndarray): m-d label array for independent set, where m is the number of samples and equals to row of X_ind, and l is the number of types. reg (bool): it True, the training is for regression, otherwise for classification. Returns: cvs (np.ndarray): m x l result matrix for cross validation, where m is the number of samples and equals to row of X, and l is the number of types and equals to row of X. inds (np.ndarray): m x l result matrix for independent test, where m is the number of samples and equals to row of X, and l is the number of types and equals to row of X. """ folds = KFold(5).split(X) cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) for i, (trained, valided) in enumerate(folds): model = PLSRegression() model.fit(X[trained], y[trained]) cvs[valided] = model.predict(X[valided])[:, 0] inds += model.predict(X_ind)[:, 0] return cvs, inds / 5
def plot_pls_results(x_data, y_data, pls_components, num_variables): pls = PLSRegression(pls_components) cv_splitter = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=6) # 1 group_splitter = data_full['Leaf number'] print('111111111') print(x_data) for train_index, test_index in cv_splitter.split(x_data, y_data, group_splitter): # print(train_index, test_index) x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index] y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index] pls.fit(x_train, y_train) y_pred_train = pls.predict(x_train) y_pred_test = pls.predict(x_test) r2_test = r2_score(y_test, y_pred_test) r2_train = r2_score(y_train, y_pred_train) mae_test = mean_absolute_error(y_test, y_pred_test) mae_train = mean_absolute_error(y_train, y_pred_train) print(r2_test, mae_test) print(r2_train, mae_train) print(r2_score(y_train, y_pred_train)) print(r2_score(y_test, y_pred_test)) plt.scatter(y_train, y_pred_train, c='blue', label='Training Set') plt.scatter(y_test, y_pred_test, c='red', label='Test Set') _line = np.linspace(0.2, 1.2) # plt.plot(_line, _line, c='indigo', linestyle='dashed') # # plt.plot(_line, _line + .06, c='darkslategray', linestyle='dashed') # plt.plot(_line, _line - .06, c='darkslategray', linestyle='dashed') # # left_annote_pos = 0.20 # plt.annotate("Training Median Absolute Error = {}".format(0.059), # (left_annote_pos, 1.1), fontsize=12) # # plt.annotate("Testing Median Absolute Error = {}".format(0.07), # # (left_annote_pos, 1.02), fontsize=12) # # plt.annotate(u"Training R\u00B2 = {}".format(0.83), # (left_annote_pos, .95), fontsize=12) # # # plt.annotate(u"Testing R\u00B2 = {}".format(0.82), # # (left_annote_pos, .89), fontsize=12) # plt.xlabel('Meausured Chlorophyll b (ug/ml)', fontsize=16) # plt.ylabel('Predicted Chlorophyll b (ug/ml)', fontsize=16) # plt.title("Chlorophyll b Model for AS7262\nbased on 2-Component\nPartial Least Squared Model", # fontsize=18) # plt.legend(loc='lower right', fontsize=12) plt.tight_layout() plt.show() plt.scatter(y_pred_train, y_train, c='blue', label='Training Set') plt.scatter(y_pred_test, y_test, c='red', label='Test Set') plt.show()
def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False, osc_params=(10,1)): # Separating X from Y for PLS X=self.df[self.freqs].to_numpy() Y=self.df[self.y_name].to_numpy().reshape(-1, 1) sample_std=np.std(self.df[self.y_name]) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") # Array for storing CV errors cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)]) i=0 for train, val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) osc.fit(X[train], Y[train]) X_train_osc=osc.X_osc X_val_osc=osc.transform(X[val]) j=0 for ncomp in ncomp_range: pls = PLSRegression(n_components=ncomp,scale=False) if len(osc_params)==2: pls.fit(X_train_osc, Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X_val_osc))**0.5 else: pls.fit(X[train], Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X[val]))**0.5 j=j+1 i=i+1 # Printing and plotting CV results cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0) cv_RPD_ncomp=sample_std/cv_RMSE_ncomp if plot: fig = plt.figure(figsize=(12,8)) plt.gca().xaxis.grid(True) plt.xticks(ncomp_range) plt.ylabel("RPD") plt.xlabel("Number of components") plt.plot(ncomp_range,cv_RPD_ncomp) # Best model rpd_best=max(cv_RPD_ncomp) ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()] if verbose: print("Best RMSE: ",min(cv_RMSE_ncomp)) print("Best RPD: ",max(cv_RPD_ncomp)) print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()]) return (ncomp_best,rpd_best)
def PLS_DA(datos): global pls_bi datos_bi = datos[(datos['etiqueta'] == 5 ) | (datos['etiqueta'] == 6)] X_bi = savgol_filter(datos_bi.values[:,2:], 15, polyorder = 3, deriv=0) y_biP = datos_bi["etiqueta"].values y_bi = (y_biP == 6).astype('uint8') pls_bi = PLSRegression(n_components=2) X_pls = pls_bi.fit_transform(X_bi, y_bi)[0] labplot = ["60/40 ratio", "50/50 ratio"] unique = list(set(y_bi)) colors = [plt.cm.jet(float(i)/max(unique)) for i in unique] with plt.style.context(('ggplot')): plt.figure(figsize=(12,10)) for i, u in enumerate(unique): col = np.expand_dims(np.array(colors[i]), axis=0) x = [X_pls[j,0] for j in range(len(X_pls[:,0])) if y_bi[j] == u] y = [X_pls[j,1] for j in range(len(X_pls[:,1])) if y_bi[j] == u] plt.scatter(x, y, c=col, s=100, edgecolors='k',label=str(u)) plt.xlabel('Variable Latente 1') plt.ylabel('Variable Latente 2') plt.legend(labplot,loc='lower left') plt.title('Descomposición cruzada PLS') plt.show() X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X_bi, y_bi, test_size=0.2, random_state=19) pls_bi = PLSRegression(n_components=2) pls_bi.fit(X_entreno, y_entreno) y_prediccion1 = pls_bi.predict(X_prueba)[:,0] prediccion_binaria1 = (pls_bi.predict(X_prueba)[:,0] > 0.5).astype('uint8') print(prediccion_binaria1, y_prueba) precision = [] A=[] m=0 cvalor = KFold(n_splits=40, shuffle=True, random_state=19) for train, test in cvalor.split(X_bi): y_prediccion = PLS_DA1(X_bi[train,:], y_bi[train], X_bi[test,:]) A.append(y_prediccion) precision.append(accuracy_score(y_bi[test], y_prediccion)) m=m+1 print("Precisión Promedio para 10 Divisiones: ", np.array(precision).mean()) return prediccion_binaria1, precision
def train_and_predict_PLS(X_train, y_train, X_test, n_components=None): # fit regression model on train regr = PLSRegression(n_components=n_components).fit(X_train, y_train) bic_val = bic(X_train, y_train) # make predictions on test set test_preds = regr.predict( X_test) # predictions of one parameter at n pixels train_preds = regr.predict(X_train) # return test_preds, train_preds, bic_val
def PartialLeastSquares(X_train, X_test, y_train, y_test=None): if y_test is not None: model = PLSRegression() model.fit(X_train, y_train) predicted = model.predict(X_test) return metrics(X_train, y_test, predicted) else: model = PLSRegression() model.fit(X_train, y_train) predicted = model.predict(X_test) return predicted
class PLSPredictor: def __init__(self): self.pls2 = PLSRegression(n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True) def predict(self, values): self.pls2.predict(values) def train(self, measured_values, screen_points): self.pls2.fit(measured_values, screen_points)
class PLS(): """ Implement PLS to make it compliant with the other dimensionality reduction methodology. (Simple class rewritting). """ def __init__(self, n_components=10): self.clf = PLSRegression(n_components) def get_components_(self): return self.clf.x_weights_.transpose() def set_components_(self, x): pass components_ = property(get_components_, set_components_) def fit(self, X, y): self.clf.fit(X,y) return self def transform(self, X): return self.clf.transform(X) def predict(self, X): return self.clf.predict(X)
class MSLMultiModel(WebModel): ''' a Multitask version of MSLModel ''' def __init__(self, output_dir, ccs_dir, lanl_file, n_components=10, **kwargs): self.output_dir = output_dir self.ccs_dir = ccs_dir self.lanl_file = lanl_file self.n_components = n_components self.model = PLSRegression(n_components=n_components, scale=False) self.multitask = True self.name = 'msl_multi_model' def fit(self, data, composition, elements): self.elements = elements # order matters data = libs_norm3(data[:, ALAMOS_MASK]) self.model.fit(data, composition) def predict(self, data, mask=ALAMOS_MASK, clip=True): data = libs_norm3(data[:, mask]) predictions = self.model.predict(data, copy=False) if predictions: predictions = np.clip(predictions, 0, 100) else: predictions[predictions < 0] = 0 return predictions
class PLSDADummy(BaseEstimator): """ Wrapper of PLSRegression for classification. PLSRegression predicts one hot encoded vectors, then plsda outputs class with maximal score. """ def __init__(self, n_components=2): self.pls = PLSRegression(n_components) self.classes = None def __one_hot_encode(self, Y): # encode labels to numbers Y = np.array([np.where(self.classes == y)[0][0] for y in Y]) enc = OneHotEncoder(n_values=len(self.classes)) return enc.fit_transform(Y.reshape(-1, 1)).toarray() def fit(self, X, Y): """ :param X: :param Y: list of labels :return : """ self.classes = np.array(sorted(np.unique(Y))) Y = self.__one_hot_encode(Y) self.pls.fit(X, Y) return self def predict(self, X): y_pred = np.argmax(self.pls.predict(X), axis=1) return np.array([self.classes[cls] for cls in y_pred])
def knn_denoise(X, X_reference, *, k, ncomp): from sklearn.cross_decomposition import PLSRegression from sklearn.neighbors import NearestNeighbors # Xb = X * window print('PCA...') npca = np.minimum(300, X.shape[0]) u, s, vh = np.linalg.svd(X) features = u[:, 0:npca] * s[0:npca] components = vh[0:npca, :] # s = s[0:npca] print('Nearest neighbors...') nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='ball_tree').fit(features) distances, indices = nbrs.kneighbors(features) features2 = np.zeros(features.shape, dtype=features.dtype) for j in range(X.shape[0]): print(f'{j+1} of {X.shape[0]}') inds0 = np.squeeze(indices[j, :]) inds0 = inds0[1:] # Xbneighbors = Xb[inds0, :] f_neighbors = features[inds0, :] pls = PLSRegression(n_components=ncomp) # pls.fit(Xbneighbors.T, Xb[j, :].T) pls.fit(f_neighbors.T, features[j, :].T) features2[j, :] = pls.predict(f_neighbors.T).T # X2[j, :] = pls.predict(Xbneighbors.T).T print(features2.shape) print(components.shape) X2 = features2 @ components return X2
def fit_plt(dados,ncomp): from sklearn.cross_decomposition import PLSRegression colmap = [ (0,0,0), (1,0,0),(0,1,0),(0,0,1),(0.41,0.41,0.41),(0,1,1), (0.58,0,0.82),(0,0.50,0),(0.98,0.50,0.44),(1, 1,0.87), (0.39,0.58,0.92),(0.50,0.50,0),(1,0.89,0.76),(0.96,0.96,0.86), (0,1,1)] g = dados['g'] r = dados['r'] wn = dados['wn'] pls = PLSRegression(n_components=ncomp) pls.fit(r,g) Y_pred = pls.predict(r) plt.figure() plt.subplot(2,1,1) for i in range(1,g.max()+1): sel = g == i plt.scatter(g[sel],Y_pred[sel],color = colmap[i]) plt.xlabel( 'Y_class',Fontsize = 12) plt.ylabel( 'Y_predited',Fontsize = 12) plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::')) plt.subplot(2,1,2) for i in range(1,g.max()+1): sel = g == i plt.hist(Y_pred[sel]) plt.xlabel( 'Y_class',Fontsize = 12) plt.ylabel( 'histograma',Fontsize = 12) plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
class MyPLS(): def __init__(self, n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True): self.pls = PLSRegression(n_components, scale, max_iter, tol, copy) def fit(self, X, Y): self.pls.fit(X, Y) return self.pls def predict(self, X, copy=True): return self.pls.predict(X, copy).flatten() def score(self, X, Y, sample_weight=None): return self.pls.score(X, Y, sample_weight) def get_params(self, deep=True): return self.pls.get_params(deep) def set_params(self, **parameters): self.pls.set_params(**parameters) return self @property def intercept_(self): return 0 @property def coeff_(self): return self.pls.coef_
def fit_pls(self, X_test): reg = PLSRegression(n_components=20, scale=False, max_iter=1000) reg.fit(self.X.copy().values, self.y.copy().values.flatten()) preds = reg.predict(X_test.copy().values) ids = X_test.index pred_df = pd.DataFrame(data=preds, index=ids, columns=['SalePrice']) pred_df.to_csv('results/results_pls.csv', sep=',')
def simple_pls_cv(X, y, n_comp): # Run PLS with suggested number of components pls = PLSRegression(n_components=n_comp) pls.fit(X, y) y_c = pls.predict(X) # Cross-validation y_cv = cross_val_predict(pls, X, y, cv=10) # Calculate scores for calibration and cross-validation score_c = r2_score(y, y_c) score_cv = r2_score(y, y_cv) # Calculate mean square error for calibration and cross validation mse_c = mean_squared_error(y, y_c) mse_cv = mean_squared_error(y, y_cv) print('R2 calib: %5.3f' % score_c) print('R2 CV: %5.3f' % score_cv) print('MSE calib: %5.3f' % mse_c) print('MSE CV: %5.3f' % mse_cv) # Plot regression z = np.polyfit(y, y_cv, 1) with plt.style.context(('ggplot')): fig, ax = plt.subplots(figsize=(9, 5)) ax.scatter(y_cv, y, c='red', edgecolors='k') ax.plot(z[1] + z[0] * y, y, c='blue', linewidth=1) ax.plot(y, y, color='green', linewidth=1) plt.title('$R^{2}$ (CV): ' + str(score_cv)) plt.xlabel('Predicted $^{\circ}$Brix') plt.ylabel('Measured $^{\circ}$Brix') plt.show()
def predicao(self, idmodelo, idamostra): idmodelo = idmodelo idamostra = idamostra print(idmodelo) print(idamostra) X = self.selectMatrizX(idmodelo, "VALIDACAO") Y = self.selectMatrizY(idmodelo, "VALOR", "VALIDACAO") amostraPredicao = self.selectAmostra(idamostra, idmodelo) valorReferencia = self.selectDadosReferenciaAmostra(idamostra, idmodelo) pls = PLSRegression(copy=True, max_iter=500, n_components=20, scale=False, tol=1e-06) pls.fit(X, Y) print(amostraPredicao) valorPredito = pls.predict(amostraPredicao) print('Amostra: ' + str(idamostra) + ' - Valor Predito :' + str(valorPredito) + ' - Valor Referencia :' + str( valorReferencia)) cursorDadosCalibracao = db.execute("select rmsec, rmsep, coeficientecal, coeficienteval, dtcalibracao " "from calibracao where inativo = 'A' and idmodelo = " + str(idmodelo) + " ") for regCodigo in cursorDadosCalibracao: rmsec = regCodigo[0] rmsep = regCodigo[1] coeficienteCal = regCodigo[2] coeficienteVal = regCodigo[3] dtcalibracao = regCodigo[4] print(rmsec) print(rmsep) print(coeficienteCal) print(coeficienteVal) print(dtcalibracao) dtcalibracao = dtcalibracao.strftime('%d/%m/%Y') print(dtcalibracao) # tratamento dos dados para o Json coeficienteCal = round(coeficienteCal, 2) coeficienteVal = round(coeficienteVal, 2) rmsec = round(rmsec, 2) rmsep = round(rmsep, 2) valorReferencia = round(valorReferencia, 2) valorPreditoString = str(valorPredito) valorPreditoString = valorPreditoString.replace("[", "") valorPreditoString = valorPreditoString.replace("]", "") ##Contrucao do JSON json_data = jsonify(idamostra=str(idamostra), valorpredito=str(valorPreditoString), rmsec=str(rmsec), rmsep=str(rmsep), idmodelo=str(idmodelo), dtcalibracao=str(dtcalibracao), valorreferencia=str(valorReferencia), coeficientecal=str(coeficienteCal), coeficienteval=str(coeficienteVal)) return json_data
def PLSR_LOOCV(data): ''' Performs LOOCV on the data and returns R2Y value ''' R2Y = 0 predVal = [] for i in range(len(data[:, 0])): train = np.zeros((len(data[:, 0]) - 1, 8)) test = np.zeros((1, 8)) for j in range(len(data[:, 0])): if j < i: train[j, :] = data[j, :] elif j > i: train[j - 1, :] = data[j, :] else: test[0, :] = data[j, :] testScaled = np.zeros((1, 8)) trainScale = StandardScaler() trainScaled = trainScale.fit_transform(train) testScaled[0, :] = trainScale.transform(test) PLSR = PLSRegression(n_components=2) PLSR.fit(trainScaled[:, 2:6], trainScaled[:, 0]) pred = PLSR.predict(testScaled[:, 2:6]) predVal.append(np.squeeze(pred)) scaledData = scaler(data) R2Y = 1 - np.sum( (predVal - scaledData[:, 0])**2) / np.sum(scaledData[:, 0]**2) return R2Y
def PLSCrossValidation(n_components, trainSet, validationSet): pls = PLSRegression(n_components=n_components) pls.fit(trainSet[predictorList], trainSet['Apps']) predictPls = pls.predict(validationSet[predictorList]) different = predictPls.flat - validationSet['Apps'] error_rate = np.mean(different ** 2) return error_rate
def hacerPLS(X,Y): pls_wild_b = PLSRegression(n_components = 9) pls_wild_b.fit(X,Y) Z = pls_wild_b.transform(X) scores = list() scores_std = list() n_features = np.shape(X)[1] X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0) N = np.shape(X)[0] for num_comp in range(n_features): kf = KFold(N,n_folds = 10) aux_scores = list() for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] if num_comp == 0: y_pred = np.mean(y_test) y_pred = y_pred* np.ones(np.shape(y_test)) aux_scores.append(metrics.mean_squared_error(y_test,y_pred)) else: pls_foo = PLSRegression(n_components = num_comp) pls_foo.fit(X_train,y_train) y_pred = pls_foo.predict(X_test) #obtaing the score this_score = metrics.mean_squared_error(y_test,y_pred) aux_scores.append(this_score) scores.append(np.mean(aux_scores)) scores_std.append(np.std(aux_scores)) plt.plot(scores) xlabel('Componentes') ylabel("$MSE$") title("Animales PLS") plt.show() num_comp = np.argmin(scores) pls_pred = PLSRegression(n_components =2) pls_pred.fit(X,Y) y_pred_test = pls_pred.predict(X_test_tot) print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
def test_regressor_predict(pls_regressor): X = np.random.rand(10, 10) y = np.random.rand(10) sklearn_regressor = PLSRegression().fit(X, y) pls_regressor.fit(X, y) y_pred = pls_regressor.predict(X) assert y_pred.shape == y.shape assert np.all(y_pred == sklearn_regressor.predict(X).ravel())
def run_pls(X, Y, LV): model = PLSRegression(n_components=LV, scale=False) model.fit(X, Y) Yr = [ y[0] for y in model.predict(X).tolist() ] r2, sdec = calc_regr_metrics(Y_exp=Y, Y_pred=Yr) q2, sdep, variables.Y_pred = regr_loo(X=np.array(X), Y=np.array(Y), M=model) scores = { 'R2': r2, 'Q2': q2, 'SDEC': sdec,'SDEP': sdep } return scores, model
def do_sigma_pls(data_x, data_y, train_split_percentage): latent_variables = [] x_test, x_train, y_test, y_train = train_test_split(data_x, data_y, test_size=train_split_percentage, random_state=0) for i in range(20): pls = PLSRegression(n_components=(i + 1), scale=True) pls.fit(x_train, y_train) predicted_cv_y = pls.predict(x_test) mean_squared_error_cv = sqrt(mean_squared_error(y_test, predicted_cv_y)) latent_variables.append(mean_squared_error_cv) best_factor = np.argmin(latent_variables) pls_sigma = PLSRegression(n_components=(best_factor + 1), scale=True) pls_sigma.fit(data_x, data_y) predicted_cv_y_sigma = pd.DataFrame(pls_sigma.predict(data_x)) data_labels = pd.DataFrame(data_y.index) data_x = pd.DataFrame(data_x).reset_index(drop=True) data_y = pd.DataFrame(data_y).reset_index(drop=True) if cfg.sigma_percentage: percentual_error = pd.DataFrame(abs(data_y.iloc[:, 0] - predicted_cv_y_sigma.iloc[:, 0])) percentual_error = pd.DataFrame((percentual_error.iloc[:, 0] * 100) / data_y.iloc[:, 0]) df_x = pd.DataFrame(pd.DataFrame(pd.concat([data_x, percentual_error], axis=1))) df_x = df_x.drop(df_x[df_x.iloc[:, -1] > cfg.sigma_confidence].index) df_x.drop(df_x.columns[len(df_x.columns) - 1], axis=1, inplace=True) df_y = pd.DataFrame(pd.DataFrame(pd.concat([data_y, data_labels, percentual_error], axis=1))) df_y = df_y.drop(df_y[df_y.iloc[:, -1] > cfg.sigma_confidence].index) df_x.set_index(df_y.iloc[:, 1], inplace=True) df_y.set_index(df_x.index, inplace=True) df_y.drop(df_y.columns[len(df_y.columns) - 1], axis=1, inplace=True) return df_x, df_y else: abs_error = pd.DataFrame(abs(data_y.iloc[:, 0] - predicted_cv_y_sigma.iloc[:, 0])) df_x = pd.DataFrame(pd.DataFrame(pd.concat([data_x, abs_error], axis=1))) df_x = df_x.drop(df_x[df_x.iloc[:, -1] > cfg.sigma_confidence].index) df_x.drop(df_x.columns[len(df_x.columns) - 1], axis=1, inplace=True) df_y = pd.DataFrame(pd.DataFrame(pd.concat([data_y, abs_error], axis=1))) df_y = df_y.drop(df_y[df_y.iloc[:, -1] > cfg.sigma_confidence].index) df_x.set_index(df_y.iloc[:, 1], inplace=True) df_y.set_index(df_x.index, inplace=True) df_y.drop(df_y.columns[len(df_y.columns) - 1], axis=1, inplace=True) return df_x, df_y
def compute_q2_pls(tdata, tlabel, vdata, vlabel, Rval): test = PLSRegression(n_components=Rval) with warnings.catch_warnings(): warnings.simplefilter("ignore") test.fit(matricize(tdata), matricize(tlabel)) Y_pred = test.predict(matricize(vdata)) Q2 = qsquared(matricize(vlabel), matricize(Y_pred)) return Q2
def Training(df,seed, yratio, xratio, index = 1): snp_matrix = np.array(df.values) xdim, ydim = snp_matrix.shape ydimlist = range(0,ydim) xdimlist = range(0,xdim) random.seed(seed) random.shuffle(ydimlist) # shuffle the individuals random.shuffle(xdimlist) # shuffle the SNPs accuracy = 0 snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist]) snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:]) snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)] snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):] snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:] snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] for i in range(int(xdim*xratio), xdim): snp_matrix_train_y = snp_matrix_train[i,:] snp_matrix_test_y = snp_matrix_test[i,:] if index != 7: if index == 1: clf = AdaBoostClassifier(n_estimators= 100) elif index == 2: clf = RandomForestClassifier(n_estimators=100) elif index == 3: clf = linear_model.LogisticRegression(C=1e5) elif index == 4: clf = svm.SVC(kernel = 'rbf') elif index == 5: clf = svm.SVC(kernel = 'poly') else: clf = svm.SVC(kernel = 'linear') clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y) Y_pred = clf.predict(snp_matrix_test_x.T) prediction = snp_matrix_test_y - Y_pred wrong = np.count_nonzero(prediction) tmp = 1 - (wrong + 0.0) / len(prediction) print tmp accuracy += tmp accuracy = accuracy / (xdim - int(xdim*xratio)) if index == 7: pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000) snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:] pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T) snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:] Y_pred = transform(pls2.predict(snp_matrix_test_x.T)) prediction = snp_matrix_test_y - Y_pred.T xdim, ydim = prediction.shape wrong = np.count_nonzero(prediction) accuracy = 1 - wrong / (xdim * ydim + 0.0) return accuracy
def Training(df,seed, yratio, xratio, index = 1): snp_matrix = np.array(df.values) xdim, ydim = snp_matrix.shape ydimlist = range(0,ydim) xdimlist = range(0,xdim) random.seed(seed) random.shuffle(ydimlist) # shuffle the individuals random.shuffle(xdimlist) # shuffle the SNPs accuracy = 0 snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist]) snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:]) snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)] snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):] snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:] snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] for i in range(int(xdim*xratio), xdim): snp_matrix_train_y = snp_matrix_train[i,:] snp_matrix_test_y = snp_matrix_test[i,:] if index != 7: if index == 1: clf = AdaBoostClassifier(n_estimators= 100) elif index == 2: clf = RandomForestClassifier(n_estimators=100) elif index == 3: clf = linear_model.LogisticRegression(C=1e5) elif index == 4: clf = svm.SVC(kernel = 'rbf') elif index == 5: clf = svm.SVC(kernel = 'poly') else: clf = svm.SVC(kernel = 'linear') clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y) Y_pred = clf.predict(snp_matrix_test_x.T) prediction = snp_matrix_test_y - Y_pred wrong = np.count_nonzero(prediction) tmp = 1 - (wrong + 0.0) / len(prediction) print tmp accuracy += tmp accuracy = accuracy / (xdim - int(xdim*xratio)) if index == 7: pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000) snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:] pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T) snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:] Y_pred = transform(pls2.predict(snp_matrix_test_x.T)) prediction = snp_matrix_test_y - Y_pred.T xdim, ydim = prediction.shape wrong = np.count_nonzero(prediction) accuracy = 1 - wrong / (xdim * ydim + 0.0) return accuracy
def train_plsr(matrix,ty,n): clf = PLSRegression(n_components=5) clf.fit(matrix, ty) X_train, X_test, y_train, y_test = train_test_split(matrix, ty, test_size=n/100) #scores = cross_val_score(clf, matrix, ty, cv =10) scores = clf.score(X_train,y_train) print_plsr_importance(clf) predict_result = {'predict':[each[0] for each in clf.predict(X_test)],'real':y_test} return(scores,predict_result)
def PCA_Red2(Y, X, Y_pred, X_pred): pca = PCA(0.90) X_reduced = pca.fit_transform(scale(X)) pls = PLSRegression(n_components=3, scale=False) pls.fit(scale(X_reduced), Y) X_pred = np.array(X_pred).reshape(1, -1) X_pred = pca.transform(scale(X_pred)) prediction = pls.predict(X_pred) return prediction
def transform_helper(self, data, data_y): data_temp = data - data.mean( axis=0) # make the mean of columns equal to zero data_y = data_y pls = PLSRegression(n_components=2) # Fit pls.fit(data_temp, data_y) res = pls.predict(data_temp) return res
class Plsr: def __init__(self, features, output): self.regressor = None # x includes the features, as matrix, e.g. #bathroom, sq.feet, ... self.X = features # y is the value to predict self.y = output # splitting the dataset into the Training set and Test set '''from sklearn.model_selection import train_test_split self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=0)''' # Feature scaling self.sc_X = StandardScaler() self.sc_y = StandardScaler() self.X = self.sc_X.fit_transform(self.X) self.y = self.sc_y.fit_transform(self.y) def fit(self): # Fitting Partial Least Squares Regression to the dataset self.regressor = PLSRegression(n_components=1) # lin_reg.fit(self.X_train, self.y_test) self.regressor.fit(self.X, self.y) def show_(self): # Visualizing the Partial Least Squares Regression results X_grid = np.arange(min(self.X), max(self.X), 0.1) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(self.X, self.y, color='red') # We don't use X_poly, so this block of code ,ca ne generalized changing data to show plt.plot(X_grid, self.regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (PLSR Model)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() def predict(self, value=6.5): if type(value) is np.ndarray: y_pred = self.regressor.predict(self.sc_X.transform(value)) else: y_pred = self.regressor.predict( self.sc_X.transform(np.array([[value]]))) return self.sc_y.inverse_transform(y_pred)
def test_compare_to_sklearn(self): d = table(10, 5, 1) d.X = np.random.RandomState(0).rand(*d.X.shape) d.Y = np.random.RandomState(0).rand(*d.Y.shape) orange_model = PLSRegressionLearner()(d) scikit_model = PLSRegression().fit(d.X, d.Y) np.testing.assert_almost_equal( scikit_model.predict(d.X).ravel(), orange_model(d)) np.testing.assert_almost_equal(scikit_model.coef_, orange_model.coefficients)
targets = pd.get_dummies(train.target) train.drop('target', axis=1, inplace=True) train = train.apply(np.log1p) test = pd.read_csv('test.csv', index_col='id') test = test.apply(np.log1p) Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27) best = 10. for n in range(5,16): clf = PLSRegression(n_components=n) clf.fit(Xt,yt) y_pred = clf.predict(Xv) loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred) if loss < best: n_best = n best = loss postfix = '(*)' else: postfix = '' print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix)) clf = PLSRegression(n_components=n_best) clf.fit(train,targets) y_pred = clf.predict(test)
regression_params = pandas.DataFrame(0, index=norm.columns, columns=concepts) predicted_nii1 = pandas.DataFrame(0, index=norm.columns, columns=["nii"]) predicted_nii2 = pandas.DataFrame(0, index=norm.columns, columns=["nii"]) print "Training voxels and building predicted images..." for voxel in norm.columns: train = [x for x in X.index if x not in [image1_holdout, image2_holdout] and x in norm.index] Y = norm.loc[train, voxel].tolist() Xtrain = X.loc[train, :] # Use pls instead of regularized regression clf = PLSRegression(n_components=number_components) clf.fit(Xtrain, Y) # Need to find where regression/intercept params are in this model regression_params.loc[voxel, :] = [x[0] for x in clf.coef_] predicted_nii1.loc[voxel, "nii"] = clf.predict(holdout1Y.reshape(1, -1))[0][0] predicted_nii2.loc[voxel, "nii"] = clf.predict(holdout2Y.reshape(1, -1))[0][0] predicted_nii1 = predicted_nii1["nii"].tolist() predicted_nii2 = predicted_nii2["nii"].tolist() # Turn into nifti images nii1 = numpy.zeros(standard_mask.shape) nii2 = numpy.zeros(standard_mask.shape) nii1[standard_mask.get_data() != 0] = predicted_nii1 nii2[standard_mask.get_data() != 0] = predicted_nii2 nii1 = nibabel.Nifti1Image(nii1, affine=standard_mask.get_affine()) nii2 = nibabel.Nifti1Image(nii2, affine=standard_mask.get_affine()) # Turn the holdout image data back into nifti
n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation)
print "\n" SVRr2.append(optSVR.score(XTest, yTest)) SVRmse.append( metrics.mean_squared_error(yTest,SVRpreds)) SVRrmse.append(math.sqrt(SVRmse[metcount])) print ("Support Vector Regression prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2f\n\n" % (metcount+1, SVRmse[metcount], SVRrmse[metcount],SVRr2[metcount])) with open(train_name,'a') as ftrain : ftrain.write("Support Vector Regression prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,\n\n" % (metcount+1, SVRmse[metcount], SVRrmse[metcount],SVRr2[metcount])) ftrain.close() # Train partial least squares and predict with optimised parameters print("\n\n------------------- Starting opitimised PLS training -------------------") optPLS = PLSRegression(n_components = nc) optPLS.fit(XTrain, yTrain) # Train the model print("Training R2 = %5.2f" % optPLS.score(XTrain,yTrain)) print("Starting optimised PLS prediction") PLSpreds = optPLS.predict(XTest) print("The predicted values now follow :") PLSpredsdim = PLSpreds.shape[0] i = 0 if PLSpredsdim%5 == 0: while i < PLSpredsdim: print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2),'\t', round(PLSpreds[i+3],2),'\t', round(PLSpreds[i+4],2) i += 5 elif PLSpredsdim%4 == 0: while i < PLSpredsdim: print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2),'\t', round(PLSpreds[i+3],2) i += 4 elif PLSpredsdim%3 == 0 : while i < PLSpredsdim : print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2) i += 3
def pls_train(groups, varname='valence', arrayname='norm', scale=True, ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Partial Least Squares model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] scale bool to scale data [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] ncomps number of independent components (See Note 5) [2] Returns ------- group with trained PSLResgession, to be used with pls_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. The optimal number of components may be best found from PCA. If set to None, a search will be done for ncomps that gives the lowest RMSE_CV. """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws['scale'] = scale kws['n_components'] = ncomps model = PLSRegression(**kws) rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) resid = [] cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :])[:, 0] resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) # final fit without cross-validation model = PLSRegression(**kws) out = model.fit(spectra, ydat) ypred = model.predict(spectra)[:, 0] rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, coefs=model.x_weights_, loadings=model.x_loadings_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, scale=scale, groupnames=groupnames, keywords=kws)
for i in np.arange(1,17): plsregr = PLSRegression(n_components=i, scale=False) plsregr.fit(X_train_scaled,y_train) score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean() mse.append(score) plt.plot(np.arange(1,17), np.array(mse), '-v') plt.title("PLS: MSE vs. Principal Components") plt.xlabel('Number of principal components in PLS regression') plt.ylabel('MSE') plt.xlim((-0.2, 17.2)) #Based off of the plot, 12 principal components minimized MSE plsregr_test = PLSRegression(n_components=12, scale=False) plsregr_test.fit(X_train_scaled, y_train) MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2) # print "Mean Squared Error: ", MSE_PLS #Compare the results from above. We use (R)^2 for all models Test_avg= np.mean(y_test) LS_R2 = 1 - MSE_LS/(np.mean((Test_avg-y_test)**2)) R_R2 = 1 - MSE_R/(np.mean((Test_avg-y_test)**2)) LA_R2 = 1 - MSE_LA/(np.mean((Test_avg-y_test)**2)) PCA_R2 = 1 - MSE_PCA/(np.mean((Test_avg-y_test)**2)) PLS_R2 = 1 - MSE_PLS/(np.mean((Test_avg-y_test)**2)) print "Least Squares Regression (R)^2: ", LS_R2 print "Ridge Regression (R)^2: ", R_R2 print "Lasso Regression (R)^2: ", LA_R2 print "Principal Component Analysis Regression (R)^2: ", PCA_R2
print(clf.coef_) yvalid_scaled = clf.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) ''' General Linear Model -- Elastic Net ''' from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=20) pls.fit(x_scaled, y_scaled) print(pls.coef_) yvalid_scaled = pls.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) from sklearn.decomposition import PCA reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax) pca = PCA(n_components=2) pca.fit(xtrain_minmax) print(pca.explained_variance_ratio_) data_trainO.head(10)
(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath) (Xtest, ytest) = loadData(xtestpath, ytestpath) #trim off background and scale ytrain=ytrain[:,1:] #ytrain=scale(ytrain) Xtrain=standardize(Xtrain) #trim off background and scale ytest = ytest[:,1:] #ytest = scale(ytest) Xtest = standardize(Xtest) pls = PLSRegression(n_components=10) pls.fit(Xtrain, ytrain) y_pls = pls.predict(Xtest) print 1 + pls.score(Xtest, ytest) pls_rmse=[] pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3]))) fig = plt.figure(figsize=(20,10)) ax1 = fig.add_subplot(241) ax1.plot(y_pls[:,0], c='r', label='PLS Fit') ax1.plot(ytest[:,0], c='grey', label='Target') ax1.set_xlabel('Time')
y_levelOne = [] level0Classifier = [] for tid,Xp,yp in zip(subjId_train,X_train,y_train): print "Predicting subject ", vid, "from subject ", tid y0 = np.zeros(yp.shape) y1 = np.ones(Xt.shape[0]) X = np.vstack([Xp,Xt]) yd = np.concatenate([y0,y1]) pls = PLSRegression(n_components) Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9) yp_t = yp_t.astype(bool) yp_t_not = np.vstack((yp_t,~yp_t)).T #print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t,yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred) ** 2).sum() print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred) ** 2).sum() print "PLS Validation error " , float(error)/yp_v.shape[0] X_new = pls.transform(X) rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4) #print "shapes ", X_new.shape, y.shape
from rdkit.Chem import Descriptors from rdkit.ML.Descriptors import MoleculeDescriptors nms = [ x[0] for x in Descriptors._descList ] def calculator( mols ): calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms ) res = [ calc.CalcDescriptors( mol ) for mol in mols ] return res trainMols = [ mol for mol in Chem.SDMolSupplier("solubility.train.sdf") ] testMols = [ mol for mol in Chem.SDMolSupplier("solubility.test.sdf") ] trainDescrs = calculator( trainMols ) testDescrs = calculator( testMols ) trainActs = np.array([ float( mol.GetProp('SOL') ) for mol in trainMols ]) testActs = np.array([ float( mol.GetProp('SOL') ) for mol in testMols ]) pls2 = PLSRegression( n_components = 15 ) pls2.fit( trainDescrs, trainActs ) sol_pred = pls2.predict( testDescrs ) print type(sol_pred) print type(trainActs) print metrics.r2_score( testActs, sol_pred ) """ for i in range(len(sol_pred)): print testActs[i], sol_pred[i] """
#Xpls = pls.x_scores_ #Ypls = pls.y_scores_ #CorrCoef = np.corrcoef(Xpls,Ypls,rowvar=0) #print('') #print('Correlation between the two datasets in component 1: {:.3}'.format(CorrCoef[2,0])) #print('Correlation between the two datasets in component 2: {:.3}'.format(CorrCoef[1,3])) ### Determine cross-validation scores using k-folds repeated n_iter times with a new random sorting cvPLS = cross_validation.StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=None) # Stratified k-folds of 1/test_size or 5 typically ### Find CV scores using root means square error for PLS to help determine appropriate number of components print('') predPLS = np.array(pls.predict(Data), dtype='int') msepPLS = mean_squared_error(predPLS,y) print('PLS MSEP with {:} PLS components: {:.2e}'.format(nPLS, msepPLS)) msePLSScores = cross_validation.cross_val_score( pls, Data, y, cv=cvPLS, scoring='mean_squared_error') # bug- returns negative values print('k-folds PLS MSEP: {:.2e}'.format(abs(np.mean(msePLSScores)))) ### Perform classification then transform PLS data to LDA basis nLDA = 2 clfLDA = lda.LDA(n_components = nLDA) Xlda = clfLDA.fit_transform(TrnsfrmPls[0],ExampleClasses) # Predict and calculate misclassification rate