def setUp(self): j = 2 self.train_set = model.TrainingSet('.train_set_synthesis.csv') X = np.array([[1, 1.9], [1.9, 1], [3.8, 4.2], [4, 3.6], [3.6, 4.4]]) Y = np.array([[1, 0], [1, 0], [0, 1], [0, 1], [0, 1]]) self.train_set.x = X.copy() self.train_set.y = Y.copy() self.train_set.autoscale() # autoscale also matrices for sklearn X = self.train_set.x.copy() Y = self.train_set.y.copy() self.nipals = model.nipals(X, Y) self.sklearn_pls = sklCD.PLSRegression(n_components=j, scale=True, max_iter=1e4, tol=1e-6, copy=True) self.sklearn_pls.fit(X, Y) IO.Log.debug('NIPALS x scores', self.nipals.T) IO.Log.debug('sklearn x scores', self.sklearn_pls.x_scores_) IO.Log.debug('NIPALS x loadings', self.nipals.P) IO.Log.debug('sklearn x loadings', self.sklearn_pls.x_loadings_) IO.Log.debug('NIPALS x weights', self.nipals.W) IO.Log.debug('sklearn x weights', self.sklearn_pls.x_weights_) IO.Log.debug('NIPALS y scores', self.nipals.U) IO.Log.debug('sklearn y scores', self.sklearn_pls.y_scores_) IO.Log.debug('NIPALS y loadings', self.nipals.Q) IO.Log.debug('sklearn y loadings', self.sklearn_pls.y_loadings_) IO.Log.debug('sklearn y weights', self.sklearn_pls.y_weights_)
def plsRegressAnalysis(xTrain, yTrain, xTest = None, yTest = None): kf = model_selection.KFold(n_splits=5,random_state=10) trans, features = xTrain.shape lvMax = int(min(trans, features)/3) lvBest = 0 rmsecvBest = np.inf for lvTemp in range(1,lvMax+1): squareArray = np.array([[]]) for train, test in kf.split(xTrain): xTrainTemp = xTrain[train, :] yTrainTemp = yTrain[train] xTestTemp = xTrain[test, :] yTestTemp = yTrain[test] yPredictTemp, coefTemp = PLS(xTestTemp, yTestTemp, xTrainTemp, yTrainTemp, lvTemp) residual = yPredictTemp - yTestTemp square = np.dot(residual.T, residual) squareArray = np.append(squareArray, square) # squareArray.append(square) RMSECV = np.sqrt(np.sum(squareArray) / xTrain.shape[0]) if RMSECV<rmsecvBest: rmsecvBest = RMSECV lvBest = lvTemp if xTest is None: return rmsecvBest, lvBest else: plsModel = cross_decomposition.PLSRegression(n_components=lvBest) plsModel.fit(xTrain, yTrain) coef = plsModel.coef_ yPredict = plsModel.predict(xTest) yTrainPredict = plsModel.predict(xTrain) R2 = sm.r2_score(yTrain,yTrainPredict) MSE = sm.mean_squared_error(yTest,yPredict) R2P = sm.r2_score(yTest, yPredict) return yPredict, R2, rmsecvBest, R2P,MSE, lvBest
def test_pls(self): """ tests the exactness of ppdire's pls""" skpls = skc.PLSRegression(n_components=4) skpls.fit(self.Xs,(self.y-np.mean(self.y))/np.std(self.y)) pppls = ppdire(projection_index = dicomo, pi_arguments = {'mode' : 'cov'}, n_components=4, square_pi=True, optimizer='SLSQP', optimizer_options={'maxiter':500}) pppls.fit(self.x,self.y) np.testing.assert_almost_equal(np.abs(np.matmul(self.Xs,skpls.coef_)*np.std(self.y) + np.mean(self.y)),np.abs(pppls.fitted_),decimal=3)
def Pls (df, df2, string): pls2 = PLSRegression(n_components=2) (xs,ys) = pls2.fit_transform(df,df2) t = df2.values principalDf = pd.DataFrame(data = xs , columns = ['pls 1', 'pls 2']) pls = cross_decomposition.PLSRegression(n_components = 10) pls.fit(df, df2) variance = np.var(pls.x_scores_, axis = 0) principalDf [string] = t return principalDf, variance
def test_PLSRegression(self): n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 df = pdml.ModelFrame(X, target=Y) pls1 = df.cross_decomposition.PLSRegression(n_components=3) df.fit(pls1) result = df.predict(pls1) pls2 = cd.PLSRegression(n_components=3) pls2.fit(X, Y) expected = pls2.predict(X) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected)
def mx_PLSRegression(train_x, train_y): mx = cross_decomposition.PLSRegression() mx.fit(train_x, train_y) return mx
ngrams = textfeature() ## load data train_data = ngrams.load_data('../holger_train_judgeyear.csv', index_col=0) test_data = ngrams.load_data('../holger_test_judgeyear.csv', index_col=0) judge_year_index = ngrams.load_data('../datasets/judge_year2index.pkl', format='pkl') ngram_dict = ngrams.load_data( '../datasets/grams_dict2002-2016/grams_dict.pkl', format='pkl') bow_feature = ngrams.load_data( '../datasets/grams_dict2002-2016/bow_features.pkl', format='pkl') model_zoo = Counter() model_zoo['OLS'] = linear_model.LinearRegression() model_zoo['PLS'] = cross_decomposition.PLSRegression(n=200) model_zoo['RF'] = RandomForestRegressor() model_zoo['Elastic Net'] = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7) features = bow_feature ngrams.process_data(train_data, judge_year_index, features, istrain=True) ngrams.process_data(test_data, judge_year_index, features, istrain=False) ngrams.get_train_test(features) bow_train, bow_test = ngrams.get_vector() X_train, X_test = ngrams.get_tfidf(bow_train, bow_test) train_total, test_total = ngrams.combine_data(X_train, X_test) cvres = [] alphas = np.linspace(0.01, 1, 50) for a in alphas: ela = ngrams.model_pre(model_zoo['Elastic Net'], train_total,
# Scree plot plt.bar(np.arange(1, spca.named_steps['pca'].n_components_ + 1) - 0.4,\ spca.named_steps['pca'].explained_variance_ratio_) cum_evr = np.cumsum(spca.named_steps['pca'].explained_variance_ratio_) plt.plot(np.arange(1, spca.named_steps['pca'].n_components_ + 1), cum_evr,\ color='black') ''' Partial least squares (PLS) regression ''' # Create a pipeline that scales the data and performs PLS regression spls = Pipeline([ ('scale', preprocessing.StandardScaler()), ('pls', cross_dec.PLSRegression(scale=False)) ]) # Train a PLS regression model with three components spls.set_params( pls__n_components=3 ) spls.fit(boroughs, feelings) # Define folds for cross-validation kf = cv.KFold(len(feelings), n_folds=10, shuffle=True) # Compute average MSE across folds mses = cv.cross_val_score(spls, boroughs, feelings,\ scoring='mean_squared_error', cv=kf) np.mean(-mses)
def PLS_DA(cmode, fold_quantity, components): print( '''------------------------------------------------------------------------------- Gastrointestinal Lesion Classifier (by Willie Wu and Linus Chen): PLS-DA -------------------------------------------------------------------------------''' ) #load data feature_data = [] with open('feature_data.csv', newline='') as csvfile: data_reader = csv.reader(csvfile, delimiter=',') for row in data_reader: feature_data.append([int(row[1])] + [float(r) for r in row[2:]]) print('Data read in successfully...') #check input flags if cmode not in ['binary', 'multi', 'debug']: raise FlagError('Only binary and multi modes are available.') if fold_quantity > len(feature_data) // 2 or fold_quantity < -1: raise FlagError('Folds must be >=-1 (LOOCV) and <= length of data.') if components > len(feature_data) // 2 or components < 1: raise FlagError('Components must be > 0 and <= length of data.') #benign and malignant classes classes = ['Hyp (b)', 'Ser (m)', 'Ade (m)'] #number of PC in models model = skcd.PLSRegression(components) #pair indicates that the 2 rows were combined into one pair_data = np.array([ feature_data[i][1:] + feature_data[i + 1][1:] for i in range(0, len(feature_data) - 1, 2) ]) pair_answers = np.array( [feature_data[i][0] for i in range(0, len(feature_data) - 1, 2)]) pair_modified_answers = [] #multiclass prediction is 3 runs of PLSR-DA, for x vs. not-x #for binary, only check the first run of hyp vs. not-hyp #3 different sets of answers for each type of classification for i in range(len(classes)): pair_modified_answers.append([ 1 if pair_answers[j] == (i + 1) else -1 for j in range(0, len(pair_answers)) ]) pair_modified_answers = np.array(pair_modified_answers) if fold_quantity == -1: fold_quantity = len(pair_data) #randomly choose folds for each lesion, approx. equal sizes folds = [[] for a in range(fold_quantity)] fold_membership = [i % fold_quantity for i in range(len(pair_data))] shuffle(fold_membership) for b in range(len(fold_membership)): folds[fold_membership[b]].append(b) #make predictions for each fold and each class test_pair_predictions = [] test_pair_answers = [] for f in range(fold_quantity): fold_test_pair_predictions = [] test_fold = folds[f] train_fold = [i for i in range(len(pair_data)) if i not in test_fold] for c in range(len(classes)): #set training data train_pair_data = pair_data[train_fold] train_pair_modified_answers = pair_modified_answers[c][train_fold] #set test data test_pair_data = pair_data[test_fold] #fit model to training data, predict model.fit(train_pair_data, train_pair_modified_answers) fold_test_pair_predictions.append( model.predict(test_pair_data).flatten()) test_pair_predictions.append( np.swapaxes(fold_test_pair_predictions, 0, 1)) test_pair_answers.append(pair_answers[test_fold]) #find which class has the highest predicted value if cmode in ['binary', 'debug']: predictions = [ 1 if ff[0] > 0 else 0 for f in test_pair_predictions for ff in f ] answers = [1 if aa == 1 else 0 for a in test_pair_answers for aa in a] acc, sens, spec, f1 = binary_calc_model_stats(predictions, answers) if cmode != 'debug': print('F1 Score: {0:.2f}%'.format(round(f1 * 100, 2))) print('Accuracy: {0:.2f}%'.format(round(acc * 100, 2))) print('Sensitivity: {0:.2f}%'.format(round(sens * 100, 2))) print('Specificity: {0:.2f}%'.format(round(spec * 100, 2))) print('===================') return acc, sens, spec, f1, np.array(model.x_loadings_), np.array( model.y_loadings_), answers, test_pair_predictions if cmode == 'multi': predictions = [ np.argmax(f) + 1 for f in test_pair_predictions for ff in f ] acc, sens, spec, tots = multi_calc_model_stats(predictions, pair_answers) print('Total Accuracy: {0:.2f}%'.format(round(tots[0] * 100))) print('F1 Score: {0:.2f}%'.format(round(tots[1] * 100, 2))) for ac, se, sp, name in zip(acc, sens, spec, ['Hyperplasic', 'Serrated', 'Adenoma']): print(name + ' stats: ') print('Accuracy: {0:.2f}%'.format(round(ac * 100, 2))) print('Sensitivity: {0:.2f}%'.format(round(se * 100, 2))) print('Specificity: {0:.2f}%'.format(round(sp * 100, 2))) print('===================')
modes = [0,1,2,3,4,5,6,7,8,9] Y = pc.projectedWeights[0:len(modes),:].T #### Generating final models Recon_models = [] for train_index, test_index in loo: test_index = test_index[0] X_train, X_test = X[train_index], X[test_index] # Y_train, Y_test = Y[train_index], Y[test_index] # initialize plsr plsr = cross_decomposition.PLSRegression(n_components=5, scale = False) plsr.fit(X, Y) #prediction y = (plsr.predict(X_test)).reshape((len(modes),)) #Reconstruct test instance # P = pc.reconstruct( # pc.getWeightsBySD(modes, y), # modes # ) P = pc.reconstruct(y, modes) #Reshape - this will maybe be different for you - hardcoded for mine here R = P.reshape((np.shape(Data_flat)[1]/3,3)) Recon_models.append (R)
def pls_inspection(X: np.ndarray, Y: np.ndarray, n_comps: int): n_classes = len(np.unique(Y)) Y_encoded = one_hot_encode(Y) model = cross_decomposition.PLSRegression(n_components=n_comps, scale=False) model.fit(X, Y_encoded) # Extract information scores = model.x_scores_ loadings = model.x_loadings_ var_scores = np.var(scores, axis=0) var_X = np.sum(np.var(X, axis=0)) var_ratios = var_scores / var_X cum_var_ratios = np.cumsum(var_ratios) # Colormap cmap = plt.cm.jet cmaplist = [cmap(i) for i in range(cmap.N)] cmap = mpl.colors.LinearSegmentedColormap.from_list( 'Custom map', cmaplist, cmap.N) bounds = np.linspace(0, n_classes, n_classes + 1) norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # Explained variance plot plt.figure(num=3, figsize=(8, 6)) plt.plot(np.pad(cum_var_ratios, (1, 0), 'constant')) plt.title('Explained variance') plt.xlabel('Principal components') plt.ylabel('Cumulative explained variance') plt.xlim((0, n_comps)) plt.ylim((0, 1)) # Loadings plot plt.figure(num=4, figsize=(8, 6)) plt.plot(loadings[:, 0]) plt.title('PC1 loadings') plt.figure(num=5, figsize=(8, 6)) plt.plot(loadings[:, 1]) plt.title('PC2 loadings') plt.figure(num=6, figsize=(8, 6)) plt.plot(loadings[:, 2]) plt.title('PC3 loadings') # 2D scores plot plt.figure(num=7, figsize=(8, 6)) scat = plt.scatter(scores[:, 0], scores[:, 1], c=Y, s=2, cmap=cmap, norm=norm) cb = plt.colorbar(scat, spacing='proportional', ticks=bounds) cb.set_label('Classes') plt.title('Scores plot (PLS)') plt.xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100)) plt.ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100)) # 3D scores plot fig = plt.figure(num=8, figsize=(8, 6)) ax = fig.add_subplot(111, projection='3d') scat = ax.scatter(scores[:, 0], scores[:, 1], scores[:, 2], c=Y, s=2, cmap=cmap, norm=norm) cb = plt.colorbar(scat, spacing='proportional', ticks=bounds) cb.set_label('Classes') ax.set_title('Scores plot (PLS)') ax.set_xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100)) ax.set_ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100)) ax.set_zlabel('PC3 ({:.2f}% explained variance)'.format(var_ratios[2] * 100)) plt.show()
#models.append( {"name": "1.6.3. RadiusNeighborsRegressor uniform", \ # "model": neighbors.RadiusNeighborsRegressor(weights = "uniform")} ) #ZeroDivisionError: Weights sum to zero, can't be normalized #models.append( {"name": "1.6.3. RadiusNeighborsRegressor distance", \ # "model": neighbors.RadiusNeighborsRegressor(weights = "distance")} ) models.append( {"name": "1.6.3. NearestCentroid", \ "model": neighbors.NearestCentroid()} ) ## 1.7. Gaussian Processes ## too slow? #models.append( {"name": "1.7. Gaussian Processes", \ # "model": gaussian_process.GaussianProcess()} ) ## 1.8. Cross decomposition models.append( {"name": "1.8. Cross decomposition PLSRegression", \ "model": cross_decomposition.PLSRegression()} ) models.append( {"name": "1.8. Cross decomposition PLSCanonical", \ "model": cross_decomposition.PLSCanonical()} ) # slow #models.append( {"name": "1.8. Cross decomposition CCA", \ # "model": cross_decomposition.CCA()} ) ## 1.9. Naive Bayes (for classification?) #ValueError: Unknown label type: array #models.append( {"name": "1.9.1. GaussianNB", \ # "model": naive_bayes.GaussianNB()} ) # doesn't work for this dataset? #models.append( {"name": "1.9.2. MultinomialNB", \ # "model": naive_bayes.MultinomialNB()} )
def PLS(xTest, yTest, xTrain, yTrain, nComponents): plsModel = cross_decomposition.PLSRegression(n_components=nComponents) plsModel.fit(xTrain,yTrain) coef = plsModel.coef_ yPredict = plsModel.predict(xTest) return yPredict, plsModel