Esempio n. 1
0
    def fitcv(self):
        pls = PLSRegression(n_components=self.n_components,scale=False)
        kf = KFold(n_splits=self.n_splits)
        yTrue=None
        yHat=None
        # 判断Y 是几维的
        dimensiony=len(self.Y.shape)

        for train_index, test_index in kf.split(self.X):
            X_train, X_test = self.X[train_index], self.X[test_index]
            y_train, y_test = self.Y[train_index], self.Y[test_index]
            pls.fit(X_train, y_train)
            if dimensiony==1:
                ypred = pls.predict(X_test)[:,0]
            else:
                ypred = pls.predict(X_test)
            ypred[ypred>0]=1
            ypred[ypred<0]=-1
            if yTrue is None:
                yTrue=y_test  # 真值
                yHat=ypred  #预测值
            else:
        
                yTrue=np.r_[yTrue,y_test]
                yHat=np.r_[yHat, ypred]
        err=yTrue-yHat
        errSampleNo=np.where(err!=0)
        err=err[err!=0]
        
        return len(err)/len(self.X)*100,errSampleNo  #返回误判率
Esempio n. 2
0
def get_score(X_train, X_test, y_train, y_test, nc):
    '''
    input:training and testing dataset
    output:r2 score of 2 methods->pca_score,pls_score
    '''
    #pca方法
    pca = PCA(n_components=nc)
    X_train_reduced = pca.fit_transform(X_train)
    X_test_reduced = pca.transform(X_test)
    pcr = LinearRegression().fit(X_train_reduced, y_train)
    pca_score = pcr.score(X_test_reduced, y_test)
    predictions = pcr.predict(X_test_reduced)  #测试集结果
    predictions1 = pcr.predict(X_train_reduced)  #训练集结果
    print(predictions, predictions1)
    plt.title("comparison of PLSR and PCA method(nc={},{})".format(nc, item))
    plt.xlabel("observed")
    plt.ylabel("fitted")
    plt.scatter(y_test / 100, predictions / 100, label='pca')

    #pls方法
    pls = PLSRegression(n_components=nc, ).fit(X_train, y_train.astype(int))
    pls_score = pls.score(X_test, y_test)
    yfit = pls.predict(X_test)
    yfit1 = pls.predict(X_train)
    print(yfit, yfit1)
    plt.scatter(y_test / 100, yfit / 100, label='plsr')
    plt.legend()
    # plt.show()

    return pca_score, pls_score, predictions / 100, predictions1 / 100, yfit / 100, yfit1 / 100
Esempio n. 3
0
def do_pls(data_x, data_y, train_split_percentage):
    latent_variables = []

    x_test, x_train, y_test, y_train = train_test_split(data_x, data_y, test_size=train_split_percentage, random_state=0)

    for i in range(20):
        pls = PLSRegression(n_components=(i + 1), scale=True)
        pls.fit(x_train, y_train)
        predicted_cv_y = pls.predict(x_test)
        mean_squared_error_cv = sqrt(mean_squared_error(y_test, predicted_cv_y))
        latent_variables.append(mean_squared_error_cv)

    best_factor = np.argmin(latent_variables)
    pls2 = PLSRegression(n_components=(best_factor + 1), scale=True)
    pls2.fit(x_train, y_train)
    predicted_cal = pls2.predict(x_train)
    rmsec = sqrt(mean_squared_error(y_train, predicted_cal))
    r2c = pls2.score(x_train, y_train)

    predicted_cv_y = pls2.predict(x_test)
    rmsecv = sqrt(mean_squared_error(y_test, predicted_cv_y))
    r2v = pls2.score(x_test, y_test)

    plsfinal = PLSRegression(n_components=(best_factor + 1), scale=True)
    plsfinal.fit(data_x, data_y)

    return plsfinal, rmsec, r2c, rmsecv, r2v
Esempio n. 4
0
def PLS(X, y, X_ind, y_ind):
    """ Cross validation and Independent test for PLS regression model.
        Arguments:
            X (np.ndarray): m x n feature matrix for cross validation, where m is the number of samples
                and n is the number of features.
            y (np.ndarray): m-d label array for cross validation, where m is the number of samples and
                equals to row of X.
            X_ind (np.ndarray): m x n Feature matrix for independent set, where m is the number of samples
                and n is the number of features.
            y_ind (np.ndarray): m-d label array for independent set, where m is the number of samples and
                equals to row of X_ind, and l is the number of types.
            reg (bool): it True, the training is for regression, otherwise for classification.
         Returns:
            cvs (np.ndarray): m x l result matrix for cross validation, where m is the number of samples and
                equals to row of X, and l is the number of types and equals to row of X.
            inds (np.ndarray): m x l result matrix for independent test, where m is the number of samples and
                equals to row of X, and l is the number of types and equals to row of X.
    """
    folds = KFold(5).split(X)
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    for i, (trained, valided) in enumerate(folds):
        model = PLSRegression()
        model.fit(X[trained], y[trained])
        cvs[valided] = model.predict(X[valided])[:, 0]
        inds += model.predict(X_ind)[:, 0]
    return cvs, inds / 5
Esempio n. 5
0
def plot_pls_results(x_data, y_data, pls_components, num_variables):
    pls = PLSRegression(pls_components)

    cv_splitter = GroupShuffleSplit(n_splits=1, test_size=0.35,
                                    random_state=6)  # 1
    group_splitter = data_full['Leaf number']
    print('111111111')
    print(x_data)

    for train_index, test_index in cv_splitter.split(x_data, y_data,
                                                     group_splitter):
        # print(train_index, test_index)
        x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
        y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

        pls.fit(x_train, y_train)
        y_pred_train = pls.predict(x_train)
        y_pred_test = pls.predict(x_test)
        r2_test = r2_score(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)

        mae_test = mean_absolute_error(y_test, y_pred_test)
        mae_train = mean_absolute_error(y_train, y_pred_train)

        print(r2_test, mae_test)
        print(r2_train, mae_train)
        print(r2_score(y_train, y_pred_train))
        print(r2_score(y_test, y_pred_test))
        plt.scatter(y_train, y_pred_train, c='blue', label='Training Set')
        plt.scatter(y_test, y_pred_test, c='red', label='Test Set')

        _line = np.linspace(0.2, 1.2)

        # plt.plot(_line, _line, c='indigo', linestyle='dashed')
        #
        # plt.plot(_line, _line + .06, c='darkslategray', linestyle='dashed')
        # plt.plot(_line, _line - .06, c='darkslategray', linestyle='dashed')
        #
        # left_annote_pos = 0.20
        # plt.annotate("Training Median Absolute Error = {}".format(0.059),
        #              (left_annote_pos, 1.1), fontsize=12)
        # # plt.annotate("Testing Median Absolute Error = {}".format(0.07),
        # #              (left_annote_pos, 1.02), fontsize=12)
        #
        # plt.annotate(u"Training R\u00B2 = {}".format(0.83),
        #              (left_annote_pos, .95), fontsize=12)
        #
        # # plt.annotate(u"Testing R\u00B2 = {}".format(0.82),
        # #              (left_annote_pos, .89), fontsize=12)
        # plt.xlabel('Meausured Chlorophyll b (ug/ml)', fontsize=16)
        # plt.ylabel('Predicted Chlorophyll b (ug/ml)', fontsize=16)
        # plt.title("Chlorophyll b Model for AS7262\nbased on 2-Component\nPartial Least Squared Model",
        #           fontsize=18)
        # plt.legend(loc='lower right', fontsize=12)
        plt.tight_layout()
        plt.show()
        plt.scatter(y_pred_train, y_train, c='blue', label='Training Set')
        plt.scatter(y_pred_test, y_test, c='red', label='Test Set')
        plt.show()
Esempio n. 6
0
 def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False,
            osc_params=(10,1)):
     # Separating X from Y for PLS
     X=self.df[self.freqs].to_numpy()
     Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
     sample_std=np.std(self.df[self.y_name])
     
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")
     
     # Array for storing CV errors
     cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)])
     i=0
     for train, val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             osc.fit(X[train], Y[train])
             X_train_osc=osc.X_osc
             X_val_osc=osc.transform(X[val])
         j=0
         for ncomp in ncomp_range:
             pls = PLSRegression(n_components=ncomp,scale=False)
             if len(osc_params)==2:
                 pls.fit(X_train_osc, Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                     Y[val], pls.predict(X_val_osc))**0.5
             else:
                 pls.fit(X[train], Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                         Y[val], pls.predict(X[val]))**0.5
             j=j+1
         i=i+1
     # Printing and plotting CV results
     cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0)
     cv_RPD_ncomp=sample_std/cv_RMSE_ncomp
     if plot:
         fig = plt.figure(figsize=(12,8))
         plt.gca().xaxis.grid(True)
         plt.xticks(ncomp_range)
         plt.ylabel("RPD")
         plt.xlabel("Number of components")
         plt.plot(ncomp_range,cv_RPD_ncomp)
     # Best model
     rpd_best=max(cv_RPD_ncomp)
     ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()]
     if verbose:
         print("Best RMSE: ",min(cv_RMSE_ncomp))
         print("Best RPD: ",max(cv_RPD_ncomp))
         print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()])
     return (ncomp_best,rpd_best)
Esempio n. 7
0
def PLS_DA(datos):
        
    global pls_bi
        
    datos_bi = datos[(datos['etiqueta'] == 5 ) | (datos['etiqueta'] == 6)]
    
    X_bi = savgol_filter(datos_bi.values[:,2:], 15, polyorder = 3, deriv=0)
    
    y_biP = datos_bi["etiqueta"].values
    
    y_bi = (y_biP == 6).astype('uint8')
    
    
    pls_bi = PLSRegression(n_components=2)
    
    X_pls = pls_bi.fit_transform(X_bi, y_bi)[0] 
    
    labplot = ["60/40 ratio", "50/50 ratio"]
    
    unique = list(set(y_bi))
    colors = [plt.cm.jet(float(i)/max(unique)) for i in unique]
    with plt.style.context(('ggplot')):
        plt.figure(figsize=(12,10))
        for i, u in enumerate(unique):
            col = np.expand_dims(np.array(colors[i]), axis=0)
            x = [X_pls[j,0] for j in range(len(X_pls[:,0])) if y_bi[j] == u]
            y = [X_pls[j,1] for j in range(len(X_pls[:,1])) if y_bi[j] == u]
            plt.scatter(x, y, c=col, s=100, edgecolors='k',label=str(u))
            plt.xlabel('Variable Latente 1')
            plt.ylabel('Variable Latente 2')
            plt.legend(labplot,loc='lower left')
            plt.title('Descomposición cruzada PLS')
            plt.show()
            
    X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X_bi, y_bi, test_size=0.2, random_state=19)

    pls_bi = PLSRegression(n_components=2)
    
    pls_bi.fit(X_entreno, y_entreno)
    
    y_prediccion1 = pls_bi.predict(X_prueba)[:,0] 
    prediccion_binaria1 = (pls_bi.predict(X_prueba)[:,0] > 0.5).astype('uint8')
    print(prediccion_binaria1, y_prueba)
    
    precision = []
    A=[]
    m=0
    cvalor = KFold(n_splits=40, shuffle=True, random_state=19)
    for train, test in cvalor.split(X_bi):
        
        y_prediccion = PLS_DA1(X_bi[train,:], y_bi[train], X_bi[test,:])
        A.append(y_prediccion)
        precision.append(accuracy_score(y_bi[test], y_prediccion))
        m=m+1
        print("Precisión Promedio para 10 Divisiones: ", np.array(precision).mean())
    
    return prediccion_binaria1, precision
def train_and_predict_PLS(X_train, y_train, X_test, n_components=None):
    # fit regression model on train
    regr = PLSRegression(n_components=n_components).fit(X_train, y_train)
    bic_val = bic(X_train, y_train)

    # make predictions on test set
    test_preds = regr.predict(
        X_test)  # predictions of one parameter at n pixels
    train_preds = regr.predict(X_train)  #
    return test_preds, train_preds, bic_val
Esempio n. 9
0
def PartialLeastSquares(X_train, X_test, y_train, y_test=None):
    if y_test is not None:
        model = PLSRegression()
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        return metrics(X_train, y_test, predicted)
    else:
        model = PLSRegression()
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        return predicted
Esempio n. 10
0
class PLSPredictor:
    def __init__(self):
        self.pls2 = PLSRegression(n_components=2,
                                  scale=True,
                                  max_iter=500,
                                  tol=1e-06,
                                  copy=True)

    def predict(self, values):
        self.pls2.predict(values)

    def train(self, measured_values, screen_points):
        self.pls2.fit(measured_values, screen_points)
Esempio n. 11
0
class PLS():
    """
    Implement PLS to make it compliant with the other dimensionality
    reduction methodology.
    (Simple class rewritting).
    """
    def __init__(self, n_components=10):
        self.clf = PLSRegression(n_components)

    def get_components_(self):
        return self.clf.x_weights_.transpose()

    def set_components_(self, x):
        pass

    components_ = property(get_components_, set_components_)

    def fit(self, X, y):
        self.clf.fit(X,y)
        return self

    def transform(self, X):
        return self.clf.transform(X)

    def predict(self, X):
        return self.clf.predict(X)
Esempio n. 12
0
class MSLMultiModel(WebModel):
    ''' a Multitask version of MSLModel '''
    def __init__(self, output_dir, ccs_dir, lanl_file, n_components=10, **kwargs):
        self.output_dir = output_dir
        self.ccs_dir = ccs_dir
        self.lanl_file = lanl_file

        self.n_components = n_components
        self.model = PLSRegression(n_components=n_components, scale=False)
        self.multitask = True
        self.name = 'msl_multi_model'

    def fit(self, data, composition, elements):
        self.elements = elements  # order matters
        data = libs_norm3(data[:, ALAMOS_MASK])
        self.model.fit(data, composition)

    def predict(self, data, mask=ALAMOS_MASK, clip=True):
        data = libs_norm3(data[:, mask])
        predictions = self.model.predict(data, copy=False)
        if predictions:
            predictions = np.clip(predictions, 0, 100)
        else:
            predictions[predictions < 0] = 0
        return predictions
Esempio n. 13
0
class PLSDADummy(BaseEstimator):
    """
    Wrapper of PLSRegression for classification.

    PLSRegression predicts one hot encoded vectors,
    then plsda outputs class with maximal score.
    """
    def __init__(self, n_components=2):
        self.pls = PLSRegression(n_components)
        self.classes = None

    def __one_hot_encode(self, Y):
        # encode labels to numbers
        Y = np.array([np.where(self.classes == y)[0][0] for y in Y])

        enc = OneHotEncoder(n_values=len(self.classes))
        return enc.fit_transform(Y.reshape(-1, 1)).toarray()

    def fit(self, X, Y):
        """

        :param X:
        :param Y: list of labels
        :return :
        """
        self.classes = np.array(sorted(np.unique(Y)))

        Y = self.__one_hot_encode(Y)
        self.pls.fit(X, Y)

        return self

    def predict(self, X):
        y_pred = np.argmax(self.pls.predict(X), axis=1)
        return np.array([self.classes[cls] for cls in y_pred])
Esempio n. 14
0
def knn_denoise(X, X_reference, *, k, ncomp):
    from sklearn.cross_decomposition import PLSRegression
    from sklearn.neighbors import NearestNeighbors
    # Xb = X * window

    print('PCA...')
    npca = np.minimum(300, X.shape[0])
    u, s, vh = np.linalg.svd(X)
    features = u[:, 0:npca] * s[0:npca]
    components = vh[0:npca, :]
    # s = s[0:npca]

    print('Nearest neighbors...')
    nbrs = NearestNeighbors(n_neighbors=k + 1,
                            algorithm='ball_tree').fit(features)
    distances, indices = nbrs.kneighbors(features)

    features2 = np.zeros(features.shape, dtype=features.dtype)
    for j in range(X.shape[0]):
        print(f'{j+1} of {X.shape[0]}')
        inds0 = np.squeeze(indices[j, :])
        inds0 = inds0[1:]
        # Xbneighbors = Xb[inds0, :]
        f_neighbors = features[inds0, :]
        pls = PLSRegression(n_components=ncomp)
        # pls.fit(Xbneighbors.T, Xb[j, :].T)
        pls.fit(f_neighbors.T, features[j, :].T)
        features2[j, :] = pls.predict(f_neighbors.T).T
        # X2[j, :] = pls.predict(Xbneighbors.T).T
    print(features2.shape)
    print(components.shape)
    X2 = features2 @ components
    return X2
Esempio n. 15
0
def fit_plt(dados,ncomp):
    from sklearn.cross_decomposition import PLSRegression
    colmap = [ (0,0,0), (1,0,0),(0,1,0),(0,0,1),(0.41,0.41,0.41),(0,1,1),
        (0.58,0,0.82),(0,0.50,0),(0.98,0.50,0.44),(1,	1,0.87),
        (0.39,0.58,0.92),(0.50,0.50,0),(1,0.89,0.76),(0.96,0.96,0.86),
        (0,1,1)]    
    g = dados['g']
    r = dados['r']
    wn = dados['wn']
    pls = PLSRegression(n_components=ncomp)
    pls.fit(r,g)
    Y_pred = pls.predict(r)
    plt.figure() 
    plt.subplot(2,1,1)
    for i in range(1,g.max()+1):
        sel = g == i
        plt.scatter(g[sel],Y_pred[sel],color = colmap[i])
    plt.xlabel( 'Y_class',Fontsize = 12)
    plt.ylabel( 'Y_predited',Fontsize = 12)
    plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
    
    plt.subplot(2,1,2)
    for i in range(1,g.max()+1):
        sel = g == i
        plt.hist(Y_pred[sel])
    plt.xlabel( 'Y_class',Fontsize = 12)
    plt.ylabel( 'histograma',Fontsize = 12)
    plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
Esempio n. 16
0
class MyPLS():
    def __init__(self,
                 n_components=2,
                 scale=True,
                 max_iter=500,
                 tol=1e-06,
                 copy=True):
        self.pls = PLSRegression(n_components, scale, max_iter, tol, copy)

    def fit(self, X, Y):
        self.pls.fit(X, Y)
        return self.pls

    def predict(self, X, copy=True):
        return self.pls.predict(X, copy).flatten()

    def score(self, X, Y, sample_weight=None):
        return self.pls.score(X, Y, sample_weight)

    def get_params(self, deep=True):
        return self.pls.get_params(deep)

    def set_params(self, **parameters):
        self.pls.set_params(**parameters)
        return self

    @property
    def intercept_(self):
        return 0

    @property
    def coeff_(self):
        return self.pls.coef_
Esempio n. 17
0
 def fit_pls(self, X_test):
     reg = PLSRegression(n_components=20, scale=False, max_iter=1000)
     reg.fit(self.X.copy().values, self.y.copy().values.flatten())
     preds = reg.predict(X_test.copy().values)
     ids = X_test.index
     pred_df = pd.DataFrame(data=preds, index=ids, columns=['SalePrice'])
     pred_df.to_csv('results/results_pls.csv', sep=',')
Esempio n. 18
0
def simple_pls_cv(X, y, n_comp):
    # Run PLS with suggested number of components
    pls = PLSRegression(n_components=n_comp)
    pls.fit(X, y)
    y_c = pls.predict(X)
    # Cross-validation
    y_cv = cross_val_predict(pls, X, y, cv=10)
    # Calculate scores for calibration and cross-validation
    score_c = r2_score(y, y_c)
    score_cv = r2_score(y, y_cv)
    # Calculate mean square error for calibration and cross validation
    mse_c = mean_squared_error(y, y_c)
    mse_cv = mean_squared_error(y, y_cv)
    print('R2 calib: %5.3f' % score_c)
    print('R2 CV: %5.3f' % score_cv)
    print('MSE calib: %5.3f' % mse_c)
    print('MSE CV: %5.3f' % mse_cv)
    # Plot regression
    z = np.polyfit(y, y_cv, 1)
    with plt.style.context(('ggplot')):
        fig, ax = plt.subplots(figsize=(9, 5))
        ax.scatter(y_cv, y, c='red', edgecolors='k')
        ax.plot(z[1] + z[0] * y, y, c='blue', linewidth=1)
        ax.plot(y, y, color='green', linewidth=1)
        plt.title('$R^{2}$ (CV): ' + str(score_cv))
        plt.xlabel('Predicted $^{\circ}$Brix')
        plt.ylabel('Measured $^{\circ}$Brix')

        plt.show()
Esempio n. 19
0
  def predicao(self, idmodelo, idamostra):

    idmodelo = idmodelo

    idamostra = idamostra

    print(idmodelo)
    print(idamostra)

    X = self.selectMatrizX(idmodelo, "VALIDACAO")
    Y = self.selectMatrizY(idmodelo, "VALOR", "VALIDACAO")

    amostraPredicao = self.selectAmostra(idamostra, idmodelo)

    valorReferencia = self.selectDadosReferenciaAmostra(idamostra, idmodelo)

    pls = PLSRegression(copy=True, max_iter=500, n_components=20, scale=False, tol=1e-06)

    pls.fit(X, Y)
    print(amostraPredicao)
    valorPredito = pls.predict(amostraPredicao)

    print('Amostra: ' + str(idamostra) + ' - Valor Predito :' + str(valorPredito) + ' - Valor Referencia :' + str(
      valorReferencia))

    cursorDadosCalibracao = db.execute("select rmsec, rmsep, coeficientecal, coeficienteval, dtcalibracao "
                                       "from calibracao where inativo = 'A' and idmodelo = " + str(idmodelo) + " ")
    for regCodigo in cursorDadosCalibracao:
      rmsec = regCodigo[0]
      rmsep = regCodigo[1]
      coeficienteCal = regCodigo[2]
      coeficienteVal = regCodigo[3]
      dtcalibracao = regCodigo[4]

    print(rmsec)
    print(rmsep)
    print(coeficienteCal)
    print(coeficienteVal)
    print(dtcalibracao)

    dtcalibracao = dtcalibracao.strftime('%d/%m/%Y')
    print(dtcalibracao)

    # tratamento dos dados para o Json
    coeficienteCal = round(coeficienteCal, 2)
    coeficienteVal = round(coeficienteVal, 2)
    rmsec = round(rmsec, 2)
    rmsep = round(rmsep, 2)
    valorReferencia = round(valorReferencia, 2)

    valorPreditoString = str(valorPredito)
    valorPreditoString = valorPreditoString.replace("[", "")
    valorPreditoString = valorPreditoString.replace("]", "")

    ##Contrucao do JSON
    json_data = jsonify(idamostra=str(idamostra), valorpredito=str(valorPreditoString),
                        rmsec=str(rmsec), rmsep=str(rmsep), idmodelo=str(idmodelo), dtcalibracao=str(dtcalibracao),
                        valorreferencia=str(valorReferencia), coeficientecal=str(coeficienteCal), coeficienteval=str(coeficienteVal))

    return json_data
Esempio n. 20
0
def PLSR_LOOCV(data):
    ''' Performs LOOCV on the data and returns R2Y value '''
    R2Y = 0
    predVal = []
    for i in range(len(data[:, 0])):
        train = np.zeros((len(data[:, 0]) - 1, 8))
        test = np.zeros((1, 8))
        for j in range(len(data[:, 0])):
            if j < i:
                train[j, :] = data[j, :]
            elif j > i:
                train[j - 1, :] = data[j, :]
            else:
                test[0, :] = data[j, :]

        testScaled = np.zeros((1, 8))
        trainScale = StandardScaler()
        trainScaled = trainScale.fit_transform(train)
        testScaled[0, :] = trainScale.transform(test)
        PLSR = PLSRegression(n_components=2)
        PLSR.fit(trainScaled[:, 2:6], trainScaled[:, 0])
        pred = PLSR.predict(testScaled[:, 2:6])
        predVal.append(np.squeeze(pred))
    scaledData = scaler(data)
    R2Y = 1 - np.sum(
        (predVal - scaledData[:, 0])**2) / np.sum(scaledData[:, 0]**2)
    return R2Y
Esempio n. 21
0
def PLSCrossValidation(n_components, trainSet, validationSet):
  pls = PLSRegression(n_components=n_components)
  pls.fit(trainSet[predictorList], trainSet['Apps'])
  predictPls = pls.predict(validationSet[predictorList])
  different = predictPls.flat - validationSet['Apps']
  error_rate = np.mean(different ** 2)
  return error_rate
def hacerPLS(X,Y):
    pls_wild_b = PLSRegression(n_components = 9) 
    pls_wild_b.fit(X,Y)
    Z = pls_wild_b.transform(X)
    scores = list() 
    scores_std = list()
    n_features = np.shape(X)[1]
    
    X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0)
    N = np.shape(X)[0]
    
    for num_comp in range(n_features):
        kf = KFold(N,n_folds = 10)
        aux_scores = list()
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
              
            if num_comp == 0:
                y_pred = np.mean(y_test)
                y_pred = y_pred* np.ones(np.shape(y_test))
                aux_scores.append(metrics.mean_squared_error(y_test,y_pred))
            
            else:
                pls_foo = PLSRegression(n_components = num_comp)                        
                pls_foo.fit(X_train,y_train)
                y_pred = pls_foo.predict(X_test)
            
                #obtaing the score
                this_score = metrics.mean_squared_error(y_test,y_pred)
                aux_scores.append(this_score)
                
        scores.append(np.mean(aux_scores))
        scores_std.append(np.std(aux_scores))
    
    plt.plot(scores)
    xlabel('Componentes')
    ylabel("$MSE$")
    title("Animales PLS")
    plt.show()
    
    num_comp = np.argmin(scores)
    
    pls_pred = PLSRegression(n_components =2)
    pls_pred.fit(X,Y)
    y_pred_test = pls_pred.predict(X_test_tot)
    
    print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
Esempio n. 23
0
def test_regressor_predict(pls_regressor):
    X = np.random.rand(10, 10)
    y = np.random.rand(10)
    sklearn_regressor = PLSRegression().fit(X, y)
    pls_regressor.fit(X, y)
    y_pred = pls_regressor.predict(X)
    assert y_pred.shape == y.shape
    assert np.all(y_pred == sklearn_regressor.predict(X).ravel())
Esempio n. 24
0
def run_pls(X, Y, LV):
    model = PLSRegression(n_components=LV, scale=False)
    model.fit(X, Y)
    Yr = [ y[0] for y in model.predict(X).tolist() ]
    r2, sdec = calc_regr_metrics(Y_exp=Y, Y_pred=Yr)
    q2, sdep, variables.Y_pred = regr_loo(X=np.array(X), Y=np.array(Y), M=model)
    scores = { 'R2': r2, 'Q2': q2, 'SDEC': sdec,'SDEP': sdep }
    return scores, model
Esempio n. 25
0
def do_sigma_pls(data_x, data_y, train_split_percentage):
    latent_variables = []

    x_test, x_train, y_test, y_train = train_test_split(data_x, data_y, test_size=train_split_percentage, random_state=0)

    for i in range(20):
        pls = PLSRegression(n_components=(i + 1), scale=True)
        pls.fit(x_train, y_train)
        predicted_cv_y = pls.predict(x_test)
        mean_squared_error_cv = sqrt(mean_squared_error(y_test, predicted_cv_y))
        latent_variables.append(mean_squared_error_cv)

    best_factor = np.argmin(latent_variables)
    pls_sigma = PLSRegression(n_components=(best_factor + 1), scale=True)
    pls_sigma.fit(data_x, data_y)
    predicted_cv_y_sigma = pd.DataFrame(pls_sigma.predict(data_x))
    data_labels = pd.DataFrame(data_y.index)
    data_x = pd.DataFrame(data_x).reset_index(drop=True)
    data_y = pd.DataFrame(data_y).reset_index(drop=True)

    if cfg.sigma_percentage:
        percentual_error = pd.DataFrame(abs(data_y.iloc[:, 0] - predicted_cv_y_sigma.iloc[:, 0]))
        percentual_error = pd.DataFrame((percentual_error.iloc[:, 0] * 100) / data_y.iloc[:, 0])
        df_x = pd.DataFrame(pd.DataFrame(pd.concat([data_x, percentual_error], axis=1)))
        df_x = df_x.drop(df_x[df_x.iloc[:, -1] > cfg.sigma_confidence].index)
        df_x.drop(df_x.columns[len(df_x.columns) - 1], axis=1, inplace=True)
        df_y = pd.DataFrame(pd.DataFrame(pd.concat([data_y, data_labels, percentual_error], axis=1)))
        df_y = df_y.drop(df_y[df_y.iloc[:, -1] > cfg.sigma_confidence].index)

        df_x.set_index(df_y.iloc[:, 1], inplace=True)
        df_y.set_index(df_x.index, inplace=True)
        df_y.drop(df_y.columns[len(df_y.columns) - 1], axis=1, inplace=True)

        return df_x, df_y
    else:
        abs_error = pd.DataFrame(abs(data_y.iloc[:, 0] - predicted_cv_y_sigma.iloc[:, 0]))
        df_x = pd.DataFrame(pd.DataFrame(pd.concat([data_x, abs_error], axis=1)))
        df_x = df_x.drop(df_x[df_x.iloc[:, -1] > cfg.sigma_confidence].index)
        df_x.drop(df_x.columns[len(df_x.columns) - 1], axis=1, inplace=True)
        df_y = pd.DataFrame(pd.DataFrame(pd.concat([data_y, abs_error], axis=1)))
        df_y = df_y.drop(df_y[df_y.iloc[:, -1] > cfg.sigma_confidence].index)

        df_x.set_index(df_y.iloc[:, 1], inplace=True)
        df_y.set_index(df_x.index, inplace=True)
        df_y.drop(df_y.columns[len(df_y.columns) - 1], axis=1, inplace=True)
        return df_x, df_y
Esempio n. 26
0
def compute_q2_pls(tdata, tlabel, vdata, vlabel, Rval):
    test = PLSRegression(n_components=Rval)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        test.fit(matricize(tdata), matricize(tlabel))
    Y_pred = test.predict(matricize(vdata))
    Q2 = qsquared(matricize(vlabel), matricize(Y_pred))
    return Q2
Esempio n. 27
0
def Training(df,seed, yratio, xratio, index = 1):
	snp_matrix = np.array(df.values)
	xdim, ydim = snp_matrix.shape

	ydimlist = range(0,ydim)
	xdimlist = range(0,xdim)

	random.seed(seed)
	random.shuffle(ydimlist) # shuffle the individuals
	random.shuffle(xdimlist) # shuffle the SNPs	
	accuracy = 0

	snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist])
	snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:])
	snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)]
	snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):]

	snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:]
	snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]

	for i in range(int(xdim*xratio), xdim):
		snp_matrix_train_y = snp_matrix_train[i,:]
		snp_matrix_test_y = snp_matrix_test[i,:]
		if index != 7:
			if index == 1:
				clf = AdaBoostClassifier(n_estimators= 100)
			elif index == 2:
				clf = RandomForestClassifier(n_estimators=100)
			elif index == 3:
				clf = linear_model.LogisticRegression(C=1e5)
			elif index == 4:
				clf = svm.SVC(kernel = 'rbf')
			elif index == 5:
				clf = svm.SVC(kernel = 'poly')
			else:
				clf = svm.SVC(kernel = 'linear')
			clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y)
			Y_pred = clf.predict(snp_matrix_test_x.T)
			prediction = snp_matrix_test_y - Y_pred
			wrong = np.count_nonzero(prediction)
			tmp = 1 - (wrong + 0.0) / len(prediction)
			print tmp
			accuracy += tmp

	accuracy = accuracy / (xdim - int(xdim*xratio))

	if index == 7:
		pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000)
		snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:]
		pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T)
		snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]
		snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:]		
		Y_pred = transform(pls2.predict(snp_matrix_test_x.T))
		prediction = snp_matrix_test_y - Y_pred.T
		xdim, ydim = prediction.shape
		wrong = np.count_nonzero(prediction)
		accuracy = 1 - wrong / (xdim * ydim + 0.0)
	return accuracy
Esempio n. 28
0
def Training(df,seed, yratio, xratio, index = 1):
	snp_matrix = np.array(df.values)
	xdim, ydim = snp_matrix.shape

	ydimlist = range(0,ydim)
	xdimlist = range(0,xdim)

	random.seed(seed)
	random.shuffle(ydimlist) # shuffle the individuals
	random.shuffle(xdimlist) # shuffle the SNPs	
	accuracy = 0

	snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist])
	snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:])
	snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)]
	snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):]

	snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:]
	snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]

	for i in range(int(xdim*xratio), xdim):
		snp_matrix_train_y = snp_matrix_train[i,:]
		snp_matrix_test_y = snp_matrix_test[i,:]
		if index != 7:
			if index == 1:
				clf = AdaBoostClassifier(n_estimators= 100)
			elif index == 2:
				clf = RandomForestClassifier(n_estimators=100)
			elif index == 3:
				clf = linear_model.LogisticRegression(C=1e5)
			elif index == 4:
				clf = svm.SVC(kernel = 'rbf')
			elif index == 5:
				clf = svm.SVC(kernel = 'poly')
			else:
				clf = svm.SVC(kernel = 'linear')
			clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y)
			Y_pred = clf.predict(snp_matrix_test_x.T)
			prediction = snp_matrix_test_y - Y_pred
			wrong = np.count_nonzero(prediction)
			tmp = 1 - (wrong + 0.0) / len(prediction)
			print tmp
			accuracy += tmp

	accuracy = accuracy / (xdim - int(xdim*xratio))

	if index == 7:
		pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000)
		snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:]
		pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T)
		snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]
		snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:]		
		Y_pred = transform(pls2.predict(snp_matrix_test_x.T))
		prediction = snp_matrix_test_y - Y_pred.T
		xdim, ydim = prediction.shape
		wrong = np.count_nonzero(prediction)
		accuracy = 1 - wrong / (xdim * ydim + 0.0)
	return accuracy
Esempio n. 29
0
def train_plsr(matrix,ty,n):
	clf = PLSRegression(n_components=5)
	clf.fit(matrix, ty)
	X_train, X_test, y_train, y_test = train_test_split(matrix, ty, test_size=n/100)
	#scores = cross_val_score(clf, matrix, ty, cv =10)
	scores = clf.score(X_train,y_train)
	print_plsr_importance(clf)
	predict_result = {'predict':[each[0] for each in clf.predict(X_test)],'real':y_test}
	return(scores,predict_result)  
def PCA_Red2(Y, X, Y_pred, X_pred):
    pca = PCA(0.90)
    X_reduced = pca.fit_transform(scale(X))
    pls = PLSRegression(n_components=3, scale=False)
    pls.fit(scale(X_reduced), Y)
    X_pred = np.array(X_pred).reshape(1, -1)
    X_pred = pca.transform(scale(X_pred))
    prediction = pls.predict(X_pred)
    return prediction
Esempio n. 31
0
 def transform_helper(self, data, data_y):
     data_temp = data - data.mean(
         axis=0)  # make the mean of columns equal to zero
     data_y = data_y
     pls = PLSRegression(n_components=2)
     # Fit
     pls.fit(data_temp, data_y)
     res = pls.predict(data_temp)
     return res
Esempio n. 32
0
class Plsr:
    def __init__(self, features, output):
        self.regressor = None
        # x includes the features, as matrix, e.g. #bathroom, sq.feet, ...
        self.X = features
        # y is the value to predict
        self.y = output

        # splitting the dataset into the Training set and Test set
        '''from sklearn.model_selection import train_test_split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=0)'''

        # Feature scaling
        self.sc_X = StandardScaler()
        self.sc_y = StandardScaler()
        self.X = self.sc_X.fit_transform(self.X)
        self.y = self.sc_y.fit_transform(self.y)

    def fit(self):
        # Fitting Partial Least Squares Regression to the dataset
        self.regressor = PLSRegression(n_components=1)

        # lin_reg.fit(self.X_train, self.y_test)
        self.regressor.fit(self.X, self.y)

    def show_(self):
        # Visualizing the Partial Least Squares Regression results
        X_grid = np.arange(min(self.X), max(self.X), 0.1)
        X_grid = X_grid.reshape((len(X_grid), 1))
        plt.scatter(self.X, self.y, color='red')
        # We don't use X_poly, so this block of code ,ca ne generalized changing data to show
        plt.plot(X_grid, self.regressor.predict(X_grid), color='blue')
        plt.title('Truth or Bluff (PLSR Model)')
        plt.xlabel('Position level')
        plt.ylabel('Salary')
        plt.show()

    def predict(self, value=6.5):
        if type(value) is np.ndarray:
            y_pred = self.regressor.predict(self.sc_X.transform(value))
        else:
            y_pred = self.regressor.predict(
                self.sc_X.transform(np.array([[value]])))
        return self.sc_y.inverse_transform(y_pred)
Esempio n. 33
0
 def test_compare_to_sklearn(self):
     d = table(10, 5, 1)
     d.X = np.random.RandomState(0).rand(*d.X.shape)
     d.Y = np.random.RandomState(0).rand(*d.Y.shape)
     orange_model = PLSRegressionLearner()(d)
     scikit_model = PLSRegression().fit(d.X, d.Y)
     np.testing.assert_almost_equal(
         scikit_model.predict(d.X).ravel(), orange_model(d))
     np.testing.assert_almost_equal(scikit_model.coef_,
                                    orange_model.coefficients)
Esempio n. 34
0
targets = pd.get_dummies(train.target)
train.drop('target', axis=1, inplace=True)
train = train.apply(np.log1p)

test = pd.read_csv('test.csv', index_col='id')
test = test.apply(np.log1p)

Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27)

best = 10.

for n in range(5,16):
    
    clf = PLSRegression(n_components=n)
    clf.fit(Xt,yt)
    y_pred = clf.predict(Xv)
    loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred)
    if loss < best:
        n_best = n
        best = loss
        postfix = '(*)'
    else:
        postfix = ''
    print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix))


clf = PLSRegression(n_components=n_best)  
clf.fit(train,targets)
y_pred = clf.predict(test)

regression_params = pandas.DataFrame(0, index=norm.columns, columns=concepts)
predicted_nii1 = pandas.DataFrame(0, index=norm.columns, columns=["nii"])
predicted_nii2 = pandas.DataFrame(0, index=norm.columns, columns=["nii"])

print "Training voxels and building predicted images..."
for voxel in norm.columns:
    train = [x for x in X.index if x not in [image1_holdout, image2_holdout] and x in norm.index]
    Y = norm.loc[train, voxel].tolist()
    Xtrain = X.loc[train, :]
    # Use pls instead of regularized regression
    clf = PLSRegression(n_components=number_components)
    clf.fit(Xtrain, Y)
    # Need to find where regression/intercept params are in this model
    regression_params.loc[voxel, :] = [x[0] for x in clf.coef_]
    predicted_nii1.loc[voxel, "nii"] = clf.predict(holdout1Y.reshape(1, -1))[0][0]
    predicted_nii2.loc[voxel, "nii"] = clf.predict(holdout2Y.reshape(1, -1))[0][0]


predicted_nii1 = predicted_nii1["nii"].tolist()
predicted_nii2 = predicted_nii2["nii"].tolist()

# Turn into nifti images
nii1 = numpy.zeros(standard_mask.shape)
nii2 = numpy.zeros(standard_mask.shape)
nii1[standard_mask.get_data() != 0] = predicted_nii1
nii2[standard_mask.get_data() != 0] = predicted_nii2
nii1 = nibabel.Nifti1Image(nii1, affine=standard_mask.get_affine())
nii2 = nibabel.Nifti1Image(nii2, affine=standard_mask.get_affine())

# Turn the holdout image data back into nifti
n = 1000
q = 3
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSRegression(n_components=3)
pls1.fit(X, y)
# note that the number of components exceeds 1 (the dimension of y)
print("Estimated betas")
print(np.round(pls1.coef_, 1))

# #############################################################################
# CCA (PLS mode B with symmetric deflation)
Esempio n. 37
0
        print "\n"
        SVRr2.append(optSVR.score(XTest, yTest))
        SVRmse.append( metrics.mean_squared_error(yTest,SVRpreds))
        SVRrmse.append(math.sqrt(SVRmse[metcount]))
        print ("Support Vector Regression prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2f\n\n" % (metcount+1, SVRmse[metcount], SVRrmse[metcount],SVRr2[metcount]))
        with open(train_name,'a') as ftrain :
                ftrain.write("Support Vector Regression prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,\n\n" % (metcount+1, SVRmse[metcount], SVRrmse[metcount],SVRr2[metcount]))
        ftrain.close()

        # Train partial least squares and predict with optimised parameters
        print("\n\n------------------- Starting opitimised PLS training -------------------")
        optPLS = PLSRegression(n_components = nc)
        optPLS.fit(XTrain, yTrain)       # Train the model
        print("Training R2 = %5.2f" % optPLS.score(XTrain,yTrain))
        print("Starting optimised PLS prediction")
        PLSpreds = optPLS.predict(XTest)
        print("The predicted values now follow :")
        PLSpredsdim = PLSpreds.shape[0]
        i = 0
        if PLSpredsdim%5 == 0:
                while i < PLSpredsdim:
                        print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2),'\t', round(PLSpreds[i+3],2),'\t', round(PLSpreds[i+4],2)
                        i += 5
        elif PLSpredsdim%4 == 0:
                while i < PLSpredsdim:
                        print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2),'\t', round(PLSpreds[i+3],2)
                        i += 4
        elif PLSpredsdim%3 == 0 :
                while i < PLSpredsdim :
                        print round(PLSpreds[i],2),'\t', round(PLSpreds[i+1],2),'\t', round(PLSpreds[i+2],2)
                        i += 3
Esempio n. 38
0
def pls_train(groups, varname='valence', arrayname='norm', scale=True,
              ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False,
              xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Partial Least Squares model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      scale       bool to scale data [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]
      ncomps      number of independent components  (See Note 5) [2]

    Returns
    -------
      group with trained PSLResgession, to be used with pls_predict

    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  The optimal number of components may be best found from PCA. If set to None,
         a search will be done for ncomps that gives the lowest RMSE_CV.
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws['scale'] = scale
    kws['n_components'] = ncomps

    model = PLSRegression(**kws)

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        resid = []
        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])[:, 0]
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    # final fit without cross-validation
    model = PLSRegression(**kws)
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)[:, 0]

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 coefs=model.x_weights_, loadings=model.x_loadings_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv,
                 rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, scale=scale, groupnames=groupnames,
                 keywords=kws)
for i in np.arange(1,17):
    plsregr = PLSRegression(n_components=i, scale=False)
    plsregr.fit(X_train_scaled,y_train)
    score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean()
    mse.append(score)

plt.plot(np.arange(1,17), np.array(mse), '-v')
plt.title("PLS: MSE vs. Principal Components")
plt.xlabel('Number of principal components in PLS regression')
plt.ylabel('MSE')
plt.xlim((-0.2, 17.2))

#Based off of the plot, 12 principal components minimized MSE
plsregr_test = PLSRegression(n_components=12, scale=False)
plsregr_test.fit(X_train_scaled, y_train)
MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2)
# print "Mean Squared Error: ", MSE_PLS

#Compare the results from above. We use (R)^2 for all models
Test_avg= np.mean(y_test)

LS_R2 = 1 - MSE_LS/(np.mean((Test_avg-y_test)**2))
R_R2 = 1 - MSE_R/(np.mean((Test_avg-y_test)**2))
LA_R2 = 1 - MSE_LA/(np.mean((Test_avg-y_test)**2))
PCA_R2 = 1 - MSE_PCA/(np.mean((Test_avg-y_test)**2))
PLS_R2 = 1 - MSE_PLS/(np.mean((Test_avg-y_test)**2))

print "Least Squares Regression (R)^2: ", LS_R2
print "Ridge Regression (R)^2: ", R_R2
print "Lasso Regression (R)^2: ", LA_R2
print "Principal Component Analysis Regression (R)^2: ", PCA_R2
Esempio n. 40
0
print(clf.coef_)

yvalid_scaled = clf.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

'''
General Linear Model -- Elastic Net
'''
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=20)
pls.fit(x_scaled, y_scaled)
print(pls.coef_)

yvalid_scaled = pls.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax)

pca = PCA(n_components=2)
pca.fit(xtrain_minmax)
print(pca.explained_variance_ratio_)


data_trainO.head(10)

Esempio n. 41
0
(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath)
(Xtest, ytest) = loadData(xtestpath, ytestpath)

#trim off background and scale
ytrain=ytrain[:,1:]
#ytrain=scale(ytrain)
Xtrain=standardize(Xtrain)

#trim off background and scale
ytest = ytest[:,1:]
#ytest = scale(ytest)
Xtest = standardize(Xtest)

pls = PLSRegression(n_components=10)
pls.fit(Xtrain, ytrain)
y_pls = pls.predict(Xtest)
print 1 + pls.score(Xtest, ytest)


pls_rmse=[]
pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3])))

fig = plt.figure(figsize=(20,10))

ax1 = fig.add_subplot(241)
ax1.plot(y_pls[:,0], c='r', label='PLS Fit')
ax1.plot(ytest[:,0], c='grey', label='Target')
ax1.set_xlabel('Time')
Esempio n. 42
0
	y_levelOne = []	
	level0Classifier = []
        for tid,Xp,yp in zip(subjId_train,X_train,y_train):
	    print "Predicting subject ", vid, "from subject ", tid
            y0 = np.zeros(yp.shape)
	    y1 = np.ones(Xt.shape[0])
	    X = np.vstack([Xp,Xt])
            yd = np.concatenate([y0,y1])

            pls = PLSRegression(n_components)
	    Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9)
	    yp_t = yp_t.astype(bool)
	    yp_t_not =  np.vstack((yp_t,~yp_t)).T
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()
	    print "PLS Validation error " , float(error)/yp_v.shape[0]

	    X_new = pls.transform(X)
	    rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4)
	    #print "shapes ", X_new.shape, y.shape
Esempio n. 43
0
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
nms = [ x[0] for x in Descriptors._descList ]
def calculator( mols ):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms )
    res = [ calc.CalcDescriptors( mol ) for mol in mols ]
    return res

trainMols = [ mol for mol in Chem.SDMolSupplier("solubility.train.sdf") ]
testMols =  [ mol for mol in Chem.SDMolSupplier("solubility.test.sdf") ]

trainDescrs = calculator( trainMols )
testDescrs = calculator( testMols )

trainActs = np.array([ float( mol.GetProp('SOL') ) for mol in trainMols  ])
testActs = np.array([ float( mol.GetProp('SOL') ) for mol in testMols  ])

pls2 = PLSRegression( n_components = 15 )
pls2.fit( trainDescrs, trainActs )

sol_pred = pls2.predict( testDescrs )
print type(sol_pred)
print type(trainActs)
print metrics.r2_score( testActs, sol_pred )

"""
for i in range(len(sol_pred)):
    print testActs[i], sol_pred[i]

"""
Esempio n. 44
0
	#Xpls = pls.x_scores_
	#Ypls = pls.y_scores_
	#CorrCoef = np.corrcoef(Xpls,Ypls,rowvar=0)
	#print('')
	#print('Correlation between the two datasets in component 1: {:.3}'.format(CorrCoef[2,0]))
	#print('Correlation between the two datasets in component 2: {:.3}'.format(CorrCoef[1,3]))


	### Determine cross-validation scores using k-folds repeated n_iter times with a new random sorting
	cvPLS = cross_validation.StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=None)   # Stratified k-folds of 1/test_size or 5 typically


	### Find CV scores using root means square error for PLS to help determine appropriate number of components
	print('')

	predPLS = np.array(pls.predict(Data), dtype='int')

	msepPLS = mean_squared_error(predPLS,y)
	print('PLS MSEP with {:} PLS components: {:.2e}'.format(nPLS, msepPLS))

	msePLSScores = cross_validation.cross_val_score(
	pls, Data, y, cv=cvPLS, scoring='mean_squared_error') # bug- returns negative values
	print('k-folds PLS MSEP: {:.2e}'.format(abs(np.mean(msePLSScores))))


	### Perform classification then transform PLS data to LDA basis
	nLDA = 2
	clfLDA = lda.LDA(n_components = nLDA)
	Xlda = clfLDA.fit_transform(TrnsfrmPls[0],ExampleClasses)
	
	# Predict and calculate misclassification rate