def simple_pls_cv(X, y, n_comp):
    # Run PLS with suggested number of components
    pls = PLSRegression(n_components=n_comp)
    pls.fit(X, y)
    y_c = pls.predict(X)
    # Cross-validation
    y_cv = cross_val_predict(pls, X, y, cv=10)
    # Calculate scores for calibration and cross-validation
    score_c = r2_score(y, y_c)
    score_cv = r2_score(y, y_cv)
    # Calculate mean square error for calibration and cross validation
    mse_c = mean_squared_error(y, y_c)
    mse_cv = mean_squared_error(y, y_cv)
    print('R2 calib: %5.3f' % score_c)
    print('R2 CV: %5.3f' % score_cv)
    print('MSE calib: %5.3f' % mse_c)
    print('MSE CV: %5.3f' % mse_cv)
    # Plot regression
    z = np.polyfit(y, y_cv, 1)
    with plt.style.context(('ggplot')):
        fig, ax = plt.subplots(figsize=(9, 5))
        ax.scatter(y_cv, y, c='red', edgecolors='k')
        ax.plot(z[1] + z[0] * y, y, c='blue', linewidth=1)
        ax.plot(y, y, color='green', linewidth=1)
        plt.title('$R^{2}$ (CV): ' + str(score_cv))
        plt.xlabel('Predicted $^{\circ}$Brix')
        plt.ylabel('Measured $^{\circ}$Brix')

        plt.show()
Esempio n. 2
0
def PLS(X, y, X_ind, y_ind):
    """ Cross validation and Independent test for PLS regression model.
        Arguments:
            X (np.ndarray): m x n feature matrix for cross validation, where m is the number of samples
                and n is the number of features.
            y (np.ndarray): m-d label array for cross validation, where m is the number of samples and
                equals to row of X.
            X_ind (np.ndarray): m x n Feature matrix for independent set, where m is the number of samples
                and n is the number of features.
            y_ind (np.ndarray): m-d label array for independent set, where m is the number of samples and
                equals to row of X_ind, and l is the number of types.
            reg (bool): it True, the training is for regression, otherwise for classification.
         Returns:
            cvs (np.ndarray): m x l result matrix for cross validation, where m is the number of samples and
                equals to row of X, and l is the number of types and equals to row of X.
            inds (np.ndarray): m x l result matrix for independent test, where m is the number of samples and
                equals to row of X, and l is the number of types and equals to row of X.
    """
    folds = KFold(5).split(X)
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    for i, (trained, valided) in enumerate(folds):
        model = PLSRegression()
        model.fit(X[trained], y[trained])
        cvs[valided] = model.predict(X[valided])[:, 0]
        inds += model.predict(X_ind)[:, 0]
    return cvs, inds / 5
Esempio n. 3
0
  def predicao(self, idmodelo, idamostra):

    idmodelo = idmodelo

    idamostra = idamostra

    print(idmodelo)
    print(idamostra)

    X = self.selectMatrizX(idmodelo, "VALIDACAO")
    Y = self.selectMatrizY(idmodelo, "VALOR", "VALIDACAO")

    amostraPredicao = self.selectAmostra(idamostra, idmodelo)

    valorReferencia = self.selectDadosReferenciaAmostra(idamostra, idmodelo)

    pls = PLSRegression(copy=True, max_iter=500, n_components=20, scale=False, tol=1e-06)

    pls.fit(X, Y)
    print(amostraPredicao)
    valorPredito = pls.predict(amostraPredicao)

    print('Amostra: ' + str(idamostra) + ' - Valor Predito :' + str(valorPredito) + ' - Valor Referencia :' + str(
      valorReferencia))

    cursorDadosCalibracao = db.execute("select rmsec, rmsep, coeficientecal, coeficienteval, dtcalibracao "
                                       "from calibracao where inativo = 'A' and idmodelo = " + str(idmodelo) + " ")
    for regCodigo in cursorDadosCalibracao:
      rmsec = regCodigo[0]
      rmsep = regCodigo[1]
      coeficienteCal = regCodigo[2]
      coeficienteVal = regCodigo[3]
      dtcalibracao = regCodigo[4]

    print(rmsec)
    print(rmsep)
    print(coeficienteCal)
    print(coeficienteVal)
    print(dtcalibracao)

    dtcalibracao = dtcalibracao.strftime('%d/%m/%Y')
    print(dtcalibracao)

    # tratamento dos dados para o Json
    coeficienteCal = round(coeficienteCal, 2)
    coeficienteVal = round(coeficienteVal, 2)
    rmsec = round(rmsec, 2)
    rmsep = round(rmsep, 2)
    valorReferencia = round(valorReferencia, 2)

    valorPreditoString = str(valorPredito)
    valorPreditoString = valorPreditoString.replace("[", "")
    valorPreditoString = valorPreditoString.replace("]", "")

    ##Contrucao do JSON
    json_data = jsonify(idamostra=str(idamostra), valorpredito=str(valorPreditoString),
                        rmsec=str(rmsec), rmsep=str(rmsep), idmodelo=str(idmodelo), dtcalibracao=str(dtcalibracao),
                        valorreferencia=str(valorReferencia), coeficientecal=str(coeficienteCal), coeficienteval=str(coeficienteVal))

    return json_data
Esempio n. 4
0
class PLSDADummy(BaseEstimator):
    """
    Wrapper of PLSRegression for classification.

    PLSRegression predicts one hot encoded vectors,
    then plsda outputs class with maximal score.
    """
    def __init__(self, n_components=2):
        self.pls = PLSRegression(n_components)
        self.classes = None

    def __one_hot_encode(self, Y):
        # encode labels to numbers
        Y = np.array([np.where(self.classes == y)[0][0] for y in Y])

        enc = OneHotEncoder(n_values=len(self.classes))
        return enc.fit_transform(Y.reshape(-1, 1)).toarray()

    def fit(self, X, Y):
        """

        :param X:
        :param Y: list of labels
        :return :
        """
        self.classes = np.array(sorted(np.unique(Y)))

        Y = self.__one_hot_encode(Y)
        self.pls.fit(X, Y)

        return self

    def predict(self, X):
        y_pred = np.argmax(self.pls.predict(X), axis=1)
        return np.array([self.classes[cls] for cls in y_pred])
Esempio n. 5
0
class MyPLS():
    def __init__(self,
                 n_components=2,
                 scale=True,
                 max_iter=500,
                 tol=1e-06,
                 copy=True):
        self.pls = PLSRegression(n_components, scale, max_iter, tol, copy)

    def fit(self, X, Y):
        self.pls.fit(X, Y)
        return self.pls

    def predict(self, X, copy=True):
        return self.pls.predict(X, copy).flatten()

    def score(self, X, Y, sample_weight=None):
        return self.pls.score(X, Y, sample_weight)

    def get_params(self, deep=True):
        return self.pls.get_params(deep)

    def set_params(self, **parameters):
        self.pls.set_params(**parameters)
        return self

    @property
    def intercept_(self):
        return 0

    @property
    def coeff_(self):
        return self.pls.coef_
Esempio n. 6
0
 def get_cal(self, X, y, n_comp=None):
     if n_comp == None:
         n_comp = self.opt_ncomp
     X = self.transform(X)
     pls = PLSRegression(n_components=n_comp)
     pls.fit(X, y)
     return pls
Esempio n. 7
0
class MSLMultiModel(WebModel):
    ''' a Multitask version of MSLModel '''
    def __init__(self, output_dir, ccs_dir, lanl_file, n_components=10, **kwargs):
        self.output_dir = output_dir
        self.ccs_dir = ccs_dir
        self.lanl_file = lanl_file

        self.n_components = n_components
        self.model = PLSRegression(n_components=n_components, scale=False)
        self.multitask = True
        self.name = 'msl_multi_model'

    def fit(self, data, composition, elements):
        self.elements = elements  # order matters
        data = libs_norm3(data[:, ALAMOS_MASK])
        self.model.fit(data, composition)

    def predict(self, data, mask=ALAMOS_MASK, clip=True):
        data = libs_norm3(data[:, mask])
        predictions = self.model.predict(data, copy=False)
        if predictions:
            predictions = np.clip(predictions, 0, 100)
        else:
            predictions[predictions < 0] = 0
        return predictions
Esempio n. 8
0
def pls_balances_cmd(table_file, metadata_file, category, output_file):
    metadata = pd.read_table(metadata_file, index_col=0)
    table = load_table(table_file)
    table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                         index=table.ids(axis='sample'),
                         columns=table.ids(axis='observation'))

    ctable = pd.DataFrame(clr(centralize(table + 1)),
                          index=table.index,
                          columns=table.columns)

    rfc = PLSRegression(n_components=1)
    if metadata[category].dtype != np.float:
        cats = np.unique(metadata[category])
        groups = (metadata[category] == cats[0]).astype(np.int)
    else:
        groups = metadata[category]

    rfc.fit(X=ctable.values, Y=groups)

    pls_df = pd.DataFrame(rfc.x_weights_,
                          index=ctable.columns,
                          columns=['PLS1'])
    l, r = round_balance(pls_df.values,
                         means_init=[[pls_df.PLS1.min()], [0],
                                     [pls_df.PLS1.max()]],
                         n_init=100)
    num = pls_df.loc[pls_df.PLS1 > r]
    denom = pls_df.loc[pls_df.PLS1 < l]
    diff_features = list(num.index.values)
    diff_features += list(denom.index.values)

    with open(output_file, 'w') as f:
        f.write(','.join(diff_features))
Esempio n. 9
0
def do_pls(df, n_components=-1):
    Y_cols = ["slump", "flow", "compressive_strength"]
    X_cols = [
        "cement", "slag", "fly_ash", "water", "superplasticizer",
        "coarse_aggregate", "fine_aggregate"
    ]
    Y = df[Y_cols]
    X = df[X_cols]

    if n_components == -1:
        r2s = []
        mses = []
        rpds = []
        xticks = np.arange(1, X.shape[1] + 1)
        for n_comp in xticks:
            y_cv, r2, mse, rpd = optimise_pls_cv(X, Y, n_comp)
            r2s.append(r2)
            mses.append(mse)
            rpds.append(rpd)
        plot_metrics(mses, 'MSE', 'min', xticks)
        plot_metrics(r2s, 'R2', 'max', xticks)
        #plot_metrics(rpds, 'RPD', 'max', xticks)
        n_components = np.argmin(mses) + 1

    pls = PLSRegression(n_components=n_components, scale=True)
    pls.fit(X, Y)
    loadings = pd.DataFrame(pls.x_loadings_)
    scores = pd.DataFrame(pls.x_scores_)

    X_rows_dict = {i: X_cols[i] for i in range(0, len(X_cols))}
    X_cols_dict = {i: 'LV' + str(i + 1) for i in range(0, n_components)}
    loadings.rename(index=X_rows_dict, columns=X_cols_dict, inplace=True)
    print(loadings)
def varselpls(x, y, ti, nc, step=1):
    '''
  X: m x n data, m: number of samples
  Y: m x 1 reference values
  TI: test indices
  NC: number of principal components (latent variables)
  Returns the following outputs:
  IND_OPT: optimum indices (variables) of X
  RMSE_OPT: the RMSE that is reached by the optimal varible selection
  '''
    ci = np.arange(x.shape[0])  # calibration index
    ci = np.delete(ci, ti)
    plsModel = PLSRegression(n_components=nc)
    plsModel.fit(x[ci, :], y[ci])
    reg_coe = np.abs(plsModel.coef_[:, 0])
    a = np.sort(reg_coe)[:-nc:step]
    la = len(a)
    rmse_opt = np.zeros(la)
    for c, k in zip(a, range(la)):
        var_sel = reg_coe >= c
        x_cal = x[ci, :][:, var_sel]
        rmse_opt[k] = cross_val_score(plsModel,
                                      x_cal,
                                      y[ci],
                                      cv=4,
                                      scoring='neg_mean_squared_error').mean()


#  pdb.set_trace()
    k_opt = np.argmax(rmse_opt)
    ind_opt = np.arange(x.shape[1])[reg_coe >= a[k_opt]]
    rmse_opt = np.sqrt(-rmse_opt[k_opt])
    return ind_opt, rmse_opt
Esempio n. 11
0
 def fit_pls(self, X_test):
     reg = PLSRegression(n_components=20, scale=False, max_iter=1000)
     reg.fit(self.X.copy().values, self.y.copy().values.flatten())
     preds = reg.predict(X_test.copy().values)
     ids = X_test.index
     pred_df = pd.DataFrame(data=preds, index=ids, columns=['SalePrice'])
     pred_df.to_csv('results/results_pls.csv', sep=',')
Esempio n. 12
0
    def fitcv(self):
        pls = PLSRegression(n_components=self.n_components,scale=False)
        kf = KFold(n_splits=self.n_splits)
        yTrue=None
        yHat=None
        # 判断Y 是几维的
        dimensiony=len(self.Y.shape)

        for train_index, test_index in kf.split(self.X):
            X_train, X_test = self.X[train_index], self.X[test_index]
            y_train, y_test = self.Y[train_index], self.Y[test_index]
            pls.fit(X_train, y_train)
            if dimensiony==1:
                ypred = pls.predict(X_test)[:,0]
            else:
                ypred = pls.predict(X_test)
            ypred[ypred>0]=1
            ypred[ypred<0]=-1
            if yTrue is None:
                yTrue=y_test  # 真值
                yHat=ypred  #预测值
            else:
        
                yTrue=np.r_[yTrue,y_test]
                yHat=np.r_[yHat, ypred]
        err=yTrue-yHat
        errSampleNo=np.where(err!=0)
        err=err[err!=0]
        
        return len(err)/len(self.X)*100,errSampleNo  #返回误判率
Esempio n. 13
0
def piecewise_ds(A, B, win_size=5, pls=None):
    assert A.shape == B.shape, "Input matrices must be the same shape."
    assert win_size % 2 == 1, "Window size must be odd."

    padding = (win_size - 1) / 2
    n_feats = A.shape[1]

    coefs = []
    for i in xrange(n_feats):
        row = np.zeros(n_feats)
        start = max(i - padding, 0)
        end = min(i + padding, n_feats - 1) + 1
        if isinstance(pls, int):
            model = PLSRegression(n_components=pls, scale=False)
            model.fit(B[:, start:end], A[:, i])
            row[start:end] = model.coefs.ravel()
        elif pls is None:
            row[start:end] = np.dot(np.linalg.pinv(B[:, start:end]), A[:, i])
        else:
            print "ERROR: bad number of PLS components."
            return
        coefs.append(row)

    proj_to_A = np.array(coefs).T
    proj_B = np.dot(B, proj_to_A)

    return proj_to_A, proj_B
Esempio n. 14
0
def knn_denoise(X, X_reference, *, k, ncomp):
    from sklearn.cross_decomposition import PLSRegression
    from sklearn.neighbors import NearestNeighbors
    # Xb = X * window

    print('PCA...')
    npca = np.minimum(300, X.shape[0])
    u, s, vh = np.linalg.svd(X)
    features = u[:, 0:npca] * s[0:npca]
    components = vh[0:npca, :]
    # s = s[0:npca]

    print('Nearest neighbors...')
    nbrs = NearestNeighbors(n_neighbors=k + 1,
                            algorithm='ball_tree').fit(features)
    distances, indices = nbrs.kneighbors(features)

    features2 = np.zeros(features.shape, dtype=features.dtype)
    for j in range(X.shape[0]):
        print(f'{j+1} of {X.shape[0]}')
        inds0 = np.squeeze(indices[j, :])
        inds0 = inds0[1:]
        # Xbneighbors = Xb[inds0, :]
        f_neighbors = features[inds0, :]
        pls = PLSRegression(n_components=ncomp)
        # pls.fit(Xbneighbors.T, Xb[j, :].T)
        pls.fit(f_neighbors.T, features[j, :].T)
        features2[j, :] = pls.predict(f_neighbors.T).T
        # X2[j, :] = pls.predict(Xbneighbors.T).T
    print(features2.shape)
    print(components.shape)
    X2 = features2 @ components
    return X2
Esempio n. 15
0
def plot_embedding(df, labels, method='tSNE', cmap='tab20', figsize=(6, 6), markersize=50, show_legend=True,
                   return_emb=False, save=False, save_emb=False):
    """
    df: DataFrame nsamples x nfeatures
    labels: labels for each sample
    """
    df = df.fillna(0)
    if method == 'tSNE':
        from sklearn.manifold import TSNE
        X = TSNE(n_components=2, random_state=124).fit_transform(df)
    if method == 'UMAP':
        from umap import UMAP
        X = UMAP(n_neighbors=30, min_dist=0.1).fit_transform(df)
    if method == 'PCA':
        from sklearn.decomposition import PCA
        X = PCA(n_components=2, random_state=124).fit_transform(df)
    if method == 'PLS':
        from sklearn.cross_decomposition import PLSRegression
        from sklearn.preprocessing import LabelEncoder
        encode = LabelEncoder()
        ref = encode.fit_transform(labels)
        pls2 = PLSRegression(n_components=2)
        pls2.fit(df, ref)
        X = pls2.x_scores_
            
    plt.figure(figsize=figsize)

    if cmap is not None:
        cmap = cmap
    elif len(classes) <= 10:
        cmap = 'tab10'
    elif len(classes) <= 20:
        cmap = 'tab20'
    else:
        cmap = 'husl'
    palette = sns.color_palette(cmap, n_colors=len(np.unique(labels)))
        
    ax = sns.scatterplot(x=X[:,0],
                         y=X[:,1],
                         hue=labels,
                         palette=palette, 
                         marker="o",
                         legend='full',
                         s=markersize)

    ax.tick_params(axis='x', bottom=True, top=False, labeltop=False, labelbottom=True, labelsize=12, length=3, pad=3)
    ax.tick_params(axis='y', left=True, right=False, labelright=False, labelleft=True, labelsize=12, length=3, pad=3)

    ax.set_xlabel('{}_1'.format(method), fontsize=15, labelpad=10, va='center')
    ax.set_ylabel('{}_2'.format(method), rotation=90, fontsize=16, labelpad=10, va='center')

    if save:
        plt.savefig(save, format='pdf', bbox_inches='tight')
    else:
        plt.show()
        
    if save_emb:
        np.savetxt(save_emb, X)
    if return_emb:
        return X
Esempio n. 16
0
def PLSCrossValidation(n_components, trainSet, validationSet):
  pls = PLSRegression(n_components=n_components)
  pls.fit(trainSet[predictorList], trainSet['Apps'])
  predictPls = pls.predict(validationSet[predictorList])
  different = predictPls.flat - validationSet['Apps']
  error_rate = np.mean(different ** 2)
  return error_rate
Esempio n. 17
0
def train_PLSR(x_filename, y_filename, model_filename, n):
    """
    Train a PLSR model and save it to the model_filename.
    X and Y matrices are read from x_filename and y_filename.
    The no. of PLSR components is given by n. 
    """
    X = loadMatrix(x_filename)[0].todense()
    Y = loadMatrix(y_filename)[0].todense()
    if X.shape[0] != Y.shape[0]:
        sys.stderr.write("X and Y must have equal number of rows!\n")
        raise ValueError
    sys.stderr.write("Learning PLSR...")
    startTime = time.time()
    pls2 = PLSRegression(copy=True,
                         max_iter=10000,
                         n_components=n,
                         scale=True,
                         tol=1e-06)
    pls2.fit(X, Y)
    model = open(model_filename, 'w')
    pickle.dump(pls2, model, 1)
    model.close()
    endTime = time.time()
    sys.stderr.write(" took %ss\n" % str(round(endTime - startTime, 2)))
    pass
Esempio n. 18
0
    def fit(self, X, y):
        self.X = X

        pls = PLSRegression(n_components=self.n_comps)
        # Fit data
        pls.fit(self.X, y)

        # Get X scores
        self.T = pls.x_scores_
        # Get X loadings
        self.P = pls.x_loadings_
        # Calculate error array
        self.Err = self.X - np.dot(self.T, self.P.T)

        # Calculate Q-residuals (sum over the rows of the error array)
        self.Q = np.sum(self.Err ** 2, axis=1)
        # Calculate Hotelling's T-squared (note that data are normalised by default)
        self.Tsq = np.sum((pls.x_scores_ / np.std(pls.x_scores_, axis=0)) ** 2, axis=1)

        # set the confidence level
        # conf = self.conf
        # Calculate confidence level for T-squared from the ppf of the F distribution
        self.Tsq_conf = (
            f.ppf(q=self.conf, dfn=self.n_comps, dfd=self.X.shape[0])
            * self.n_comps
            * (self.X.shape[0] - 1)
            / (self.X.shape[0] - self.n_comps)
        )
        # Estimate the confidence level for the Q-residuals
        i = np.max(self.Q) + 1
        while 1 - np.sum(self.Q > i) / np.sum(self.Q > 0) > self.conf:
            i -= 1
        self.Q_conf = i

        self._fitted = True
Esempio n. 19
0
class PLS():
    """
    Implement PLS to make it compliant with the other dimensionality
    reduction methodology.
    (Simple class rewritting).
    """
    def __init__(self, n_components=10):
        self.clf = PLSRegression(n_components)

    def get_components_(self):
        return self.clf.x_weights_.transpose()

    def set_components_(self, x):
        pass

    components_ = property(get_components_, set_components_)

    def fit(self, X, y):
        self.clf.fit(X,y)
        return self

    def transform(self, X):
        return self.clf.transform(X)

    def predict(self, X):
        return self.clf.predict(X)
Esempio n. 20
0
    def cv_predict(self, n_folds, max_components):
                 
        x_tr, x_te, y_tr, y_te = baseCV.CV(self, self.x, self.y, n_folds)

        y_predict_all=np.ones((1,max_components))
#         pls = _NIPALS(max_components)
        for i in range(n_folds):
            
            y_predict = np.zeros((x_te[i].shape[0],max_components))
            xtrainmean = np.mean(x_tr[i], axis=0)
            ytrainmean = np.mean(y_tr[i], axis=0)

            xte_center = np.subtract(x_te[i], xtrainmean)                              
            yte_center = np.subtract(y_te[i],ytrainmean)

#             W,T,P,C,U,Q,lists_coefs_B=pls.fit(x_tr[i],y_tr[i],max_components)
#            print lists_coefs_B
            for j in range(1, max_components, 1):

                pls2 = PLSRegression(j)
                pls2.fit(x_tr[i],y_tr[i])
#                 print pls2.coef_
        
                y_pre_center = np.dot(xte_center,pls2.coef_)
#                 y_pre_center = np.dot(xte_center,lists_coefs_B[j])
                Y_pre = y_pre_center + ytrainmean                                
                y_predict[:,j]=Y_pre.ravel() 
            
            y_predict_all=np.vstack((y_predict_all,y_predict))
            
        y_predict_all=y_predict_all[1:]
 
        return   y_predict_all,self.y
Esempio n. 21
0
def fitPLS(input_data):
    ### Step 1 - unpack input JSON to normal data array
    input_df = pd.DataFrame(input_data)
    
    colnames = list(input_df.columns.values)  
    colnames[len(colnames) - 1] = 'Intercept'
    
    train_arr = np.zeros(shape = input_df.shape)

    for i in range(train_arr.shape[0]):
        for j in range(train_arr.shape[1]):
            train_arr[i,j] = float(input_df.values[i,j]['value'])
            

    ### Step 2 - fit the model. X - independent variables; Y - output variable
    X, Y = train_arr[:, : - 1], train_arr[:, -1] 
    
    n_best = 2
    
    best_fit = PLSRegression(n_components = n_best, scale = False)
    best_fit.fit(X, Y)
    
    ### Step 3 - retrieve regression coefficients and pack them to JSON
    model_coef = best_fit.coef_
    y_intercept = best_fit.y_mean_ - np.dot(best_fit.x_mean_ , best_fit.coef_)
    
    model_vector = np.append(model_coef, y_intercept)
        
    model_list = model_vector.tolist()
    
    model_dict = dict(zip(colnames, model_list))
    model_js = json.dumps(model_dict)
        
    
    return model_js; 
def getPLSRegression(differenceMatrix):
    similarityRatings = np.zeros([80])
    feature_length = 103

    X1 = np.nan_to_num(similarityRatings)  # (80,)
    featureSet = scale(
        differenceMatrix)  # (80, 103) - 103 diff. values for 80 ratings

    i = 0
    with open('average-similarity-ratings.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=",")
        for row in reader:
            similarityRatings[i] = float(row[0])
            i += 1

    normalizedDifference = np.zeros([80, feature_length])
    differenceArray = np.zeros([80, feature_length])
    for i in range(80):
        differenceArray[i, :] = differenceMatrix[i]

    y = np.nan_to_num(similarityRatings)  # (80,)

    PLSreg = PLSRegression(n_components=16)
    PLSreg.fit(featureSet, y)
    print(featureSet.shape, 'featureset shape')
    return PLSreg, similarityRatings
Esempio n. 23
0
def PLSR_LOOCV(data):
    ''' Performs LOOCV on the data and returns R2Y value '''
    R2Y = 0
    predVal = []
    for i in range(len(data[:, 0])):
        train = np.zeros((len(data[:, 0]) - 1, 8))
        test = np.zeros((1, 8))
        for j in range(len(data[:, 0])):
            if j < i:
                train[j, :] = data[j, :]
            elif j > i:
                train[j - 1, :] = data[j, :]
            else:
                test[0, :] = data[j, :]

        testScaled = np.zeros((1, 8))
        trainScale = StandardScaler()
        trainScaled = trainScale.fit_transform(train)
        testScaled[0, :] = trainScale.transform(test)
        PLSR = PLSRegression(n_components=2)
        PLSR.fit(trainScaled[:, 2:6], trainScaled[:, 0])
        pred = PLSR.predict(testScaled[:, 2:6])
        predVal.append(np.squeeze(pred))
    scaledData = scaler(data)
    R2Y = 1 - np.sum(
        (predVal - scaledData[:, 0])**2) / np.sum(scaledData[:, 0]**2)
    return R2Y
Esempio n. 24
0
def feature_clustering(x,y,fc):
    if fc=='True':
        plsca = PLSRegression(n_components=200)
        plsca.fit(x,y)
        x3 = plsca.transform(x)
        string = "pls_"
        pls_column_name = [string + `i` for i in range(x3.shape[1])]
Esempio n. 25
0
def fit_plt(dados,ncomp):
    from sklearn.cross_decomposition import PLSRegression
    colmap = [ (0,0,0), (1,0,0),(0,1,0),(0,0,1),(0.41,0.41,0.41),(0,1,1),
        (0.58,0,0.82),(0,0.50,0),(0.98,0.50,0.44),(1,	1,0.87),
        (0.39,0.58,0.92),(0.50,0.50,0),(1,0.89,0.76),(0.96,0.96,0.86),
        (0,1,1)]    
    g = dados['g']
    r = dados['r']
    wn = dados['wn']
    pls = PLSRegression(n_components=ncomp)
    pls.fit(r,g)
    Y_pred = pls.predict(r)
    plt.figure() 
    plt.subplot(2,1,1)
    for i in range(1,g.max()+1):
        sel = g == i
        plt.scatter(g[sel],Y_pred[sel],color = colmap[i])
    plt.xlabel( 'Y_class',Fontsize = 12)
    plt.ylabel( 'Y_predited',Fontsize = 12)
    plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
    
    plt.subplot(2,1,2)
    for i in range(1,g.max()+1):
        sel = g == i
        plt.hist(Y_pred[sel])
    plt.xlabel( 'Y_class',Fontsize = 12)
    plt.ylabel( 'histograma',Fontsize = 12)
    plt.xticks(np.arange(1,g.max() + 1), str(dados['arqs']).split('::'))
Esempio n. 26
0
 def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False,
            osc_params=(10,1)):
     # Separating X from Y for PLS
     X=self.df[self.freqs].to_numpy()
     Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
     sample_std=np.std(self.df[self.y_name])
     
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")
     
     # Array for storing CV errors
     cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)])
     i=0
     for train, val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             osc.fit(X[train], Y[train])
             X_train_osc=osc.X_osc
             X_val_osc=osc.transform(X[val])
         j=0
         for ncomp in ncomp_range:
             pls = PLSRegression(n_components=ncomp,scale=False)
             if len(osc_params)==2:
                 pls.fit(X_train_osc, Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                     Y[val], pls.predict(X_val_osc))**0.5
             else:
                 pls.fit(X[train], Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                         Y[val], pls.predict(X[val]))**0.5
             j=j+1
         i=i+1
     # Printing and plotting CV results
     cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0)
     cv_RPD_ncomp=sample_std/cv_RMSE_ncomp
     if plot:
         fig = plt.figure(figsize=(12,8))
         plt.gca().xaxis.grid(True)
         plt.xticks(ncomp_range)
         plt.ylabel("RPD")
         plt.xlabel("Number of components")
         plt.plot(ncomp_range,cv_RPD_ncomp)
     # Best model
     rpd_best=max(cv_RPD_ncomp)
     ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()]
     if verbose:
         print("Best RMSE: ",min(cv_RMSE_ncomp))
         print("Best RPD: ",max(cv_RPD_ncomp))
         print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()])
     return (ncomp_best,rpd_best)
Esempio n. 27
0
def plot_pls_results(x_data, y_data, pls_components, num_variables):
    pls = PLSRegression(pls_components)

    cv_splitter = GroupShuffleSplit(n_splits=1, test_size=0.35,
                                    random_state=6)  # 1
    group_splitter = data_full['Leaf number']
    print('111111111')
    print(x_data)

    for train_index, test_index in cv_splitter.split(x_data, y_data,
                                                     group_splitter):
        # print(train_index, test_index)
        x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
        y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

        pls.fit(x_train, y_train)
        y_pred_train = pls.predict(x_train)
        y_pred_test = pls.predict(x_test)
        r2_test = r2_score(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)

        mae_test = mean_absolute_error(y_test, y_pred_test)
        mae_train = mean_absolute_error(y_train, y_pred_train)

        print(r2_test, mae_test)
        print(r2_train, mae_train)
        print(r2_score(y_train, y_pred_train))
        print(r2_score(y_test, y_pred_test))
        plt.scatter(y_train, y_pred_train, c='blue', label='Training Set')
        plt.scatter(y_test, y_pred_test, c='red', label='Test Set')

        _line = np.linspace(0.2, 1.2)

        # plt.plot(_line, _line, c='indigo', linestyle='dashed')
        #
        # plt.plot(_line, _line + .06, c='darkslategray', linestyle='dashed')
        # plt.plot(_line, _line - .06, c='darkslategray', linestyle='dashed')
        #
        # left_annote_pos = 0.20
        # plt.annotate("Training Median Absolute Error = {}".format(0.059),
        #              (left_annote_pos, 1.1), fontsize=12)
        # # plt.annotate("Testing Median Absolute Error = {}".format(0.07),
        # #              (left_annote_pos, 1.02), fontsize=12)
        #
        # plt.annotate(u"Training R\u00B2 = {}".format(0.83),
        #              (left_annote_pos, .95), fontsize=12)
        #
        # # plt.annotate(u"Testing R\u00B2 = {}".format(0.82),
        # #              (left_annote_pos, .89), fontsize=12)
        # plt.xlabel('Meausured Chlorophyll b (ug/ml)', fontsize=16)
        # plt.ylabel('Predicted Chlorophyll b (ug/ml)', fontsize=16)
        # plt.title("Chlorophyll b Model for AS7262\nbased on 2-Component\nPartial Least Squared Model",
        #           fontsize=18)
        # plt.legend(loc='lower right', fontsize=12)
        plt.tight_layout()
        plt.show()
        plt.scatter(y_pred_train, y_train, c='blue', label='Training Set')
        plt.scatter(y_pred_test, y_test, c='red', label='Test Set')
        plt.show()
Esempio n. 28
0
def compute_q2_pls(tdata, tlabel, vdata, vlabel, Rval):
    test = PLSRegression(n_components=Rval)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        test.fit(matricize(tdata), matricize(tlabel))
    Y_pred = test.predict(matricize(vdata))
    Q2 = qsquared(matricize(vlabel), matricize(Y_pred))
    return Q2
Esempio n. 29
0
def test_regressor_fit(pls_regressor):
    X = np.random.rand(10, 10)
    y = np.random.rand(10)
    sklearn_regressor = PLSRegression().fit(X, y)
    assert pls_regressor.fit(X, y)
    assert pls_regressor.fit(X[:, 0:1], y)
    with pytest.raises(ValueError):
        sklearn_regressor.fit(X[:, 0:1], y)
Esempio n. 30
0
def test_n_components_bounds_pls_regression(n_components, err_type, err_msg):
    """Check the validation of `n_components` for `PLSRegression`."""
    rng = np.random.RandomState(0)
    X = rng.randn(10, 5)
    Y = rng.randn(10, 3)
    est = PLSRegression(n_components=n_components)
    with pytest.raises(err_type, match=err_msg):
        est.fit(X, Y)
Esempio n. 31
0
def run_pls(X, Y, LV):
    model = PLSRegression(n_components=LV, scale=False)
    model.fit(X, Y)
    Yr = [ y[0] for y in model.predict(X).tolist() ]
    r2, sdec = calc_regr_metrics(Y_exp=Y, Y_pred=Yr)
    q2, sdep, variables.Y_pred = regr_loo(X=np.array(X), Y=np.array(Y), M=model)
    scores = { 'R2': r2, 'Q2': q2, 'SDEC': sdec,'SDEP': sdep }
    return scores, model
Esempio n. 32
0
def Training(df,seed, yratio, xratio, index = 1):
	snp_matrix = np.array(df.values)
	xdim, ydim = snp_matrix.shape

	ydimlist = range(0,ydim)
	xdimlist = range(0,xdim)

	random.seed(seed)
	random.shuffle(ydimlist) # shuffle the individuals
	random.shuffle(xdimlist) # shuffle the SNPs	
	accuracy = 0

	snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist])
	snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:])
	snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)]
	snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):]

	snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:]
	snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]

	for i in range(int(xdim*xratio), xdim):
		snp_matrix_train_y = snp_matrix_train[i,:]
		snp_matrix_test_y = snp_matrix_test[i,:]
		if index != 7:
			if index == 1:
				clf = AdaBoostClassifier(n_estimators= 100)
			elif index == 2:
				clf = RandomForestClassifier(n_estimators=100)
			elif index == 3:
				clf = linear_model.LogisticRegression(C=1e5)
			elif index == 4:
				clf = svm.SVC(kernel = 'rbf')
			elif index == 5:
				clf = svm.SVC(kernel = 'poly')
			else:
				clf = svm.SVC(kernel = 'linear')
			clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y)
			Y_pred = clf.predict(snp_matrix_test_x.T)
			prediction = snp_matrix_test_y - Y_pred
			wrong = np.count_nonzero(prediction)
			tmp = 1 - (wrong + 0.0) / len(prediction)
			print tmp
			accuracy += tmp

	accuracy = accuracy / (xdim - int(xdim*xratio))

	if index == 7:
		pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000)
		snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:]
		pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T)
		snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]
		snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:]		
		Y_pred = transform(pls2.predict(snp_matrix_test_x.T))
		prediction = snp_matrix_test_y - Y_pred.T
		xdim, ydim = prediction.shape
		wrong = np.count_nonzero(prediction)
		accuracy = 1 - wrong / (xdim * ydim + 0.0)
	return accuracy
Esempio n. 33
0
def fit(predictors, predictands, log=False, **kwargs):
	
	model = PLSRegression(n_components=2)
	try:
		model.fit(predictors, predictands)
	except:
		return None

	return model
Esempio n. 34
0
def trainmodels(m, x, y, iter=1000):
    '''For the model type m, train a model on x->y using built-in CV to
    parameterize.  Return both this model and an unfit model that can be used for CV.
    Note for PLS we cheat a little bit since there isn't a built-in CV trainer.
    '''
    
    if m == 'pls':
        #have to manually cross-validate to choose number of components
        kf = KFold(len(y), n_folds=3)
        bestscore = -10000
        besti = 0
        for i in xrange(1,min(100,len(x[0]))):
            #try larger number of components until average CV perf decreases
            pls = PLSRegression(i)
            scores = []
            #TODO: parallelize below
            for train,test in kf:
                xtrain = x[train]
                ytrain = y[train]
                xtest = x[test]
                ytest = y[test]            
                pls.fit(xtrain,ytrain)
                score = scoremodel(pls,xtest,ytest)
                scores.append(score)
                
            ave = np.mean(scores)
            if ave < bestscore*0.95: #getting significantly worse
                break
            elif ave > bestscore:
                bestscore = ave
                besti = i
        
        model = PLSRegression(besti) 
        model.fit(x,y)
        unfit = PLSRegression(besti)  #choose number of components using full data - iffy
        print "PLS components =",besti

    elif m == 'lasso':
        model = LassoCV(n_jobs=-1,max_iter=iter)
        model.fit(x,y)
        unfit = LassoCV(n_jobs=-1,max_iter=iter) #(alpha=model.alpha_)
        print "LASSO alpha =",model.alpha_
        return (model,unfit)
    elif m == 'ridge':
        model = RidgeCV()
        model.fit(x,y)
        print "Ridge alpha =",model.alpha_
        unfit = RidgeCV()
    else:
        model = ElasticNetCV(n_jobs=-1,l1_ratio=[.1, .5, .7, .9, .95, .99, 1],max_iter=iter)
        model.fit(x,y)
        print "Elastic alpha =",model.alpha_," l1_ratio =",model.l1_ratio_
        unfit = ElasticNetCV(n_jobs=-1,max_iter=iter)

    return (model,unfit)
Esempio n. 35
0
def get_correlations(param, spec, wave):
    '''Returns correlations between spec and params by wavelengths'''
    # using PLS
    pls = PLSRegression(10)
    pls.fit(spec, param)
    
    #get corretalions
    nparam = param.shape[1]
    cor = pls.coefs*np.asarray([pls.x_std_]*nparam).T
    cor /= np.tile(pls.y_std_, (cor.shape[0],1))

    return cor
Esempio n. 36
0
def pls_approach():
    from sklearn.cross_decomposition import PLSRegression

    (X, Y), cities = pull_xy_data()

    pls = PLSRegression()
    pls.fit(X, Y)

    plsX, plsY = pls.transform(X, Y)

    plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1)

    return "OK What Now?"
Esempio n. 37
0
def do_pls(X, Y):
    pls2 = PLSRegression(n_components=2)
    pls2.fit(X,Y)
    out = pls2.transform(X)
    print(out)
    print(out.shape)

    plt.title("PLS2")
    plt.xlabel("PL1")
    plt.ylabel("PL2")
    plt.grid();
    plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis')
    plt.savefig('pls.png', dpi=125)
Esempio n. 38
0
class PLSPredictor:
    def __init__(self):
        self.pls2 = PLSRegression(n_components=2,
                                  scale=True,
                                  max_iter=500,
                                  tol=1e-06,
                                  copy=True)

    def predict(self, values):
        self.pls2.predict(values)

    def train(self, measured_values, screen_points):
        self.pls2.fit(measured_values, screen_points)
Esempio n. 39
0
    def __one_pls(self, cat):

        np.seterr(all='raise')

        lcat = np.zeros(self.train_set['labels'].size)

        lcat[self.train_set['labels'] != cat] = -1
        lcat[self.train_set['labels'] == cat] = +1

        pls = PLSRegression(n_components=2, scale=False)

        pls.fit(self.train_set['data'], lcat)

        return pls
Esempio n. 40
0
	def fit(self, predictors, predictands, locations, log=False, **kwargs):

		self.locations = locations
		self.models = []
		self.n = predictors['n']

		id = 0
		for location in locations:
			X = extract_n_by_n(predictors, location, **kwargs)
			Y = predictands[:,id]

			if log:
				Y = np.log(Y)

			#pca = PCA(n_components='mle', whiten=True)
			model = PLSRegression(n_components=2)
			
			model = model.fit(X,Y)
			#components = pca.components_
			#pca.components_ = components
			
			self.models.append(model)
			print "pls: ", location, model.score(X, Y), model.x_loadings_.shape, np.argmax(model.x_loadings_, axis=0)

			id += 1
Esempio n. 41
0
def build_model(X, y):
	# gbr = GradientBoostingRegressor(learning_rate= 0.03, n_estimators=2000, max_depth=8, subsample=0.9)
	# rf = RandomForestRegressor(n_estimators=200)
	# lr = LinearRegression(fit_intercept=True)
	# knr = KNeighborsRegressor(n_neighbors=10, weights='uniform')
	# svr = SVR(C=5.0, kernel='linear')
	pls = PLSRegression(n_components=35)
	return pls.fit(X, y)
Esempio n. 42
0
def reduce_PLS(dataframe):
    PLS_file="data/pls_structure.pickle"
    selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]]
    X=np.array(dataframe[selectedcolumn])
    y=np.array(dataframe["click"])
    if os.path.exists(PLS_file):
        stand_PLS=pickle.load(open(PLS_file,'rb'))
        print "PLS structure is loaded."
    else:
        stand_PLS=PLSRegression(n_components=10,scale=True)
        stand_PLS.fit(X, y[:,np.newaxis])
        stand_PLS.y_scores_=None
        stand_PLS.x_scores_=None
        pickle.dump(stand_PLS,open(PLS_file,"wb"))
        print "PLS transform structure is stored."
    T=stand_PLS.transform(X)
    print "PLS transformation is performed."
    return T
Esempio n. 43
0
def pls_regr(x, y):
    from sklearn.cross_decomposition import PLSRegression
    n = len(x[0])
    if n < 2:
        raise TypeError
    score = -999999999999
    pls = None
    '''
    for i in range(3, n):
        pls2 = PLSRegression(n_components=i)
        pls2.fit(x,y)
        cscore = pls2.score(x, y)
        #print i, cscore 
        if cscore > score:
            pls = pls2
            score = cscore
    '''
    pls = PLSRegression(n_components=5)
    pls.fit(x,y)
    return pls
def lex_function_learning( class_name,  hyper_vec ) :

		#pls2 = KernelRidge( kernel = "rbf", gamma= 100)
		#pls2 = KernelRidge( )
		pls2 = PLSRegression(n_components=50, max_iter=5000)

		X = extract_postive_features ( train_dataset[class_name][0], train_dataset[class_name][1] )			

		Y = []

		for hypo_vec in X :

			sub = hyper_vec-hypo_vec
			Y.append(sub) # Target = difference vector ( Hypernym_vector - Hyponym_vector )
			#Y.append(hyper_vec) # Target = Hypernym vector 

		pls2.fit( X, Y)	
		train_acc = pls2.score(X, Y)
		print "class = ", class_name, "train len = ", len(X)
		
		return pls2, train_acc, len(X)
Esempio n. 45
0
def train_PLSR(x_filename, y_filename, model_filename, n):
    """
    Train a PLSR model and save it to the model_filename.
    X and Y matrices are read from x_filename and y_filename.
    The no. of PLSR components is given by n. 
    """
    X = loadMatrix(x_filename)[0].todense()
    Y = loadMatrix(y_filename)[0].todense()
    if X.shape[0] != Y.shape[0]:
        sys.stderr.write("X and Y must have equal number of rows!\n")
        raise ValueError
    sys.stderr.write("Learning PLSR...")
    startTime = time.time()
    pls2 = PLSRegression(copy=True, max_iter=10000, n_components=n, scale=True, tol=1e-06)
    pls2.fit(X, Y)  
    model = open(model_filename, 'w') 
    pickle.dump(pls2, model, 1)
    model.close()
    endTime = time.time()
    sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2)))  
    pass
def hacerPLS(X,Y):
    pls_wild_b = PLSRegression(n_components = 9) 
    pls_wild_b.fit(X,Y)
    Z = pls_wild_b.transform(X)
    scores = list() 
    scores_std = list()
    n_features = np.shape(X)[1]
    
    X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0)
    N = np.shape(X)[0]
    
    for num_comp in range(n_features):
        kf = KFold(N,n_folds = 10)
        aux_scores = list()
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
              
            if num_comp == 0:
                y_pred = np.mean(y_test)
                y_pred = y_pred* np.ones(np.shape(y_test))
                aux_scores.append(metrics.mean_squared_error(y_test,y_pred))
            
            else:
                pls_foo = PLSRegression(n_components = num_comp)                        
                pls_foo.fit(X_train,y_train)
                y_pred = pls_foo.predict(X_test)
            
                #obtaing the score
                this_score = metrics.mean_squared_error(y_test,y_pred)
                aux_scores.append(this_score)
                
        scores.append(np.mean(aux_scores))
        scores_std.append(np.std(aux_scores))
    
    plt.plot(scores)
    xlabel('Componentes')
    ylabel("$MSE$")
    title("Animales PLS")
    plt.show()
    
    num_comp = np.argmin(scores)
    
    pls_pred = PLSRegression(n_components =2)
    pls_pred.fit(X,Y)
    y_pred_test = pls_pred.predict(X_test_tot)
    
    print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
Esempio n. 47
0
train = pd.read_csv('train.csv', index_col='id')
targets = pd.get_dummies(train.target)
train.drop('target', axis=1, inplace=True)
train = train.apply(np.log1p)

test = pd.read_csv('test.csv', index_col='id')
test = test.apply(np.log1p)

Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27)

best = 10.

for n in range(5,16):
    
    clf = PLSRegression(n_components=n)
    clf.fit(Xt,yt)
    y_pred = clf.predict(Xv)
    loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred)
    if loss < best:
        n_best = n
        best = loss
        postfix = '(*)'
    else:
        postfix = ''
    print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix))


clf = PLSRegression(n_components=n_best)  
clf.fit(train,targets)
y_pred = clf.predict(test)
    if i == 0:
        plt.ylabel('1st component')
    elif i == 1:
        plt.ylabel('2nd component')
    else:
        plt.ylabel('3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
    axis_c.set_xticks(axis_c.get_xticks() + 0.5)
    print "dentro del bucleeeeeeeeeee"

#Select the number of components using CV
#%%
##PLSR
pls_wild_b = PLSRegression(n_components = 3)
pls_wild_b.fit(X_train_prepro,Y_train)
X_train_pls_proj = pls_wild_b.transform(X_train_prepro)
print("loadings")

for i in range(pls_wild_b.n_components):
    plt.figure()
    plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i])
    if i == 0:
        plt.ylabel('PLS 1st component')
    elif i == 1:
        plt.ylabel('PLS2nd component')
    else:
        plt.ylabel('PLS 3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
    axis_c.set_xticks(axis_c.get_xticks() + 0.5)
plt.yticks(())
plt.show()

# #############################################################################
# PLS regression, with multivariate response, a.k.a. PLS2

n = 1000
q = 3
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSRegression(n_components=3)
pls1.fit(X, y)
Esempio n. 50
0
(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath)
(Xtest, ytest) = loadData(xtestpath, ytestpath)

#trim off background and scale
ytrain=ytrain[:,1:]
#ytrain=scale(ytrain)
Xtrain=standardize(Xtrain)

#trim off background and scale
ytest = ytest[:,1:]
#ytest = scale(ytest)
Xtest = standardize(Xtest)

pls = PLSRegression(n_components=10)
pls.fit(Xtrain, ytrain)
y_pls = pls.predict(Xtest)
print 1 + pls.score(Xtest, ytest)


pls_rmse=[]
pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3])))

fig = plt.figure(figsize=(20,10))

ax1 = fig.add_subplot(241)
ax1.plot(y_pls[:,0], c='r', label='PLS Fit')
ax1.plot(ytest[:,0], c='grey', label='Target')
Esempio n. 51
0
    def generate(self, input=None):   
        dso = input
        
        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')
                
        data = dso.data
        
        plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm'))
        Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ])

        plsr.fit(data, Y) # Transpose it, as vars need to along the top
        
        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0])))  
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n,s in enumerate(plsr.x_scores_.T):
            scored.data[:,n] = s
            scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n])
                
        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
            
        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean( cw_x[c] )
            cy = np.mean( cw_y[c] )
            
            # Calculate 95% CI
            rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn)

            figure_regions.append( 
                (c, cx, cy, rx, ry)
            )

        
            
        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 )
        dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] ))
        dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50
        dso_z = [x for x, wmx in dso_z ]    

        weightsd = DataSet(size=plsr.x_weights_.T.shape)
        weightsd.data = plsr.x_weights_.T
        weightsd.scales[1] = input.scales[1]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1] ):
            lvd =  DataSet( size=(1, input.shape[1] ) )
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:,n:n+1].T
            dso_lv['lv%s' % (n+1)] = lvd
            weightsd.labels[0][n] = "Weights on LV %s" % (n+1)
            weightsd.classes[0][n] = "LV %s" % (n+1)
                    
        return dict(list({
            'dso': dso,
            'scores':scored,
            'weights':weightsd,
            #'figure_data': figure_data,
            #'figure_regions': figure_regions,
            'y_weights': plsr.y_weights_,
            'x_weights': plsr.x_weights_,
        }.items()) + list(dso_lv.items()) )
Esempio n. 52
0
        plt.plot(nComponents,plsCanScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('PLS Cannonical accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right')
    plt.grid(True)

if (0):
    #%% PLS Regression
    nComponents = np.arange(1,nClasses+1)
    plsRegScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        plsReg = PLSRegression(n_components=n)
        plsReg.fit(Xtrain,Ytrain)
        XtrainT = plsReg.transform(Xtrain)
        XtestT = plsReg.transform(Xtest)
        plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)

    
    plsReg = PLSRegression(n_components=2)
    plsReg.fit(Xtrain,Ytrain)
    xt = plsReg.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 2 components of projected data')
    

    #%% Plot accuracies for PLSSVD 
    plt.figure()
def plsvip (X, Y, V, lat_var):
    attributes = len(X[0])

    if not lat_var:
        latent_variables = attributes
    else:
        latent_variables = lat_var
		
    num_instances = len(X)	
	
    attributes_gone = []

    min_att = -1	

    #start_time = time.time()
    #attr_time = time.time()
    #time_counter = 0
    while attributes>0: 
        #if (attributes +9) %10 ==0:
        #    print "total time: ", time.time() - start_time
        #    print "attr time: ", time.time() - attr_time
        #    attr_time = time.time()

        if (latent_variables == 0) or (latent_variables > attributes):	
            latent_variables = attributes	

        lv_best = best_latent_variable(X, Y, latent_variables, num_instances)
        #print "current best lv: ", lv_best, "num. attr. ", attributes ####
		
        #fin_pls = PLSCanonical(n_components = lv_best)
        fin_pls = PLSRegression(n_components = lv_best)
        fin_pls.fit(X, Y)


        currentR2 = fin_pls.score(X, Y)  

        #######################################w
        # alternative r2
        """
        meanY4r2 = numpy.mean(Y)
        predY = fin_pls.predict(X)
        RSS = 0
        for i in range (len(Y)):
            RSS +=  numpy.power (Y[i] - predY[i], 2)
        TSS = 0
        for i in range (len(Y)):
            TSS += numpy.power (Y[i] - meanY4r2, 2)
        
        alterR2 = 1 - (RSS/TSS)
        #print currentR2, "vs", alterR2
        """
        #######################################w
        
        min_vip = 1000

        if min_att ==-1:
            attributes_gone.append(["None", currentR2, attributes, lv_best])

        ##########################################r
        #threaded version
        """ 
        myThreads = []
        VIPcurrent = []
        for i in range (0,attributes):
            myThreads.append(enthread( target = get_vip, args = (fin_pls, lv_best, i, attributes_gone, attributes  )) )
        for i in range (0,attributes):
            VIPcurrent.append(myThreads[i].get())
      
        min_vip = min(VIPcurrent)
        min_att = VIPcurrent.index(min_vip)
        """ 
        # Working version
        #"""
        for i in range (0,attributes):
            VIPcurrent = get_vip (fin_pls, lv_best, i, attributes_gone, attributes  )
            if VIPcurrent< min_vip:
                min_vip = VIPcurrent
                min_att = i
        #"""
        ##########################################r
        if min_att >-1:
            attributes_gone.append([V[min_att], currentR2, attributes, lv_best]) ####### CURRENT : to BE popped, NOT already popped
        V.pop(min_att)

        for i in range (num_instances):
            X[i].pop(min_att)

        attributes -= 1		
    #print attributes_gone ####
    #time_counter +=1
    return attributes_gone
Esempio n. 54
0
#correct not accurate
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.cross_decomposition import PLSCanonical
df=pd.read_csv('newdata.csv')
x=df.drop(['tag'],axis=1)
y=df.drop(['kx','ky','kz','wa','wb','wc','wd','we','wf'],axis=1)
X_train , X_test , Y_train , Y_test = train_test_split(x,y , random_state=5)

plsr=PLSRegression()
plsr.fit(X_train,Y_train)

plsc=PLSCanonical()
plsc.fit(X_train,Y_train)

print (plsr.score(X_test,Y_test))
print (plsc.score(X_test,Y_test))
#Partial Least Squares Regression
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import scale

X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

#Performing Cross_Validation for PLS
mse = []
n=  len(X_train_scaled)
kf_10 = cross_validation.KFold(n,n_folds=10, shuffle=True, random_state=0)

for i in np.arange(1,17):
    plsregr = PLSRegression(n_components=i, scale=False)
    plsregr.fit(X_train_scaled,y_train)
    score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean()
    mse.append(score)

plt.plot(np.arange(1,17), np.array(mse), '-v')
plt.title("PLS: MSE vs. Principal Components")
plt.xlabel('Number of principal components in PLS regression')
plt.ylabel('MSE')
plt.xlim((-0.2, 17.2))

#Based off of the plot, 12 principal components minimized MSE
plsregr_test = PLSRegression(n_components=12, scale=False)
plsregr_test.fit(X_train_scaled, y_train)
MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2)
# print "Mean Squared Error: ", MSE_PLS
Esempio n. 56
0
	X_levelOne = []
	y_levelOne = []	
	level0Classifier = []
        for tid,Xp,yp in zip(subjId_train,X_train,y_train):
	    print "Predicting subject ", vid, "from subject ", tid
            y0 = np.zeros(yp.shape)
	    y1 = np.ones(Xt.shape[0])
	    X = np.vstack([Xp,Xt])
            yd = np.concatenate([y0,y1])

            pls = PLSRegression(n_components)
	    Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9)
	    yp_t = yp_t.astype(bool)
	    yp_t_not =  np.vstack((yp_t,~yp_t)).T
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()
	    print "PLS Validation error " , float(error)/yp_v.shape[0]

	    X_new = pls.transform(X)
	    rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4)
Esempio n. 57
0
'''
clf = linear_model.ElasticNet(alpha=0.2, l1_ratio=0.01)
clf.fit(x_scaled, y_scaled)
print(clf.coef_)

yvalid_scaled = clf.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

'''
General Linear Model -- Elastic Net
'''
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=20)
pls.fit(x_scaled, y_scaled)
print(pls.coef_)

yvalid_scaled = pls.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax)

pca = PCA(n_components=2)
pca.fit(xtrain_minmax)
print(pca.explained_variance_ratio_)

Esempio n. 58
0
import pandas as pd
import numpy as np

_experiment_test = config['experiment_test']
_experiment_control = config['experiment_control']

plsr = PLSRegression(n_components=config['number_of_components'], scale=config['autoscale']) #, algorithm=self.config.get('algorithm'))

# We need classes to do the classification; should check and raise an error
class_idx = input_data.index.names.index('Class')
classes = list( input_data.index.levels[ class_idx ] )

Y = input_data.index.labels[ class_idx ]

plsr.fit(input_data.values, Y)

# Build scores into a dso no_of_samples x no_of_principal_components
scores = pd.DataFrame(plsr.x_scores_)  
scores.index = input_data.index

scoresl =[]
for n,s in enumerate(plsr.x_scores_.T):
    scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n]) 
scores.columns = scoresl
    

weights = pd.DataFrame( plsr.x_weights_.T )
weights.columns = input_data.columns

dso_lv = {}
Esempio n. 59
0
X = dataset["data"]
y = dataset["target"]

# Center each feature and scale the variance to be unitary
X = preprocessing.scale(X)

# Compute the variance for each column
print(numpy.var(X, 0).sum())

# Now use PCA using 3 components
pca = PCA(3)
X2 = pca.fit_transform(X)
print(numpy.var(X2, 0).sum())

pls = PLSRegression(3)
pls.fit(X, y)
X2 = pls.transform(X)
print(numpy.var(X2, 0).sum())

# Make predictions using an SVM with PCA and PLS
pca_error = 0
pls_error = 0
n_folds = 10

svc = LinearSVC()

for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds):
    X_train, X_test = X[train_inds], X[test_inds]
    y_train, y_test = y[train_inds], y[test_inds]

    # Use PCA and then classify using an SVM
def bestpls(vipMatrix, X, Y, V):

    ###########################
    #bestR2 = -10000
    #lv_best = 1
    #position = 1
    ###########################
    bestR2 = vipMatrix[0][1]
    lv_best = vipMatrix[0][3]
    position = 0
	###########################
	
	
    #for i in range (len(vipMatrix)): 
    #    print vipMatrix[i] 
		
    for entries in range (len(vipMatrix)):
        #print vipMatrix[entries][1], "=?=", bestR2 #############
        if vipMatrix[entries][1] > bestR2:   
            position = entries
            bestR2 = vipMatrix[entries][1]
            lv_best = vipMatrix[entries][3]

    #################################################################################################qq
    variables = []    
    for i in range (1, position): # not position + 1, as the vipMatrix[position] holds the next variable to be removed
        variables.append(vipMatrix[i][0])
    #print "VAR TO BE REMOVED: ", variables
    V_new_Indices = []
    for i in variables: # removed variable names in random order
        V_new_Indices.append(V.index(i))

    #if V == sorted(V):
    #    print "\nV ok!\n"

    # keep names == separate
    V_new = deepcopy(V)
    for i in variables:
        V_new.remove(i)
        
    X_new = []
    for i in range (len(X)):
        X_new.append([])

    variables_sent = [] ####
    for i in range (len(X)):
        for j in range (len(V)):
            if j not in V_new_Indices:
                #if V[j] not in variables_sent: ####
                #    variables_sent.append(V[j])####
                X_new[i].append(X[i][j])

    # epic test
    if not V_new == sorted(V_new):
        return base64.b64encode("tobulo"), [], [], 0
    #else:
    #    print "v_new ok!"


    #validity tests
    #for i in range (len (variables_sent)):
    #    if variables_sent[i] == V_new[i]:
    #        print "ok", i
    #print "var: ", len(V), "selected: ", len(V_new), "data (var) init length: ", len(X[0]), "data (var) now length: ", len(X_new[0])
    """ 
    # PREVIOUS
    variables = []    
    for i in range (1, position):
        variables.append(vipMatrix[i][0])

    V_new = deepcopy(V)
    for i in variables:
        V_new.remove(i) ################ remove by index??? CHECK!!!!

    X_new = []
    for i in range (len(X)):
        X_new.append([])

    for i in range (len(X)):
        for j in range (len(V_new)): ####### HERE ALSO
            X_new[i].append(X[i][j])
    """
    #################################################################################################qq
    #print V_new, "OOOO\n\n" #var names == cool
    #print "\n\nNumber of variables ", len(V_new), " and latent: ", lv_best
    #best_pls = PLSCanonical(n_components = lv_best)
    best_pls = PLSRegression(n_components = lv_best)
    best_pls.fit(X_new, Y)

    saveas = pickle.dumps(best_pls)
    encoded = base64.b64encode(saveas)	
	
    return encoded, X_new, V_new, lv_best