Beispiel #1
0
class _PLSSVDImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
microbe_iv['group'] = microbe_iv['group'].map(catdict)
metabolite_iv['group'] = metabolite_iv['group'].map(catdict)

# highlight features with p-value <= 0.001
max_pval = 0.001

microbe_iv.loc[microbe_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant microbes: %d' %
      microbe_iv[microbe_iv['group'] != 'None'].shape[0])

metabolite_iv.loc[metabolite_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant metabolites: %d' %
      metabolite_iv[metabolite_iv['group'] != 'None'].shape[0])

plssvd = PLSSVD(n_components=3)
plssvd.fit(X=clr(centralize(multiplicative_replacement(microbes))),
           Y=clr(centralize(multiplicative_replacement(metabolites))))


def standardize(A):
    A = (A - np.mean(A, axis=0)) / np.std(A, axis=0)
    return A


pls_microbes = pd.DataFrame(standardize(plssvd.x_weights_),
                            columns=['PCA1', 'PCA2', 'PCA3'],
                            index=microbes.columns)
pls_metabolites = pd.DataFrame(standardize(plssvd.y_weights_),
                               columns=['PCA1', 'PCA2', 'PCA3'],
                               index=metabolites.columns)

color_map = {
Beispiel #3
0
    def fit(self, X, y, split_type: str = "extreme"):
        """Split multi-label y dataset into train and test subsets.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features).

        y : {array-like, sparse matrix} of shape (n_samples, n_labels).

        split_type : Splitting type of {naive, extreme, iterative}.

        Returns
        -------
        data partition : two lists of indices representing the resulted data split
        """

        if X is None:
            raise Exception("Please provide a dataset.")
        if y is None:
            raise Exception("Please provide labels for the dataset.")
        assert X.shape[0] == y.shape[0]

        check, X = check_type(X=X, return_list=False)
        if not check:
            tmp = "The method only supports scipy.sparse, numpy.ndarray, and list type of data"
            raise Exception(tmp)

        check, y = check_type(X=y, return_list=False)
        if not check:
            tmp = "The method only supports scipy.sparse, numpy.ndarray, and list type of data"
            raise Exception(tmp)

        num_examples, num_labels = y.shape

        # check whether data is singly labeled
        if num_labels == 1:
            # transform it to multi-label data
            classes = list(set([i[0] if i else 0 for i in y.data]))
            mlb = LabelBinarizer(labels=classes)
            y = mlb.transform(y)

        # 1)- Compute covariance of X and y using SVD
        if not self.is_fit:
            model = PLSSVD(n_components=self.num_clusters,
                           scale=True,
                           copy=False)
            optimal_init = self.__optimal_learning_rate(alpha=self.lr)
            list_batches = np.arange(start=0,
                                     stop=num_examples,
                                     step=self.batch_size)
            total_progress = self.num_epochs * len(list_batches)
            for epoch in np.arange(start=1, stop=self.num_epochs + 1):
                for idx, batch_idx in enumerate(list_batches):
                    current_progress = epoch * (idx + 1)
                    desc = '\t>> Computing the covariance of X and y using PLSSVD: {0:.2f}%...'.format(
                        (current_progress / total_progress) * 100)
                    if total_progress == current_progress:
                        print(desc)
                    else:
                        print(desc, end="\r")
                    model.fit(
                        X[batch_idx:batch_idx + self.batch_size].toarray(),
                        y[batch_idx:batch_idx + self.batch_size].toarray())
                    U = model.x_weights_
                    learning_rate = 1.0 / (self.lr *
                                           (optimal_init + epoch - 1))
                    U = U + learning_rate * (0.5 * 2 * U)
                    U = U + learning_rate * (0.5 * np.sign(U))
                    model.x_weights_ = U
            self.U = lil_matrix(model.x_weights_)
            del U, model

        # 2)- Project X onto a low dimension via U orthonormal basis obtained from SVD
        #     using SVD
        desc = '\t>> Projecting examples onto the obtained low dimensional U orthonormal basis...'
        print(desc)
        Z = X.dot(self.U)

        # 3)- Cluster low dimensional examples
        if not self.is_fit:
            desc = '\t>> Clustering the resulted low dimensional examples...'
            print(desc)
            self.centroid_kmeans, label_kmeans = kmeans2(data=Z.toarray(),
                                                         k=self.num_clusters,
                                                         iter=self.num_epochs,
                                                         minit='++')
        else:
            label_kmeans = np.array(
                [np.argmin(z.dot(self.centroid_kmeans), 1)[0] for z in Z])

        mlb = LabelBinarizer(labels=list(range(self.num_clusters)))
        y = mlb.reassign_labels(y, mapping_labels=label_kmeans)
        self.is_fit = True

        # perform splitting
        if split_type == "extreme":
            st = ExtremeStratification(
                swap_probability=self.swap_probability,
                threshold_proportion=self.threshold_proportion,
                decay=self.decay,
                shuffle=self.shuffle,
                split_size=self.split_size,
                num_epochs=self.num_epochs,
                verbose=False)
            train_list, test_list = st.fit(X=X, y=y)
        elif split_type == "iterative":
            st = IterativeStratification(shuffle=self.shuffle,
                                         split_size=self.split_size,
                                         verbose=False)
            train_list, test_list = st.fit(y=y)
        else:
            st = NaiveStratification(shuffle=self.shuffle,
                                     split_size=self.split_size,
                                     batch_size=self.batch_size,
                                     num_jobs=self.num_jobs,
                                     verbose=False)
            train_list, test_list = st.fit(y=y)
        return train_list, test_list
Beispiel #4
0
plt.show()

pos_max = np.argmax(acc_val)
num_opt_feat = rang_feat[pos_max]
test_acc_opt = acc_test[pos_max]
print 'Number optimum of features: ' + str(num_opt_feat)
print("The optimum test accuracy is  %2.2f%%" % (100 * test_acc_opt))

########################### PLS ##################################3

from sklearn.cross_decomposition import PLSSVD

N_feat_max = n_classes  # As many new features as classes minus 1
# 1. Obtain PLS projections
pls = PLSSVD(n_components=N_feat_max)
pls.fit(X_train, Y_train_bin)
X_train_pls = pls.transform(X_train)
X_val_pls = pls.transform(X_val)
X_test_pls = pls.transform(X_test)

# 2. Compute and plot accuracy evolution
rang_feat = np.arange(1, N_feat_max, 1)
[acc_tr, acc_val,
 acc_test] = SVM_accuracy_evolution(X_train_pls, Y_train, X_val_pls, Y_val,
                                    X_test_pls, Y_test, rang_feat, C, gamma)
plt.figure()
plot_accuracy_evolution(rang_feat, acc_tr, acc_val, acc_test)
plt.show()

# 3. Find the optimum number of features
pos_max = np.argmax(acc_val)
Beispiel #5
0
                    c=classColors[i, :])

    plt.title('Projected data over the components')
    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.show()

if (0):

    #%% PARTIAL LEAST SQUARES
    #%% PLS SVD
    nComponents = np.arange(1, nClasses + 1)
    plsSvdScores = np.zeros((5, np.alen(nComponents)))
    for i, n in enumerate(nComponents):
        plssvd = PLSSVD(n_components=n)
        plssvd.fit(Xtrain, Ytrain)
        XtrainT = plssvd.transform(Xtrain)
        XtestT = plssvd.transform(Xtest)
        plsSvdScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain,
                                           labelsTest)

    plssvd = PLSSVD(n_components=2)
    xt, yt = plssvd.fit_transform(Xtrain, Ytrain)
    fig = plt.figure()
    util.plotData(fig, xt, labelsTrain, classColors)
    plt.title('First 2 components of projected data')

    #%% Plot accuracies for PLSSVD
    plt.figure()
    for i in range(5):
        plt.plot(nComponents, plsSvdScores[i, :], lw=3)
    dataTrainT = pca.fit_transform(dataTrain)
    dataTestT = pca.transform(dataTest)
    pcaScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest)

# Training data with 2 dimensions
pca = PCA(n_components=2)
xtPCA = pca.fit_transform(dataTrain)
uPCA = pca.components_

#%% PARTIAL LEAST SQUARES
#%% PLS SVD
nComponents = np.arange(1,nClasses+1)
plsSvdScores = np.zeros((2,np.alen(nComponents)))
for i,n in enumerate(nComponents):
    plssvd = PLSSVD(n_components=n)
    plssvd.fit(dataTrain,Ytrain)
    dataTrainT = plssvd.transform(dataTrain)
    dataTestT = plssvd.transform(dataTest)
    plsSvdScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest)
fig = plt.figure()
util.plotAccuracy(fig,nComponents,plsSvdScores)
plt.title('PLS SVD accuracy',figure=fig)

plssvd = PLSSVD(n_components=2)
xt,yt = plssvd.fit_transform(dataTrain,Ytrain)
fig = plt.figure()
util.plotData(fig,xt,labelsTrain,classColors)

u = plssvd.x_weights_
plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig)
plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)
        plt.scatter(xtPCA[labelsTrain==l,0],xtPCA[labelsTrain==l,1],alpha=0.5,c=classColors[i,:])

    plt.title('Projected data over the components')
    plt.xlim([-4,4])
    plt.ylim([-4,4])
    plt.show()

if (0):
    
    #%% PARTIAL LEAST SQUARES
    #%% PLS SVD
    nComponents = np.arange(1,nClasses+1)
    plsSvdScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        plssvd = PLSSVD(n_components=n)
        plssvd.fit(Xtrain,Ytrain)
        XtrainT = plssvd.transform(Xtrain)
        XtestT = plssvd.transform(Xtest)
        plsSvdScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)
        
    plssvd = PLSSVD(n_components=2)
    xt,yt = plssvd.fit_transform(Xtrain,Ytrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 2 components of projected data')
    
    #%% Plot accuracies for PLSSVD 
    plt.figure()
    for i in range (5):
        plt.plot(nComponents,plsSvdScores[i,:],lw=3)