コード例 #1
0
def PLS(mol_a, mol_i):
    X = []
    Y = []
    for mol in mol_a:
        b = np.array([
            mol.x - center[0], mol.y - center[1], mol.z - center[2], mol.dg,
            mol.dh, mol.tds
        ])

        X.append(b)
        Y.append(np.array([1, 1]))

    #Y = []
    for mol in mol_i:
        b = np.array([
            mol.x - center[0], mol.y - center[1], mol.z - center[2], mol.dg,
            mol.dh, mol.tds
        ])

        X.append(b)
        Y.append(np.array([0, 0]))

    pls2 = PLSRegression(n_components=2)
    x_scores, y_scores = pls2.fit_transform(X, Y)

    plt.figure(10, figsize=(5, 5))
    plt.scatter(x_scores[:, 0], x_scores[:, 1])
    plt.scatter(y_scores[:, 0], y_scores[:, 1])
    plt.show()
    print(np.shape(x_scores))
    print(np.shape(y_scores))
コード例 #2
0
def pls_da(df, n):
    X = df.iloc[:, 0:-1]
    y = df.iloc[:, -1]
    # y_class = pd.get_dummies(y)
    plsda = PLSRegression(n_components=n)
    reduced_x = plsda.fit_transform(X, y)
    return reduced_x[0]
コード例 #3
0
def train_model(model_name, X, y, save=True):
    # window_sizes = [128, 256, 512, 1024]
    # angles = ["right_shoulder", "left_shoulder", "right_elbow", "left_elbow", "right_hip", "left_hip", "right_knee",
    #           "left_knee"]
    model_dict = {}
    pls = PLSRegression(n_components=5)
    X = pls.fit_transform(X, y)[0]

    if "lscp" in model_name:
        model = construct_lscp()
    elif "xgbod" in model_name:
        model = construct_xgbod()
    elif "simple-mean" in model_name:
        model = construct_simple_aggregator("average")
    elif "simple-max" in model_name:
        model = construct_simple_aggregator("maximization")

    model.fit(X, y)

    model_dict = {"pls": pls, "model": model}
    if save:
        if not os.path.exists("saved_models"):
            os.mkdir("saved_models")
        save_path = os.path.join("saved_models", model_name + ".joblib")
        joblib.dump(model_dict, save_path)

    return model_dict
コード例 #4
0
ファイル: PLS_DA_method.py プロジェクト: JhonG0527/P4
def PLS_DA(datos):
        
    global pls_bi
        
    datos_bi = datos[(datos['etiqueta'] == 5 ) | (datos['etiqueta'] == 6)]
    
    X_bi = savgol_filter(datos_bi.values[:,2:], 15, polyorder = 3, deriv=0)
    
    y_biP = datos_bi["etiqueta"].values
    
    y_bi = (y_biP == 6).astype('uint8')
    
    
    pls_bi = PLSRegression(n_components=2)
    
    X_pls = pls_bi.fit_transform(X_bi, y_bi)[0] 
    
    labplot = ["60/40 ratio", "50/50 ratio"]
    
    unique = list(set(y_bi))
    colors = [plt.cm.jet(float(i)/max(unique)) for i in unique]
    with plt.style.context(('ggplot')):
        plt.figure(figsize=(12,10))
        for i, u in enumerate(unique):
            col = np.expand_dims(np.array(colors[i]), axis=0)
            x = [X_pls[j,0] for j in range(len(X_pls[:,0])) if y_bi[j] == u]
            y = [X_pls[j,1] for j in range(len(X_pls[:,1])) if y_bi[j] == u]
            plt.scatter(x, y, c=col, s=100, edgecolors='k',label=str(u))
            plt.xlabel('Variable Latente 1')
            plt.ylabel('Variable Latente 2')
            plt.legend(labplot,loc='lower left')
            plt.title('Descomposición cruzada PLS')
            plt.show()
            
    X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X_bi, y_bi, test_size=0.2, random_state=19)

    pls_bi = PLSRegression(n_components=2)
    
    pls_bi.fit(X_entreno, y_entreno)
    
    y_prediccion1 = pls_bi.predict(X_prueba)[:,0] 
    prediccion_binaria1 = (pls_bi.predict(X_prueba)[:,0] > 0.5).astype('uint8')
    print(prediccion_binaria1, y_prueba)
    
    precision = []
    A=[]
    m=0
    cvalor = KFold(n_splits=40, shuffle=True, random_state=19)
    for train, test in cvalor.split(X_bi):
        
        y_prediccion = PLS_DA1(X_bi[train,:], y_bi[train], X_bi[test,:])
        A.append(y_prediccion)
        precision.append(accuracy_score(y_bi[test], y_prediccion))
        m=m+1
        print("Precisión Promedio para 10 Divisiones: ", np.array(precision).mean())
    
    return prediccion_binaria1, precision
コード例 #5
0
def test_sanity_check_pls_regression():
    # Sanity check for PLSRegression
    # The results were checked against the R-packages plspm, misOmics and pls

    d = load_linnerud()
    X = d.data
    Y = d.target

    pls = PLSRegression(n_components=X.shape[1])
    X_trans, _ = pls.fit_transform(X, Y)

    # FIXME: one would expect y_trans == pls.y_scores_ but this is not
    # the case.
    # xref: https://github.com/scikit-learn/scikit-learn/issues/22420
    assert_allclose(X_trans, pls.x_scores_)

    expected_x_weights = np.array([
        [-0.61330704, -0.00443647, 0.78983213],
        [-0.74697144, -0.32172099, -0.58183269],
        [-0.25668686, 0.94682413, -0.19399983],
    ])

    expected_x_loadings = np.array([
        [-0.61470416, -0.24574278, 0.78983213],
        [-0.65625755, -0.14396183, -0.58183269],
        [-0.51733059, 1.00609417, -0.19399983],
    ])

    expected_y_weights = np.array([
        [+0.32456184, 0.29892183, 0.20316322],
        [+0.42439636, 0.61970543, 0.19320542],
        [-0.13143144, -0.26348971, -0.17092916],
    ])

    expected_y_loadings = np.array([
        [+0.32456184, 0.29892183, 0.20316322],
        [+0.42439636, 0.61970543, 0.19320542],
        [-0.13143144, -0.26348971, -0.17092916],
    ])

    assert_array_almost_equal(np.abs(pls.x_loadings_),
                              np.abs(expected_x_loadings))
    assert_array_almost_equal(np.abs(pls.x_weights_),
                              np.abs(expected_x_weights))
    assert_array_almost_equal(np.abs(pls.y_loadings_),
                              np.abs(expected_y_loadings))
    assert_array_almost_equal(np.abs(pls.y_weights_),
                              np.abs(expected_y_weights))

    # The R / Python difference in the signs should be consistent across
    # loadings, weights, etc.
    x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings)
    x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights)
    y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights)
    y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings)
    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip)
    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip)
コード例 #6
0
ファイル: office.py プロジェクト: chenchao666/JDMC
def ELM_test(split, source_name, target_name):
    source_path = os.path.join(data_path, source_name + "_SURF_L10.mat")
    target_path = os.path.join(data_path, target_name + "_SURF_L10.mat")
    train_x_ss, train_y_ss, train_x_ts, train_y_ts, test_xs, test_ys = load_mmdt_split(
        split, source_path, target_path)
    accs = []
    for i in range(20):
        print("####### " + str(i + 1) + " #######")
        train_x_s, train_y_s, train_x_t, train_y_t, test_x, test_y = np.squeeze(
            train_x_ss[i]), np.squeeze(train_y_ss[i]), np.squeeze(
                train_x_ts[i]), np.squeeze(train_y_ts[i]), np.squeeze(
                    test_xs[i]), np.squeeze(test_ys[i])
        # source_dict,target_dict = {},{}
        # source_dict['fts'] = np.append(train_x_s,train_x_t,axis=0)
        # source_dict['labels'] = np.append(train_y_s,train_y_t,axis=0).T
        # savemat("./others/source.mat",source_dict)
        # target_dict['fts'] = test_x
        # target_dict['labels'] = test_y.T
        # savemat("./others/target.mat",target_dict)
        #     train_x_s = (train_x_s-np.mean(train_x_s,1,keepdims=True))/np.std(train_x_s,1,keepdims=True)
        #     train_x_t = (train_x_t-np.mean(train_x_t,1,keepdims=True))/np.std(train_x_t,1,keepdims=True)
        #     test_x = (test_x-np.mean(test_x,1,keepdims=True))/np.std(test_x,1,keepdims=True)
        train_y_s = LabelTransform(train_y_s)
        train_y_t = LabelTransform(train_y_t)
        test_y = LabelTransform(test_y)
        pca = PCA(dim)
        pls = PLSRegression(dim)
        reduc = "pca"
        if reduc == "pls":
            train_x_s, _ = pls.fit_transform(train_x_s, train_y_s)
            paced = pca.fit_transform(np.row_stack([train_x_t, test_x]))
            train_x_t, test_x = paced[0:30, :], paced[30:, :]
        elif reduc == "pca":
            paced = pca.fit_transform(
                np.row_stack([train_x_s, train_x_t, test_x]))
            if source_name == "amazon":
                train_x_s, train_x_t, test_x = paced[0:200, :], paced[
                    200:230, :], paced[230:, :]
            else:
                train_x_s, train_x_t, test_x = paced[0:80, :], paced[
                    80:110, :], paced[110:, :]
        else:
            raise NotImplementedError

    ##train model
        net = ELM(train_x_s, train_y_s, test_x, test_y, 1000)
        net.ParamInit()
        net.Activation('relu')
        net.TrainELM('Lp', 0.01)
        net.TrainAccuracy('relu')
        net.TestAccuracy('relu')
        net.printf()
        accs.append(net.TestAcc)
    return np.mean(accs) * 100, np.std(accs) * 100 / np.sqrt(len(accs))
コード例 #7
0
ファイル: PLS (sf vs agg).py プロジェクト: BeatricePe/pls-
def Pls (df, df2, string):
    pls2 = PLSRegression(n_components=2)
    (xs,ys) = pls2.fit_transform(df,df2)
    t = df2.values
    principalDf = pd.DataFrame(data = xs
             , columns = ['pls 1', 'pls 2'])
    pls = cross_decomposition.PLSRegression(n_components = 10)
    pls.fit(df, df2) 
    variance = np.var(pls.x_scores_, axis = 0) 
    principalDf [string] = t
    return principalDf, variance 
コード例 #8
0
from sklearn.metrics import mean_squared_error, r2_score


# In[136]:


# Split data to train and test on 50-50 ratio
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=None)


# In[137]:


pls = PLSRegression(n_components=27)
pls.fit(X_train, X_test)
X_pls = pls.fit_transform(X_train, X_test)
x2 =pls.transform(x)


# In[138]:


x2=pd.DataFrame(x2)
print(x2)
#x2= NormalizeData(x2)
#print(X_pls)
#two_arrays = X_pls
#datapls = np.hstack(two_arrays)
#np.savetxt('lungcancerpls111.csv', datapls, delimiter=',')

コード例 #9
0
class PLSClassifier(BaseEstimator, ClassifierMixin):
    __name__ = 'MultiLayeredPLS'

    def __init__(self, estimator=None, n_iter=1500, eps=1e-6, n_comp=10, mode='regression'):
        warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

        self.n_iter = n_iter
        self.eps = eps
        self.n_comp = n_comp
        self.mode = mode
        self.estimator = estimator

        self.estimator_ = None
        self.pls = None

    def fit(self, X, y):
        # if X is not np.array or y is not np.array:
        #     print('x and y must be of type np.array')
        #     raise ValueError
        if X.shape[0] != y.shape[0]:
            raise ValueError()

        if self.estimator is None:
            self.estimator_ = LinearRegression()
        else:
            self.estimator_ = sklearn.base.clone(self.estimator_)

        self.classes_, target = np.unique(y, return_inverse=True)

        target[target == 0] = -1

        if self.mode == 'canonical':
            self.pls = PLSCanonical(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps)
        elif self.mode == 'regression':
            self.pls = PLSRegression(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps)
        proj_x, proj_y = self.pls.fit_transform(X, target)

        self.estimator_.fit(proj_x, target)

        return self

    def predict_value(self, x):
        resp = self.decision_function(x)
        if resp.ndim == 1:
            ans = np.zeros(resp.shape, dtype=np.int32)
            ans[resp > 0] = self.classes_[1]
            ans[resp <= 0] = self.classes_[0]
        else:
            ans = self.classes_[np.argmax(resp, axis=1)]

        return ans

    def predict_confidence(self, x):
        resp = self.decision_function(x)
        return resp[0]

    def decision_function(self, x):
        x = np.array(x).reshape((1, -1))
        proj = self.pls.transform(x)
        resp = self.estimator_.predict(proj)
        return resp

    def predict_proba(self, x):
        resp = self.decision_function(x)
        resp = np.min(-1, resp)
        resp = np.max(1, resp)
        resp -= 1
        resp /= 2
        # resp = np.exp(resp)
        # for r in range(len(resp)):
        #     resp[r] /= np.sum(resp[r])

        return resp
コード例 #10
0
class PLS(Model):

    # X represents the features, Y represents the labels
    X = None
    Y = None
    prediction = None
    model = None


    def __init__(self, X=None, Y=None,  n_components=2, type='regressor', cfg=False):
        self.name = 'PLS'

        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        self.type = type
        self.cfg = cfg
        self.n_components = n_components
        self.model = PLSRegression(n_components=n_components)


    def fit(self, X=None, Y=None):
        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        print('PLS Train started............')
        self.model.fit(self.X, self.Y)
        print('PLS completed..........')

        return self.model


    def fit_transform(self, X=None, Y=None):
        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        print('PLS Train/Transform started............')
        self.X  = self.model.fit_transform(self.X)
        print('PLS completed..........')

        self.X = pd.DataFrame(self.X)
        return self.X

    def predict(self, test_features):
        print('Prediction started............')
        self.predictions = self.model.predict(test_features)
        print('Prediction completed..........')
        return self.predictions


    def save(self):
        if self.cfg:
            f = open('pls_configs.txt', 'w')
            f.write(json.dumps(self.model.get_params()))
            f.close()
        print('No models will be saved for PLS')

    def featureImportance(self):
    #    if X_headers is None:
    #        X_headers = list(self.X)
#
#        feature_importance_ = zip(self.model.coef_.reshape(1,-1)[0], X_headers)
#        feature_importance = set(feature_importance_)

        return self.model.coef_


    def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8):
        correct = 0
        df = pd.DataFrame(data=predictions.flatten())
        for i in range(len(df)):
            if 1 - abs(df.values[i] - test_labels.values[i])/abs(df.values[i]) >= hitmissr:
                correct = correct + 1
        return float(correct)/len(df)

    def getConfusionMatrix(self, test_labels, predictions, label_headers):
        return 'No Confusion Matrix for Regression'

    def getRSquare(self, test_labels, predictions, mode='single'):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            if mode == 'multiple':
                errors = r2_score(test_labels, df, multioutput='variance_weighted')
            else:
                errors = r2_score(test_labels, df)
            return errors
        else:
            return 'No RSquare for Classification'

    def getMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = mean_squared_error(test_labels, df)
            return errors
        else:
            return 'No MSE for Classification'

    def getMAPE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = np.mean(np.abs((test_labels - df.values) / test_labels)) * 100
            return errors.values[0]
        else:
            return 'No MAPE for Classification'

    def getRMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = sqrt(mean_squared_error(test_labels, df))
            return errors
        else:
            return 'No RMSE for Classification'
コード例 #11
0
    ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.axis('equal')
ax.set_xlim([-1000,4000])
ax.set_ylim([-1000,4000])
ax.set_zlim([-1000,4000])

plt.show()

# part b
PLS1 = PLS(n_components=3)
number_map = {"M": 0,"B": 1}
numeric_y = np.array(map(lambda x : number_map[x],y))
result = PLS1.fit_transform(x,numeric_y)
X_r = result[0]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for c, i, target_name in zip("rb", target_names, target_names):
    ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.axis('equal')

plt.show()

validation = data[:100]
test = data[100:200]
train = data[200:]
コード例 #12
0
def plot_projections(holder,
                     labels,
                     preprocess_lda='PCA',
                     class_name='Antioxidants',
                     only_pca=False,
                     binarize_class=True,
                     standardize=True,
                     cluster=True,
                     return_distances=False):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''
    if only_pca:
        from sklearn.decomposition import PCA

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                df_cluster = st(df_cluster)
            else:
                classes = df_cluster.index.copy()
            pca = PCA(n_components=2)
            temp = pca.fit_transform(df_cluster)
            df[ind] = pd.DataFrame(index=df_cluster.index, data=temp)
            df[ind]['classes'] = classes
            df[ind]['classes'] = df[ind]['classes'].map(labels)
        title = 'PCA'

    else:  # to LDA
        from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA
        from sklearn.preprocessing import LabelEncoder
        # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                from sklearn.preprocessing import MinMaxScaler

                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                mms = MinMaxScaler()
                df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster),
                                          index=df_cluster.index,
                                          columns=df.columns)
            else:
                classes = df_cluster.index.copy()
            df_cluster['classes'] = classes
            df_cluster['classes'] = df_cluster['classes'].map(labels)
            if binarize_class:
                df_cluster.loc[df_cluster.classes != class_name,
                               'classes'] = 'not ' + 'class_name'

            # change labels from str to int
            enc = LabelEncoder()
            real_classes = df_cluster.loc[:, 'classes']
            df_cluster.loc[:, 'classes'] = enc.fit_transform(
                df_cluster['classes'])
            classes = df_cluster.pop('classes')

            if preprocess_lda == 'PLS':
                from sklearn.cross_decomposition import PLSRegression
                pls = PLSRegression(n_components=10, scale=False)
                temp = pls.fit_transform(df_cluster.values, classes.values)[0]
            elif preprocess_lda == 'PCA':
                from sklearn.decomposition import PCA
                pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'kernelPCA':
                from sklearn.decomposition import KernelPCA
                pca = KernelPCA(kernel="rbf", gamma=5)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'NONE':
                temp = df_cluster.values

            # lda
            lda = LDA(n_discriminants=2)
            lda.fit(temp, classes.values)
            temp = lda.transform(temp)
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    'ignore',
                    'Casting complex values to real discards the imaginary part'
                )
                temp = temp.astype(np.float)  # in case of complex numbers///
            df[ind] = pd.DataFrame(index=df_cluster.index,
                                   columns=[0, 1],
                                   data=temp)
            df[ind]['classes'] = real_classes

        title = 'LDA'

    sns.set_context(context='talk')
    sns.set_style('dark')
    sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']})
    fig, ((ax1, ax2, ax3), (ax4, ax5, ax6),
          (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14))
    cm = plt.cm.get_cmap('Spectral')
    my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))),
                 alpha=0.6)

    if return_distances:
        distances = dict()
        sil_scores = dict()
        chs_scores = dict()
    for ax_n, key, x, name in zip(
        [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(),
        [
            'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16',
            'Trans_1024', 'Trans_64', 'GAE_64'
        ]):
        if not binarize_class:
            for ind, i in enumerate(np.unique(x['classes'])):
                color = my_cmap[ind]
                marker = '.'
                if i == class_name:
                    color = 'black',
                    marker = ','
                ax_n.scatter(
                    x.loc[x.classes == i, 0],
                    x.loc[x.classes == i, 1],
                    marker=marker,
                    label=i +
                    f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})',
                    color=color)
                ax_n.title.set_text(name)
        else:
            ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.')
            ax_n.scatter(
                x.loc[x.classes == class_name, 0],
                x.loc[x.classes == class_name, 1],
                marker=',',
                label=class_name +
                f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})',
                color='darkorange')
            ax_n.title.set_text(name)
            if cluster:
                from sklearn.cluster import KMeans
                from scipy.spatial.distance import pdist
                from sklearn.metrics import silhouette_score as sil
                from sklearn.metrics import calinski_harabasz_score as chs

                km = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km.fit(x.loc[x.classes != class_name, [0, 1]])

                km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km1.fit(x.loc[x.classes == class_name, [0, 1]])

                ax_n.scatter(km.cluster_centers_[:, 0],
                             km.cluster_centers_[:, 1],
                             marker='X',
                             color='darkblue',
                             s=100,
                             linewidth=3)
                ax_n.scatter(km1.cluster_centers_[:, 0],
                             km1.cluster_centers_[:, 1],
                             marker='X',
                             color='red',
                             s=100,
                             linewidth=3)

                d = round(
                    pdist([km.cluster_centers_[0], km1.cluster_centers_[0]],
                          metric='euclidean')[0], 3)
                d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3)
                d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3)
                if return_distances:
                    cl_name = class_name + ' ' + name
                    distances[cl_name] = d
                    sil_scores[cl_name] = d_sc
                    chs_scores[cl_name] = d_chs
                name = name + '\n|d:' + str(d) + '|sil:' + str(
                    d_sc) + '|chs:' + str(d_chs)
                ax_n.title.set_text(name)
    for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]:
        ax.set_xticks([])
        ax.set_yticks([])

    labels = ax_n.get_legend_handles_labels()[1]
    if only_pca:
        fig.suptitle(labels[0] + "\n classified with: " + title)
    else:
        fig.suptitle(labels[0] + "\n classified with: " + title +
                     f', preprocessed with: {preprocess_lda}')
    fig.tight_layout()
    if not return_distances:
        return fig
    else:
        return fig, distances, sil_scores, chs_scores
コード例 #13
0
def plot_single_projection(holder,
                           labels,
                           class_name='Antioxidants',
                           fp_name='fps_e3fp_1024bit',
                           standardize=True,
                           preprocess_lda='PCA'):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''

    from mlxtend.preprocessing import standardize as st
    from sklearn.preprocessing import LabelEncoder
    from sklearn.cluster import KMeans
    from mlxtend.feature_extraction import LinearDiscriminantAnalysis  #in sklearn LDA i'd need to add a dummy class if i want to have 2 components after trasnformation
    from scipy.spatial.distance import pdist

    df_cluster = holder[fp_name].copy()
    df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
    df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]

    if standardize:
        classes = df_cluster.index.copy()
        df_cluster.reset_index(inplace=True, drop=True)
        df_cluster = st(df_cluster)
    else:
        classes = df_cluster.index.copy()
    df_cluster[
        'classes'] = classes  # our classes are mapped to index in labels dictionary
    df_cluster['classes'] = df_cluster['classes'].map(labels)

    df_cluster.loc[df_cluster.classes != class_name,
                   'classes'] = 'not ' + 'class_name'
    #dummy = [0]*(df_cluster.shape[1]-1) + ['dummy']
    #df_cluster.loc[df_cluster.shape[0]] = dummy

    # change labels from str to int
    enc = LabelEncoder()
    real_classes = df_cluster.loc[:, 'classes']
    df_cluster.loc[:, 'classes'] = enc.fit_transform(df_cluster['classes'])
    classes = df_cluster.pop('classes')

    if preprocess_lda == 'PLS':
        from sklearn.cross_decomposition import PLSRegression
        pls = PLSRegression(n_components=10, scale=False)
        temp = pls.fit_transform(df_cluster.values, classes.values)[0]
    elif preprocess_lda == 'PCA':
        from sklearn.decomposition import PCA
        pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
        temp = pca.fit_transform(df_cluster.values)
    elif preprocess_lda == 'kernelPCA':
        from sklearn.decomposition import KernelPCA
        pca = KernelPCA(kernel="rbf", gamma=5)
        temp = pca.fit_transform(df_cluster.values)
    elif preprocess_lda == 'NONE':
        temp = df_cluster.values
    elif preprocess_lda == 'NCA':
        from sklearn.neighbors import NeighborhoodComponentsAnalysis
        nca = NeighborhoodComponentsAnalysis()
        temp = nca.fit_transform(df_cluster.values, classes.values)

    #lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto')
    #lda.fit(temp, classes.values)
    #temp1 = lda.transform(temp)

    lda = LinearDiscriminantAnalysis(n_discriminants=2)
    lda.fit(temp, classes.values)
    temp = lda.transform(temp)
    with warnings.catch_warnings():
        warnings.filterwarnings(
            'ignore',
            'Casting complex values to real discards the imaginary part')
        temp = temp.astype(np.float)  # in case of complex numbers///

    df = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp)
    df['classes'] = real_classes

    km = KMeans(init='k-means++', n_clusters=1, n_init=10)
    km.fit(df.loc[df.classes != class_name, [0, 1]])

    km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
    km1.fit(df.loc[df.classes == class_name, [0, 1]])

    d = pdist([km.cluster_centers_[0], km1.cluster_centers_[0]])
    d = str(round(d[0], 3))

    fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(df.loc[df.classes != class_name, 0],
               df.loc[df.classes != class_name, 1],
               marker=',',
               color='grey')
    ax.scatter(df.loc[df.classes == class_name, 0],
               df.loc[df.classes == class_name, 1],
               marker=',',
               color='orange')

    ax.scatter(km.cluster_centers_[:, 0],
               km.cluster_centers_[:, 1],
               marker='X',
               color='green',
               linewidths=30)

    ax.scatter(km1.cluster_centers_[:, 0],
               km1.cluster_centers_[:, 1],
               marker='X',
               color='red',
               linewidths=30)

    fig.suptitle(class_name + ' ' + d)
    return fig
コード例 #14
0
Axes3D(plt.figure()).scatter(Xiso[:,0],Xiso[:,1], alpha=.3)
#%% t-SNE
tsne = TSNE(n_components=2, n_iter=250)
Xtsne = tsne.fit_transform(X[:500,:200])
Axes3D(plt.figure()).scatter(Xtsne[:,0],Xtsne[:,1], alpha=.3)
#%% PC Regression
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg,
                         X95[:,:10],
                         Y) 
scores.mean()

#%% Partial Least Squares
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=10)
Xpls, Ypls = pls.fit_transform(X,Y)

#%% Visualization with labeling
import ggplot as gg
df1['x1'], df1['x2'] = Xpca[:,0],Xpca[:,1]
chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \
                  + gg.geom_point(size=10, alpha=.8) 
chart.show()
#%% PLS transformation
df1['x1'], df1['x2'] = Xpls[:,0],Xpls[:,1]
chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \
                  + gg.geom_point(size=10, alpha=.8) 
chart.show()
#%% Feature Selection with Elastic Net
scaler = StandardScaler()
Xscale = scaler.fit_transform(X)
コード例 #15
0
def Granger_Causality_Pred(num):
    # loading dataset
    df = pd.read_csv("Y:\\Dropbox\\Dropbox (MIT)\\Robinhood Trading\\Stock Data\\broader_stock.csv")
    df = df.set_index(pd.to_datetime(df['Date']))
    df.drop(['Date'], axis=1, inplace=True)
    pct_df = df.pct_change().shift(1).iloc[2:]

    # set up global variables
    leader_tick_dict = {}
    perf_res = {}
    w_mktre = (1 + pct_df['SPY_Close']).resample('W').prod() - 1

    _ = 0

    # identify leaders
    for tick in pct_df.columns[::3][(num * 100):((num + 1) * 100)]:
        # picking leaders for each stocks
        target_arr = pct_df[tick].dropna()
        w_target = (1 + target_arr).resample('W').prod() - 1
        Y = w_target.shift(-1)
        leader_set = []

        for leader in pct_df.columns[::3]:
            if leader != tick:
                leader_arr = pct_df[leader].dropna()
                w_leader = (1 + leader_arr).resample('W').prod() - 1

                tempreg_dta = pd.concat([Y, w_target, w_mktre, w_leader], axis=1).dropna()
                tempreg_dta.columns = ['Y', 'Y-1', 'Mkt', 'Lead']

                if tempreg_dta.shape[0] >= 36 * 4:
                    ols = sm.OLS(tempreg_dta['Y'].iloc[-36 * 4:],
                                 sm.add_constant(tempreg_dta[['Y-1', 'Mkt', 'Lead']].iloc[-36 * 4:]))
                    res = ols.fit(cov_type='HC0')
                    leader_sig = res.pvalues[3]

                elif tempreg_dta.shape[0] >= 12 * 4:
                    ols = sm.OLS(tempreg_dta['Y'].iloc[-12 * 4:],
                                 sm.add_constant(tempreg_dta[['Y-1', 'Mkt', 'Lead']].iloc[-12 * 4:]))
                    res = ols.fit(cov_type='HC0')
                    leader_sig = res.pvalues[3]

                else:
                    leader_sig = 1

                if leader_sig <= 1e-3:
                    leader_set.append(leader)

        leader_tick_dict[tick] = leader_set

        # evaluate performance
        leader = leader_tick_dict[tick]
        if len(leader) > 1:
            # simple average
            avg_signal = ((1 + pct_df[leader_tick_dict[tick]]).resample('W').prod() - 1).mean(axis=1)
            # only evaluate at short term period
            val_avg = pd.concat([w_target.shift(1).iloc[-12 * 4:], avg_signal.iloc[-12 * 4:]], axis=1).dropna().values
            # metrics
            mu_avg = mean_squared_error(val_avg[:, 0], val_avg[:, 1]) * 100
            acc_avg = accuracy_score((val_avg[:, 0] > 0).astype(int), (val_avg[:, 1] > 0).astype(int))
            perf_res[tick] = [mu_avg, acc_avg]

        _ += 1
        print("{}/100".format(_))

    perf_pls = {}
    N = len(leader_tick_dict.keys())
    count = 0

    for tick in leader_tick_dict.keys():
        leader = leader_tick_dict[tick]
        if len(leader) > 1:
            leader_arr = df[leader_tick_dict[tick]]
            target_arr = pct_df[tick].dropna()
            w_target = (1 + target_arr).resample('W').prod() - 1
            pls_set = []

            for col in leader_arr.columns:
                ind_arr = []
                for t in range(leader_arr.shape[0] - 300, leader_arr.shape[0]):
                    macd = MACD(leader_arr[col], 5, t)
                    booling = BoolingerBands(leader_arr[col], 5, t)
                    volcof = Vol_Coefficient(leader_arr[col], 5, t)
                    anvol = AnnVol(leader_arr[col], 5, t)
                    phl = Price_High_Low(leader_arr[col], 5, t)
                    prev = PriceReverse(leader_arr[col], 5, t)
                    # rsi = RelativeStrengh(leader_arr[col], 5, t)

                    ind_arr.append([macd, booling, volcof, anvol, phl, prev])

                w_X = pd.DataFrame(data=ind_arr, index=leader_arr.index[-300:]).resample('W').mean()
                temp_dta = pd.concat([w_target, w_X], axis=1).dropna().values[-12 * 4:, ]
                pls = PLSRegression(n_components=1)
                pls_x = pls.fit_transform(X=temp_dta[:, 1:], y=temp_dta[:, :1])[0]
                pls_set.append(pls_x)
            pls_X = np.column_stack(pls_set)

            signal = np.mean(pls_X, axis=1)
            actual = w_target.iloc[-49:-1]

            mu_pls = mean_squared_error(actual, signal) * 100
            acc_pls = accuracy_score((actual > 0).astype(int), (signal > 0).astype(int))
            perf_pls[tick] = [mu_pls, acc_pls]

        count += 1
        print("{}/{}".format(count, N))

    avg_res = pd.DataFrame(perf_res).T
    pls_res = pd.DataFrame(perf_pls).T
    ttl_res = pd.concat([pls_res, avg_res], axis=1)
    ttl_res.columns = ['MSE_PLS', 'ACC_PLS', 'MSE_AVG', 'ACC_AVG']
    ttl_res.to_csv('Granger_Causality_Res%s.csv' % num)
コード例 #16
0

# -----PLS testing--------------------------------------------------------------------------------------------------
if __name__ == '__main__':
    path_to_data = os.path.join(str(Path.home()), 'Deformetrica', 'deterministic_atlas_ct',
                                'output_separate_tmp10_def10_prttpe13_corrected', 'Decomposition')
    data_filename = 'Momenta_Table.csv'

    data, target = load_iris(return_X_y=True)
    data = data[0:80, 0:3]
    target = target[0:80]
    pls = PLSBinaryClassification(dataset_filename=data_filename, dataset_path=path_to_data, X=data, y=target)
    pls.decompose_with_pls(method='da')

    plsr = PLSRegression(3, scale=False)
    x_plsr, y_plsr = plsr.fit_transform(pls.X_centered, pls.y)

    plt.scatter(plsr.x_scores_[pls.y == 1, 0], plsr.x_scores_[pls.y == 1, 1], c='red', marker='d')
    plt.scatter(plsr.x_scores_[pls.y == -1, 0], plsr.x_scores_[pls.y == -1, 1], c='blue', marker='x')
    x = np.linspace(-2, 2, 100)

    print('W:\n {}'.format(pls.W))
    print('xw:\n {}'.format(plsr.x_weights_))
    print('T:\n {}'.format(pls.T))
    print('Xload:\n {}'.format(plsr.x_loadings_.T @ plsr.x_loadings_))
    print('P:\n {}'.format(pls.P))
    print('q:\n {}'.format(pls.q))
    print('----------------')

    print('yload:\n {}'.format(plsr.y_loadings_))
コード例 #17
0
x_axis = np.arange(1, np.linalg.matrix_rank(X) + 1)
plt.scatter(x_axis, cummulative_variance_explained)
plt.plot(x_axis, cummulative_variance_explained)
plt.title("Scree Plot")
plt.xlabel("Number of latent vectors used")
plt.ylabel("Percentage of variance explained")
plt.xticks(x_axis, x_axis)
plt.yticks()
plt.show()

# compare to sklearn package results to verify accuracy
import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

X = [[1, 5, 10], [2, 4, 8], [3, 4, 8], [4, 5, 10]]
y = [41, 49, 69, 65]

X = StandardScaler().fit_transform(X)  # population stdev
y = StandardScaler().fit_transform(y)  # population stdev

pls1 = PLSRegression(n_components=2)
scores = pls1.fit_transform(X, y)
T = pls1.x_scores_
W = pls1.x_weights_
P = pls1.y_loadings_

y_pred = pls1.predict(X)
コード例 #18
0
plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig)
plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)

#%% PLS2
lda = LDA()
nComponents = np.arange(1,nFeatures,8)
pls2Scores = np.zeros((2,np.alen(nComponents)))
for i,n in enumerate(nComponents):
    pls2 = PLSRegression(n_components=n)
    pls2.fit(dataTrain,Ytrain)
    dataTrainT = pls2.transform(dataTrain)
    dataTestT = pls2.transform(dataTest)
    pls2Scores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest)

pls2 = PLSRegression(n_components=2)
xtPLS,yt = pls2.fit_transform(dataTrain,Ytrain)

uPLS = pls2.x_weights_

#%% Canonical Correlation Analysis
nComponents = np.arange(1,nClasses+1)
cca = CCA(n_components=nClasses)
cca.fit(dataTrain,Ytrain)
dataTrainT = cca.transform(dataTrain)
dataTestT = cca.transform(dataTest)
ccaScores = np.zeros((2,np.alen(nComponents)))
for i,n in enumerate(nComponents):
    ccaScores[:,i] = util.classify(dataTrainT[:,0:n],dataTestT[:,0:n],labelsTrain,labelsTest)

#%% Linear Discriminant Analysis
nComponents = np.arange(1,nClasses+1)
コード例 #19
0
ファイル: question2.py プロジェクト: MatthewWEdwards/EE-379K-
reduced_college_train_x = pcr_opt.transform(college_train_x)
lrm = LinearRegression()
lrm.fit(reduced_college_train_x, college_train_y)
print "\nPCR RMSE (M = " + str(opt_m) + ")"
print rmse(lrm, reduced_college_test_x, college_test_y)

#%% PLS
from sklearn.cross_decomposition import PLSRegression

pls_components = range(1, 18)

cv_pls = np.array([])
for m in pls_components:
    pls = PLSRegression(n_components=m)
    foo = np.transpose(college_train_x.get_values())
    transformed_college_train_x = pls.fit_transform(college_train_x,
                                                    college_train_y)[0]
    lrm = LinearRegression()
    pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x,
                            college_train_y).mean()
    cv_pls = np.append(cv_pls, pls_this_rmse)

min_m = pls_components[np.argmin(cv_pls)]
cv_pls = pd.Series(cv_pls, index=pls_components)
cv_pls.plot(title="PLSRegression Cross Validation")
plt.xlabel("Number of Components (M)")
plt.ylabel("Root Mean Square Error")
if show_plots_flag:
    plt.show()

best_pls = PLSRegression(n_components=min_m)
transformed_college_train_x = best_pls.fit_transform(college_train_x,
コード例 #20
0
plt.figure()
for c, i, target_name in zip("rgb", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend()
plt.title('PCA of IRIS dataset')
plt.axis('equal')
plt.show()

# PLS1
PLS1 = PLS(n_components=2)
X = df.as_matrix()[:, :4]
y = np.array(map(lambda x : number_map[x],df.as_matrix()[:, 4]))
string_map = {-1.2206555615733703 : "Iris-setosa", 0 : "Iris-versicolor", 1.2206555615733703 : "Iris-virginica"}

result = PLS1.fit_transform(X,y)
y = np.array(map(lambda x : string_map[x],result[1]))
target_names = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
for c, i, target_name in zip("rgb", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], target_names):
    plt.scatter(result[0][y == i, 0],result[0][y == i, 1], c=c, label=target_name)
plt.legend()
plt.title('PLS1 of IRIS dataset')
plt.axis('equal')

plt.show()

# PLS2
PLS2 = PLS(n_components=2)
X = df.as_matrix()[:, :4]
y = np.array(map(lambda x : number_map[x],df.as_matrix()[:, 4]))
one_hot_y = np.zeros((len(y),3))
コード例 #21
0
ファイル: feexp.py プロジェクト: santoshchapaneri/tyre-hug
# Make predictions using an SVM with PCA and PLS
pca_error = 0
pls_error = 0
n_folds = 10

svc = LinearSVC()

for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds):
    X_train, X_test = X[train_inds], X[test_inds]
    y_train, y_test = y[train_inds], y[test_inds]

    # Use PCA and then classify using an SVM
    X_train2 = pca.fit_transform(X_train)
    X_test2 = pca.transform(X_test)

    svc.fit(X_train2, y_train)
    y_pred = svc.predict(X_test2)
    pca_error += zero_one_loss(y_test, y_pred)

    # Use PLS and then classify using an SVM
    X_train2, y_train2 = pls.fit_transform(X_train, y_train)
    X_test2 = pls.transform(X_test)

    svc.fit(X_train2, y_train)
    y_pred = svc.predict(X_test2)
    pls_error += zero_one_loss(y_test, y_pred)

print(pca_error / n_folds)
print(pls_error / n_folds)
コード例 #22
0
plots = height*width

# fig, axes = plt.subplots(height, width, figsize=(20, 20), sharex=True, sharey=True)

colors = {
    0: "g",
    1: "r"
}

df = pd.read_json(DATA_PATH + "left_shoulder.json")
df_features = pd.DataFrame(df.data.tolist())

plsr = PLSRegression(n_components=2)
X = df_features
y = df["label"]
principal_components = plsr.fit_transform(X, y)
principal_df = pd.DataFrame(data=principal_components[0], columns=["component 1", "component 2"])
principal_df = pd.concat([principal_df, df[["label"]]], axis=1)
principal_df = pd.concat([principal_df, df[["id"]]], axis=1)

# axes[i // 2][i % 2].plot(np.cumsum(pca.explained_variance_ratio_))

healthy_id = list(set(principal_df.loc[df["label"] == 0]["id"]))
impaired_id = list(set(principal_df.loc[df["label"] == 1]["id"]))

healthy_ids = np.random.choice(healthy_id, plots//2)
impaired_ids = np.random.choice(impaired_id, plots//2)

all_ids = np.append(healthy_ids, impaired_ids)

# for i in range(len(all_ids)):