Ejemplo n.º 1
0
 def fit(self, X, y):
     #creating a manifold on training data
     self.model = LocallyLinearEmbedding(
         method=self.method,
         n_neighbors=self.n_neighbors,
         n_components=self.n_components,
         reg=self.reg,
         eigen_solver=self.eigen_solver,
         random_state=self.random_state).fit(X, y)
     #determining centroids for given points
     self.centroids = KMeans(n_clusters=self.n_clusters,
                             random_state=self.random_state).fit(
                                 self.model.transform(X))
     labels = self.centroids.predict(self.model.transform(
         X))  # Every point is assigned to a certain cluster.
     #assigning each centroid to the correct cluster
     confusion_m = confusion_matrix(y, labels)
     m = Munkres()
     cost_m = make_cost_matrix(confusion_m)
     target_cluster = m.compute(
         cost_m)  # (target, cluster) assignment pairs.
     #saving mapping for predictions
     self.mapping = {
         cluster: target
         for target, cluster in dict(target_cluster).items()
     }
Ejemplo n.º 2
0
def get_dim_reds_scikit(pct_features):
    n_components = max(int(pct_features * num_features), 1)
    return [
        LinearDiscriminantAnalysis(n_components=n_components),
        TruncatedSVD(n_components=n_components),
        #SparseCoder(n_components=n_components),
        DictionaryLearning(n_components=n_components),
        FactorAnalysis(n_components=n_components),
        SparsePCA(n_components=n_components),
        NMF(n_components=n_components),
        PCA(n_components=n_components),
        RandomizedPCA(n_components=n_components),
        KernelPCA(kernel="linear", n_components=n_components),
        KernelPCA(kernel="poly", n_components=n_components),
        KernelPCA(kernel="rbf", n_components=n_components),
        KernelPCA(kernel="sigmoid", n_components=n_components),
        KernelPCA(kernel="cosine", n_components=n_components),
        Isomap(n_components=n_components),
        LocallyLinearEmbedding(n_components=n_components,
                               eigen_solver='auto',
                               method='standard'),
        LocallyLinearEmbedding(n_neighbors=n_components,
                               n_components=n_components,
                               eigen_solver='auto',
                               method='modified'),
        LocallyLinearEmbedding(n_neighbors=n_components,
                               n_components=n_components,
                               eigen_solver='auto',
                               method='ltsa'),
        SpectralEmbedding(n_components=n_components)
    ]
Ejemplo n.º 3
0
def get_metastable_connections_from_gmm(
        data,
        gmm,
        connection_estimation_method='max_path_distance_diff',
        min_paths=3,
        distance='euclidean',
        low_dimension_distances=True,
        as_graph=False):
    means = gmm.means_
    memberships = gmm.predict(data)
    if connection_estimation_method in [
            'max_path_distance_diff', 'connecting_paths', 'mst'
    ]:
        if low_dimension_distances:
            pca = PCA(n_components=2)
            lle = LocallyLinearEmbedding(n_components=2,
                                         n_neighbors=int(0.8 * data.shape[0]))
            distance_matrix = squareform(
                pdist(lle.fit_transform(data), distance))
        else:
            distance_matrix = squareform(pdist(data, distance))
        weighted_graph = nx.Graph(distance_matrix)
    else:
        weighted_graph = None
    return get_metastable_connections(data,
                                      means,
                                      memberships,
                                      method=connection_estimation_method,
                                      weighted_graph=weighted_graph,
                                      min_paths=3,
                                      as_graph=as_graph)
Ejemplo n.º 4
0
 def function(self, data):
     # pylint: disable=not-a-mapping
     lle = LocallyLinearEmbedding(n_neighbors=self.n_neighbors,
                                  n_components=self.n_components,
                                  **self.kwargs)
     emb = lle.fit_transform(data)
     return emb
Ejemplo n.º 5
0
def get_lower_dimensional_projection(cluster_data,
                                     algorithm='tsne',
                                     projection_dim=2):
    if algorithm.lower() == 'tsne':
        tsne_object = TSNE(n_components=projection_dim, random_state=42)
        lower_dimensional_projected_data = tsne_object.fit_transform(
            cluster_data)
        return lower_dimensional_projected_data
    elif algorithm.lower() == 'pca':
        pca_object = PCA(n_components=projection_dim,
                         random_state=42,
                         copy=False)
        lower_dimensional_projected_data = pca_object.fit_transform(
            cluster_data)
        return lower_dimensional_projected_data
    elif algorithm.lower() == "mds":
        mds_object = MDS(n_components=projection_dim, random_state=42)
        lower_dimensional_projected_data = mds_object.fit_transform(
            cluster_data)
        return lower_dimensional_projected_data
    else:
        lle_object = LocallyLinearEmbedding(n_components=projection_dim,
                                            random_state=42)
        lower_dimensional_projected_data = lle_object.fit_transform(
            cluster_data)
        return lower_dimensional_projected_data
Ejemplo n.º 6
0
def lle(space):

    n_neighbors = int(space['n_neighbors'])
    method = space['method']

    vertices, colors = get_all_vertices_dk_atlas_w_colors()
    print(space)

    lle = LLE(n_neighbors=n_neighbors, n_components=2, method=method, neighbors_algorithm='auto')
    lle_xy = lle.fit_transform(vertices)

    centers = get_centers_of_rois_xy(lle_xy)

    avg_distance = avg_distance_between_center_of_masses(centers)

    model_name = 'lle_{}_{}'.format(method, avg_distance)

    result = {
        'loss': -avg_distance,
        'space': space,
        'status': STATUS_OK
    }

    save_json_result(model_name, result)
    save_2d_roi_map(lle_xy, colors, centers, model_name)

    return result
Ejemplo n.º 7
0
def applyLlleWithStandardisation(data, n_components=None):
    X = preprocessing.scale(data)

    lle = LocallyLinearEmbedding(n_components=n_components,
                                 eigen_solver="auto")

    return lle.fit_transform(X)
def evaluate_embeddings(D, labels):

    estimators = [
        KMeans(init='k-means++', n_clusters=5, n_init=10)
    ]  #,AgglomerativeClustering(n_clusters=5),AgglomerativeClustering(n_clusters=5,linkage='average')]
    est_names = [
        'KMeans'
    ]  #,'wardAgglomerativeClustering','avgAgglomerativeClustering']
    for e in range(len(estimators)):
        print '!!----------------------------------!!'
        print est_names[e]
        estim = estimators[e]
        for i in range(2, 6 + 1):

            print '--------------------------------------'
            print '#dim = ' + str(i)

            model_t = TSNE(n_components=i,
                           learning_rate=100,
                           perplexity=10,
                           method='exact')
            x = model_t.fit_transform(D)
            bench_k_means(estim, name="tsne", data=x, labels=labels)

            model_i = Isomap(n_components=i)
            x = model_i.fit_transform(D)
            bench_k_means(estim, name="isomap", data=x, labels=labels)

            model_l = LocallyLinearEmbedding(n_components=i)
            x = model_l.fit_transform(D)
            bench_k_means(estim, name="lle", data=x, labels=labels)
Ejemplo n.º 9
0
def finalData(algo,lb,dm):

    Page3_Util.a.maindata.make_XY(lb)
    pca = LocallyLinearEmbedding(n_components=dm)
    if algo=='PCA':
        pca = PCA(n_components=dm)
    elif algo=='Linear Embading':
        pca = LocallyLinearEmbedding(n_components=dm)
    elif algo== 'Isomap':
        pca = Isomap(n_components=dm)
    elif algo== 'MDS':
        pca = MDS(n_components=dm)
    elif algo== 'SpectralEmbedding':
        pca = SE(n_components=dm)
    else:
        if dm==Page3_Util.a.maindata.N_features:dm=dm-1
        pca = TSNE(n_components=dm)

    principalComponents = pca.fit_transform(Page3_Util.a.maindata.X)
    principalDf = pd.DataFrame(data=principalComponents
                               , columns=["D{}".format(i) for i in range(dm)])
    finalDf = pd.concat([principalDf, Page3_Util.a.maindata.df[[lb]]], axis=1)
    csv_string = finalDf.to_csv(index=False, encoding='utf-8')
    csv_string = "data:text/csv;charset=utf-8," + urllib.parse.quote(csv_string)
    return csv_string
Ejemplo n.º 10
0
 def initial_embed(self, reduce, d):
     reduce = reduce.lower()
     assert reduce in ['isomap', 'ltsa', 'mds', 'lle', 'se', 'pca', 'none']
     if reduce == 'isomap':
         from sklearn.manifold import Isomap
         embed = Isomap(n_components=d)
     elif reduce == 'ltsa':
         from sklearn.manifold import LocallyLinearEmbedding
         embed = LocallyLinearEmbedding(n_components=d,
                                        n_neighbors=5,
                                        method='ltsa')
     elif reduce == 'mds':
         from sklearn.manifold import MDS
         embed = MDS(n_components=d, metric=False)
     elif reduce == 'lle':
         from sklearn.manifold import LocallyLinearEmbedding
         embed = LocallyLinearEmbedding(n_components=d,
                                        n_neighbors=5,
                                        eigen_solver='dense')
     elif reduce == 'se':
         from sklearn.manifold import SpectralEmbedding
         embed = SpectralEmbedding(n_components=d)
     elif reduce == 'pca':
         from sklearn.decomposition import PCA
         embed = PCA(n_components=d)
     if reduce == 'none':
         self.embed = lambda x: x
     else:
         self.embed = lambda x: embed.fit_transform(x)
Ejemplo n.º 11
0
def LLE_plot(data):
    """
    This function print and plots the result of LLE(Local Linear Embedding) algorithm.
    """
    print("Computing LLE embedding")
    t1 = time()
    for n in range(1, 50):
        plt.figure(figsize=(16,9))
        n_neighbors = n
        print("n_neighbors = %d"%n_neighbors)
        for i in range(10):

            condition = data['label'] == i
            subset_data = data[condition]

            clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard', eigen_solver='dense')
            t0 = time()
            X_lle = clf.fit_transform(subset_data)

            print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
            print("Locally Linear Embedding of the digits (time %.2fs)" %(time() - t0))
            plt.scatter(X_lle[:, 0], X_lle[:, 1], cmap=plt.cm.hot, s=2, label='digit %d'%i)

        plt.ylim([-0.1, 0.1])
        plt.xlim([-0.2, 0.2])
        plt.legend()
        plt.grid()
        plt.savefig("./img/n-neighbor=%d.png"%n_neighbors, dpi=300)

    print("totally consumed time : (%.2fs)" %(time() - t1))
Ejemplo n.º 12
0
def data_lle_preprocessing(data, feature_columns):
    data = data.dropna()
    sc = preprocessing.StandardScaler()
    data[feature_columns] = sc.fit_transform(data[feature_columns])
    lle = LocallyLinearEmbedding(n_components=4)
    data[feature_columns[:-1]] = lle.fit_transform(data[feature_columns])
    return data, feature_columns[:-1]
Ejemplo n.º 13
0
def wrap_lle(x, required_d, neighbors):
    # 对输入x,用LLE方法降维到required_d维,并将降维后的数据保存为np文件,方便下次调用
    lle = LocallyLinearEmbedding(n_components=required_d,
                                 n_neighbors=neighbors)
    lle.fit(x)
    x_lle = lle.embedding_
    np.save('LLE/np_x_LLE_' + str(required_d) + str(neighbors), x_lle)
    return x_lle
Ejemplo n.º 14
0
def score_lle(x_train, y_train, x_test, y_test):
    lle = LocallyLinearEmbedding(n_neighbors=5, n_components=4)
    x_train = lle.fit_transform(x_train)
    x_test = lle.fit_transform(x_test)
    nb = GaussianNB()
    nb.fit(x_train, y_train)
    y_pred = nb.predict(x_test)
    return accuracy_score(y_pred, y_test)
def lle(x):
    """
    Useful link:
    https://stackoverflow.com/questions/42275922/setting-the-parameters-of-locally-linear-embedding-lle-method-in-scikit-learn
    """
    embedding = LocallyLinearEmbedding(n_components=2)  # 2D projection
    x_transformed = embedding.fit_transform(x)
    return embedding, x_transformed
Ejemplo n.º 16
0
    def locally_linear_embedding(self,
                                 n_neighbors=5,
                                 n_components=3,
                                 reg=1e-3,
                                 eigen_solver='auto',
                                 tol=1e-6,
                                 max_iter=100,
                                 method='standard',
                                 hessian_tol=1E-4,
                                 modified_tol=1E-12,
                                 neighbors_algorithm='auto',
                                 random_state=None,
                                 n_jobs=None):
        """Computes the locally linear embedding of x_data.

        Args:
            n_neighbors: An integer, which is the number of neighbors
                         considered for each point
            n_components: An integer, which is the number of coordinates
                          for the manifold
            reg: A float, which is the regularization constant
            eigen_solver: A string ('auto', 'arpack', 'dense'),
                          which is solver for the problem
            tol: A float, which is the convergence tolerance for
                 eigen solvers (arpack)
            max_iter: An integer, which is the max number of iteration
                      for the arpack solver
            method: A string ('standard', 'hessian', 'modified', 'ltsa'),
                    which is the embedding algorithm
            hessian_tol: A float, which is the tolerance for Hessian method
            modified_tol: A float, which is the tolerance for LLE method
            neighbors_algorithm: A string ('auto', 'brute',
                                 'kd_tree', 'ball_tree'), which is the
                                 algorithm for nearest neighbors search
            random_state: An integer, which is a seed for random number
                          generator
            n_jobs: An integer (-1 all), which is the number of parallel
                    jobs to run

        Returns:
            A numpy ndarray, which has a shape like
            (length of x_data, n_components)
        """
        x_data = self.x_data.reshape(
            (self.x_data.shape[0], np.prod(self.x_data.shape[1:])))
        lle = LocallyLinearEmbedding(n_neighbors=n_neighbors,
                                     n_components=n_components,
                                     reg=reg,
                                     eigen_solver=eigen_solver,
                                     tol=tol,
                                     max_iter=max_iter,
                                     method=method,
                                     hessian_tol=hessian_tol,
                                     modified_tol=modified_tol,
                                     neighbors_algorithm=neighbors_algorithm,
                                     random_state=random_state,
                                     n_jobs=n_jobs)
        return lle.fit_transform(x_data)
def LLE_dr(data_set, com_dimentions):
    
    print("Dimentions of Dataset before LLE: ", data_set.shape)
    
    lle = LocallyLinearEmbedding(n_components=com_dimentions)
    data_transform = lle.fit_transform(data_set)  # Fit the model with X and Apply dimensionality reduction to X.
    
    print("Dimentions of Dataset after LLE: ", data_transform.shape)
    return data_transform
Ejemplo n.º 18
0
def KLLE(feature, dim):
    print("dim:", dim)
    t = time.time()
    lle = LocallyLinearEmbedding(n_components=dim,
                                 n_jobs=4,
                                 neighbors_algorithm='ball_tree')
    feature_ = lle.fit_transform(feature)
    np.save('LLE/feature_' + str(dim), feature_)
    print("time:", time.time() - t)
Ejemplo n.º 19
0
 def fit_transform(self, X, n_components=2):
     if self.method == 'pca':
         self.compresser = PCA(n_components=n_components)
     elif self.method == 'tsne':
         self.compresser = TSNE(n_components=n_components, verbose=1)
     elif self.method == 'lle':
         self.compresser = LocallyLinearEmbedding(n_components=n_components,
                                                  n_jobs=4)
     return self.compresser.fit_transform(X)
Ejemplo n.º 20
0
def LTSA(X, labels, imgs, n_neighbors, **kwargs):
    # LTSA embedding of the dataset dataset
    print("Computing LTSA embedding")
    clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa')
    t = time()
    X_ltsa = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_ltsa, labels, imgs,
                   "LTSA of the dataset (time %.2fs)" % (time() - t), **kwargs)
Ejemplo n.º 21
0
def lle(numComponents, neighbors=5, hessian=False):
    # if there's time we can try changing n_neighbors
    if hessian:
        return LocallyLinearEmbedding(n_neighbors=neighbors,
                                      n_components=numComponents,
                                      method='hessian')
    else:
        return LocallyLinearEmbedding(n_neighbors=neighbors,
                                      n_components=numComponents)
Ejemplo n.º 22
0
def draw_lle(matrix, spiral_density, layer_distance, k):
    embedding = LocallyLinearEmbedding(n_components=2)
    lle = embedding.fit_transform(matrix)
    plt.clf()
    plt.scatter(lle[:, 0], lle[:, 1])
    title = "lle_spiral_density={0:.2f}_layer_distance={1:.2f}_k={2:.2f}.png".format(
        spiral_density, layer_distance, k)
    plt.title(title)
    plt.savefig(title)
Ejemplo n.º 23
0
 def __init__(self, method):
     assert method in ['pca', 'tsne', 'lle']
     self.method = method
     if self.method == 'pca':
         self.compresser = PCA(n_components=2)
     elif self.method == 'tsne':
         self.compresser = TSNE(n_components=2, verbose=1)
     elif self.method == 'lle':
         self.compresser = LocallyLinearEmbedding(n_components=2)
Ejemplo n.º 24
0
def data_transform(train, test):
    pca = LocallyLinearEmbedding(n_components=80, n_neighbors=60)
    train_tran = pca.fit_transform(train[:, :-1])
    test_tran = pca.transform(test[:, :-1])
    train_cat = np.hstack((train_tran, train[:, -1].reshape((-1, 1))))
    test_cat = np.hstack((test_tran, test[:, -1].reshape((-1, 1))))
    #print("explained variance ratio: %s" % str(pca.lambdas_))
    pass
    return train_cat, test_cat
Ejemplo n.º 25
0
def my_lle(X, y=None, l1=.1, n_components=2, **kwargs):
    rrfs = RRFS(X.shape[1], hidden=n_components)
    model = LocallyLinearEmbedding(n_components=n_components)
    codes = model.fit_transform(X)
    codes = (codes - np.min(codes)) / (np.max(codes) - np.min(codes))
    #rrfs.train_representation_network(x_train, name=dataset+'_rep.hd5', epochs=1000)
    score = rrfs.train_fs_network(X, rep=codes, l1=l1, epochs=300, loss='mse')
    # sort the feature scores in an ascending order according to the feature scores
    idx = np.argsort(score)[::-1]
    return idx
Ejemplo n.º 26
0
 def embed_lle(train, test, nn=10, method='standard'):
     traintest = np.concatenate((train, test))
     from sklearn.manifold import LocallyLinearEmbedding
     lle = LocallyLinearEmbedding(n_neighbors=nn,
                                  n_components=2,
                                  method=method)
     lle.fit(traintest)
     X2d = lle.transform(traintest)
     X2d = MinMaxScaler().fit_transform(X2d)
     return X2d[:train.shape[0]], X2d[train.shape[0]:]
 def runLLE_KMeans(self):
     """
     Run sklearn-LocallyLinearEmbedding to reduce the dimensionality of the data
     Cluster embedding with K-Means
     """
     lle = LocallyLinearEmbedding(n_components=2)
     self.dlle = lle.fit_transform(self.dataset)
     self.kmeansLLE = KMeans(n_clusters=self.n_clusters,
                             random_state=0).fit_predict(self.dlle)
     return self.dlle, self.kmeansLLE
Ejemplo n.º 28
0
def lle(data, d, k):
    '''
    input:data(ndarray):待降维数据
          d(int):降维后数据维度
          k(int):邻域内样本数
    output:Z(ndarray):降维后数据
    '''
    lle = LocallyLinearEmbedding(n_components=d, n_neighbors=k)
    Z = lle.fit_transform(data)
    return Z
def nn_check(ppd):
    for i in range(8, 26):
        lle = LLE(n_components=3,
                  n_neighbors=i,
                  method='modified',
                  modified_tol=1e-12)
        XT = lle.fit_transform(ppd)
        print('running')
        validity(XT, i)
    print('done')
Ejemplo n.º 30
0
def __manifold_lle(pc, outcome, dim=2):
    """Fit Locally Linear Embedding.
    :return: DataFrame of covariates"""
    lle = LocallyLinearEmbedding(n_components=dim,
                                 n_jobs=-1,
                                 method='standard')
    lle_out = lle.fit_transform(pc)

    df_lle_out = pd.DataFrame(lle_out, columns=["D1", "D2"])
    return df_lle_out
Ejemplo n.º 31
0
def pseudotimes_from_embedding(data_array, n_neighbors=None):
    if n_neighbors is None:
        n_neighbors = int(data_array.shape[0] * 0.5)
    embedding = LocallyLinearEmbedding(n_components=1, n_neighbors=n_neighbors)
    u, s, v = np.linalg.svd(data_array, full_matrices=1)
    l = 2
    denoised_data_array = np.dot(u[:, :l], np.dot(np.diag(s[:l]), v[:l, :]))
    pseudotimes = embedding.fit_transform(denoised_data_array)

    pseudotimes -= pseudotimes.min()
    pseudotimes /= pseudotimes.max()
    return pseudotimes
Ejemplo n.º 32
0
class SemiSupervisedGradientBoosting : 
    def __init__(self, max_depth=3, n_estimators=10, learning_rate=0.1,
                 min_samples_leaf=4, n_neighbors=5, n_components=2) :
        self.GB = GradientBoosting.GradientBoosting(max_depth, n_estimators,
                                   learning_rate, min_samples_leaf)
        self.Transformator = LocallyLinearEmbedding(n_neighbors, n_components)
        
    def fit_predict(self,Xl, y, Xu) :
        print 'start collapse space'
        delimeter = Xl.shape[0]        
        X_all = np.vstack((Xl, Xu))
        X_all = self.Transformator.fit_transform(X_all)
        X_l_t = X_all[:delimeter]
        X_u_t = X_all[delimeter:]
        del X_all
        print 'start compute simalirity'
        Sim = GradientBoosting.Simalirity(X_l_t, X_u_t)
        print 'end compute simalirity'        
        del X_l_t, X_u_t        
        #Xl = X_all[:delimeter]
        #Xu = X_all[delimeter:]
        print 'end collapse space succesfully'
        return self.GB.fit_predict(Xl, y, Xu, Sim)
        
    def predict(self,X) : 
        return self.GB.predict(X)
    def score (self, X, y) : 
        return self.GB.score(X, y)
Ejemplo n.º 33
0
def get_metastable_connections_from_gmm(data, gmm, 
                                        connection_estimation_method='max_path_distance_diff', 
                                        min_paths=3, distance='euclidean', 
                                        low_dimension_distances=True, 
                                        as_graph=False):
    means = gmm.means_
    memberships = gmm.predict(data)
    if connection_estimation_method in ['max_path_distance_diff', 'connecting_paths', 'mst']:
        if low_dimension_distances:
            pca = PCA(n_components=2)
            lle = LocallyLinearEmbedding(n_components=2, 
                                         n_neighbors=int(0.8*data.shape[0]))
            distance_matrix = squareform(pdist(lle.fit_transform(data), distance))
        else:
            distance_matrix = squareform(pdist(data, distance))
        weighted_graph = nx.Graph(distance_matrix)
    else:
        weighted_graph = None
    return get_metastable_connections(data, means, memberships, 
                                      method=connection_estimation_method, 
                                      weighted_graph=weighted_graph, min_paths=3,
                                      as_graph=as_graph)
Ejemplo n.º 34
0
from sklearn.manifold import LocallyLinearEmbedding
from astroML.datasets import fetch_sdss_specgals
from astroML.datasets import fetch_sdss_spectrum

data = fetch_sdss_specgals()
print data.dtype.names
ngals = 326
nwavel = 3855
plates = data['plate'][:ngals]
mjds = data['mjd'][:ngals]
fiberIDs = data['fiberID'][:ngals]
h_alpha = data['h_alpha_flux'][:ngals]
bptclass = data['bptclass'][:ngals]
specdata = np.zeros((ngals, nwavel))

i = 0
for plate, mjd, fiberID in zip(plates, mjds, fiberIDs):
    tempdata = fetch_sdss_spectrum(plate, mjd, fiberID)
    specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean()
    i += 1

# Apply LLE
k = 7
for fignum, n in enumerate([2, 3]):
    lle = LocallyLinearEmbedding(k, n)
    lle.fit(specdata)
    proj = lle.transform(specdata)
    pl.subplot(2, 1, fignum+1)
    pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50)
pl.colorbar()
pl.show()
Ejemplo n.º 35
0
def main():
    
    parser = argparse.ArgumentParser(description=
                                'Perform Dimensionality Reduction')
    parser.add_argument('--alg', type=str, default='MLLE',
        help='Algorithm to reduce dimensionality.')
    parser.add_argument('catalog', type=str,
        help='Specify the catalog on which to perform DimReduce.')
    args = parser.parse_args()

    #dat = Table.read('catalogs/ZEST_catalog_colors.fits')
    #training_sample = dat[0:10000]
    #testing_sample = dat[10001:20000]
    #zkeys = ['cc', 'aa', 'm20', 'gg']

    base = os.path.basename(args.catalog)
    filename = os.path.splitext(base)[0]

    dat = Table.read(args.catalog)
    mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']#

    #dat.remove_column('color')
    if 'color' not in dat.colnames:
        if 'kaggle' in sample:
            dat = prep_catalog.color_data2(dat, 'gz2class')
        if 'direct' in sample:
            dat = prep_catalog.color_data(dat, 'zclass')
        dat.write(args.catalog, overwrite=True)

    #dat = prep_catalog.adjust_asym(dat, mkeys[2])
    #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys)

    n_neighbors = [10,12,15,20]
    #n_neighbors = [7]
    n_components = 3

    for i, n_neigh in enumerate(n_neighbors):
        
        if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']:
            if args.alg == 'MLLE':
                method = 'modified'
            elif args.alg == 'LLE':
                method = 'standard'
            elif args.alg == 'LTSA':
                method = 'ltsa'
            elif args.alg == 'HLLE':
                method = 'hessian'
                           
            #replace_panoptes(dat)
            #pdb.set_trace()
            #sample = 'directbig_panoptes'

            X, y = prep_catalog.whiten_data(dat, mkeys)

            (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], 
                                                       random_state=0)
            
            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.35], random_state=0)

            y_train = simplify_classlabels(y_train)
            y_test = simplify_classlabels(y_test)

            #filename = 'modified_7_directbig_new'

            X_train = X
            y_train = simplify_classlabels(y)

            #'''
            #sample ='direct_zcut'

            #Y_train, Y_test = open_previous_LLE(filename)

            #cut = np.where(X1['REDSHIFT'] <= 0.05)
            #X1_cut = X1[cut]
            #QC_plots(X1_cut)
            #Y_train = np.array(Y_train)[cut]
            #col_train = np.array(col_train)[cut]
            #X = Table(X)
            #cut_out_mixedup_region(X, np.array(Y_train))

            #'''
            print "performing "+method+" LLE with",n_neigh,\
                "nearest neighbors"
            print "on training sample of",len(X_train),"objects"

            t0 = time()
            A = LLE(n_neigh, n_components, eigen_solver='auto', method=method)
            error = A.fit(X_train).reconstruction_error_
            
            Y_train = A.fit_transform(X_train)
            Y_test = A.transform(X_train)
            t1 = time()
            #'''        

            metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
                        'error':error, 'time':t1-t0, 'sample':filename+'_total'}
            save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total')

            #metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
            #            'error':error, 'time':t1-t0, 'sample':filename+'_test'}
            #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test')

            # plot in 3D
            plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], 
                              method, n_neigh, error, t1-t0, filename, two=False)

        #====================================================================#

        elif args.alg == 'ISO':
            method='IsoMap'
                
            print "performing IsoMap with",n_neigh,"nearest neighbors"
            print "on training sample of",len(dat),"objects"
            
            t0 = time()
            A = Isomap(n_neigh, n_components, eigen_solver='dense')
            error = A.fit(train).reconstruction_error()
            
            Y = A.fit_transform(train)
            #Y2 = A.transform(test)
            
            t1 = time()
            print "%s: %.2g sec" %(args.alg, t1-t0)
            print "reconstruction error: ", error
            
            print "begin plotting"
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2)
            plot_dimreduce_3D(Y, traincols, Y, traincols, method, 
                              n_neigh, (t1-t0), error, sample)
            
        elif args.alg == 'LDA':
            
            print "performing LDA"
            
            X, Xc, y = prep_catalog.whiten_data(dat, mkeys)

            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.25], random_state=0)

            DRclf = LDA(3, priors=None)
            #DRclf.fit(X_train, y_train)
            DRtrain = DRclf.fit(X_train, y_train).transform(X_train)
            DRtest = DRclf.fit(X_train, y_train).transform(X_test)

            classes = np.unique(y_train)
            colors = np.array(['darkred', 'red', 'lightsalmon', 
                               'darkgreen', 'lightgreen', 'lightseagreen', 
                               'indigo', 'darkviolet', 'plum'])
            plot_LDA_3D(DRtrain, y_train, classes, colors, sample)

            pdb.set_trace()

            #classifiers = []
            #predictions = []
            #Nparams = np.arange(1, X.shape[1]+1)
            #for nc in Nparams:
            clf = LDA()
            clf.fit(DRtrain, y_train)
            y_pred = clf.predict(DRtest)
            
            matchesLDA = (y_pred == y_test)
            print np.sum(matchesLDA)

            pdb.set_trace()

            #------------------------------------------

            from sklearn.neighbors import KNeighborsClassifier
            knc = KNeighborsClassifier(5)
            knc.fit(DRtrain, y_train)
            y_pred = knc.predict(DRtest)

            matchesKNN = (y_pred == y_test)
            print np.sum(matchesKNN)

            pdb.set_trace()
            #------------------------------------------

            from astroML.classification import GMMBayes
            gmmb = GMMBayes(9)
            gmmb.fit(DRtrain, y_train)
            y_pred = gmmb.predict(DRtest)

            matchesGMMB = (y_pred == y_test)
            print np.sum(matchesGMMB)

            pdb.set_trace()
            #------------------------------------------

            # plot the results
            fig = plt.figure(figsize=(5, 2.5))
            fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0,
                                left=0.1, right=0.95, wspace=0.2)

            # left plot: data and decision boundary
            ax = fig.add_subplot(121)
            pdb.set_trace()
            im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, 
                            s=4, lw=0) #cmap=plt.cm.binary,, zorder=2
            im.set_clim(-0.5, 1)
            
            #im = ax.imshow(Z, origin='lower', aspect='auto',
            #               cmap=plt.cm.binary, zorder=1,
            #               extent=xlim + ylim)
            #im.set_clim(0, 1.5)
            
            #ax.contour(xx, yy, Z, [0.5], colors='k')
            
            #ax.set_xlim(xlim)
            #ax.set_ylim(ylim)
            
            ax.set_xlabel('$G$')
            ax.set_ylabel('$M20$')

            #pred, true = classification_loss(predictions, y_test)
            #completeness, contamination = completeness_contamination(pred, true)

            pdb.set_trace()


            #'''
            #t0 = time()
            #A = LDA(n_components, priors=None)
            #Y = A.fit_transform(train, targets)
            #Y2 = A.fit(train, targets).transform(train)
                
            #t1 = time()
            #print "%s: %.2g sec" %(args.alg, t1-t0)
            
            predict = A.predict(train)
            #print "Predicted classes:", predict
            #pdb.set_trace()
            

            #pdb.set_trace()
            #'''
            
            plot_LDA_3D(Y2, targets, classes, colors, sample)
            plot_LDA(Y2, targets, classes, colors, sample, axis=0)
            plot_LDA(Y2, targets, classes, colors, sample, axis=1)
            plot_LDA(Y2, targets, classes, colors, sample, axis=2)
            
            pdb.set_trace()
Ejemplo n.º 36
0
    dic_cl[st_name][5] += int(fields[6].lower().strip().strip('"'))
    dic_cl[st_name][6] += int(fields[7].lower().strip().strip('"'))
f.close()


import numpy as np

N = len(dic_cl.items())
X = np.zeros((N, 7))
for i, (key, val) in enumerate(dic_cl.iteritems()):
    X[i, :] = dic_cl[key]

from sklearn.manifold import LocallyLinearEmbedding
from sklearn.preprocessing import scale

lle = LocallyLinearEmbedding(n_components=3, n_neighbors=20)
print X.max(axis=0)
Y3 = lle.fit_transform(scale(X))
Y3 -= Y3.min(axis=0)

print len(dic_cl.items())
lle = LocallyLinearEmbedding(n_components=1, n_neighbors=20)
Y1 = lle.fit_transform(X)
Y1 -= Y1.min()

o1 = open("1-d.csv", "w")
o3 = open("3-d.csv", "w")
for i, (key, val) in enumerate(dic_cl.iteritems()):
    o1.write("%s,%f\n" % (key, Y1[i - 1]))
    o3.write("%s,%s\n" % (key, ",".join(map(str, Y3[i - 1, :]))))
o1.close()
    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features

# nFeatures = np.arange(50, 1000, 50)
nLocally_Linear = np.arange(20, 200, 20)

data = {}

for k in nLocally_Linear:

    features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl")
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

    t0 = time()
    ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto')
    ll.fit(features_train)
    print ("Dimension Reduction time:", round(time()-t0, 3), "s")


    features_train = ll.transform(features_train)
    features_test = ll.transform(features_test)

    for name, clf in [
        ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')),
        ('BernoulliNB', BernoulliNB(alpha=1)),
        ('GaussianNB', GaussianNB()),
        ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)),
        ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')),
        ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)),
        ('SVC', SVC(kernel='linear', C=1))
Ejemplo n.º 38
0
iso = Isomap(n_components=3, n_neighbors=15)
fdata = iso.fit_transform(digits["data"])
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")

plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100)

plt.show()


# LLE

from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_neighbors=15, n_components=3, method="modified")
fig = plt.figure()
fdata = lle.fit_transform(digits["data"])
ax = fig.add_subplot(111, projection="3d")

plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100)

plt.show()

# MDS

from sklearn.manifold import MDS

mds = MDS(n_components=3)
fig = plt.figure()
fdata = mds.fit_transform(digits["data"])
Ejemplo n.º 39
0
from sklearn.manifold import LocallyLinearEmbedding
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

tetra_freq = np.load('tetrafreq.npy')
phylum_index = np.load('phylumIndex.npy')
phylum_names = np.load('phylumNames.npy')

lle = LocallyLinearEmbedding(n_components=2)
lle_result = lle.fit_transform(tetra_freq)

plt.figure()
for c, i, name in zip ("bgrcmykw", list(range(7, -1, -1)), phylum_names):
    plt.scatter(lle_result[phylum_index == i, 0], lle_result[phylum_index == i, 1], c=c, label=name)
plt.title('LLE of tetranucleotide')
plt.legend(loc=3, fontsize=10)
plt.savefig('LLE.png')


     reducedImages = pca.fit_transform(trimmedImages)
 elif sys.argv[1] == '-isomap':
     trimmedImages = []
     for i in range(len(images)):
         images[i] = np.reshape(images[i], (-1))
         images[i] = images[i][:minSize]
         trimmedImages.append(images[i])
     isomap = Isomap(n_components=136)
     reducedImages = isomap.fit_transform(trimmedImages)
 elif sys.argv[1] == '-lle':
     trimmedImages = []
     for i in range(len(images)):
         images[i] = np.reshape(images[i], (-1))
         images[i] = images[i][:minSize]
         trimmedImages.append(images[i])
     lle = LocallyLinearEmbedding(n_components=136)
     reducedImages = lle.fit_transform(trimmedImages)
 
 # Do cross-fold validation 
 kf = KFold(len(images), n_folds=2)
 minAreas = {}
 maxAreas = {}
 avgAreas = {}
 totals = {}
 for train_index, test_index in kf:        
     xTrain = reducedImages[train_index]
     yTrain = labels[train_index]
     clf = OneVsRestClassifier(LinearSVC(), 4)
     clf.fit(xTrain, yTrain)
     xTest = reducedImages[test_index]
     yTest = labels[test_index]
Ejemplo n.º 41
0
def localLinearEmbedding(X, y):
	lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense")
	lle.fit(X)
	transformX = lle.transform(X)
	return transformX
def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): 
	"""
	Plot data transformed into two dimensions by PCA. 
	PCA transforms into a new embedding dimension such that 
	the first dimension contains the maximal variance and following 
	dimensions maximal remaining variance. 
	This shoudl spread the observed n-dimensional data maximal. This 
	is unsupervised and will not consider target values. 
	"""
	if (scale): 
		scaler = StandardScaler()
		X = scaler.fit_transform(X)

	if (normalize): 
		normalizer = Normalizer(norm='l2')
		X = normalizer.fit_transform(X)
		
	if (embedding is 'pca'): 
		pca = PCA(n_components=2)
		X_transformed = pca.fit_transform(X)
	elif (embedding is 'isomap'):
		isomap = Isomap(n_components=2, n_neighbors=20)
		X_transformed = isomap.fit_transform(X)
	elif (embedding is 'lle' ): 
		lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5)
		X_transformed = lle.fit_transform(X)
	elif (embedding is 'tsne'): 
		t_sne = TSNE(n_components=2)
		X_transformed = t_sne.fit_transform(X)
	elif (embedding is 'spectral'): 
		se = SpectralEmbedding(n_components=2)
		X_transformed = se.fit_transform(X)
	elif (embedding is 'mds'):
		mds = MDS(n_components=2)
		X_transformed = mds.fit_transform(X)
	elif (embedding is 'gallery'): 
		plt.figure(1)
		
		plt.subplot(231)
		plt.title('pca')
		X_t = PCA(n_components=2).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(232)
		plt.title('isomap')
		X_t = Isomap(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(233)
		plt.title('lle')
		X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(234)
		plt.title('tsne')
		X_t = TSNE().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(235)
		plt.title('spectral')
		X_t = SpectralEmbedding().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(236)
		plt.title('mds')
		X_t = MDS().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.suptitle('Gallery transforms ' + title)

		return plt
	else:
		raise ValueError("Choose between pca, isomap and tsne")

	plt.title(title + ' ' + embedding + ' plot')
	sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
	plt.colorbar(sc)
	return plt
n_samples, n_features = D.shape
n_neighbors = 10

#----------------------------------------------------------------------
# Isomap projection 
print "Computing Isomap embedding"
t0 = time()
D_iso = Isomap(n_neighbors, n_components=2).fit_transform(D_scaled)
print "Done in time %.2fs " % (time() - t0)

#----------------------------------------------------------------------
# Locally linear embedding 
n_neighbors = 35
print "Computing LLE embedding"
clf = LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='modified')
t0 = time()
D_lle = clf.fit_transform(D_scaled)
print "Done in time %.2fs " % (time() - t0)
print "Reconstruction error: %g" % clf.reconstruction_error_

#----------------------------------------------------------------------
# kernel PCA
print "Computing kPCA embedding"
kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0028942661247167516)
t0 = time()
D_kpca = kpca.fit_transform(D_scaled)
print "Done in time %.2fs " % (time() - t0)

plot_embedding(D_pca, 1, rescale=None, title="PCA projection")
plot_embedding(D_iso, 2, rescale=None, title="Isomap projection")
Ejemplo n.º 44
0
from __future__ import division
import sys
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
import random
from colorsys import hsv_to_rgb

pca = PCA(n_components=2)
isomap = Isomap(n_components=2)
lle = LocallyLinearEmbedding(n_components=2)
data = np.genfromtxt('data01_small.txt', delimiter=',')
pca_xform = pca.fit_transform(data)
isomap_xform = isomap.fit_transform(data)
lle_xform = lle.fit_transform(data)
label = [0]*100+[1]*100
rgbs = [(0.5,0,0), (0,0.5,0)]


plt.figure()
xs = pca_xform[:,0]
ys = pca_xform[:,1]
ax = plt.subplot(111)
for i in xrange(len(xs)):
	ax.text(xs[i], ys[i], str(label[i]), color=rgbs[label[i]], fontdict={'weight': 'bold', 'size': 9})
t = (max(xs)-min(xs))*0.1
Ejemplo n.º 45
0
def lle(X=None, W=None, num_vecs=None, k=None):
    embedder = LocallyLinearEmbedding(n_neighbors=k, n_components=num_vecs)
    return embedder.fit_transform(X)
Ejemplo n.º 46
0
# Build the output arrays
cells = opts.high / opts.step
lle_gmm_results = np.zeros((cells,opts.iters))

D = scale(X)

n_samples, n_features = D.shape
# chosen by hyperparam search in a separate test.
n_neighbors = 35

# For the specified number of principal components, do the clustering
dimension_list = range(opts.low, opts.high + 1, opts.step)
data_files = []
for i in dimension_list:
    index = (i / opts.step) - 1     
    lle = LocallyLinearEmbedding(n_neighbors, n_components=i, method='standard')
    X_lle = lle.fit_transform(D)
    
    for j in range(0,opts.iters,1):
        gaussmix = GMM(n_components=true_k, covariance_type='tied', n_init=10, n_iter=1000)
        gaussmix.fit(X_lle)
        gaussmix_labels = gaussmix.predict(X_lle)    
        homog = metrics.homogeneity_score(labels[:,0], gaussmix_labels)
        print "Gaussian mixture homogeneity: %0.3f" % homog
        test_result = {"Model": "LLE", "Dimension": i, "Homogeneity": homog}
        index = pd.Index([0], name='rows')
        data_files.append(pd.DataFrame(data=test_result,index=index))        

# Save the data to a file:

print "...Done"
Ejemplo n.º 47
0
'''
train PCA basis based on training.txt and output dimension-reduced coefficients for both training.txt and testing.txt
'''

from __future__ import division
import sys
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
import random
from colorsys import hsv_to_rgb

final_dim = 30
data = np.genfromtxt("100examples.txt", delimiter=',')
pca = PCA(n_components=final_dim)
isomap = Isomap(n_components=final_dim)
lle = LocallyLinearEmbedding(n_components=final_dim)
data_xformed = lle.fit_transform(data)
np.savetxt("lle_data_30_dims.txt", data_xformed, delimiter=',')
Ejemplo n.º 48
0
#03-02.py
X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None)

from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=15,
                             n_components=3, method='modified')
X_proj = lle.fit_transform(X)

three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
Ejemplo n.º 49
0
 def __init__(self, max_depth=3, n_estimators=10, learning_rate=0.1,
              min_samples_leaf=4, n_neighbors=5, n_components=2) :
     self.GB = GradientBoosting.GradientBoosting(max_depth, n_estimators,
                                learning_rate, min_samples_leaf)
     self.Transformator = LocallyLinearEmbedding(n_neighbors, n_components)