Esempio n. 1
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None
Esempio n. 2
0
 def __init__(self, X_train, y_train):
     X_train, X_train_lr, y_train, y_train_lr = \
         skms.train_test_split(X_train, y_train, test_size=0.5)
     rt = ske.RandomTreesEmbedding(n_estimators=50)
     lm = skline.LogisticRegression()
     self.model = skp.make_pipeline(rt, lm)
     self.model.fit(X_train, y_train)
 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None
Esempio n. 4
0
def classifier_choice(method='tsne', neighbors=30, dimensions=2):
    if method in "tsne":
        return TSNE(n_components=dimensions, perplexity=30, verbose=1)
    elif method in "pca":
        return decomposition.TruncatedSVD(n_components=dimensions)
    elif method in "isomap":
        return manifold.Isomap(n_neighbors=neighbors, n_components=dimensions)
    elif method in "lle":
        return manifold.LocallyLinearEmbedding(n_neighbors=neighbors,
                                               n_components=dimensions,
                                               method='standard')
    elif method in "mlle":
        return manifold.LocallyLinearEmbedding(n_neighbors=neighbors,
                                               n_components=dimensions,
                                               method='modified')
    elif method in "hlle":
        return manifold.LocallyLinearEmbedding(n_neighbors=neighbors,
                                               n_components=dimensions,
                                               method='hessian')
    elif method in "ltsa":
        return manifold.LocallyLinearEmbedding(n_neighbors=neighbors,
                                               n_components=dimensions,
                                               method='ltsa')
    elif method in "mds":
        return manifold.MDS(n_components=dimensions, n_init=1, max_iter=100)
    elif method in "trees":
        trees = ensemble.RandomTreesEmbedding(n_estimators=200, max_depth=5)
        pca = decomposition.TruncatedSVD(n_components=dimensions)
        return Pipeline([('Random Tree Embedder', trees), ('PCA', pca)])
    elif method in "spectral":
        return manifold.SpectralEmbedding(n_components=dimensions,
                                          eigen_solver="arpack")
    else:
        print('Please use valid method')
def analyse_mode(properties, mode):
    mode = float(mode)
    if mode == 0:
        pca = PCA(n_components=2)
        pos = pca.fit_transform(properties)
    elif mode == 1:
        model = TSNE(n_components=2, random_state=0)
        np.set_printoptions(suppress=True)
        pos = model.fit_transform(properties)
    elif mode == 2:
        clf = manifold.Isomap(n_components=2)
        pos = clf.fit_transform(properties)
    elif mode == 3:
        hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                               random_state=0,
                                               max_depth=5)
        x_transformed = hasher.fit_transform(properties)
        pca = decomposition.TruncatedSVD(n_components=2)
        pos = pca.fit_transform(x_transformed)
    else:
        clf = manifold.SpectralEmbedding(n_components=2,
                                         random_state=0,
                                         eigen_solver="arpack")
        pos = clf.fit_transform(properties)
    return pos
Esempio n. 6
0
 def __init__(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit(source)
     hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                            random_state=0,
                                            max_depth=None)
     data_transformed = hasher.fit_transform(data_source)
     rfe = decomposition.TruncatedSVD(n_components=2)
     self.return_data = rfe.fit_transform(data_transformed)
Esempio n. 7
0
def random_tree():
  print("Totally Random Trees embedding is selected")
  embedder = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                           max_depth=5)
  X_s = embedder.fit_transform(X_s)
  X_t = embedder.fit_transform(X_t)
  X_s_val = embedder.fit_transform(X_s_val)
  X_test = embedder.fit_transform(X_test)
  embedder = decomposition.TruncatedSVD(n_components=n_components)
  return embedder
Esempio n. 8
0
 def RandomForestEmbedding(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit(source)
     hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                            random_state=0,
                                            max_depth=None)
     data_transformed = hasher.fit_transform(data_source)
     rfe = decomposition.TruncatedSVD(n_components=2)
     result = {}
     result['data'] = rfe.fit_transform(data_transformed)
     result['params'] = 0
     return result
Esempio n. 9
0
def random_trees_embedding():

    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                           max_depth=5)
    t0 = time()
    X_transformed = hasher.fit_transform(X)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(X_reduced,
                   "Random forest embedding of the digits (time %.2fs)" %
                   (time() - t0))
Esempio n. 10
0
def rtree(X, dim=2, n_estimators=200, max_depth=5, **kargs):
    '''Random Trees embedding of the dataset'''
    print("Computing Totally Random Trees embedding")
    from sklearn.pipeline import Pipeline
    tr = Pipeline([('hasher',
                    ensemble.RandomTreesEmbedding(n_estimators=n_estimators,
                                                  random_state=0,
                                                  max_depth=max_depth)),
                   ('pca', decomposition.PCA(n_components=dim))])

    try:
        X_reduced = tr.fit_transform(X)

        return tr, X_reduced, "Random forest embedding of the features"
    except Exception as e:
        traceback.print_exc()
 def __init__(self,
              n_clusters=100,
              pca_n_components=10,
              kmpca_n_components=7,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 1),
                                         min_df=2,
                                         max_df=0.8,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX',
         'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX'
     ]
     self.linear_feature_selector = None
     ## BUILD dictionary based on location_tree - faster for search
     location_tree = [
         row[0].lower().split('~')[::-1]
         for row in csv.reader(open(LOCATION_TREE_FILE))
     ]
     self.location_dict = {}
     for locs in location_tree:
         for i in range(len(locs)):
             if locs[i] not in self.location_dict:
                 self.location_dict[locs[i]] = locs[i:]
Esempio n. 12
0
def plot_scatter_2d(ds_merged, method='mds', fig_number=1):

    from sklearn import decomposition, manifold, lda, ensemble
    """
    methods: 'mds', 'pca', 'iso', 'forest', 'embedding'
    """

    data = ds_merged.samples

    stringa = ''
    if method == 'pca':
        clf = decomposition.RandomizedPCA(n_components=2)
        stringa = 'Principal Component Analysis'
    ########
    elif method == 'iso':
        clf = manifold.Isomap(30, n_components=2)
        stringa = 'Iso surfaces '
    #########
    elif method == 'forest':
        hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                               random_state=0,
                                               max_depth=5)
        data = hasher.fit_transform(data)
        clf = decomposition.RandomizedPCA(n_components=2)
        stringa = 'Random Forests'
    ########
    elif method == 'embedding':
        clf = manifold.SpectralEmbedding(n_components=2,
                                         random_state=0,
                                         eigen_solver="arpack")
        stringa = 'Spectral Embedding'
    #########
    else:
        clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
        stringa = 'Multidimensional scaling'

    ###########################
    #dist_matrix = squareform(pdist(data, 'euclidean'))

    print stringa + ' is performing...'

    pos = clf.fit_transform(data)

    colors = cycle('bgrymkybgrcmybgrcmybgrcmy')

    f = plt.figure()
    a = f.add_subplot(111)
    a.set_title(stringa)
    for label in np.unique(ds_merged.targets):
        m = ds_merged.targets == label
        data_m = pos[m]
        c = colors.next()
        a.scatter(data_m.T[0].mean(),
                  data_m.T[1].mean(),
                  label=label,
                  color=c,
                  s=120)
        a.scatter(data_m.T[0][::2], data_m.T[1][::2], color=c)
        '''
        cov_ = np.cov(data_m.T)
        v, w = np.linalg.eigh(cov_)
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi  # convert to degrees
        v *= 0.5
        ell = mpl.patches.Ellipse(np.mean(data_m, axis=0), v[0], v[1],
                                  180 + angle, color=c)
        ell.set_clip_box(a.bbox)
        ell.set_alpha(0.2)
        a.add_artist(ell)
        '''
    a.legend()
    return
Esempio n. 13
0
            if stru2vec[triple[1]] + cnn2vec[triple[1]] not in entity_vec:
                entity_vec.append(stru2vec[triple[1]])
                entity_type.append(5)
    # x_train, y_train, x_test, y_test,x_test_triple = eTour_Experiment.orgnanizeDataFormat(dkrl_train,trainZ_N,dkrl_test,ClassiferTestTriples,cnn2vec,stru2vec,1)

    ###rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
    ###x_projected = rp.fit_transform(x_train)
    ###x_projected = decomposition.TruncatedSVD(n_components=2).fit_transform(x_train)
    ###x_projected = discriminant_analysis.LinearDiscriminantAnalysis().fit_transform(x_train, y_train)
    ###x_projected = manifold.Isomap(n_neighbors=5, n_components=2).fit_transform(x_train)
    # x_projected = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2,
    #                                       method='hessian').fit_transform(x_train)
    ##x_projected = manifold.TSNE(n_components=2, init='pca', random_state=0).fit_transform(x_train)

    hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                           random_state=0,
                                           max_depth=12)
    t0 = time()
    X_transformed = hasher.fit_transform(entity_vec)
    pca = decomposition.TruncatedSVD(n_components=2)
    x_projected = pca.fit_transform(X_transformed)

    font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=14)
    print(x_projected)
    xValue_1 = [x[0] for x, y in zip(x_projected, entity_type) if y == 1]
    yValue_1 = [x[1] for x, y in zip(x_projected, entity_type) if y == 1]

    xValue_0 = [x[0] for x, y in zip(x_projected, entity_type) if y == 0]
    yValue_0 = [x[1] for x, y in zip(x_projected, entity_type) if y == 0]

    xValue_3 = [x[0] for x, y in zip(x_projected, entity_type) if y == 3]
Esempio n. 14
0
                                      n_components=level,
                                      method='modified')
X_mlle = clf.fit_transform(X)
plot_embedding(X_mlle, "LLE modifiée")

clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                      n_components=level,
                                      method='hessian')
X_hlle = clf.fit_transform(X)
plot_embedding(X_hlle, "LLE Hessian")

clf = manifold.MDS(n_components=level, n_init=20, max_iter=100)
X_mds = clf.fit_transform(X)
plot_embedding(X_mds, "MDS")

hasher = ensemble.RandomTreesEmbedding(n_estimators=100)
X_transformed = hasher.fit_transform(X)
pca = decomposition.TruncatedSVD(n_components=level)
X_reduced = pca.fit_transform(X_transformed)
plot_embedding(X_reduced, "Random forest")

embedder = manifold.SpectralEmbedding(n_components=level)
X_se = embedder.fit_transform(X)
plot_embedding(X_se, "Spectral embedding")
plotly_embedding(X_se, "Spectral embedding")

tsne = manifold.TSNE(n_components=level, init='pca', random_state=0)
X_tsne = tsne.fit_transform(X)
plot_embedding(X_tsne, "t-SNE")

print("Réduction dimensionnelle et clustering :", time.clock() - time4)
Esempio n. 15
0
def try_all_dim_reduction(X, y, y_label):
    n_neighbors = 30
    #----------------------------------------------------------------------
    # Random 2D projection using a random unitary matrix
    print("Computing random projection")
    rp = random_projection.SparseRandomProjection(n_components=2,
                                                  random_state=42)
    X_projected = rp.fit_transform(X)
    plot_3D_labeled_datapoints(X_projected, y, y_label)

    #----------------------------------------------------------------------
    # Projection on to the first 2 principal components
    print("Computing PCA projection")
    X_pca = decomposition.PCA(n_components=2).fit_transform(X)
    plot_3D_labeled_datapoints(X_pca, y, y_label)

    #----------------------------------------------------------------------
    # Isomap projection of the digits dataset
    print("Computing Isomap embedding")
    X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
    plot_3D_labeled_datapoints(X_iso, y, y_label)

    #----------------------------------------------------------------------
    # Locally linear embedding of the digits dataset
    print("Computing LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='standard')
    X_lle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_3D_labeled_datapoints(X_lle, y, y_label)

    #----------------------------------------------------------------------
    # Modified Locally linear embedding of the digits dataset
    print("Computing modified LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='modified')

    X_mlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_3D_labeled_datapoints(X_mlle, y, y_label)

    #----------------------------------------------------------------------
    # MDS  embedding of the digits dataset
    print("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)

    X_mds = clf.fit_transform(X)
    print("Done. Stress: %f" % clf.stress_)
    plot_3D_labeled_datapoints(X_mds, y, y_label)

    #----------------------------------------------------------------------
    # Random Trees embedding of the digits dataset
    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                           random_state=0,
                                           max_depth=5)
    X_transformed = hasher.fit_transform(X)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_3D_labeled_datapoints(X_reduced, y, y_label)

    #----------------------------------------------------------------------
    # Spectral embedding of the digits dataset
    print("Computing Spectral embedding")
    embedder = manifold.SpectralEmbedding(n_components=2,
                                          random_state=0,
                                          eigen_solver="arpack")
    X_se = embedder.fit_transform(X)

    plot_3D_labeled_datapoints(X_se, y, y_label)

    #----------------------------------------------------------------------
    # t-SNE embedding of the digits dataset
    print("Computing t-SNE embedding")
    tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    X_tsne = tsne.fit_transform(X)

    plot_3D_labeled_datapoints(X_tsne, y, y_label)
Esempio n. 16
0
def plot_other_manifold(X, y, n_neighbors, n_estimators=00,
                        max_depth=5, random_state=0):
    # ----------------------------------------------------------------------
    # Modified Locally linear embedding of the digits dataset
    print("Computing modified LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='modified')
    t0 = time()
    X_mlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_mlle, y,
                   "Modified Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # -------------------------------------------------------------
    # HLLE embedding of the digits dataset
    print("Computing Hessian LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='hessian')
    t0 = time()
    X_hlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_hlle, y,
                   "Hessian Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # --------------------------------------------------------------------
    # LTSA embedding of the digits dataset
    print("Computing LTSA embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='ltsa')
    t0 = time()
    X_ltsa = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_ltsa, y,
                   "Local Tangent Space Alignment of the digits (time %.2fs)" %
                   (time() - t0))

    # ----------------------------------------------------------------------
    # Random Trees embedding of the digits dataset
    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=n_estimators,
                                           random_state=random_state,
                                           max_depth=max_depth)
    t0 = time()
    X_transformed = hasher.fit_transform(X)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(X_reduced, y,
                   "Random forest embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # ----------------------------------------------------------------------
    # Spectral embedding of the digits dataset
    print("Computing Spectral embedding")
    embedder = manifold.SpectralEmbedding(n_components=2,
                                          random_state=random_state,
                                          eigen_solver="arpack")
    t0 = time()
    X_se = embedder.fit_transform(X)

    plot_embedding(X_se, y,
                   "Spectral embedding of the digits (time %.2fs)" %
                   (time() - t0))
Esempio n. 17
0
def get_embedding(X, y, type_embeding):
    n_neighbors = 30
    X_projected = None

    if type_embeding == "Random":
        rp = random_projection.SparseRandomProjection(n_components=2,
                                                      random_state=42)
        X_projected = rp.fit_transform(X)

    elif type_embeding == "PCA":
        X_projected = decomposition.TruncatedSVD(n_components=2).fit_transform(
            X)

    elif type_embeding == "LDA":
        X2 = X.copy()
        X2.flat[::X.shape[1] + 1] += 0.01  # Make X invertible
        X_projected = discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=2).fit_transform(X2, y)

    elif type_embeding == "Isomap":
        X_projected = manifold.Isomap(n_neighbors,
                                      n_components=2).fit_transform(X)

    elif type_embeding == "LLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='standard')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "mLLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='modified')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "hLLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='hessian')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "ltsa":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='ltsa')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "MDS":
        clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
        X_projected = clf.fit_transform(X)

    elif type_embeding == "RF":
        hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                               max_depth=5)

        X_transformed = hasher.fit_transform(X)
        pca = decomposition.TruncatedSVD(n_components=2)
        X_projected = pca.fit_transform(X_transformed)

    elif type_embeding == "Spectral":
        embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                              eigen_solver="arpack")
        X_projected = embedder.fit_transform(X)

    elif type_embeding == "T-SNE":
        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        X_projected = tsne.fit_transform(X)
    else:
        print("""Valid options are:
         Random    => Random Projections
         PCA       => Principal Component Analysis
         LDA       => Linear Discriminant Analysis
         Isomap    => Isomap
         LLE       => Locally Linear Embedding
         mLLE      => Modified Locally Linear Embedding
         hLLE      => Hessian Locally Linear Embedding
         ltsa      => Locally Linear Embedding (ltsa)
         MDS       => Multidimensional Scaling
         RF        => Random Forest Embeding
         Spectral  => Spectral Embeding
         T-SNE     => T-SNE    """)

    return X_projected
Esempio n. 18
0
# ----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" % (time() - t0))

# ----------------------------------------------------------------------
# Random Trees embedding of the digits dataset
print("Computing Totally Random Trees embedding")
RTE = ensemble.RandomTreesEmbedding(n_estimators=200,
                                    random_state=0,
                                    max_depth=5).fit_transform(X)
X_reduced = decomposition.TruncatedSVD(n_components=2).fit_transform(RTE)

plot_embedding(X_reduced, "Random forest embedding of the digits")

# ----------------------------------------------------------------------
# Spectral embedding of the digits dataset
print("Computing Spectral embedding")
embedder = manifold.SpectralEmbedding(n_components=2,
                                      random_state=0,
                                      eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X)

plot_embedding(X_se,
Esempio n. 19
0
def dimension_reduce(X_train,y_train):
    '''
    rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
    X_projected = rp.fit_transform(X_train)
    plot_embedding(X_projected, y_train, "Random Projection of the digits")
    
    # Projection on to the first 2 principal components
    print("Computing PCA projection")
    t0 = time()
    X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X_train)
    plot_embedding(X_pca, y_train
                   "Principal Components projection of the digits (time %.2fs)" %
                                  (time() - t0))
    print("Computing Linear Discriminant Analysis projection")
    X2 = X_train.copy()
    X2.flat[::X_train.shape[1] + 1] += 0.01  # Make X invertible
    t0 = time()
    X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2).fit_transform(X2, y_train)
    plot_embedding(X_lda, y_train,
                   "Linear Discriminant projection of the digits (time %.2fs)" %
                                  (time() - t0))
    '''
    n_neighbors = 30
    # Isomap projection of the digits dataset
    print("Computing Isomap embedding")
    t0 = time()
    X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X_train)
    print("Done.")
    plot_embedding(X_iso,y_train,
                   "Isomap projection of the digits (time %.2fs)" %(time() - t0))
    # Locally linear embedding of the digits dataset
    print("Computing LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='standard')
    t0 = time()
    X_lle = clf.fit_transform(X_train)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_lle,y_train,
                   "Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))
    # Modified Locally linear embedding of the digits dataset
    print("Computing modified LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='modified')
    t0 = time()
    X_mlle = clf.fit_transform(X_train)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_mlle,y_train,
                   "Modified Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))
    '''
    # HLLE embedding of the digits dataset
    print("Computing Hessian LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='hessian')
    t0 = time()
    X_hlle = clf.fit_transform(X_train)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_hlle,y_train,
                   "Hessian Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))
    '''
    # LTSA embedding of the digits dataset
    print("Computing LTSA embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='ltsa')
    t0 = time()
    X_ltsa = clf.fit_transform(X_train)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_ltsa,y_train,
                   "Local Tangent Space Alignment of the digits (time %.2fs)" %
                   (time() - t0))
    # MDS  embedding of the digits dataset
    print("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
    t0 = time()
    X_mds = clf.fit_transform(X_train)
    print("Done. Stress: %f" % clf.stress_)
    plot_embedding(X_mds,y_train,
                   "MDS embedding of the digits (time %.2fs)" %
                   (time() - t0))
    # Random Trees embedding of the digits dataset
    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                           max_depth=5)
    t0 = time()
    X_transformed = hasher.fit_transform(X_train)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(X_reduced,y_train,
                   "Random forest embedding of the digits (time %.2fs)" %
                   (time() - t0))
    # Spectral embedding of the digits dataset
    print("Computing Spectral embedding")
    embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                          eigen_solver="arpack")
    t0 = time()
    X_se = embedder.fit_transform(X_train)

    plot_embedding(X_se,y_train,
                   "Spectral embedding of the digits (time %.2fs)" %
                   (time() - t0))
    # t-SNE embedding of the digits dataset
    print("Computing t-SNE embedding")
    tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    t0 = time()
    X_tsne = tsne.fit_transform(X_train)

    plot_embedding(X_tsne,y_train,
                   "t-SNE embedding of the digits (time %.2fs)" %
                   (time() - t0))
    plt.show()
def analyze(X=None, y=None, plot_fun=scatter_plot, data_name="data"):
    if X is None:
        digits = datasets.load_digits(n_class=6)
        X = digits.data
        y = digits.target

    n_samples, n_features = X.shape
    n_neighbors = 30

    def plot_embedding(X, title=None):
        x_min, x_max = np.min(X, 0), np.max(X, 0)
        X = (X - x_min) / (x_max - x_min)
        plot_fun(X, y)
        if title is not None:
            plt.title(title)

    # #----------------------------------------------------------------------
    # # Scale and visualize the embedding vectors
    # def plot_embedding(X, title=None):
    #     x_min, x_max = np.min(X, 0), np.max(X, 0)
    #     X = (X - x_min) / (x_max - x_min)
    #
    #     plt.figure()
    #     ax = plt.subplot(111)
    #     for i in range(X.shape[0]):
    #         plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
    #                  color=plt.cm.Set1(y[i] / 10.),
    #                  fontdict={'weight': 'bold', 'size': 9})
    #
    #     if hasattr(offsetbox, 'AnnotationBbox'):
    #         # only print thumbnails with matplotlib > 1.0
    #         shown_images = np.array([[1., 1.]])  # just something big
    #         for i in range(digits.data.shape[0]):
    #             dist = np.sum((X[i] - shown_images) ** 2, 1)
    #             if np.min(dist) < 4e-3:
    #                 # don't show points that are too close
    #                 continue
    #             shown_images = np.r_[shown_images, [X[i]]]
    #             imagebox = offsetbox.AnnotationBbox(
    #                 offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
    #                 X[i])
    #             ax.add_artist(imagebox)
    #     plt.xticks([]), plt.yticks([])
    #     if title is not None:
    #         plt.title(title)
    #
    #
    # #----------------------------------------------------------------------
    # # Plot images of the digits
    # n_img_per_row = 20
    # img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
    # for i in range(n_img_per_row):
    #     ix = 10 * i + 1
    #     for j in range(n_img_per_row):
    #         iy = 10 * j + 1
    #         img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))

    # plt.imshow(img, cmap=plt.cm.binary)
    # plt.xticks([])
    # plt.yticks([])
    # plt.title('A selection from the 64-dimensional digits dataset')

    #----------------------------------------------------------------------
    # Random 2D projection using a random unitary matrix
    print("Computing random projection")
    rp = random_projection.SparseRandomProjection(n_components=2,
                                                  random_state=42)
    X_projected = rp.fit_transform(X)
    plot_embedding(X_projected,
                   "Random Projection of the {}".format(data_name))

    #----------------------------------------------------------------------
    # Projection on to the first 2 principal components

    print("Computing PCA projection")
    t0 = time()
    X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
    plot_embedding(
        X_pca,
        "Principal Components projection of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # Projection on to the first 2 linear discriminant components

    print("Computing LDA projection")
    X2 = X.copy()
    X2.flat[::X.shape[1] + 1] += 0.01  # Make X invertible
    t0 = time()
    X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
    plot_embedding(
        X_lda, "Linear Discriminant projection of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # Isomap projection of the dataset
    print("Computing Isomap embedding")
    t0 = time()
    X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
    print("Done.")
    plot_embedding(
        X_iso, "Isomap projection of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # Locally linear embedding of the dataset
    print("Computing LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='standard')
    t0 = time()
    X_lle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(
        X_lle, "Locally Linear Embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # Modified Locally linear embedding of the dataset
    print("Computing modified LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='modified')
    t0 = time()
    X_mlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(
        X_mlle,
        "Modified Locally Linear Embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # HLLE embedding of the dataset
    print("Computing Hessian LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='hessian')
    t0 = time()
    X_hlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(
        X_hlle,
        "Hessian Locally Linear Embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # LTSA embedding of the dataset
    print("Computing LTSA embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                          n_components=2,
                                          method='ltsa')
    t0 = time()
    X_ltsa = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(
        X_ltsa, "Local Tangent Space Alignment of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # MDS  embedding of the dataset
    print("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
    t0 = time()
    X_mds = clf.fit_transform(X)
    print("Done. Stress: %f" % clf.stress_)
    plot_embedding(
        X_mds,
        "MDS embedding of the {} (time {:.2f})".format(data_name,
                                                       time() - t0))

    #----------------------------------------------------------------------
    # Random Trees embedding of the dataset
    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                           random_state=0,
                                           max_depth=5)
    t0 = time()
    X_transformed = hasher.fit_transform(X)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(
        X_reduced, "Random forest embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # Spectral embedding of the digits dataset
    print("Computing Spectral embedding")
    embedder = manifold.SpectralEmbedding(n_components=2,
                                          random_state=0,
                                          eigen_solver="arpack")
    t0 = time()
    X_se = embedder.fit_transform(X)

    plot_embedding(
        X_se, "Spectral embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    #----------------------------------------------------------------------
    # t-SNE embedding of the digits dataset
    print("Computing t-SNE embedding")
    tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    t0 = time()
    X_tsne = tsne.fit_transform(X)

    plot_embedding(
        X_tsne, "t-SNE embedding of the {} (time {:.2f})".format(
            data_name,
            time() - t0))

    plt.show()