Exemple #1
0
 def umap(self, n_components, metric, data=None):
     model= UMAP(n_components=n_components,metric=metric)
     if data is not None:
         reduced_data = model.fit_transform(data)
     else:
         reduced_data = model.fit_transform(self.data)
     return reduced_data
Exemple #2
0
 def plot_umap_proc(self, df):
     folder = self.plot_path
     umap_2d = UMAP(n_components=2, spread=1, min_dist=0.5, a=0.7, b=1.2)
     umap_3d = UMAP(n_components=3, spread=1, min_dist=0.5, a=0.7, b=1.2)
     proj_2d = umap_2d.fit_transform(np.array(df.Vector.tolist()))
     proj_3d = umap_3d.fit_transform(np.array(df.Vector.tolist()))
     self.plot_umap(folder,proj_2d,proj_3d,df.Categ,"Categ","category-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.subject,"subject","subject-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.chn,"chn","channel-umap")
Exemple #3
0
class manifold_umap(base_manifold):
    def __init__(self, parent=None, name='none'):
        base_manifold.__init__(self,
                               parent=parent,
                               name=name,
                               manifold_type='UMAP')

    def train(self, num_pc, n_neighbors=None, min_dist=0.3):
        """
        **Purpose**
            Train the UMAP on the first <num_pc> components of a PCA

            UMAP is generally too computationally heavy to do on a full dataset, so you
            should choose the first few PCs to train the tSNE. Check the pca module
            for a PCA interface you can use to select the best PCs

        **Arguments**
            n_neighbors (Required)
                Estimated number of neighbours

            min_dist (Optional, default=0.3)
                minimum distance between points

        **Returns**
            None
        """
        assert self.configured, 'umap is not configured, run configure() first'
        assert n_neighbors, 'You must specify an estimate for n_neighbors'

        if isinstance(num_pc, int):
            self.__model = PCA(n_components=num_pc, whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            self.__pcas = self.__transform

        elif isinstance(num_pc, list):
            self.__model = PCA(n_components=max(num_pc) + 1,
                               whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            # get only the specific PCs
            self.__pcas = numpy.array(
                [self.__transform[:, c - 1] for c in num_pc]).T
        else:
            raise AssertionError('num_pcs must be either an integer or a list')

        self.__model = UMAP(n_components=2,
                            n_neighbors=n_neighbors,
                            metric='correlation',
                            random_state=self.random_state,
                            verbose=self.verbose)

        self.npos = self.__model.fit_transform(self.__pcas)

        self.trained = True
Exemple #4
0
def calc_umap(X, n_components, n_neighbors, min_dist, spread, random_state):
    umap = UMAP(n_components=n_components,
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                spread=spread,
                random_state=random_state)
    return umap.fit_transform(X)
Exemple #5
0
def umap(feats, indices):
    metric = st.selectbox('Metric', [
        'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'canberra',
        'braycurtis', 'mahalanobis', 'wminkowski', 'seuclidean', 'cosine',
        'correlation'
    ])
    n_neighbors = st.slider('N Neighbors',
                            min_value=2,
                            max_value=200,
                            value=15,
                            step=1)
    min_dist = st.slider('Minimum Distance',
                         min_value=0.0,
                         max_value=1.0,
                         value=0.1,
                         step=0.01)

    model = UMAP(n_components=3,
                 n_neighbors=n_neighbors,
                 min_dist=min_dist,
                 metric=metric)

    results = model.fit_transform(feats[indices, :])

    return results
def test_umap_transform_embedding_stability(iris, iris_selection):
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    _ = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    _ = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)
Exemple #7
0
def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title=""):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    # Compute the 2D projections. You could also project to another number of dimensions (e.g.
    # for a 3D plot) or use a different different dimensionality reduction like PCA or TSNE.
    reducer = UMAP()
    projs = reducer.fit_transform(embeds)

    # Draw the projections
    speakers = np.array(speakers)
    colors = colors or _my_colors
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    if legend:
        ax.legend(title="Speakers", ncol=2)
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    return projs
    def calc_umap(self,
                  df,
                  n_neighbors=5,
                  min_dist=0.3,
                  metric='correlation',
                  data_type='original_data'):

        print(">> Running UMAP from " + data_type + "...")
        tmp_drop_cols = ['Gene_Name', self.cfg.Y]
        X = df.drop(tmp_drop_cols, axis=1)

        umap = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
        t0 = time()
        X_umap = umap.fit_transform(X)
        total_time = time() - t0

        X_umap = pd.DataFrame(X_umap)
        X_umap.columns = [('d' + str(c)) for c in X_umap.columns.values]
        #print(X_umap)

        X_umap = pd.concat([X_umap, df[tmp_drop_cols]], axis=1)

        filepath = str(self.cfg.unsuperv_out / ("UMAP" + data_type + ".tsv"))
        X_umap.to_csv(filepath, sep='\t', index=None)

        return X_umap, total_time
Exemple #9
0
def embeddingUmap(n_components, n_neighbors, random_state, tfidf_matrix_fit, tfidf_matrix_transform):
    umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=random_state).fit(tfidf_matrix_fit)
    print("reducing vector's dimensionality...")
    umap_embedding = umap.fit_transform(tfidf_matrix_transform)
    umap_df = pd.DataFrame(umap_embedding, columns=[f'emb_{i + 1}' for i in range(n_components)])

    return umap_df, umap_embedding
Exemple #10
0
def dim_red_kmeans(data, cluster, technique):
    if cluster == 'renda':
        features = data.loc[:, 'gdp_per_capita':]
    else:
        features = data.loc[:, 'cardiovasc_death_rate':]

    if technique == 'umap':
        umap_2d = UMAP(n_components=2, init='random', random_state=0)
        proj_2d = umap_2d.fit_transform(features)
    elif technique == 'pca':
        pca = PCA(n_components=2, random_state=0)
        proj_2d = pca.fit(features).transform(features)
    else:
        tsne = TSNE(n_components=2, random_state=0)
        proj_2d = tsne.fit_transform(features)

    kmeans = KMeans(n_clusters=7,
                    init="k-means++",
                    max_iter=500,
                    n_init=10,
                    random_state=123)
    identified_clusters = kmeans.fit_predict(proj_2d)

    data['Cluster'] = identified_clusters

    return px.scatter(proj_2d,
                      x=0,
                      y=1,
                      color=data.Cluster,
                      labels={'color': 'Cluster'},
                      hover_name=data.location)
class UMAPAnalyzer(BaseAnalyzer):
    """
    UMAP analysis for features.
    """
    def compute(
        self,
        n_neighbors=100,
        n_components=2,
        min_dist=0.5,
        metric="euclidean",
        verbose=True,
        n_epochs=1000,
        **kwargs,
    ):
        self.model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric=metric,
            verbose=True,
            n_epochs=n_epochs,
            **kwargs,
        )
        embedding = self.model.fit_transform(self.features)
        self.embedding = embedding

        return self.embedding
Exemple #12
0
def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title="",
                     **kwargs):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    reducer = UMAP(**kwargs)

    projs = reducer.fit_transform(embeds)

    speakers = np.array(speakers)
    colors = colors or _embedding_colors_
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()
    return projs
Exemple #13
0
def update_figure(selected_dataset):
    if selected_dataset == "MNIST-Digits":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    elif selected_dataset == "MNIST-Fashion":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    else:
        return None, "Please select a dataset."

    umap_3d = UMAP(n_components=3, init="random", random_state=0)

    proj_3d = umap_3d.fit_transform(X, y=y)

    fig = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=y)

    fig.update_layout(transition_duration=500, height=1000)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_traces(marker_size=2)

    return fig
Exemple #14
0
class UMAP:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = UMAP_(*args, **kwargs)

    def fit(self, X, y):
        pass

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.fit_transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
Exemple #15
0
def project_umap(spk_dict: Dict[str, Tensor], seed):
    sorted_speakers = sorted(list(spk_dict.keys()))
    flat_embs = torch.cat([spk_dict[k] for k in sorted_speakers],
                          dim=0).numpy()
    try:
        from umap import UMAP
        from sklearn.preprocessing import StandardScaler
        import matplotlib.pyplot as plt
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            'Please install umap, sklearn, and matplotlib from pypi to plot umap results.'
        )
    data = StandardScaler().fit_transform(flat_embs)
    reducer = UMAP(metric='cosine',
                   verbose=True,
                   n_neighbors=20,
                   random_state=seed)
    reduced_data = reducer.fit_transform(data)
    print(reduced_data.shape)
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
    reduced_chunks = torch.from_numpy(reduced_data).chunk(len(spk_dict), dim=0)
    for s, c in zip(sorted_speakers, reduced_chunks):
        ax.scatter(c.numpy()[:, 0], c.numpy()[:, 1])
    ax.legend(sorted_speakers)
    ax.set_xlabel('umap 1st component')
    ax.set_ylabel('umap 2nd component')
    ax.set_title("2D umap projection with n_neighbors=20")
    ax.grid(True)
    plt.tight_layout()
    plt.savefig('umap_plot.svg')
    print("Saved umap plot to umap_plot.svg")
Exemple #16
0
def run_umap(dist, logger=None, labels=None, **kwargs):
    """
    Run MDS on distances produced by tree2dmat

    Args:
        dist (str):             A distance matrix, square or condensed form
        n_components (int):     number of components to produce
        metric (bool):          Whether or not to run metric MDS. default is to run non-metric
        logger (Logger):        Logger to use. default is no logging

    Return:
        emb (np.array):         the MDS embedding
    """
    if len(dist.shape) == 1:
        if logger is not None:
            logger.info('computing squareform')
        dist = _squareform(dist)

    kwargs.setdefault('n_neighbors', 100)
    kwargs.setdefault('n_components', 2)

    if logger is not None:
        logger.info(
            'computing {n_components} components with UMAP'.format(**kwargs))
        logger.info(
            'using {n_neighbors} neighbors and {min_dist} min_dist'.format(
                **kwargs))

    kwargs['verbose'] = True
    umap = UMAP(**kwargs)
    emb = umap.fit_transform(dist, y=labels)
    return emb
Exemple #17
0
def vanDongenSpectral(args):

    neighbors, min_d, components, metric, dataset, scaler, k = args

    print(dataset + ', ' + metric + ', ' + scaler + ', n_components=' +
          str(components) + ', n_neighbors=' + str(neighbors) + ', min_dist=' +
          str(min_d) + ', k=' + str(k))

    # Se estandariza usando el scaler correspondiente
    df = scalers[scaler].fit_transform(datasets[dataset])

    # Se aplica UMAP
    um = UMAP(n_components=components,
              n_neighbors=neighbors,
              min_dist=min_d,
              metric=metric)
    embedding = um.fit_transform(df)

    # Se aplica KMeans al embedding
    km = KMeans(n_clusters=k, random_state=0).fit(embedding)

    # Se calcula la matriz de confusion
    tmp = pd.DataFrame({'Generos': metadata.genre, 'data': km.labels_})
    ct = pd.crosstab(tmp['Generos'], tmp['data'])

    return vanDongen(ct)
Exemple #18
0
def umapper(embed, metric="euclidean", n_neighbors=30, min_dist=1, **kws):
    umap = UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)
    umap_euc = umap.fit_transform(embed)
    plot_df = pd.DataFrame(data=umap_euc)
    plot_df["labels"] = labels
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    plot_kws = dict(
        x=0,
        y=1,
        hue="labels",
        palette=CLASS_COLOR_DICT,
        legend=False,
        s=20,
        linewidth=0.5,
        alpha=0.7,
    )
    sns.scatterplot(data=plot_df, ax=ax, **plot_kws)
    ax.axis("off")
    left_right_indexing = True
    if left_right_indexing:
        tlp_inds = np.arange(len(embed) // 2)
        trp_inds = np.arange(len(embed) // 2) + len(embed) // 2
        add_connections(
            plot_df.iloc[tlp_inds, 0],
            plot_df.iloc[trp_inds, 0],
            plot_df.iloc[tlp_inds, 1],
            plot_df.iloc[trp_inds, 1],
            ax=ax,
        )
    return fig, ax
Exemple #19
0
def umap(data, labels=None, ax=None, **kwargs):
    '''Draw a UMAP embedding plot of the data.

    :param matrix data: Input data. Numpy array recommended.
    :param list labels: (Optional) Corresponding labels to each datum. If specified, data points in the plot will be colored according to the label.
    :param axis ax: (Optional) Matplotlib axis to draw the plot on.
    :param kwargs: Any other keyword arguments will be passed onto matplotlib.pyplot.scatter.
    '''
    # Apply UMAP and get embeddings.
    reducer = UMAP()
    embeddings = reducer.fit_transform(data)

    if labels is None:
        ax.scatter(x=embeddings[:, 0], y=embeddings[:, 1], **kwargs)

    else:
        # If labels are attached, color them in different colors
        labels = np.array(labels)
        for label in set(labels):
            toDraw = (labels == label)  # only draw these points this time

            ax.scatter(x=embeddings[toDraw, 0],
                       y=embeddings[toDraw, 1],
                       label=label,
                       **kwargs)
            ax.legend(loc='best')
    return ax
Exemple #20
0
def main(dataset):
    adata = getdata(dataset)

    def saveplot(coords, dimred):
        plt.figure()
        plt.scatter(
            coords[:, 0],
            coords[:, 1],
            s=2,
            c=adataproj.obs["y"].values % 9,
            cmap="Set1",
        )
        plt.tick_params(
            axis="both",
            which="both",
            bottom=False,
            labelbottom=False,
            left=False,
            labelleft=False,
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.pdf",
            format="pdf",
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.png",
            format="png",
        )
        plt.close()

    for alg in [
            "cife",
            "bincife",
            "jmi",
            "binmim",
            "logreg",
            "t-test_overestim_var",
            "wilcoxon",
    ]:
        markers = np.load(
            f"output/{dataset}_{alg}_markers_full.npz")["markers"]
        if len(markers.shape) > 1:
            markers = markers[:, 0].flatten()
        else:
            markers = markers[:10]
        n_markers = len(markers)
        adataproj = adata[:, markers].copy()
        plotprep(adataproj)
        print("Computing PCA coords")
        Xpca = pr.plot.pca(adataproj.X, 2, return_info=False)
        saveplot(Xpca, "pca")
        print("Computing tSNE coords")
        t = TSNE()
        Xtsne = t.fit_transform(adataproj.X.toarray())
        saveplot(Xtsne, "tsne")
        print("Computing UMAP coords")
        u = UMAP()
        Xumap = u.fit_transform(adataproj.X)
        saveplot(Xumap, "umap")
Exemple #21
0
def umapfigure(adata, **scatterkwargs):
    if "X_umap" not in adata.obsm_keys():
        if "X_pca" not in adata.obsm_keys(
        ) or adata.obsm["X_pca"].shape[1] < 30:
            pca(adata, 30, zero_center=not scipy.sparse.issparse(adata.X))
        umap = UMAP()
        adata.obsm["X_umap"] = umap.fit_transform(adata.obsm["X_pca"][:, :30])
    return genericplot(adata, adata.obsm["X_umap"], **scatterkwargs)
Exemple #22
0
 def get(self, x, labels, clu, eval):
     umap = UMAP(**self.kwargs)
     self.logger.info("Finding embeddings.")
     emb = umap.fit_transform(x, y=labels)
     new_labels = clu.get(emb, eval)
     ind = np.where(labels != -1)
     new_labels[ind] = labels[ind]
     return new_labels
Exemple #23
0
class UMAP_Preprocessed:
    def __init__(self, *args, **kwargs):
        self.preprocessor = UMAP(n_neighbors=30, min_dist=0, n_components=2)
        self.clusterer = None

    def fit_predict(self, X):
        X = self.preprocessor.fit_transform(X)
        return self.clusterer.fit_predict(X)
 def plot_UMAP(self, features):
     umap_2d = UMAP(n_components=2, init='random', random_state=0)
     print("Computing projections...")
     proj_2d = umap_2d.fit_transform(features)
     print("Plotting...")
     sns.scatterplot(data=proj_2d)
     plt.grid(True)
     plt.show()
Exemple #25
0
def umap_reduce(data, **kwargs):
    try:
        reducer = cumlUMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    except (RuntimeError, TypeError) as e:
        warnings.warn(e)
        reducer = UMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    return embedding, reducer
Exemple #26
0
def embed_umap(data):
    """data should be on cpu, numpy"""
    embedding = UMAP(
        metric='euclidean',
        n_neighbors=40,
        # angular_rp_forest=True,
        # random_state=torch.initial_seed(),
        transform_seed=torch.initial_seed())
    return embedding.fit_transform(data)
 def umapDataReductionTo2D(self):
     """
     UMAP - Uniform Manifold Approximation and Projection method to used to reduce the dimensionality of Target/Reference vectors to 2-D using Multi Dimension Scaling (MDS)
     :return: None
     """
     umap = UMAP(n_components=2, random_state=1)
     reduced_feature_matrix = umap.fit_transform(self.__vectorized_corpus)
     self.__reduced_dim_feature_data = FeatureMatrixData(
         reduced_feature_matrix, self.__document_ids)
Exemple #28
0
def reduceWithUMAP(vectors, size):
    log(f'Reducing data to {size} features using UMAP (slow-ish)')
    umap = UMAP(n_neighbors=15,
                min_dist=0.1,
                metric='euclidean',
                n_components=size)
    vecs = umap.fit_transform(vectors)

    return vecs
Exemple #29
0
def get_umap_projection(**kwargs):
    '''Get the x,y positions of images passed through a umap projection'''
    print(' * creating UMAP layout')
    out_path = get_path('layouts', 'umap', **kwargs)
    if os.path.exists(out_path) and kwargs['use_cache']: return out_path
    model = UMAP(n_neighbors=kwargs['n_neighbors'],
                 min_dist=kwargs['min_dist'],
                 metric=kwargs['metric'])
    z = model.fit_transform(kwargs['vecs'])
    return write_layout(out_path, z, **kwargs)
Exemple #30
0
    def on_epoch_begin(self, model):
        print(
            f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..."
        )
        scores = defaultdict(list)
        scores["epoch"] = self.epoch
        for df, seed in test_data:
            print(f"Vectorize...")

            docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x))
            docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x))

            print(f"Reduce dimensions...")
            dim_reducer = UMAP(metric="cosine",
                               set_op_mix_ratio=1.0,
                               n_components=256,
                               random_state=42)

            dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs))

            print(f"Run ivis...")
            dim_reducer = Ivis(embedding_dims=1,
                               k=15,
                               model="maaten",
                               n_epochs_without_progress=10,
                               verbose=0)
            decision_scores = dim_reducer.fit_transform(dim_reduced_vecs)
            decision_scores = decision_scores.astype(float)

            print(f"Get and save scores...")
            preds = reject_outliers(decision_scores,
                                    iq_range=1.0 - contamination)
            preds = [-1 if x else 1 for x in preds]

            scores = get_scores(scores, df["outlier_label"], preds)
            scores["seed"] = seed
            print(
                f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}"
            )

            self.result_df = self.result_df.append(scores, ignore_index=True)
            self.result_df.to_csv(self.log_path, sep="\t")
        self.epoch += 1
Exemple #31
0
class TUmap(Transform):
    """
    n_neighbors:
        This determines the number of neighboring points used in local approximations
        of manifold structure. Larger values will result in more global structure being
        preserved at the loss of detailed local structure.
        In general this parameter should often be in the range 5 to 50,
        with a choice of 10 to 15 being a sensible default.
    min_dist:
        This controls how tightly the embedding is allowed compress points together.
        Larger values ensure embedded points are more evenly distributed, while smaller
        values allow the algorithm to optimise more accurately with regard to local structure.
        Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default.
    metric:
        This determines the choice of metric used to measure distance in the input space.
        A wide variety of metrics are already coded, and a user defined function can be passed
        as long as it has been JITd by numba.
    """

    def __init__(
        self,
        n_neighbors=15,
        min_dist=0.1,
        metric="euclidean",
        n_components=2,
        spread=1.0,
        random_state=None
    ):
        self._inst = UMAP(
            n_neighbors = n_neighbors,
            min_dist = min_dist,
            metric = metric,
            n_components=n_components,
            spread=spread,
        )


    def transform(self, fp):
        x = FeaturePool(fp).array()
        logger.info("TUmap: starting UMAP transform ...")
        x_emb = self._inst.fit_transform(x)
        logger.info("TUamp: Done")

        for f_id in range(x_emb.shape[1]):
            yield Feature(
                "UMAP feature #{}".format(f_id),
                x_emb[:, f_id]
            )

    @staticmethod
    def plot_embedding(efp: FeaturePool, split_by=None):
        x = efp.array()
        assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1])
        fig = plt.figure(figsize=(7, 7))
        ax = fig.add_subplot(111)
        if split_by is not None:
            d = split_by.data
            ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5)
        else:
            ax.scatter(x[:, 0], x[:, 1], alpha=0.5)
        if split_by is not None:
            ax.set_title(
                "UMAP for a feature pool splitted by feature `{}`".format(split_by.name)
            )
        else:
            ax.set_title(
                "UMAP for a feature pool"
            )
        fig.show()
from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))