Python UMAP.fit_transform Exemples, umap.UMAP.fit_transform Python Exemples

Exemple #1

0

Afficher le fichier

 def umap(self, n_components, metric, data=None):
     model= UMAP(n_components=n_components,metric=metric)
     if data is not None:
         reduced_data = model.fit_transform(data)
     else:
         reduced_data = model.fit_transform(self.data)
     return reduced_data

Exemple #2

0

Afficher le fichier

 def plot_umap_proc(self, df):
     folder = self.plot_path
     umap_2d = UMAP(n_components=2, spread=1, min_dist=0.5, a=0.7, b=1.2)
     umap_3d = UMAP(n_components=3, spread=1, min_dist=0.5, a=0.7, b=1.2)
     proj_2d = umap_2d.fit_transform(np.array(df.Vector.tolist()))
     proj_3d = umap_3d.fit_transform(np.array(df.Vector.tolist()))
     self.plot_umap(folder,proj_2d,proj_3d,df.Categ,"Categ","category-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.subject,"subject","subject-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.chn,"chn","channel-umap")

Exemple #3

0

Afficher le fichier

class manifold_umap(base_manifold):
    def __init__(self, parent=None, name='none'):
        base_manifold.__init__(self,
                               parent=parent,
                               name=name,
                               manifold_type='UMAP')

    def train(self, num_pc, n_neighbors=None, min_dist=0.3):
        """
        **Purpose**
            Train the UMAP on the first <num_pc> components of a PCA

            UMAP is generally too computationally heavy to do on a full dataset, so you
            should choose the first few PCs to train the tSNE. Check the pca module
            for a PCA interface you can use to select the best PCs

        **Arguments**
            n_neighbors (Required)
                Estimated number of neighbours

            min_dist (Optional, default=0.3)
                minimum distance between points

        **Returns**
            None
        """
        assert self.configured, 'umap is not configured, run configure() first'
        assert n_neighbors, 'You must specify an estimate for n_neighbors'

        if isinstance(num_pc, int):
            self.__model = PCA(n_components=num_pc, whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            self.__pcas = self.__transform

        elif isinstance(num_pc, list):
            self.__model = PCA(n_components=max(num_pc) + 1,
                               whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            # get only the specific PCs
            self.__pcas = numpy.array(
                [self.__transform[:, c - 1] for c in num_pc]).T
        else:
            raise AssertionError('num_pcs must be either an integer or a list')

        self.__model = UMAP(n_components=2,
                            n_neighbors=n_neighbors,
                            metric='correlation',
                            random_state=self.random_state,
                            verbose=self.verbose)

        self.npos = self.__model.fit_transform(self.__pcas)

        self.trained = True

Exemple #4

0

Afficher le fichier

def calc_umap(X, n_components, n_neighbors, min_dist, spread, random_state):
    umap = UMAP(n_components=n_components,
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                spread=spread,
                random_state=random_state)
    return umap.fit_transform(X)

Exemple #5

0

Afficher le fichier

def umap(feats, indices):
    metric = st.selectbox('Metric', [
        'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'canberra',
        'braycurtis', 'mahalanobis', 'wminkowski', 'seuclidean', 'cosine',
        'correlation'
    ])
    n_neighbors = st.slider('N Neighbors',
                            min_value=2,
                            max_value=200,
                            value=15,
                            step=1)
    min_dist = st.slider('Minimum Distance',
                         min_value=0.0,
                         max_value=1.0,
                         value=0.1,
                         step=0.01)

    model = UMAP(n_components=3,
                 n_neighbors=n_neighbors,
                 min_dist=min_dist,
                 metric=metric)

    results = model.fit_transform(feats[indices, :])

    return results

Exemple #6

0

Afficher le fichier

Fichier : test_umap_ops.py Projet : ginihumer/latent-projective-interventions

def test_umap_transform_embedding_stability(iris, iris_selection):
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    _ = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    _ = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)

Exemple #7

0

Afficher le fichier

def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title=""):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    # Compute the 2D projections. You could also project to another number of dimensions (e.g.
    # for a 3D plot) or use a different different dimensionality reduction like PCA or TSNE.
    reducer = UMAP()
    projs = reducer.fit_transform(embeds)

    # Draw the projections
    speakers = np.array(speakers)
    colors = colors or _my_colors
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    if legend:
        ax.legend(title="Speakers", ncol=2)
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    return projs

Exemple #8

0

Afficher le fichier

Fichier : dimens_reduction.py Projet : astrazeneca-cgr-publications/mantis-ml-release

    def calc_umap(self,
                  df,
                  n_neighbors=5,
                  min_dist=0.3,
                  metric='correlation',
                  data_type='original_data'):

        print(">> Running UMAP from " + data_type + "...")
        tmp_drop_cols = ['Gene_Name', self.cfg.Y]
        X = df.drop(tmp_drop_cols, axis=1)

        umap = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
        t0 = time()
        X_umap = umap.fit_transform(X)
        total_time = time() - t0

        X_umap = pd.DataFrame(X_umap)
        X_umap.columns = [('d' + str(c)) for c in X_umap.columns.values]
        #print(X_umap)

        X_umap = pd.concat([X_umap, df[tmp_drop_cols]], axis=1)

        filepath = str(self.cfg.unsuperv_out / ("UMAP" + data_type + ".tsv"))
        X_umap.to_csv(filepath, sep='\t', index=None)

        return X_umap, total_time

Exemple #9

0

Afficher le fichier

def embeddingUmap(n_components, n_neighbors, random_state, tfidf_matrix_fit, tfidf_matrix_transform):
    umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=random_state).fit(tfidf_matrix_fit)
    print("reducing vector's dimensionality...")
    umap_embedding = umap.fit_transform(tfidf_matrix_transform)
    umap_df = pd.DataFrame(umap_embedding, columns=[f'emb_{i + 1}' for i in range(n_components)])

    return umap_df, umap_embedding

Exemple #10

0

Afficher le fichier

def dim_red_kmeans(data, cluster, technique):
    if cluster == 'renda':
        features = data.loc[:, 'gdp_per_capita':]
    else:
        features = data.loc[:, 'cardiovasc_death_rate':]

    if technique == 'umap':
        umap_2d = UMAP(n_components=2, init='random', random_state=0)
        proj_2d = umap_2d.fit_transform(features)
    elif technique == 'pca':
        pca = PCA(n_components=2, random_state=0)
        proj_2d = pca.fit(features).transform(features)
    else:
        tsne = TSNE(n_components=2, random_state=0)
        proj_2d = tsne.fit_transform(features)

    kmeans = KMeans(n_clusters=7,
                    init="k-means++",
                    max_iter=500,
                    n_init=10,
                    random_state=123)
    identified_clusters = kmeans.fit_predict(proj_2d)

    data['Cluster'] = identified_clusters

    return px.scatter(proj_2d,
                      x=0,
                      y=1,
                      color=data.Cluster,
                      labels={'color': 'Cluster'},
                      hover_name=data.location)

Exemple #11

0

Afficher le fichier

Fichier : feature_analyzer.py Projet : sailfish009/bondnet

class UMAPAnalyzer(BaseAnalyzer):
    """
    UMAP analysis for features.
    """
    def compute(
        self,
        n_neighbors=100,
        n_components=2,
        min_dist=0.5,
        metric="euclidean",
        verbose=True,
        n_epochs=1000,
        **kwargs,
    ):
        self.model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric=metric,
            verbose=True,
            n_epochs=n_epochs,
            **kwargs,
        )
        embedding = self.model.fit_transform(self.features)
        self.embedding = embedding

        return self.embedding

Exemple #12

0

Afficher le fichier

Fichier : plotting.py Projet : Amuoeba/ARP_modeling

def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title="",
                     **kwargs):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    reducer = UMAP(**kwargs)

    projs = reducer.fit_transform(embeds)

    speakers = np.array(speakers)
    colors = colors or _embedding_colors_
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()
    return projs

Exemple #13

0

Afficher le fichier

def update_figure(selected_dataset):
    if selected_dataset == "MNIST-Digits":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    elif selected_dataset == "MNIST-Fashion":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    else:
        return None, "Please select a dataset."

    umap_3d = UMAP(n_components=3, init="random", random_state=0)

    proj_3d = umap_3d.fit_transform(X, y=y)

    fig = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=y)

    fig.update_layout(transition_duration=500, height=1000)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_traces(marker_size=2)

    return fig

Exemple #14

0

Afficher le fichier

Fichier : new_insane.py Projet : redjerdai/thethWyrm

class UMAP:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = UMAP_(*args, **kwargs)

    def fit(self, X, y):
        pass

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.fit_transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z

Exemple #15

0

Afficher le fichier

def project_umap(spk_dict: Dict[str, Tensor], seed):
    sorted_speakers = sorted(list(spk_dict.keys()))
    flat_embs = torch.cat([spk_dict[k] for k in sorted_speakers],
                          dim=0).numpy()
    try:
        from umap import UMAP
        from sklearn.preprocessing import StandardScaler
        import matplotlib.pyplot as plt
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            'Please install umap, sklearn, and matplotlib from pypi to plot umap results.'
        )
    data = StandardScaler().fit_transform(flat_embs)
    reducer = UMAP(metric='cosine',
                   verbose=True,
                   n_neighbors=20,
                   random_state=seed)
    reduced_data = reducer.fit_transform(data)
    print(reduced_data.shape)
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
    reduced_chunks = torch.from_numpy(reduced_data).chunk(len(spk_dict), dim=0)
    for s, c in zip(sorted_speakers, reduced_chunks):
        ax.scatter(c.numpy()[:, 0], c.numpy()[:, 1])
    ax.legend(sorted_speakers)
    ax.set_xlabel('umap 1st component')
    ax.set_ylabel('umap 2nd component')
    ax.set_title("2D umap projection with n_neighbors=20")
    ax.grid(True)
    plt.tight_layout()
    plt.savefig('umap_plot.svg')
    print("Saved umap plot to umap_plot.svg")

Exemple #16

0

Afficher le fichier

def run_umap(dist, logger=None, labels=None, **kwargs):
    """
    Run MDS on distances produced by tree2dmat

    Args:
        dist (str):             A distance matrix, square or condensed form
        n_components (int):     number of components to produce
        metric (bool):          Whether or not to run metric MDS. default is to run non-metric
        logger (Logger):        Logger to use. default is no logging

    Return:
        emb (np.array):         the MDS embedding
    """
    if len(dist.shape) == 1:
        if logger is not None:
            logger.info('computing squareform')
        dist = _squareform(dist)

    kwargs.setdefault('n_neighbors', 100)
    kwargs.setdefault('n_components', 2)

    if logger is not None:
        logger.info(
            'computing {n_components} components with UMAP'.format(**kwargs))
        logger.info(
            'using {n_neighbors} neighbors and {min_dist} min_dist'.format(
                **kwargs))

    kwargs['verbose'] = True
    umap = UMAP(**kwargs)
    emb = umap.fit_transform(dist, y=labels)
    return emb

Exemple #17

0

Afficher le fichier

def vanDongenSpectral(args):

    neighbors, min_d, components, metric, dataset, scaler, k = args

    print(dataset + ', ' + metric + ', ' + scaler + ', n_components=' +
          str(components) + ', n_neighbors=' + str(neighbors) + ', min_dist=' +
          str(min_d) + ', k=' + str(k))

    # Se estandariza usando el scaler correspondiente
    df = scalers[scaler].fit_transform(datasets[dataset])

    # Se aplica UMAP
    um = UMAP(n_components=components,
              n_neighbors=neighbors,
              min_dist=min_d,
              metric=metric)
    embedding = um.fit_transform(df)

    # Se aplica KMeans al embedding
    km = KMeans(n_clusters=k, random_state=0).fit(embedding)

    # Se calcula la matriz de confusion
    tmp = pd.DataFrame({'Generos': metadata.genre, 'data': km.labels_})
    ct = pd.crosstab(tmp['Generos'], tmp['data'])

    return vanDongen(ct)

Exemple #18

0

Afficher le fichier

def umapper(embed, metric="euclidean", n_neighbors=30, min_dist=1, **kws):
    umap = UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)
    umap_euc = umap.fit_transform(embed)
    plot_df = pd.DataFrame(data=umap_euc)
    plot_df["labels"] = labels
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    plot_kws = dict(
        x=0,
        y=1,
        hue="labels",
        palette=CLASS_COLOR_DICT,
        legend=False,
        s=20,
        linewidth=0.5,
        alpha=0.7,
    )
    sns.scatterplot(data=plot_df, ax=ax, **plot_kws)
    ax.axis("off")
    left_right_indexing = True
    if left_right_indexing:
        tlp_inds = np.arange(len(embed) // 2)
        trp_inds = np.arange(len(embed) // 2) + len(embed) // 2
        add_connections(
            plot_df.iloc[tlp_inds, 0],
            plot_df.iloc[trp_inds, 0],
            plot_df.iloc[tlp_inds, 1],
            plot_df.iloc[trp_inds, 1],
            ax=ax,
        )
    return fig, ax

Exemple #19

0

Afficher le fichier

Fichier : plot.py Projet : dohlee/python-dohlee

def umap(data, labels=None, ax=None, **kwargs):
    '''Draw a UMAP embedding plot of the data.

    :param matrix data: Input data. Numpy array recommended.
    :param list labels: (Optional) Corresponding labels to each datum. If specified, data points in the plot will be colored according to the label.
    :param axis ax: (Optional) Matplotlib axis to draw the plot on.
    :param kwargs: Any other keyword arguments will be passed onto matplotlib.pyplot.scatter.
    '''
    # Apply UMAP and get embeddings.
    reducer = UMAP()
    embeddings = reducer.fit_transform(data)

    if labels is None:
        ax.scatter(x=embeddings[:, 0], y=embeddings[:, 1], **kwargs)

    else:
        # If labels are attached, color them in different colors
        labels = np.array(labels)
        for label in set(labels):
            toDraw = (labels == label)  # only draw these points this time

            ax.scatter(x=embeddings[toDraw, 0],
                       y=embeddings[toDraw, 1],
                       label=label,
                       **kwargs)
            ax.legend(loc='best')
    return ax

Exemple #20

0

Afficher le fichier

def main(dataset):
    adata = getdata(dataset)

    def saveplot(coords, dimred):
        plt.figure()
        plt.scatter(
            coords[:, 0],
            coords[:, 1],
            s=2,
            c=adataproj.obs["y"].values % 9,
            cmap="Set1",
        )
        plt.tick_params(
            axis="both",
            which="both",
            bottom=False,
            labelbottom=False,
            left=False,
            labelleft=False,
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.pdf",
            format="pdf",
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.png",
            format="png",
        )
        plt.close()

    for alg in [
            "cife",
            "bincife",
            "jmi",
            "binmim",
            "logreg",
            "t-test_overestim_var",
            "wilcoxon",
    ]:
        markers = np.load(
            f"output/{dataset}_{alg}_markers_full.npz")["markers"]
        if len(markers.shape) > 1:
            markers = markers[:, 0].flatten()
        else:
            markers = markers[:10]
        n_markers = len(markers)
        adataproj = adata[:, markers].copy()
        plotprep(adataproj)
        print("Computing PCA coords")
        Xpca = pr.plot.pca(adataproj.X, 2, return_info=False)
        saveplot(Xpca, "pca")
        print("Computing tSNE coords")
        t = TSNE()
        Xtsne = t.fit_transform(adataproj.X.toarray())
        saveplot(Xtsne, "tsne")
        print("Computing UMAP coords")
        u = UMAP()
        Xumap = u.fit_transform(adataproj.X)
        saveplot(Xumap, "umap")

Exemple #21

0

Afficher le fichier

Fichier : plot.py Projet : umangv/picturedrocks

def umapfigure(adata, **scatterkwargs):
    if "X_umap" not in adata.obsm_keys():
        if "X_pca" not in adata.obsm_keys(
        ) or adata.obsm["X_pca"].shape[1] < 30:
            pca(adata, 30, zero_center=not scipy.sparse.issparse(adata.X))
        umap = UMAP()
        adata.obsm["X_umap"] = umap.fit_transform(adata.obsm["X_pca"][:, :30])
    return genericplot(adata, adata.obsm["X_umap"], **scatterkwargs)

Exemple #22

0

Afficher le fichier

Fichier : _ss_cluster.py Projet : ArjunSarathi/cellar

 def get(self, x, labels, clu, eval):
     umap = UMAP(**self.kwargs)
     self.logger.info("Finding embeddings.")
     emb = umap.fit_transform(x, y=labels)
     new_labels = clu.get(emb, eval)
     ind = np.where(labels != -1)
     new_labels[ind] = labels[ind]
     return new_labels

Exemple #23

0

Afficher le fichier

class UMAP_Preprocessed:
    def __init__(self, *args, **kwargs):
        self.preprocessor = UMAP(n_neighbors=30, min_dist=0, n_components=2)
        self.clusterer = None

    def fit_predict(self, X):
        X = self.preprocessor.fit_transform(X)
        return self.clusterer.fit_predict(X)

Exemple #24

0

Afficher le fichier

Fichier : visualize_predictions.py Projet : anuprulez/single_cell_analysis

 def plot_UMAP(self, features):
     umap_2d = UMAP(n_components=2, init='random', random_state=0)
     print("Computing projections...")
     proj_2d = umap_2d.fit_transform(features)
     print("Plotting...")
     sns.scatterplot(data=proj_2d)
     plt.grid(True)
     plt.show()

Exemple #25

0

Afficher le fichier

def umap_reduce(data, **kwargs):
    try:
        reducer = cumlUMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    except (RuntimeError, TypeError) as e:
        warnings.warn(e)
        reducer = UMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    return embedding, reducer

Exemple #26

0

Afficher le fichier

def embed_umap(data):
    """data should be on cpu, numpy"""
    embedding = UMAP(
        metric='euclidean',
        n_neighbors=40,
        # angular_rp_forest=True,
        # random_state=torch.initial_seed(),
        transform_seed=torch.initial_seed())
    return embedding.fit_transform(data)

Exemple #27

0

Afficher le fichier

Fichier : DocumentFeatureVisualization.py Projet : aidowu1/Ades-NLP-Recepies

 def umapDataReductionTo2D(self):
     """
     UMAP - Uniform Manifold Approximation and Projection method to used to reduce the dimensionality of Target/Reference vectors to 2-D using Multi Dimension Scaling (MDS)
     :return: None
     """
     umap = UMAP(n_components=2, random_state=1)
     reduced_feature_matrix = umap.fit_transform(self.__vectorized_corpus)
     self.__reduced_dim_feature_data = FeatureMatrixData(
         reduced_feature_matrix, self.__document_ids)

Exemple #28

0

Afficher le fichier

def reduceWithUMAP(vectors, size):
    log(f'Reducing data to {size} features using UMAP (slow-ish)')
    umap = UMAP(n_neighbors=15,
                min_dist=0.1,
                metric='euclidean',
                n_components=size)
    vecs = umap.fit_transform(vectors)

    return vecs

Exemple #29

0

Afficher le fichier

Fichier : pixplot.py Projet : herlai/pix-plot

def get_umap_projection(**kwargs):
    '''Get the x,y positions of images passed through a umap projection'''
    print(' * creating UMAP layout')
    out_path = get_path('layouts', 'umap', **kwargs)
    if os.path.exists(out_path) and kwargs['use_cache']: return out_path
    model = UMAP(n_neighbors=kwargs['n_neighbors'],
                 min_dist=kwargs['min_dist'],
                 metric=kwargs['metric'])
    z = model.fit_transform(kwargs['vecs'])
    return write_layout(out_path, z, **kwargs)

Exemple #30

0

Afficher le fichier

    def on_epoch_begin(self, model):
        print(
            f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..."
        )
        scores = defaultdict(list)
        scores["epoch"] = self.epoch
        for df, seed in test_data:
            print(f"Vectorize...")

            docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x))
            docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x))

            print(f"Reduce dimensions...")
            dim_reducer = UMAP(metric="cosine",
                               set_op_mix_ratio=1.0,
                               n_components=256,
                               random_state=42)

            dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs))

            print(f"Run ivis...")
            dim_reducer = Ivis(embedding_dims=1,
                               k=15,
                               model="maaten",
                               n_epochs_without_progress=10,
                               verbose=0)
            decision_scores = dim_reducer.fit_transform(dim_reduced_vecs)
            decision_scores = decision_scores.astype(float)

            print(f"Get and save scores...")
            preds = reject_outliers(decision_scores,
                                    iq_range=1.0 - contamination)
            preds = [-1 if x else 1 for x in preds]

            scores = get_scores(scores, df["outlier_label"], preds)
            scores["seed"] = seed
            print(
                f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}"
            )

            self.result_df = self.result_df.append(scores, ignore_index=True)
            self.result_df.to_csv(self.log_path, sep="\t")
        self.epoch += 1

Exemple #31

0

Afficher le fichier

Fichier : transform.py Projet : alexeyche/alexeyche-junk

class TUmap(Transform):
    """
    n_neighbors:
        This determines the number of neighboring points used in local approximations
        of manifold structure. Larger values will result in more global structure being
        preserved at the loss of detailed local structure.
        In general this parameter should often be in the range 5 to 50,
        with a choice of 10 to 15 being a sensible default.
    min_dist:
        This controls how tightly the embedding is allowed compress points together.
        Larger values ensure embedded points are more evenly distributed, while smaller
        values allow the algorithm to optimise more accurately with regard to local structure.
        Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default.
    metric:
        This determines the choice of metric used to measure distance in the input space.
        A wide variety of metrics are already coded, and a user defined function can be passed
        as long as it has been JITd by numba.
    """

    def __init__(
        self,
        n_neighbors=15,
        min_dist=0.1,
        metric="euclidean",
        n_components=2,
        spread=1.0,
        random_state=None
    ):
        self._inst = UMAP(
            n_neighbors = n_neighbors,
            min_dist = min_dist,
            metric = metric,
            n_components=n_components,
            spread=spread,
        )


    def transform(self, fp):
        x = FeaturePool(fp).array()
        logger.info("TUmap: starting UMAP transform ...")
        x_emb = self._inst.fit_transform(x)
        logger.info("TUamp: Done")

        for f_id in range(x_emb.shape[1]):
            yield Feature(
                "UMAP feature #{}".format(f_id),
                x_emb[:, f_id]
            )

    @staticmethod
    def plot_embedding(efp: FeaturePool, split_by=None):
        x = efp.array()
        assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1])
        fig = plt.figure(figsize=(7, 7))
        ax = fig.add_subplot(111)
        if split_by is not None:
            d = split_by.data
            ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5)
        else:
            ax.scatter(x[:, 0], x[:, 1], alpha=0.5)
        if split_by is not None:
            ax.set_title(
                "UMAP for a feature pool splitted by feature `{}`".format(split_by.name)
            )
        else:
            ax.set_title(
                "UMAP for a feature pool"
            )
        fig.show()

Exemple #32

0

Afficher le fichier

Fichier : umap_transformer.py Projet : cmagnusb/machine_learning_examples

from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))