def learn_umap(data, **kwargs):
    """
    Calculates UMAP transformation for given matrix features.

    Parameters
    --------
    data: np.array
        Array of features.
    kwargs: optional
        Parameters for ``umap.UMAP()``

    Returns
    -------
    Calculated UMAP transform

    Return type
    -------
    np.ndarray
    """
    #_tsne_filter = TSNE.get_params(TSNE)
    #kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter}
    #res = TSNE(random_state=0, **kwargs).fit_transform(data.values)
    reducer = umap.UMAP()
    _umap_filter = reducer.get_params()
    kwargs = {i: j for i, j in kwargs.items() if i in _umap_filter}
    embedding = umap.UMAP(random_state=0, min_dist=1,
                          **kwargs).fit_transform(data.values)
    return pd.DataFrame(embedding, index=data.index.values)
Esempio n. 2
0
def run_umap(x, y, item, n_neighbors_list, min_dist=0.05, verbose=True):
    for i in n_neighbors_list:
        print("UMAP NEIGHBOR NUMBER: ", i)
        x_umap = umap_.UMAP(n_neighbors=i, min_dist=min_dist,
                            verbose=verbose).fit_transform(x)
        filename = "umap_result" + str(i) + "neighbors"
        draw_plot(x_umap, y, item, filename)
def find_clusters(embeddings, min_cluster_size):
    """
        Receives a pandas DataFrame containing embedding vectors of length 768 map them into an low-dimensional space 
        and finds the clusters in that space

        based on the idea in: https://umap-learn.readthedocs.io/en/latest/faq.html - section "From a more practical standpoint"
        
        Args:
            embeddings (:obj:`DataFrame[float]`):
                DataFrame of embedding vectors
            min_cluster_size (:obj:`int`):
            	Minimal cluster size

        Returns:
           :obj:`numpy array[int64]`: Cluster labels for each data point
    """

    n_components = min(len(embeddings), 50)
    pca = PCA(n_components=n_components)
    embeddings = pca.fit_transform(embeddings)

    reducer = umap.UMAP(n_components=2)
    embeddings = reducer.fit_transform(embeddings)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    clusterer.fit(embeddings)

    return clusterer.labels_
Esempio n. 4
0
    def _update(self, umap_data):
        """Updates UMAP object properties from ``umap_data`` :obj:`dict`.

        Parameters
        ----------
        umap_data : :obj:`dict`

        """
        # pylint: disable=undefined-variable

        self.z = umap_data['z']
        self.R = umap_data['R']
        self.R_desc = umap_data['R_desc']
        self.E_true = umap_data['E_true']
        self.F_true = umap_data['F_true']
        self.E_pred = umap_data['E_pred']
        self.F_pred = umap_data['F_pred']
        self.data_info = umap_data['data_info'].tolist()
        self.n_neighbors = umap_data['n_neighbors'][()]
        self.min_dist = umap_data['min_dist'][()]
        self.random_state = umap_data['random_state'][()]

        self.reducer = umap.UMAP(n_neighbors=self.n_neighbors,
                                 min_dist=self.min_dist,
                                 random_state=self.random_state)
        self.embedding = umap_data['embedding']
Esempio n. 5
0
def process_umap(exact_pdh, pca_comp, scale=500):
    umapH = uma.UMAP()
    umap_result = umapH.fit_transform(exact_pdh[list(range(pca_comp))])
    freqlist = exact_pdh['freq']
    lw = (freqlist / freqlist[0])**2
    plt.scatter(umap_result[:, 0], umap_result[:, 1], s=scale * lw)
    return umap_result
Esempio n. 6
0
def generate_base64(terms, vectors):
    new_values = umap.UMAP(n_neighbors=5, min_dist=0.3,
                           metric='correlation').fit_transform(vectors)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(terms[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     fontproperties=myfont,
                     ha='right',
                     va='bottom')

    save_file = BytesIO()
    plt.savefig(save_file, format='png')
    save_file_base64 = base64.b64encode(save_file.getvalue()).decode('utf8')
    return save_file_base64
Esempio n. 7
0
def vis_high_dims_data_umap_2(X, y, show_label_flg=False):
    """

    :param X:  features
    :param y:  labels
    :param show_label_flg :
    :return:
    """
    # res_umap=umap.UMAP(n_neighbors=5,min_dist=0.3, metric='correlation').fit_transform(X,y)
    res_umap = umap.UMAP(n_neighbors=50,
                         min_dist=0.8,
                         metric='correlation',
                         random_state=42).fit_transform(X, y)

    if not show_label_flg:
        # plt.figure(figsize=(10, 5))
        fig, ax = plt.subplots(figsize=(12, 7))
        plt.scatter(res_umap[:, 0],
                    res_umap[:, 1],
                    c=y,
                    cmap=plt.cm.get_cmap("jet", 8),
                    alpha=0.8)
        # cbar = fig.colorbar(cax, ticks=[1, 2, 3, 4, 5, 6, 7, 8], orientation='horizontal')
        cbar = fig.colorbar(ax, ticks=[1, 2, 3, 4, 5, 6, 7, 8])
        cbar.ax.set_xticklabels([
            'Google', 'Twitter', 'Youtube', 'Outlook', 'Github', 'Facebook',
            'Slack', 'Bing'
        ])  # horizontal colorbar

        # plt.colorbar(ticks=range(0,9))
        plt.setp(ax, xticks=[], yticks=[])
        # plt.title('umap results')
        plt.show()
    else:
        plot_with_labels(X, y, res_umap, "UMAP", min_dist=2.0)
Esempio n. 8
0
def vis_high_dims_data_umap(X, y, show_label_flg=False):
    """

    :param X:  features
    :param y:  labels
    :param show_label_flg :
    :return:
    """
    # res_umap=umap.UMAP(n_neighbors=5,min_dist=0.3, metric='correlation').fit_transform(X,y)
    res_umap = umap.UMAP(n_neighbors=30,
                         min_dist=0.12,
                         spread=1.8,
                         metric='correlation').fit_transform(X, y)

    if not show_label_flg:
        plt.figure(figsize=(10, 5))
        plt.scatter(res_umap[:, 0],
                    res_umap[:, 1],
                    c=y,
                    cmap=plt.cm.get_cmap("jet", 7),
                    alpha=0.7)
        plt.colorbar(ticks=range(7))
        plt.title('umap results')
        plt.savefig("t_SNE.jpg", dpi=400)
    else:
        plot_with_labels(X, y, res_umap, "UMAP", min_dist=2.0)
Esempio n. 9
0
def NDR(data,method,dim,n_neighbors=100):
    if method == 'standard_LLE':
        embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors,n_components=dim,\
                method='standard').fit_transform(data)
    elif method == 'hessian_LLE':
        embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors,n_components=dim,\
                method='hessian').fit_transform(data)
    elif method == 'ltsa_LLE':
        embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=dim,\
                method='ltsa').fit_transform(data)
    elif method == 'modified_LLE':
        embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=dim,\
                method='modified').fit_transform(data)
    elif method == 'IsoMap':
        embedding = manifold.Isomap(n_neighbors=n_neighbors, n_components=dim)\
            .fit_transform(data)
    elif method == 't-SNE':
        embedding = manifold.TSNE(n_components=dim, init='pca', random_state=0,method='exact')\
                .fit_transform(data)
    elif method == 'MDS':
        embedding = manifold.MDS(n_components=dim, max_iter=100, n_init=1).fit_transform(data)
    elif method == 'Spectral_Embedding':
        embedding = manifold.SpectralEmbedding(n_components=dim,n_neighbors=n_neighbors)\
                .fit_transform(data)
    elif method == 'UMAP':
        embedding = umap.UMAP(n_components=dim,n_neighbors=n_neighbors).fit_transform(data)
    elif method == 'PCA':
        embedding = PCA(n_components=dim,svd_solver= 'auto').fit_transform(data)
    return(embedding)
Esempio n. 10
0
    def embed(self):
        """Embed the descriptors and derivatives in two dimensions.

        Will set the ``reducer`` attribute if it does not exist or update it
        if ``n_neighbors``, ``min_dist``, or ``random_state`` have changed.
        
        Returns
        -------
        :obj:`numpy.ndarray`
            A 2D array with rows being each structure (in the order they are
            provided in data) with their reduced dimension coordinates being
            the columns.
        
        Notes
        -----
        We recommend first tuning ``n_neighbors`` to provide a balance of
        clustering and overlap/uniformness. Then tune ``min_dist`` to be the
        smallest number that allows you to qualitatively determine number of
        points in a clustered region.

        For more information on these parameters see
        https://umap-learn.readthedocs.io/en/latest/parameters.html.
        """
        # pylint: disable=undefined-variable
        # We check if we need to update the reducer attribute. We always
        # reinitialize reducer as it takes very little time, but we still have
        # to check the random_state for reproducibility.
        reducer = umap.UMAP(n_neighbors=self.n_neighbors,
                            min_dist=self.min_dist,
                            random_state=self.random_state)
        self.reducer = reducer

        data = self.R_desc
        self.embedding = reducer.fit_transform(data)
        return self.embedding
Esempio n. 11
0
def get_embeddings(input_features):
    reducer = umap.UMAP(random_state=28)
    embedding = reducer.fit_transform(input_features)
    df = pd.DataFrame({
        'UMAP 1': embedding[:, 1],
        'UMAP 2': embedding[:, 0]
    })  #, 'IDR':idr_class})
    return df
Esempio n. 12
0
def umap_paint(X_topics, umap_para):
    embedding = umap.UMAP(
        n_neighbors=umap_para['n_neighbors'],
        min_dist=umap_para['min_dist'],
        random_state=umap_para['random_state']).fit_transform(X_topics)
    plt.figure(figsize=(7, 5))
    plt.scatter(embedding[:, 0], embedding[:, 1], s=10, edgecolor='none')
    plt.show()
Esempio n. 13
0
def umap_analysis():
    from sklearn import preprocessing
    import umap.umap_ as umap  # pip install umap-learn, pip install ipywidgets

    featrues_filename = 'features_3_sec.csv'
    #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv'
    #featrues_filename = 'data_adv_3_sec_hccho.csv'

    data = pd.read_csv(f'{general_path}/{featrues_filename}')
    #data = data[data.filename.apply(lambda x: x.split(".")[-2]=='0')].copy().reset_index(drop=True)  # 파일당 10개 중 1개만....

    print(data.shape, data.head())

    data = data.iloc[0:, 1:]
    print(data.head(5))

    y = data['label']  # genre variable.
    X = data.loc[:, data.columns !=
                 'label']  #select all columns but not the labels

    #### NORMALIZE X ####
    cols = X.columns
    standard_scaler = preprocessing.StandardScaler()
    np_scaled = standard_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled, columns=cols)
    print(X.shape, X.iloc[:, :2])

    # spread는 값의 scale을 결정한다. min_dist <= spread이고 min_dist가 작아지면 더 뭉치게 되고, 커지면 퍼지게 된다.
    umap_embedding = umap.UMAP(n_neighbors=20,
                               spread=1,
                               min_dist=0.1,
                               n_epochs=5000,
                               metric='correlation',
                               n_components=2,
                               verbose=True).fit_transform(
                                   X)  # return numpy array (N,n_components)

    umapDf = pd.DataFrame(data=umap_embedding,
                          columns=['umap component 1', 'umap component 2'])

    # concatenate with target label
    finalDf = pd.concat([umapDf, y], axis=1)

    sns.scatterplot(x="umap component 1",
                    y="umap component 2",
                    data=finalDf,
                    hue="label",
                    alpha=0.7,
                    s=10)

    plt.title('umap on Genres', fontsize=12)
    plt.xticks(fontsize=7)
    plt.yticks(fontsize=7)
    plt.xlabel("umap Component 1", fontsize=7)
    plt.ylabel("umap Component 2", fontsize=7)

    plt.show()
    print('Done')
Esempio n. 14
0
def visualise_clusters(feature_vectors_list):

    reducer = umap.UMAP()
    embedding_fv = reducer.fit_transform(feature_vectors_list)
    embedding_fv.shape

    x_data = [[a, b] for (a, b) in zip(embedding_fv[:, 0], embedding_fv[:, 1])]

    visualize_scatter_with_images(x_data, images=images, image_zoom=0.3)
Esempio n. 15
0
def umap_fn(x, y=None, random_state=RS, **kwargs):
    # WARNING: y shouldn't actually be passed in unless
    # for supervised clustering purposes
    start = datetime.now()
    print("UMAP dimensionality reduction started at {}".format(
        start.strftime("%H:%M:%S")))
    x_umap = umap.UMAP(random_state=RS, **kwargs).fit_transform(x, y)
    print("UMAP took {} to finish".format(datetime.now() - start))
    return x_umap
Esempio n. 16
0
def umap_iplot(x, df_text, preds):
    hover_data = pd.DataFrame({'index': preds, 'label': df_text})
    mapper = umap.UMAP().fit(x)
    p = uplot.interactive(mapper,
                          labels=preds,
                          hover_data=hover_data,
                          point_size=2)
    uplot.show(p)
    return mapper
Esempio n. 17
0
    def apply_umap(self):
        """
    """
        profiles = [(idx, profile)
                    for idx, profile in Clusterer.d_profiles.items()
                    if idx in self.d_sequences]
        vector = [x[1] for x in profiles]

        if self.subCluster:
            neighbors, dist = 5, 0.0
        else:
            neighbors, dist = 50, 0.25

        try:
            clusterable_embedding = umap.UMAP(
                n_neighbors=neighbors,
                min_dist=dist,
                n_components=20,
                random_state=42,
                metric='cosine',
            ).fit_transform(vector)

            clusterer = hdbscan.HDBSCAN()
            clusterer.fit(clusterable_embedding)

            self.clusterlabel = clusterer.labels_
            self.probabilities = clusterer.probabilities_
            if len(set(self.clusterlabel)) == 1:
                raise TypeError

        except TypeError:
            import shutil
            shutil.copyfile(
                self.sequenceFile,
                f'{self.outdir}/{os.path.splitext(os.path.basename(self.sequenceFile))[0]}_repr.fa'
            )
            return 1

        self.allCluster = list(zip([x[0] for x in profiles],
                                   self.clusterlabel))
        if not self.subCluster:
            with open(f'{self.outdir}/cluster.txt', 'w') as outStream:
                for i in set(self.clusterlabel):
                    with open(f'{self.outdir}/cluster{i}.fa', 'w') as fastaOut:
                        outStream.write(f"Cluster: {i}\n")
                        for idx, label in self.allCluster:
                            if label == i:
                                if idx in Clusterer.goiHeader:
                                    Clusterer.goi2Cluster[
                                        Clusterer.id2header[idx]] = i
                                outStream.write(
                                    f"{Clusterer.id2header[idx]}\n")
                                fastaOut.write(
                                    f">{Clusterer.id2header[idx]}\n{self.d_sequences[idx].split('X'*10)[0]}\n"
                                )
                    outStream.write("\n")
def plot_umap(X_scaled, class_labels, image_save_directory, y):
    # Use a supervised / unsupervised analysis to make the clusters

    sns.set(style='white', context='poster')
    # import umap
    # %time #Time of the whole cell
    embeddingUnsupervised = umap.UMAP(n_neighbors=5,
                                      random_state=42,
                                      init='random').fit_transform(X_scaled)
    # %time #Time of the whole cell

    if y is not None:
        embeddingSupervised = umap.UMAP(n_neighbors=5,
                                        random_state=42,
                                        init='random').fit_transform(X_scaled,
                                                                     y=y)
        vis.plotUmap(embeddingSupervised, y, list(class_labels.values()),
                     'Dataset supervised clustering')

        vis.save_figure(plt.gcf(),
                        image_save_directory=image_save_directory,
                        filename='UMAP_Supervised')
        print("Plot UMAP supervised")

        vis.plotUmap(embeddingUnsupervised,
                     y,
                     list(class_labels.values()),
                     'Dataset unsupervised clustering',
                     cmapString='RdYlGn')
        print("Plot UMAP unsupervised with class labels")
    else:
        warnings.warn("No y values.")
        vis.plotUmap(embeddingUnsupervised,
                     None,
                     None,
                     'Dataset unsupervised clustering',
                     cmapString='RdYlGn')
        print("Plot UMAP unsupervised without class labels")

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='UMAP_Unsupervised')
    print("Plot UMAP unsupervised")
 def load_umap_embeddings(self):
     if self.umap_embeddings is None:
         logger.info('Running UMAP for {} words'.format(self.lang))
         umap_embeddings = umap.UMAP(n_neighbors=10,
                                     min_dist=0.005,
                                     metric='correlation').fit_transform(
                                         self.embeddings)
         self.umap_embeddings = [
             vector.tolist() for vector in umap_embeddings
         ]
Esempio n. 20
0
def process_umap(exact_pdh, pca_comp, scale=500):
    umapH = uma.UMAP()
    umap_result = umapH.fit_transform(exact_pdh[list(range(pca_comp))])
    freqlist = exact_pdh['freq']
    lw = (freqlist / freqlist[0])**2
    u1 = umap_result[:, 0]
    exact_pdh['u1'] = u1
    u2 = umap_result[:, 1]
    exact_pdh['u2'] = u2
    plt.scatter(u1, u2, s=scale * lw)
    return None
Esempio n. 21
0
    def umap_reducer(self):

        if (self.umap_reduce == "yes"):
            IDs = self.confounders_df['ID']
            IDs_df = pd.DataFrame(IDs)
            to_umap = self.confounders_df.drop(columns=['ID'])

            reducer = umap.UMAP(random_state=153)
            embedding = reducer.fit_transform(to_umap)

            embedding1 = pd.DataFrame(embedding[:, 0])
            embedding2 = pd.DataFrame(embedding[:, 1])

            out_data = pd.concat([
                IDs_df.reset_index(),
                embedding1.reset_index(drop=True),
                embedding2.reset_index(drop=True)
            ],
                                 axis=1,
                                 ignore_index=True)
            out_data.columns = [
                'INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2"
            ]
            out_data = out_data.drop(columns=['INDEX'])

            # Plot
            print(f"Exporting UMAP plot...")
            fig, ax = plt.subplots(figsize=(12, 10))
            plt.scatter(embedding[:, 0], embedding[:, 1], cmap="cool")
            plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18)
            plot_out = self.run_prefix + '.umap_plot.png'
            plt.savefig(plot_out, dpi=600)

            print(
                f"The UMAP plot has been exported and can be found here: {plot_out}"
            )

            out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv'
            out_data.to_csv(out_file, index=False)

            print(
                f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}"
            )

            exported_reducer = reducer.fit(to_umap)
            algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib'
            dump(exported_reducer, algo_out)

            self.confounders_df = out_data

            print(f"The UMAP .joblib  file can be found here: {algo_out}")

        return self.confounders_df
Esempio n. 22
0
def gen_projections(features, method='tsne', n_components=2):
    assert method in ['tsne', 'umap'], f'{method} error'
    if method == 'tsne':
        reducer = TSNE(n_components=n_components
                       )  #n_components:Dimension of the embedded space.
    if method == 'umap':
        reducer = umap.UMAP(n_components=n_components)

    print(f'generating embeddings...')
    features_reduced = reducer.fit_transform(features)
    print(f'generating embeddings completed!')
    return features_reduced
def umap_visualization(data, target, color=None):
    reducer = umap.UMAP()
    embedding = reducer.fit_transform(data)

    color_column = data[target] if not color else color

    plt.figure(figsize=(40, 30), dpi=80)
    plt.scatter(embedding[:, 0],
                embedding[:, 1],
                c=[sns.color_palette()[x] for x in color_column])
    plt.gca().set_aspect('equal', 'datalim')
    plt.title('UMAP projection of the Cover Type dataset', fontsize=24)
Esempio n. 24
0
def visualize(model):
    """
    Visualize the result for the topic model by 2D embedding (UMAP)
    :param model: Topic_Model object
    """
    if model.method == 'LDA':
        return
    reducer = umap.UMAP()
    print('Calculating UMAP projection ...')
    vec_umap = reducer.fit_transform(model.vec[model.method])
    print('Calculating UMAP projection. Done!')
    plot_proj(vec_umap, model.cluster_model.labels_)
def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral",
                                       algorithm="UMAP", save_dir=None):
    """Plots the dimensionality reduction of specified population columns

    Args:
        cell_data (pandas.DataFrame):
            Dataframe containing columns for dimensionality reduction and category
        columns (list):
            List of column names that are included for dimensionality reduction
        category (str):
            Name of column in dataframe containing population or patient data
        color_map (str):
            Name of MatPlotLib ColorMap used, default is Spectral
        algorithm (str):
            Name of dimensionality reduction algorithm, default is UMAP
        save_dir (str):
            Directory to save plots, default is None
    """
    cell_data = cell_data.dropna()

    if algorithm not in ["UMAP", "PCA", "tSNE"]:
        raise ValueError(f"The algorithm specified must be one of the following: "
                         f"{['UMAP', 'PCA', 'tSNE']}")

    graph_title = "%s projection of data" % algorithm

    if algorithm == "UMAP":
        reducer = umap.UMAP()

        column_data = cell_data[columns].values
        scaled_column_data = StandardScaler().fit_transform(column_data)
        embedding = reducer.fit_transform(scaled_column_data)

        plot_dim_reduced_data(embedding[:, 0], embedding[:, 1], fig_id=1,
                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
                              save_dir=save_dir, save_file="UMAPVisualization.png")

    elif algorithm == "PCA":
        pca = PCA()
        pca_result = pca.fit_transform(cell_data[columns].values)

        plot_dim_reduced_data(pca_result[:, 0], pca_result[:, 1], fig_id=2,
                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
                              save_dir=save_dir, save_file="PCAVisualization.png")

    elif algorithm == "tSNE":
        tsne = TSNE()
        tsne_results = tsne.fit_transform(cell_data[columns].values)

        plot_dim_reduced_data(tsne_results[:, 0], tsne_results[:, 1], fig_id=3,
                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
                              save_dir=save_dir, save_file="tSNEVisualization.png")
Esempio n. 26
0
    def fit_umap(self, feature_list, n_neighbors=5, n_jobs=4):
        time_start = time.time()
        fit = umap.UMAP(n_neighbors=n_neighbors,
                        random_state=42,
                        n_components=2,
                        verbose=1,
                        n_jobs=n_jobs,
                        metric='euclidean')

        u = fit.fit_transform(feature_list)
        print('UMAP done! Time elapsed: {} seconds'.format(time.time() -
                                                           time_start))
        return u
Esempio n. 27
0
 def plot_tsne_and_umap(self,
                        training_set,
                        labels,
                        save_path,
                        model_name="tsne"):
     """
     Run T-SNE on training set and save plots.
     :param training_set:
     :param labels: labels of training set
     :param model_name: "tsne" or "umap"
     :return:
     """
     if model_name == "tsne":
         tsne_train = manifold.TSNE(n_components=2,
                                    perplexity=30.0,
                                    early_exaggeration=12.0,
                                    learning_rate=200.0,
                                    n_iter=1000,
                                    n_iter_without_progress=300,
                                    min_grad_norm=1e-07,
                                    metric='euclidean',
                                    init='random',
                                    verbose=0,
                                    random_state=None,
                                    method='barnes_hut',
                                    angle=0.5)
         results = tsne_train.fit_transform(training_set)
     elif model_name == "umap":
         umap_train = umap.UMAP(n_neighbors=10,
                                min_dist=0.3,
                                metric='correlation')
         results = umap_train.fit_transform(training_set)
     else:
         raise ValueError("Model name could not be recognized")
     df_subset_up = {}
     df_subset_up['tsne-2d-one'] = results[:, 0]
     df_subset_up['tsne-2d-two'] = results[:, 1]
     df_subset_up['y'] = labels
     plt.figure(figsize=(16, 10))
     sns_plot = sns.scatterplot(x="tsne-2d-one",
                                y="tsne-2d-two",
                                hue="y",
                                palette=sns.color_palette(
                                    "hls",
                                    max(labels) + 1),
                                data=df_subset_up,
                                legend="full",
                                alpha=0.3)
     fig = sns_plot.get_figure()
     fig.savefig(save_path)
     return sns_plot
Esempio n. 28
0
def plot_umap(Z, labels):
    reducer = umap.UMAP()
    Z_scaled = StandardScaler().fit_transform(Z)
    embedding = reducer.fit_transform(Z_scaled)
    ax = sns.scatterplot(embedding[:, 0],
                         embedding[:, 1],
                         hue=labels,
                         palette=sns.color_palette('muted',
                                                   n_colors=len(
                                                       np.unique(labels))))
    ax.set(xlabel='UMAP0', ylabel='UMAP1', xticklabels=[], yticklabels=[])
    ax.set_xticks([])
    ax.set_yticks([])
    return ax.get_figure()
def _learn_umap(data, **kwargs):
    """
    Calculates UMAP transformation for given matrix features.

    Parameters
    --------
    data: np.array
        Array of features.
    kwargs: optional
        Parameters for ``umap.UMAP()``

    Returns
    -------
    Calculated UMAP transform

    Return type
    -------
    np.ndarray
    """
    reducer = umap.UMAP()
    _umap_filter = reducer.get_params()
    kwargs = {k: v for k, v in kwargs.items() if k in _umap_filter}
    embedding = umap.UMAP(random_state=0, **kwargs).fit_transform(data.values)
    return pd.DataFrame(embedding, index=data.index.values)
Esempio n. 30
0
def plot_UMAP(generated_data: torch.tensor, labels: np.array, n_classes: int,
              model_name: str, path: str) -> None:
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(generated_data)
    labels = np.array(labels).flatten()

    fig, ax = plt.subplots(figsize=(12, 10))
    for i in range(n_classes):
        indices = np.where(labels == i)[0]
        plt.scatter(embedding[indices, 0],
                    embedding[indices, 1],
                    s=5,
                    label=i,
                    color=COLORS[i % 10])
    plt.title("{} Generated MNIST Data".format(model_name), fontsize=18)
    plt.legend(markerscale=2)
    plt.savefig(os.path.join(path, '{}_UMAP.png'.format(model_name)))