Example #1
0
            "df_style_texture = pd.read_csv('style_AML211_VAE_Texture{}.csv',names = range(1,101))"
            .format(i))
        array_t = df_style_texture.values

    df_z_mask = pd.concat([df_root_mask, df_style_mask], 1)
    df_z_texture = pd.concat([df_root_texture, df_style_texture], 1)

    ####combine the z dim from both texture and mask
    f = 0.7  # specify contribution from mask
    array_mt = np.concatenate(
        (array_m * f, array_t * (1 - f)), 1
    )  #only get the last 60 dimensions for the shape that has the largest variation

    ####umap reduce dimensions to about 10

    reducer2D = umap.UMAP(n_components=2, random_state=50)
    print('performing combined umap 2D...')
    umap_result2D = reducer2D.fit_transform(array_mt)

    #%%
    print('calculating louvain...')

    G = kneighbors_graph(array_mt, 200, mode='connectivity',
                         include_self=True)  #was 50
    G1 = nx.from_scipy_sparse_matrix(G)
    partition = community.best_partition(G1, resolution=0.9, random_state=50)

    #%%
    #first convert all pos t cell values to int
    for j in ['pos', 't', 'cell']:
        L = list(df_root_mask[j])
Example #2
0
dataset such as MNIST. We first pull the MNIST dataset and
then use UMAP to reduce it to only 2-dimensions for
easy visualisation.
Note that UMAP manages to both group the individual digit
classes, but also to retain the overall global structure
among the different digit classes -- keeping 1 far from
0, and grouping triplets of 3,5,8 and 4,7,9 which can
blend into one another in some cases.
"""
import umap
from sklearn.datasets import fetch_mldata
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(context="paper", style="white")

mnist = fetch_mldata("MNIST original")

reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(mnist.data)

fig, ax = plt.subplots(figsize=(12, 10))
plt.scatter(embedding[:, 0],
            embedding[:, 1],
            c=mnist.target,
            cmap="Spectral",
            s=0.1)
plt.setp(ax, xticks=[], yticks=[])
plt.title("MNIST data embedded into two dimensions by UMAP", fontsize=18)

plt.show()
Example #3
0
def generate_umap(activations):
    umap_ = umap.UMAP(n_neighbors=200, n_components=2, min_dist=0.5)
    X_2d = umap_.fit_transform(activations)
    X_2d -= X_2d.min(axis=0)
    X_2d /= X_2d.max(axis=0)
    return X_2d
Example #4
0
#images = [IMS_Bcells, IMS_CD3neg_NK, IMS_debris, IMS_eos, IMS_ery, IMS_erydub, IMS_monos, IMS_neutro, IMS_Tcells]
#images = pd.DataFrame(images)
#all_images = pd.concat(images)


frames_b_test = np.nan_to_num(frames_unknown)
frames_b_test2 = np.nan_to_num(frames_unknown2)
frames_b_CD3pos_NK = np.nan_to_num(frames_CD3pos_NK)
all_labels_test = (np.zeros(labels_unknown))
all_labels_test2 = (np.zeros(labels_unknown2))
all_labels_CD3pos_NK = (np.zeros(labels_CD3pos_NK))


#UMAP embedding
import umap
reducer = umap.UMAP(n_neighbors=15)
print("------------UMAP imported------------------")



##############################################################################################
################################## first embedding #############################################

#embed and time
import time
start = time. time()
embedding = reducer.fit(frames_b, all_labels)
#embedding = reducer.fit(all_frames, all_labels)
#embedding = reducer.fit_transform(blood_image_new_flat)
end = time. time()
print("------------UMAP embedding finished, embedding time = ------------------")
Example #5
0
def visualize(
    model,  # type: thelper.typedefs.ModelType
    task,  # type: thelper.typedefs.TaskType
    loader,  # type: thelper.typedefs.LoaderType
    draw=False,  # type: bool
    color_map=None,  # type: Optional[Dict[int, np.ndarray]]
    max_samples=None,  # type: Optional[int]
    return_meta=False,  # type: Union[bool, List[AnyStr]]
    **kwargs):  # type: (...) -> Dict[AnyStr, Any]
    """
    Creates (and optionally displays) a 2D UMAP visualization of sample embeddings.

    By default, all samples from the data loader will be projected using the model and used
    for the visualization. If the task is related to classification, the prediction and groundtruth
    labels will be highlighting using various colors.

    If the model does not possess a ``get_embedding`` attribute, its raw output will be
    used for projections. Otherwise, ``get_embedding`` will be called.

    Args:
        model: the model which will be used to produce embeddings.
        task: the task object used to decode predictions and color samples (if possible).
        loader: the data loader used to get data samples to project.
        draw: boolean flag used to toggle internal display call on or off.
        color_map: map of RGB triplets used to color predictions (for classification only).
        max_samples: maximum number of samples to draw from the data loader.
        return_meta: toggles whether sample metadata should be provided as output or not.

    Returns:
        A dictionary of the visualization result (an RGB image in numpy format), a list of projected
        embedding coordinates, the labels of the samples, and the predictions of the samples.
    """
    assert thelper.utils.check_installed("umap"), \
        "could not import optional 3rd-party dependency 'umap-learn'; make sure you install it first!"
    import umap
    assert loader is not None and len(loader) > 0, "no available data to load"
    assert model is not None and isinstance(model,
                                            torch.nn.Module), "invalid model"
    assert task is not None and isinstance(task,
                                           thelper.tasks.Task), "invalid task"
    assert max_samples is None or max_samples > 0, "invalid maximum loader sample count"
    thelper.viz.logger.debug(
        "fetching data loader samples for UMAP visualization...")
    embeddings, labels, preds, idxs = [], [], [], []
    if isinstance(task, thelper.tasks.Classification) and not task.multi_label:
        assert all([isinstance(n, str)
                    for n in task.class_names]), "unexpected class name types"
        if not color_map:
            if hasattr(task, "color_map"):
                color_map = task.color_map
            else:
                color_map = {
                    idx: thelper.draw.get_label_color_mapping(idx + 1)
                    for idx in task.class_indices.values()
                }
        color_map = {
            idx: f"#{c[0]:02X}{c[1]:02X}{c[2]:02X}"
            for idx, c in color_map.items()
        }
    if isinstance(return_meta, bool):
        return_meta = task.meta_keys if return_meta else []
    assert isinstance(return_meta, list) and all([isinstance(key, str) for key in return_meta]), \
        "sample metadata keys must be provided as a list of strings"
    meta = {key: [] for key in return_meta}
    for sample_idx, sample in tqdm.tqdm(enumerate(loader),
                                        desc="extracting embeddings"):
        if max_samples is not None and sample_idx > max_samples:
            break
        with torch.no_grad():
            input_tensor = sample[task.input_key]
            if task is not None and isinstance(task, thelper.tasks.Classification) and \
                    not task.multi_label and task.gt_key in sample:
                label = sample[task.gt_key]
                if isinstance(label, torch.Tensor):
                    label = label.cpu().numpy()
                if all([isinstance(lbl, str) for lbl in label]):
                    label = [task.class_indices[lbl] for lbl in label]
                pred = model(input_tensor).topk(k=1, dim=1)[1].view(
                    input_tensor.size(0)).cpu().numpy()
                labels.append(label)
                preds.append(pred)
            if hasattr(model, "get_embedding"):
                embedding = model.get_embedding(input_tensor)
            else:
                if not thelper.viz.warned_missing_get_embedding:
                    thelper.viz.logger.warning(
                        "missing 'get_embedding' function in model object; will use output instead"
                    )
                    thelper.viz.warned_missing_get_embedding = True
                embedding = model(input_tensor)
            if embedding.dim() > 2:  # reshape to BxC
                embedding = embedding.view(embedding.size(0), -1)
        embeddings.append(embedding.cpu().numpy())
        idxs.append(sample_idx)
        for key in return_meta:
            for v in sample[key]:
                meta[key].append(v)
    embeddings = np.concatenate(embeddings)
    if labels and preds:
        labels, preds = np.concatenate(labels), np.concatenate(preds)
    else:
        labels, preds = [0] * len(embeddings), [0] * len(embeddings)
    seed = thelper.utils.get_key_def("seed", kwargs, 0)
    if seed is None:
        seed = np.random.randint(np.iinfo(np.int32).max)
    prev_state = np.random.get_state()
    np.random.seed(seed)
    default_umap_args = {"n_components": 2}
    umap_args = thelper.utils.get_key_def("umap_args", kwargs,
                                          default_umap_args)
    umap_engine = umap.UMAP(**umap_args)
    thelper.viz.logger.debug("computing UMAP projection...")
    embeddings = umap_engine.fit_transform(embeddings)
    np.random.set_state(prev_state)
    fig = plot(embeddings,
               labels,
               preds,
               color_map=color_map,
               task=task,
               **kwargs)
    img = thelper.draw.fig2array(fig).copy()
    if draw:
        thelper.viz.logger.debug("displaying UMAP projection...")
        cv.imshow("thelper.viz.umap",
                  img[..., ::-1])  # RGB to BGR for opencv display
        cv.waitKey(1)
    return {
        # key formatting should be compatible with _write_data in thelper/train/base.py
        "tsne-projs/pickle": embeddings,
        "tsne-labels/json": labels.tolist(),
        "tsne-preds/json": preds.tolist(),
        "tsne-idxs/json": idxs,
        "tsne-meta/json": meta,
        "tsne/image": img
    }
Example #6
0
def main():
    #select cpu or gpu
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(16)

    #pre process data

    #annotation
    an = pd.read_csv(
        "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/annotation.csv",
        index_col=0)
    tmp = an["loh_percent"].copy()
    for i in range(len(tmp)):
        if tmp[i] >= 0.05:
            tmp[i] = 1
        else:
            if tmp[i] < 0.05:
                tmp[i] = 0
    an["loh_percent"] = tmp

    tmp2 = an["mutations_per_mb"].copy()
    for i in range(len(tmp2)):
        if tmp2[i] >= 28:
            tmp2[i] = 1
        else:
            if tmp2[i] < 28:
                tmp2[i] = 0
    an["mutations_per_mb"] = tmp2

    #data
    x = pd.read_csv(
        "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/salmonE74cDNA_counts_baseline.csv",
        index_col=0)
    x = x.T
    x = (x + 1).apply(np.log2)
    #test = np.median(x, axis=0)
    x_std = np.std(x, axis=0)
    top_gene = runPams.n_top_gene
    top_gene_idx = x_std.argsort()[::-1][0:top_gene]
    data = x.iloc[:, top_gene_idx]
    data = data.values.copy()
    top_gene_names = list(x.columns[top_gene_idx])
    top_gene_names = np.insert(top_gene_names, 0, "bias")

    #data = np.random.rand(10, 200)
    xn, yn = data.shape

    # umap + kmeans
    pams = str(runPams.k) + "_" + str(runPams.n_top_gene)
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_UMAP/"
    # umap
    reducer = umap.UMAP()
    z = reducer.fit_transform(data)
    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # TSNE
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_TSNE/"

    # T-sNE
    z = TSNE(n_components=2).fit_transform(data)
    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # vae+dnp
    #data = np.random.rand(10, 2000)
    #xn, yn = data.shape
    data = np.reshape(data, (xn, 1, yn))
    data = np.insert(data, 0, 1, axis=2)
    #data = data[:,:,:5000]
    zn, xn, yn = data.shape
    # set s
    set_s = np.zeros(xn * yn)
    set_s[0] = 1

    # set c
    set_c = np.ones(xn * yn)
    set_c[0] = 0

    # np 2 tensor
    data = torch.tensor(data)
    # dataLoader
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=runPams.batch_size,
                            shuffle=False,
                            num_workers=runPams.n_cpu,
                            pin_memory=torch.cuda.is_available())

    net, optimizer, lossFunc = getVAEPams(xn, yn, device, runPams.lr)

    #np->tensor or gpu
    set_s = torch.tensor(set_s).float().to(device)
    set_c = torch.tensor(set_c).float().to(device)

    # train
    while torch.sum(set_s == 1).item() < (runPams.k + 1):
        print(torch.sum(set_s == 1).item())
        for _ in range(runPams.epoch):
            for step, (x, _) in enumerate(dataLoader):
                b_x = Variable(x.view(-1, xn * yn).float().to(device))
                b_y = Variable(x.view(-1, xn * yn).float().to(device))

                # initialize the weight of set c to be zero and of set s to be normal
                net.fc1.weight.data = net.fc1.weight.data * (set_s)

                # network
                _, decoded, _ = net(b_x)
                loss = lossFunc(decoded, b_y)  # mean square error
                optimizer.zero_grad()  # clear gradients for this training step
                loss.backward()  # backpropagation, compute gradients
                optimizer.step()  # apply gradients
                print(net.fc1.weight.grad)

        #get new J
        newJ = getNewJ(net.fc1.weight.grad.clone(), set_c, device).item()
        print(newJ)

        # initialize the weight of node J by xavier
        tmpWeight = torch.rand(1, net.fc1.out_features)
        tmpWeight = nn.init.xavier_normal_(tmpWeight)
        net.fc1.weight.data[:, newJ] = tmpWeight

        # update set s and aet C
        set_s[newJ] = torch.tensor(1)
        set_c[newJ] = torch.tensor(0)

    # test
    #sys.exit()
    predLabelsByVAE = list()
    features = list()
    for (x, _) in dataLoader:
        b_x = Variable(x.view(-1,
                              xn * yn).float().to(device))  # batch x (data)
        feature, _, predicted = net(b_x)
        features.append([feature.cpu().detach().numpy()])
        predicted = torch.max(predicted.data, 1)[1].cpu().numpy()
        predLabelsByVAE.append(predicted)
    # test end

    features = np.hstack(features)
    zn, xn, yn = features.shape
    features = np.reshape(features, (xn, yn))
    features = np.array(features)
    z = features
    pams = str(runPams.k) + "_" + str(runPams.n_top_gene)
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_VAE+DNP/"

    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # save gene names
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/genes_selected.csv"
    genes = pd.DataFrame(set_s.cpu().detach().numpy())
    genes = genes.T
    genes.columns = top_gene_names
    genes.to_csv(pathName)
    '''
    kmeans_estimator = KMeans(n_clusters=4, random_state=0).fit(features)
    labelByVAEKmeans = kmeans_estimator.labels_ 
    # get figures
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    # 这里'or'代表中的'o'代表画圈,'r'代表颜色为红色,后面的依次类推
    for i in range(len(labelByVAEKmeans)):
        plt.plot([features[i, 0]], [features[i, 1]], mark[label_pred[i]], markersize=5)
    #save data
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200702AE_DNP/results/csv_img_res/"
    fileName = pathName + str(runPams.k) + ".png"
    plt.savefig(fileName)

    fileName = pathName + str(runPams.k) + ".csv"
    setS = pd.DataFrame(set_s.cpu().detach().numpy())
    setS = setS.T
    setS.to_csv(fileName)
    #plt.show()
    '''
    return ()
Example #7
0
X_datasets.append(X_coil20)
Y_datasets.append(y_coil20)

X_digits, y_digits = datasets.load_digits(n_class=10, return_X_y=True)
X_datasets.append(X_digits)
Y_datasets.append(y_digits)

dft = pd.read_csv('./data/fashion-mnist_test.csv', dtype=int)  # read test data
X_fashion = dft.drop('label', axis=1)
y_fashion = dft['label']
X_datasets.append(X_fashion)
Y_datasets.append(y_fashion)

# Set up algorithms
methods = OrderedDict()
methods['umap'] = umap.UMAP()
methods['t-SNE'] = manifold.TSNE(n_components=2, init='pca', random_state=0)

fig = plt.figure(figsize=(15, 8))
# Plot results
labels = ['COIL20', 'MNIST', 'FASHION MNIST']
for i, (label, method) in enumerate(methods.items()):
    for j in range(len(X_datasets)):
        print(X_datasets[j].shape)
        Y = method.fit_transform(X_datasets[j])
        ax = fig.add_subplot(2, 3, i * 3 + j + 1)
        ax.scatter(Y[:, 0], Y[:, 1], c=Y_datasets[j], cmap=plt.cm.Spectral)
        ax.set_title(labels[j])
        ax.set_ylabel(label)
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
Example #8
0
def do_umap(filename, bm_plot_name, env_plot_name):
    # get the reaction-inclusion vector out of the input file and make it into
    # a bunch of 1/0 columns instead of one column of strings of 1s and 0s
    data = pd.read_csv(filename)
    # want each reaction bit in its own column and only pass those columns to umap
    rxn_incl_cols = data['rxn_incl'].apply(lambda x: pd.Series(list(x)))
    # first and last columns will be empty and all columns will be strings
    umap_ready = rxn_incl_cols.iloc[:, 1:-1].astype('int32')

    # do UMAP
    print('Doing UMAP')
    reducer = umap.UMAP()
    umap_results = reducer.fit_transform(umap_ready)
    umap_df = pd.DataFrame(data=umap_results, columns=['x', 'y'])
    # add in other info for plotting purposes
    plotting_df = pd.concat([umap_df, data], axis=1)

    # make a colormap for biomass reactions
    print('Plotting')
    bm_cdict = {v: k for k, v in enumerate(np.unique(data.biomass))}
    bm_cvals = [bm_cdict[c] for c in data.biomass]

    # make a colormap for input metabolites
    # start by getting the column of input metabolites, splitting it into a list of
    # lists, then pasting the sublists together so that there's one string to use
    # for making the colormap
    in_groups = ['-'.join(sorted(ins.split('-'))) for ins in data.env]
    in_cdict = {v: k for k, v in enumerate(np.unique(in_groups))}
    in_cvals = [in_cdict[c] for c in in_groups]

    # make the figure large
    plt.figure(figsize=(8, 7))

    # make the text legible
    matplotlib.rcParams.update({
        'font.size': 18,
        'xtick.labelsize': 18,
        'ytick.labelsize': 18,
        'axes.labelsize': 18
    })

    # do one scatterplot colored by biomass reactions
    plt.scatter(plotting_df.x,
                plotting_df.y,
                c=bm_cvals,
                cmap='nipy_spectral',
                s=10)
    plt.xlabel('UMAP_1')
    plt.ylabel('UMAP_2')
    plt.savefig(f'data/{bm_plot_name}.png', dpi=600)

    # do one scatterplot colored by environments
    plt.figure(2)
    plt.figure(figsize=(8, 7))
    plt.scatter(plotting_df.x,
                plotting_df.y,
                c=in_cvals,
                cmap='nipy_spectral',
                s=10)
    plt.xlabel('UMAP_1')
    plt.ylabel('UMAP_2')
    plt.savefig(f'data/{env_plot_name}.png', dpi=600)
Example #9
0
def layout_umap(
    graph: nx.Graph,
    min_dist: float = 0.75,
    n_neighbors: int = 25,
    max_edges: int = 10000000,
    random_seed: Optional[int] = None,
) -> Tuple[nx.Graph, List[NodePosition]]:
    """
    Automatic graph layout generation by creating a generalized node2vec embedding,
    then using UMAP for dimensionality reduction to 2d space.

    By default, this function automatically attempts to prune each graph to a maximum
    of 10,000,000 edges by removing the lowest weight edges. This pruning is approximate
    and will leave your graph with at most ``max_edges``, but is not guaranteed to be
    precisely ``max_edges``.

    In addition to pruning edges by weight, this function also only operates over the
    largest connected component in the graph.

    After dimensionality reduction, sizes are generated for each node based upon
    their degree centrality, and these sizes and positions are further refined by an
    overlap removal phase. Lastly, a global partitioning algorithm
    (:func:`graspologic.partition.leiden`) is executed for the largest connected
    component and the partition ID is included with each node position.

    Parameters
    ----------
    graph : :class:`networkx.Graph`
        The graph to generate a layout for. This graph may have edges pruned if the
        count is too high and only the largest connected component will be used to
        automatically generate a layout.
    min_dist : float
        The effective minimum distance between embedded points. Default is ``0.75``.
        Smaller values will result in a more clustered/clumped embedding where nearby
        points on the manifold are drawn closer together, while larger values will
        result on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded points will
        be spread out.
    n_neighbors : int
        The size of local neighborhood (in terms of number of neighboring sample points)
        used for manifold approximation. Default is ``25``. Larger values result in
        more global views of the manifold, while smaller values result in more local
        data being preserved.
    max_edges : int
        The maximum number of edges to use when generating the embedding.  Default is
        ``10000000``. The edges with the lowest weights will be pruned until at most
        ``max_edges`` exist. Warning: this pruning is approximate and more edges than
        are necessary may be pruned. Running in 32 bit enviornment you will most
        likely need to reduce this number or you will out of memory.
    random_seed : int
        Seed to be used for reproducible results. Default is None and will produce
        random results.

    Returns
    -------
    Tuple[nx.Graph, List[NodePosition]]
        The largest connected component and a list of NodePositions for each node in
        the largest connected component. The NodePosition object contains:
        - node_id
        - x coordinate
        - y coordinate
        - size
        - community

    References
    ----------
    .. [1] McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection
        for Dimension Reduction, ArXiv e-prints 1802.03426, 2018
    .. [2] Böhm, Jan Niklas; Berens, Philipp; Kobak, Dmitry. A Unifying Perspective
        on Neighbor Embeddings along the Attraction-Repulsion Spectrum. ArXiv e-prints
        2007.08902v1, 17 Jul 2020.
    """

    lcc_graph, tensors, labels = _node2vec_for_layout(graph, max_edges, random_seed)
    points = umap.UMAP(
        min_dist=min_dist, n_neighbors=n_neighbors, random_state=random_seed
    ).fit_transform(tensors)
    positions = _node_positions_from(lcc_graph, labels, points, random_seed=random_seed)
    return lcc_graph, positions
import pandas as pd
import numpy as np
import umap
from sklearn.feature_extraction.text import TfidfVectorizer

covidtrials = pd.read_csv("C:\clustering\\allresults.csv")


def TFIDF(X_train, MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    print("tf-idf with", str(np.array(X_train).shape[1]), "features")
    columns = vectorizer_x.get_feature_names()
    return (X_train, columns)


Y = covidtrials['newc']
vectorizer = TfidfVectorizer(max_features=75000)
X = vectorizer.fit_transform(Y)

YY = covidtrials['clusters']

mapper = umap.UMAP().fit(X)

import umap.plot

p = umap.plot.points(mapper, labels=YY, color_key_cmap='Paired')

umap.plot.plt.show()
Example #11
0
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from skimage.io import imread
from skimage.transform import rescale, resize, pyramid_gaussian, pyramid_reduce
import matplotlib.pylab as plt
import umap
#%%
from os.path import join
#%%
reducer = umap.UMAP(metric="correlation")
embedding = reducer.fit_transform(iris.data)
embedding.shape
#%%
face_attr_df = pd.read_csv(r"E:\Datasets\celeba-dataset\list_attr_celeba.csv")
#%%
celeba_dir = r"E:\Datasets\celeba-dataset\img_align_celeba\img_align_celeba"
imgs = []
for imgi in range(1, 10001):
    img = imread(join(celeba_dir, r"%06d.jpg" % imgi))
    #print(img.shape)
    imgs.append(img)
#%%
# img_rd = rescale(img, 0.11, multichannel=True, anti_aliasing=True)
# print(img_rd.shape)
img_rs = resize(img, (24, 20, 3), anti_aliasing=True)
print(img_rs.shape)
plt.imshow(img_rs)
plt.show()
Example #12
0
subXie = Xiecluster[[
    'Soma_region', 'Brain_id', 'SWC_File', 'Celltype', 'Subtype'
]].copy()
# Change Dr.Xie's Soma_region to 'Xie soma abbr'
subXie.rename(columns={'Soma_region': 'Xie Soma_Abbr'}, inplace=True)
subXie.index = Xiecluster['SWC_File']
subXie = subXie.reindex(index=os.listdir('/home/penglab/Documents/CLA_swc'),
                        fill_value='0')

# %%
# Use umap to map data from high dimension to low dimension
import umap
import matplotlib.pyplot as plt
import seaborn as sns
reducer = umap.UMAP()
embedding = reducer.fit_transform(Feafile.values)
print('\n')
print('Shape of the Umap result are ', embedding.shape)
print('The result is an array with ' + str(embedding.shape[0]) +
      ' samples, but only ' + str(embedding.shape[1]) +
      ' feature columns (instead of the ' + str(Feafile.shape[1]) +
      ' we started with).')

#Show the original subtype
ShowXie = pd.DataFrame(index=subXie.index,
                       columns=['ux', 'uy', 'Subtype', 'plotc'])
typeR, typeC = np.unique(subXie['Subtype'], return_counts=True)
ShowXie['ux'] = embedding[:, 0]
ShowXie['uy'] = embedding[:, 1]
ShowXie['Subtype'] = subXie['Subtype']
Example #13
0
def eval_other_methods(x, y):
    gmm = mixture.GaussianMixture(covariance_type='full',
                                  n_components=args.n_clusters,
                                  random_state=0)
    gmm.fit(x)
    y_pred_prob = gmm.predict_proba(x)
    y_pred = y_pred_prob.argmax(1)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | GMM clustering on raw data")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    y_pred = KMeans(n_clusters=args.n_clusters, random_state=0).fit_predict(x)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | K-Means clustering on raw data")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    sc = SpectralClustering(n_clusters=args.n_clusters,
                            random_state=0,
                            affinity='nearest_neighbors')
    y_pred = sc.fit_predict(x)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | Spectral Clustering on raw data")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    if args.manifold_learner == 'UMAP':
        md = float(args.umap_min_dist)
        hle = umap.UMAP(random_state=0,
                        metric=args.umap_metric,
                        n_components=args.umap_dim,
                        n_neighbors=args.umap_neighbors,
                        min_dist=md).fit_transform(x)
    elif args.manifold_learner == 'LLE':
        from sklearn.manifold import LocallyLinearEmbedding
        hle = LocallyLinearEmbedding(
            n_components=args.umap_dim,
            n_neighbors=args.umap_neighbors).fit_transform(x)
    elif args.manifold_learner == 'tSNE':
        method = 'exact'
        hle = TSNE(n_components=args.umap_dim,
                   n_jobs=16,
                   random_state=0,
                   verbose=0).fit_transform(x)
    elif args.manifold_learner == 'isomap':
        hle = Isomap(
            n_components=args.umap_dim,
            n_neighbors=5,
        ).fit_transform(x)

    gmm = mixture.GaussianMixture(covariance_type='full',
                                  n_components=args.n_clusters,
                                  random_state=0)
    gmm.fit(hle)
    y_pred_prob = gmm.predict_proba(hle)
    y_pred = y_pred_prob.argmax(1)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | GMM clustering on " + str(args.manifold_learner) +
          " embedding")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    plt.scatter(*zip(*hle[:, :2]), c=y, label=y)

    plt.savefig(args.save_dir + '/' + args.dataset + '-' +
                str(args.manifold_learner) + '.png')
    plt.clf()

    y_pred = KMeans(n_clusters=args.n_clusters,
                    random_state=0).fit_predict(hle)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | K-Means " + str(args.manifold_learner) +
          " embedding")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    sc = SpectralClustering(n_clusters=args.n_clusters,
                            random_state=0,
                            affinity='nearest_neighbors')
    y_pred = sc.fit_predict(hle)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | Spectral Clustering on " +
          str(args.manifold_learner) + " embedding")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")
Example #14
0
def cluster_manifold_in_embedding(hl, y, n_clusters, save_dir, visualize):
    # find manifold on autoencoded embedding
    if args.manifold_learner == 'UMAP':
        md = float(args.umap_min_dist)
        hle = umap.UMAP(random_state=0,
                        metric=args.umap_metric,
                        n_components=args.umap_dim,
                        n_neighbors=args.umap_neighbors,
                        min_dist=md).fit_transform(hl)
    elif args.manifold_learner == 'LLE':
        hle = LocallyLinearEmbedding(
            n_components=args.umap_dim,
            n_neighbors=args.umap_neighbors).fit_transform(hl)
    elif args.manifold_learner == 'tSNE':
        hle = TSNE(n_components=args.umap_dim,
                   n_jobs=16,
                   random_state=0,
                   verbose=0).fit_transform(hl)
    elif args.manifold_learner == 'isomap':
        hle = Isomap(
            n_components=args.umap_dim,
            n_neighbors=5,
        ).fit_transform(hl)

    # clustering on new manifold of autoencoded embedding
    if args.cluster == 'GMM':
        gmm = mixture.GaussianMixture(covariance_type='full',
                                      n_components=n_clusters,
                                      random_state=0)
        gmm.fit(hle)
        y_pred_prob = gmm.predict_proba(hle)
        y_pred = y_pred_prob.argmax(1)
    elif args.cluster == 'KM':
        km = KMeans(init='k-means++',
                    n_clusters=n_clusters,
                    random_state=0,
                    n_init=20)
        y_pred = km.fit_predict(hle)
    elif args.cluster == 'SC':
        sc = SpectralClustering(n_clusters=n_clusters,
                                random_state=0,
                                affinity='nearest_neighbors')
        y_pred = sc.fit_predict(hle)

    y_pred = np.asarray(y_pred)
    y_pred = y_pred.reshape(len(y_pred), )
    y = np.asarray(y)
    y = y.reshape(len(y), )
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(args.dataset + " | " + args.manifold_learner +
          " on autoencoded embedding with " + args.cluster + " - N2D")
    print("======================")
    result = "{}\t{}\t{}".format(ari, nmi, acc)
    print(result)
    print("======================")

    if visualize:
        plt.scatter(*zip(*hle[:, :2]), c=y, label=y)

        plt.savefig(save_dir + '/' + args.dataset + '-n2d.png')
        plt.clf()

    return y_pred, acc, nmi, ari
Example #15
0
	for i in range(len(n4)):
		elem=n4[i]
		eigs_matrix[k][0],eigs_matrix[k][1]=elem[0],elem[1]
		cols.append(colores['Sparse'])
		k+=1'''
'''
	for i in range(len(n1)):
		elem=n1[i]
		eigs_matrix[k][0],eigs_matrix[k][1]=elem[0],elem[1]
		cols.append(colores['Complete'])
		k+=1 '''

embedding = umap.UMAP(n_neighbors=75,
                      metric='canberra',
                      n_epochs=1000,
                      min_dist=0.01,
                      repulsion_strength=10,
                      negative_sample_rate=50,
                      transform_queue_size=10)

H = embedding.fit_transform(eigs_matrix)
plt.scatter(H[:, 0], H[:, 1], c=cols, s=5)
plt.show()
"""
nx.draw(G_coma[1], with_labels=True, font_weight='bold')
plt.subplot(224)
nx.draw(G_old[1], with_labels=True, font_weight='bold')
plt.show()
np.savetxt("coma_props.csv",coma_props,delimiter=",",fmt="%s")
nx.degree_centrality(Graphs[0])
nx.eigenvector_centrality(Graphs[0])
Example #16
0
    # mnist dataset
    digits = datasets.load_digits()
    x_data = digits.data[0:100]
    y_d = digits.target[0:100]

    labels = (2, 3, 7)
    x_list = []
    y_list = []
    for i, j in zip(x_data, y_d):
        if j in labels:
            x_list.append(i)
            y_list.append(j)

    x_data = umap.UMAP(n_neighbors=20,
                       n_components=10,
                       min_dist=0.01,
                       metric='correlation').fit_transform(x_list, y=y_list)
    parameters = []
    sc = StandardScaler()
    sc.fit(x_data)
    x_data = sc.transform(x_data)
    # labels = random.sample(range(10), k=3)

    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_list,
                                                        test_size=0.1,
                                                        shuffle=False)

    dim = len(x_data[0])
    theta_list = []
    test = QVC(dim, dim, ["0" * dim, "1" * dim], 16384, 1, dim, max(y_d))
     learning_rate=3.0, local_connectivity=1.0,
     metric=<function dist_eigs at 0x7f6295517b70>, metric_kwds=None,
     min_dist=0, n_components=2, n_epochs=30, n_neighbors=20,
     negative_sample_rate=5, random_state=None, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, verbose=True)
"""

#splitting data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, cols, test_size=0.3)

embedding = umap.UMAP(n_components=2,
                      n_neighbors=25,
                      spread=2,
                      metric=dist_eigs,
                      verbose=True,
                      n_epochs=500)
#H = embedding.fit_transform(data,y=cols)
H = embedding.fit_transform(data, y=cols)
#adding legend
one = mpatches.Patch(facecolor=colores[category1],
                     label=category1,
                     linewidth=0.5,
                     edgecolor='black')
two = mpatches.Patch(facecolor=colores[category2],
                     label=category2,
                     linewidth=0.5,
                     edgecolor='black')
fig = plt.figure()
plt.scatter(H[:, 0], H[:, 1], c=cols, s=5)
Example #18
0
 def UMAP(self, **kwargs):
     XT = self.data[self.columns_latent_states].values
     XT = StandardScaler().fit_transform(XT)
     XT = umap.UMAP(**kwargs).fit_transform(XT)
     self.data[[f'UMAP {i+1}' for i in range(XT.shape[1])]] = XT
Example #19
0
repulsion_strength = hf['repulsion_strength']
repulsion_strength = (repulsion_strength[0][0])

negative_sample_rate = hf['negative_sample_rate']
negative_sample_rate = (negative_sample_rate[0][0])

transform_queue_size = hf['transform_queue_size']
transform_queue_size = (transform_queue_size[0][0])

target_n_neighbors = hf['target_n_neighbors']
target_n_neighbors = (target_n_neighbors[0][0])

target_weight = hf['target_weight']
target_weight = (target_weight[0][0])

transform_seed = hf['transform_seed']
transform_seed = (transform_seed[0][0])

metric = sys.argv[1]


hf = h5py.File(os.path.join(script_path, 'D.mat'),'r')
D = np.array(hf.get('D'));

reducer = umap.UMAP(metric=metric,n_neighbors=n_neighbors,n_components=n_components,learning_rate=learning_rate,min_dist=min_dist,spread=spread,set_op_mix_ratio=set_op_mix_ratio,local_connectivity=local_connectivity,repulsion_strength=repulsion_strength,negative_sample_rate=negative_sample_rate,transform_queue_size=transform_queue_size,target_n_neighbors=target_n_neighbors,target_weight=target_weight,transform_seed=transform_seed)
embedding = reducer.fit_transform(D)

with h5py.File(os.path.join(script_path, 'data.h5'), 'w') as hf:
    hf.create_dataset('R', data=embedding)
Example #20
0
plt.gcf().clear()

#%%[markdown]
# ## Clustering grid search
#
# Search for the best clustering hyperparameters from the
# fitted w2v grid search, then plot the results

#%%
print('Reducing dimensionality of word2vec embeddings for clustering...')
# Get the normalized word2vec embeddings
w2v_gscv.best_estimator_.named_steps['w2v'].gensim_model.init_sims(
    replace=True)
vectors = w2v_gscv.best_estimator_.named_steps['w2v'].gensim_model.wv.vectors
# Reduce dimensionality to 3D using UMAP
umapper = umap.UMAP(n_components=3)
umap_vectors = umapper.fit_transform(vectors)

#%%
# Do the clustering
print('Performing grid search for clustering...')
clust_gscv = GridSearchCV(
    clust_pipe,
    CLUST_GRID,
    scoring={'sil': silhouette_scorer_cosine},
    cv=3,
    refit='sil',
    error_score=0,
    return_train_score=False,
    n_jobs=-2,
    verbose=1
for sent in test_dataset:
    concept_scores = {}
    for i in range(5):
        concept_scores[i] = 0

    words = nltk.word_tokenize(sent)
    for word in words:
        for k in concept_words.keys():
            for tup in concept_words[k]:
                if tup[0] == word:
                    concept_scores[k] += tup[1]
                    break
    print(sent)
    print(concept_scores)
    #print(sorted(concept_scores.items(),key = lambda x:x[1], reverse=True))

import umap

X_topics = lsa.fit_transform(X)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5,
                      random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7, 5))
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=dataset_tar,
    s=10,  # size
    edgecolor='none')
plt.show()
Example #22
0
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom,
         keepraw, scale, umap_d, pc1, pc2, projectatomic, plotatomic, adtext):
    """

    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the umap.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom t-SNE coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    scale: Scale the coordinates (True/False). Scaling highly recommanded.
    umap_d: Dimension of the embedded space.
    dim1: Plot the projection along which principle axes
    dim2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------

    """

    foutput = prefix + "-pca-d" + str(umap_d)
    use_atomic_desc = (peratom or plotatomic or projectatomic)

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print(
            "Did not provide the xyz file. We can only output descriptor matrix."
        )
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError(
            'Please supply descriptor in a xyz file or a standlone descriptor matrix'
        )
    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []

    # scale & center
    if scale:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        print('Shape of descriptor matrix is {}'.format(desc.shape))
        print(scaler.fit(desc))
        desc = scaler.transform(desc)  # normalizing the features

    # fit UMAP

    reducer = umap.UMAP()
    proj = reducer.fit_transform(desc)
    if peratom or plotatomic and not projectatomic:
        proj_atomic_all = reducer.transform(desc_atomic)

    # save
    if output == 'matrix':
        np.savetxt(foutput + ".coord",
                   proj,
                   fmt='%4.8f',
                   header='low D coordinates of samples')
        if peratom:
            np.savetxt(foutput + "-atomic.coord",
                       proj_atomic_all,
                       fmt='%4.8f',
                       header='low D coordinates of samples')
    if output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'pca_coord')
        if peratom:
            asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord')
        # remove the raw descriptors
        if not keepraw:
            asapxyz.remove_descriptors(fmat)
            asapxyz.remove_atomic_descriptors(fmat)
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(
        fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    # make plot
    if plotatomic:
        outfile = 'UMAP_4_' + prefix + '-c-' + fcolor + '-plotatomic.png'
    else:
        outfile = 'UMAP_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,
        'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components': {
            "first_p": {
                "type": 'scatter',
                'clabel': colorlabel
            },
            "second_p": {
                "type": 'annotate',
                'adtext': adtext
            }
        }
    }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags)
    if peratom or plotatomic and not projectatomic:
        asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]],
                       plotcolor_peratom[::-1], [], [])
    plt.show()
Example #23
0
normalized_pao1_core_numeric_df = pd.DataFrame(
    normalized_pao1_core_numeric,
    columns=pao1_core_numeric.columns,
    index=pao1_core_numeric.index,
)

normalized_pa14_core_numeric = scaler.fit_transform(pa14_core_numeric)
normalized_pa14_core_numeric_df = pd.DataFrame(
    normalized_pa14_core_numeric,
    columns=pa14_core_numeric.columns,
    index=pa14_core_numeric.index,
)

# +
# model_pao1 = pca.fit(normalized_pao1_expression_numeric_df)
model_pao1 = umap.UMAP(random_state=123).fit(normalized_pao1_core_numeric_df)

normalized_pao1_core_encoded = model_pao1.transform(
    normalized_pao1_core_numeric_df)

normalized_pao1_core_encoded_df = pd.DataFrame(
    data=normalized_pao1_core_encoded,
    index=normalized_pao1_core_numeric_df.index,
    columns=["1", "2"],
)

# Add back label
normalized_pao1_core_encoded_df[["our label", "sra label"]] = pao1_core_label[[
    "our label", "sra label"
]]
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dimension reduction and clustering libraries
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

sns.set(style='white', rc={'figure.figsize': (10, 8)})
mnist = fetch_mldata('MNIST Original')

standard_embedding = umap.UMAP(random_state=42).fit_transform(mnist.data)
plt.scatter(standard_embedding[:, 0],
            standard_embedding[:, 1],
            c=mnist.target,
            s=0.1,
            cmap='Spectral')

kmeans_labels = cluster.KMeans(n_clusters=10).fit_predict(mnist.data)

plt.scatter(standard_embedding[:, 0],
            standard_embedding[:, 1],
            c=kmeans_labels,
            s=0.1,
            cmap='Spectral')

(adjusted_rand_score(mnist.target, kmeans_labels),
Example #25
0
        ax.imshow(images[i_img], cmap='gray', interpolation='none')
        if annosize is not None: # if: 画像列番と正解ラベルを追記
            ax.annotate("%d" % i_img,
                xy=(0, 0.98), xycoords='axes fraction', ha='left', va='top', color='y', fontsize=annosize)
            ax.annotate("L:%d" % labels[i_img],
                xy=(1, 0.98), xycoords='axes fraction', ha='right', va='top', color='c', fontsize=annosize)
        ax.axis('off')

    plt.show()

draw_digits(list(range(24)))

import umap
from scipy.sparse.csgraph import connected_components

res_umap = umap.UMAP().fit_transform(digits.data)
print(res_umap.shape)

import matplotlib.cm as cm
plt.figure(figsize=(6, 6))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=digits.target, cmap=cm.tab10)
plt.colorbar()
plt.show()

plt.scatter(res_umap[:, 0], res_umap[:, 1], s=10, c=digits.target, cmap=cm.tab10)
plt.axis([-3, 1, 3, 5]); plt.grid(); plt.show()

i_list = np.where((-1.5 < res_umap[:, 0]) & (res_umap[:, 0] < -1) & (3.3 < res_umap[:, 1]) & (res_umap[:, 1] < 4))[0]
i_list

draw_digits(i_list)
Example #26
0
    correlated_genes = list(set(correlated_genes))
    full_list += correlated_genes

import collections

full_freq = collections.Counter(full_list)
full_list = []
for k, v in full_freq.items():
    if v >= 3:
        full_list.append(k)
full_list.sort()

##-----------------------------------------------------------------------------
##for clustering
embedding = umap.UMAP(n_neighbors=5,
                      min_dist=0.0,
                      n_components=2,
                      metric='cosine').fit_transform(new_y_pred)
kmeans = KMeans(n_clusters=6, random_state=1).fit(embedding)
y_label = kmeans.labels_.copy()

##for visualization
embedding = umap.UMAP(n_neighbors=5,
                      min_dist=0.0,
                      n_components=2,
                      metric='cosine').fit_transform(new_y_pred)

embedding = pd.DataFrame(embedding)
embedding.columns = ['UMAP1', 'UMAP2']
embedding["Proton"] = y_label
f = sns.lmplot(x='UMAP1',
               y='UMAP2',
Example #27
0
index.verbose = True
faiss_index_file = 'faiss.index'
if os.path.exists(faiss_index_file):
    print('load existing index from %s' % faiss_index_file)
    index = faiss.read_index(faiss_index_file, faiss.IO_FLAG_MMAP)
    index.hnsw.efSearch = 256
else:
    # build lossy faiss index
    print('build new index and save to %s' % faiss_index_file)
    index.hnsw.efConstruction = 40
    data = np.ascontiguousarray(mnist.data, dtype=np.float32)
    # we no longer need mnist data in its original form
    print('train index...')
    index.train(data)
    print('add vectors to index...')
    index.add(data)
    print('save...')
    faiss.write_index(index, faiss_index_file)

reducer = umap.UMAP(random_state=42, init="random", verbose=True, n_epochs=200)
embedding = reducer.fit_faiss_transform(index)
#embedding = reducer.fit_transform(mnist.data)

fig, ax = plt.subplots(figsize=(12, 10))
color = mnist.target.astype(int)
plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap="Spectral", s=0.1)
plt.setp(ax, xticks=[], yticks=[])
plt.title("MNIST data embedded into two dimensions by UMAP", fontsize=18)

plt.show()
Example #28
0
def evaluation(y_pred,
               cluster_method="Kmeans",
               num_cluster=25,
               n_neighbors=20,
               min_dist=0.0):
    '''
    it supports Kmeans, Spectral clustering and GMM 3 clustering methods
    '''
    if cluster_method == "Kmeans":
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)

        kmeans = KMeans(n_clusters=num_cluster, random_state=1).fit(embedding)
        centroid = kmeans.cluster_centers_.copy()
        y_label = kmeans.labels_.copy()
        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))
    elif cluster_method == "SC":
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)
        clustering = SpectralClustering(n_clusters=num_cluster,
                                        assign_labels="discretize",
                                        random_state=0).fit(embedding)
        y_label = clustering.labels_.copy()
        centroid = pd.DataFrame(embedding.copy())
        centroid['label'] = y_label
        centroid = centroid.groupby('label').mean().values
        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))

    else:
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)
        gmm = GaussianMixture(n_components=num_cluster).fit(embedding)
        y_label = gmm.predict(embedding)
        centroid = pd.DataFrame(embedding.copy())
        centroid['label'] = y_label
        centroid = centroid.groupby('label').mean().values

        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))

    ##alternative approach to assigne soft-assignment through t-student distribution
    ##t-student distribution kernel soft-assignment,alpha=1
    #for j in range(centroid.shape[0]):
    #    y_pseudo[:,j]=(np.linalg.norm(embedding-centroid[j,:],axis=1)+1)**(-1)
    ##cosine distance
    #y_pseudo[:,j]=((1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1]))+1)**(-1))[:,0]
    #y_pseudo = pd.DataFrame(y_pseudo)
    #y_pseudo2=np.zeros((y_pred.shape[0],centroid.shape[0]))
    #for j in range(centroid.shape[0]):
    #    y_pseudo2[:,j]=y_pseudo.iloc[:,j].values/np.sum(
    #        y_pseudo[y_pseudo.columns.difference([j])].values,axis=1)
    #y_pseudo = y_pseudo2

    ##soft-assignment used in this study
    ##distance based soft-assignment
    for j in range(centroid.shape[0]):
        ##euclidean distance
        y_pseudo[:, j] = 1 / np.linalg.norm(embedding - centroid[j, :], axis=1)
        ##cosine similarity
        #y_pseudo[:,j]=1/(1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1])))[:,0]
    y_pseudo = softmax(y_pseudo, axis=1)

    ##auxiliary target distribution
    f = np.sum(np.square(y_pseudo) / np.sum(y_pseudo, axis=0), axis=1)
    y2 = np.square(y_pseudo) / np.sum(y_pseudo, axis=0)
    au_tar = (y2.T / f).T

    return au_tar, y_label, embedding
import umap
import matplotlib.pyplot as plt

#导入数据
datafile = u'data.csv'
data = pd.read_csv(datafile)
data_fea = data.iloc[:,1:]#取数据中指标所在的列
data_fea = data_fea.fillna(0)#填补缺失值

#标准化
data_mean = data_fea.mean()
data_std = data_fea.std()
data_fea = (data_fea - data_mean)/data_std

#降维
umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=3).fit_transform(data_fea.values)

#归一化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
umap_data = min_max_scaler.fit_transform(umap_data)

#绘制图像
plt.figure(figsize=(12,5))
plt.scatter(umap_data[:,0], umap_data[:,1])
plt.scatter(umap_data[:,1], umap_data[:,2])
plt.scatter(umap_data[:,2], umap_data[:,0])



def plot_embedding_of_points_2PLOTS(embedding,
                                    labels1,
                                    labels2,
                                    path_save1,
                                    path_save2,
                                    name_save1,
                                    name_save2,
                                    n_samples_plot=None,
                                    method='TSNE'):
    n_samples = embedding.shape[0]
    if n_samples_plot != None:
        indices_to_plot = np.random.choice(range(n_samples),
                                           min(n_samples_plot, n_samples),
                                           replace=False)
    else:
        indices_to_plot = np.random.choice(range(n_samples),
                                           n_samples,
                                           replace=False)
    embedding_sampled = embedding[indices_to_plot, :]
    if embedding.shape[1] == 2:

        embedding_sampled = embedding
    else:
        if method == 'TSNE':
            embedding_sampled = TSNE(
                n_components=2).fit_transform(embedding_sampled)
        elif method == 'UMAP':
            embedding_sampled = umap.UMAP(
                n_neighbors=500).fit_transform(embedding_sampled)
    labels1 = np.asarray(labels1)
    labels2 = np.asarray(labels2)
    labels1 = labels1.astype(int)
    labels2 = labels2.astype(int)
    labels1_sampled = labels1[indices_to_plot]
    labels2_sampled = labels2[indices_to_plot]
    #### plot1 (for classwise):
    _, ax = plt.subplots(1, figsize=(14, 10))
    n_classes = FLAGS.num_classes
    class_names = [class_list[str(i)] for i in range(len(class_list))]
    plt.scatter(embedding_sampled[:, 0],
                embedding_sampled[:, 1],
                s=10,
                c=labels1_sampled,
                cmap='Spectral',
                alpha=1.0)
    cbar = plt.colorbar(boundaries=np.arange(FLAGS.num_classes + 1) - 0.5)
    cbar.set_ticks(np.arange(FLAGS.num_classes))
    cbar.set_ticklabels(class_names)
    if not os.path.exists(path_save1):
        os.makedirs(path_save1)
    plt.savefig(path_save1 + name_save1 + '.png')
    plt.clf()
    plt.close()
    #### plot2 (for domainwise):
    _, ax = plt.subplots(1, figsize=(14, 10))
    n_classes = len(domain_list)
    class_names = domain_list
    plt.scatter(embedding_sampled[:, 0],
                embedding_sampled[:, 1],
                s=10,
                c=labels2_sampled,
                cmap='Spectral',
                alpha=1.0)
    cbar = plt.colorbar(boundaries=np.arange(n_classes + 1) - 0.5)
    cbar.set_ticks(np.arange(n_classes))
    cbar.set_ticklabels(class_names)
    if not os.path.exists(path_save2):
        os.makedirs(path_save2)
    plt.savefig(path_save2 + name_save2 + '.png')
    plt.clf()
    plt.close()
    np.save('embedding_sampled.npy', embedding_sampled)
    np.save('labels1_sampled.npy', labels1_sampled)
    np.save('labels2_sampled.npy', labels2_sampled)
    np.save('embedding.npy', embedding)
    np.save('labels1.npy', labels1)
    np.save('labels2.npy', labels2)