Exemple #1
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 2048
    enc = tm.Minhash(16384, 42, dims)
    lf = tm.LSHForest(dims * 2, 128, weighted=True)

    images = []
    labels = []
    image_labels = []

    for file in os.listdir("coil_20"):
        labels.append(int(file.split("__")[0].replace("obj", "")) - 1)
        images.append(list(Image.open("coil_20/" + file).getdata()))

    for image in images:
        img = Image.fromarray(np.uint8(np.split(np.array(image), 128)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        image_labels.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))

    tmp = []
    for _, image in enumerate(images):
        avg = sum(image) / sum([1 if x > 0 else 0 for x in image])
        tmp.append([i / 255 for i in image])

    lf.batch_add(enc.batch_from_weight_array(tmp))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf)

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "COIL20",
        {
            "x": x,
            "y": y,
            "c": labels,
            "labels": image_labels
        },
        colormap="tab20",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
    )
    faerun.add_tree("COIL20_tree", {
        "from": s,
        "to": t
    },
                    point_helper="COIL20",
                    color="#666666")
    faerun.plot("coil", template="url_image")
def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 10000
    n = 1000

    data = []

    # Generating some random data
    start = timer()
    for i in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(enc.batch_from_binary_array(data))
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # The configuration for the MST plot
    # Distribute the tree more evenly
    cfg = tm.LayoutConfiguration()
    cfg.sl_scaling_min = 1
    cfg.sl_scaling_max = 1
    cfg.node_size = 1 / 50

    # Construct the k-nearest neighbour graph
    start = timer()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)
    print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.")

    # Plot spanning tree layout
    start = timer()
    for i in range(len(s)):
        plt.plot(
            [x[s[i]], x[t[i]]],
            [y[s[i]], y[t[i]]],
            "r-",
            linewidth=1.0,
            alpha=0.5,
            zorder=1,
        )

    plt.scatter(x, y, s=0.1, zorder=2)
    plt.tight_layout()
    plt.savefig("lsh_forest_knng_mpl.png")
    print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")
Exemple #3
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 1024
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128)

    print("Converting images ...")
    for image in tqdm(IMAGES):
        img = Image.fromarray(np.uint8(np.split(np.array(image), 28)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        IMAGE_LABELS.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))
    tmp = []
    for _, image in enumerate(IMAGES):
        avg = sum(image) / sum([1 if x > 0 else 0 for x in image])
        tmp.append(tm.VectorUchar([1 if x >= avg else 0 for x in image]))
        # tmp.append(tm.VectorUint(image))

    print("Running tmap ...")
    lf.batch_add(enc.batch_from_binary_array(tmp))
    # LF.batch_add(ENC.batch_from_int_weight_array(tmp))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG)

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "MNIST",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": IMAGE_LABELS
        },
        colormap="tab10",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
    )
    faerun.add_tree("MNIST_tree", {
        "from": s,
        "to": t
    },
                    point_helper="MNIST",
                    color="#666666")
    faerun.plot("i3d-tmap-mnist", path="outputs", template="url_image")
Exemple #4
0
def tree_coords(lf, node_size=1 / 20, k=20, mmm_rps=2):
    print('Converting to tmap coordinates')
    # Create a LayoutConfiguration instance
    cfg = tm.LayoutConfiguration()
    cfg.node_size = node_size
    cfg.mmm_repeats = mmm_rps
    cfg.sl_extra_scaling_steps = 5
    cfg.k = k
    cfg.sl_scaling_type = tm.RelativeToAvgLength

    #Create minimum spanning tree from the LSHForest and LayoutConfiguration instance
    #The x and y coordinates of the vertices, the ids of the vertices spanning the edges
    #information on the graph is ignored

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    return list(x), list(y), list(s), list(t)
Exemple #5
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 256
    enc = tm.Minhash(len(DATA.columns), 42, dims)
    lf = tm.LSHForest(dims * 2, 32, weighted=True)

    fps = []
    for _, row in DATA.iterrows():
        fps.append(tm.VectorFloat(list(row)))

    lf.batch_add(enc.batch_from_weight_array(fps))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP)
    lf.clear()

    legend_labels = {(1, "PRAD"), (2, "LUAD"), (3, "BRCA"), (4, "KIRC"),
                     (5, "COAD")}

    # Create the plot
    faerun = Faerun(view="front", coords=False, legend_title="")
    faerun.add_scatter(
        "RNASEQ",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": LABELS
        },
        colormap="tab10",
        point_scale=5.0,
        max_point_size=10,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_labels=legend_labels,
        legend_title="Tumor Types",
    )
    faerun.add_tree("RNASEQ_tree", {
        "from": s,
        "to": t
    },
                    point_helper="RNASEQ",
                    color="#666666")
    faerun.plot("rnaseq")
Exemple #6
0
    def test_lf_layout(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        x, y, s, t, gp = tm.layout_from_lsh_forest(lf)
        assert len(x) == 100
        assert len(s) == 99
Exemple #7
0
def main():
    """ The main function """
    df = pd.read_csv("papers.tar.xz")
    df.drop(df.tail(1).index, inplace=True)
    df["title"] = df["title"].apply(lambda t: t.replace("'", '"'))
    enc = tm.Minhash()
    lf = tm.LSHForest()

    ctr = Counter()
    texts = []
    for _, row in df.iterrows():
        text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"])
        text = [t.lower() for t in text.split(" ") if len(t) > 2]
        ctr.update(text)
        texts.append(text)

    # Remove the top n words
    n = 6000
    ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1]

    # Make it fast using a lookup map
    all_words = {}
    for i, (key, _) in enumerate(ctr):
        all_words[key] = i

    # Create the fingerprints and also check whether the word
    # "deep" is found in the document
    fingerprints = []
    has_word = []
    for text in texts:
        if "deep" in text:
            has_word.append(1)
        else:
            has_word.append(0)

        fingerprint = []
        for t in text:
            if t in all_words:
                fingerprint.append(all_words[t])
        fingerprints.append(tm.VectorUint(fingerprint))

    # Index the article fingerprints
    lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints))
    lf.index()

    # Create the tmap
    config = tm.LayoutConfiguration()
    config.k = 100
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config)

    faerun = Faerun(
        view="front", coords=False, legend_title="", legend_number_format="{:.0f}"
    )

    # Add a scatter that is bigger than the one above, to add colored
    # circles.
    faerun.add_scatter(
        "NIPS_word",
        {"x": x, "y": y, "c": has_word, "labels": df["title"]},
        colormap="Set1",
        point_scale=7.5,
        max_point_size=25,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_title="Contains word<br/>'deep'",
        legend_labels=[(0, "No"), (1, "Yes")],
        interactive=False,
    )

    # Add a scatter that is colored by year on top
    faerun.add_scatter(
        "NIPS",
        {"x": x, "y": y, "c": df["year"], "labels": df["title"]},
        colormap="gray",
        point_scale=5.0,
        max_point_size=20,
        shader="smoothCircle",
        has_legend=True,
        legend_title="Year of<br/>Publication",
    )

    faerun.add_tree(
        "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666"
    )

    faerun.plot("nips_papers")
Exemple #8
0
def main():
    """ Main function """

    dims = 512
    lf = tm.LSHForest(dims, 128, store=True)

    # Due to the large data size (> 1GB) the following files are not provided directly
    smiles, target_class, activity, chembl_id = pickle.load(
        open("chembl.pickle", "rb"))

    labels = []
    for i, s in enumerate(smiles):
        labels.append(
            s + "__" + chembl_id[i] + "__" +
            f'<a target="_blank" href="https://www.ebi.ac.uk/chembl/compound_report_card/{chembl_id[i]}">{chembl_id[i]}</a>'
        )

    lf.restore("chembl.dat")

    target_class_map = dict([(y, x + 1)
                             for x, y in enumerate(sorted(set(target_class)))])

    classes = [
        "enzyme",
        "kinase",
        "protease",
        "cytochrome p450",
        "ion channel",
        "transporter",
        "transcription factor",
        "membrane receptor",
        "epigenetic regulator",
    ]

    i = 0
    for key, value in target_class_map.items():
        if key not in classes:
            target_class_map[key] = 7
        else:
            target_class_map[key] = i
            i += 1
            if i == 7:
                i = 8

    cfg = tm.LayoutConfiguration()
    cfg.node_size = 1 / 70
    cfg.mmm_repeats = 2
    cfg.sl_repeats = 2

    start = timer()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    end = timer()
    print(end - start)

    activity = np.array(activity)
    activity = np.maximum(0.0, activity)
    activity = np.minimum(100.0, activity)
    activity = 10.0 - activity

    legend_labels = [
        (0, "Cytochrome p450"),
        (1, "Other Enzyme"),
        (2, "Epigenetic Regulator"),
        (3, "Ion Channel"),
        (4, "Kinase"),
        (5, "Membrane Receptor"),
        (6, "Protease"),
        (8, "Transcription Factor"),
        (9, "Transporter"),
        (7, "Other"),
    ]

    vals = [int(target_class_map[x]) for x in target_class]

    faerun = Faerun(view="front", coords=False)
    faerun.add_scatter(
        "chembl",
        {
            "x": x,
            "y": y,
            "c": vals,
            "labels": labels
        },
        colormap="tab10",
        point_scale=1.0,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        shader="smoothCircle",
        legend_labels=legend_labels,
        title_index=1,
    )
    faerun.add_tree("chembl_tree", {
        "from": s,
        "to": t
    },
                    point_helper="chembl",
                    color="#222222")

    faerun.plot("chembl", template="smiles")
Exemple #9
0
def main():
    """ The main function """
    df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index(
        drop=True)
    enc = MHFPEncoder()
    lf = tm.LSHForest(2048, 128)

    fps = []
    labels = []
    groups = []
    tpsa = []
    logp = []
    mw = []
    h_acceptors = []
    h_donors = []
    ring_count = []
    is_lipinski = []
    has_coc = []
    has_sa = []
    has_tz = []

    substruct_coc = AllChem.MolFromSmiles("COC")
    substruct_sa = AllChem.MolFromSmiles("NS(=O)=O")
    substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1")

    total = len(df)
    for i, row in df.iterrows():
        if i % 1000 == 0 and i > 0:
            print(f"{round(100 * (i / total))}% done ...")

        smiles = row[6]
        mol = AllChem.MolFromSmiles(smiles)

        if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2:
            fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0)))
            labels.append(
                f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}'
                .replace("'", ""))
            groups.append(row[3].split(";")[0])
            tpsa.append(Descriptors.TPSA(mol))
            logp.append(Descriptors.MolLogP(mol))
            mw.append(Descriptors.MolWt(mol))
            h_acceptors.append(Descriptors.NumHAcceptors(mol))
            h_donors.append(Descriptors.NumHDonors(mol))
            ring_count.append(Descriptors.RingCount(mol))
            is_lipinski.append(lipinski_pass(mol))
            has_coc.append(mol.HasSubstructMatch(substruct_coc))
            has_sa.append(mol.HasSubstructMatch(substruct_sa))
            has_tz.append(mol.HasSubstructMatch(substruct_tz))

    # Create the labels and the integer encoded array for the groups,
    # as they're categorical
    labels_groups, groups = Faerun.create_categories(groups)
    tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa)
    logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp)
    mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw)
    h_acceptors_ranked = ss.rankdata(
        np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors)
    h_donors_ranked = ss.rankdata(
        np.array(h_donors) / max(h_donors)) / len(h_donors)
    ring_count_ranked = ss.rankdata(
        np.array(ring_count) / max(ring_count)) / len(ring_count)

    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()
    cfg.k = 100
    # cfg.sl_extra_scaling_steps = 1
    cfg.sl_repeats = 2
    cfg.mmm_repeats = 2
    cfg.node_size = 2
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

    # Define a colormap highlighting approved vs non-approved
    custom_cmap = ListedColormap(
        [
            "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f",
            "#95a5a6"
        ],
        name="custom",
    )

    bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap")

    f = Faerun(
        clear_color="#222222",
        coords=False,
        view="front",
        impress=
        'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>',
    )

    f.add_scatter(
        "Drugbank",
        {
            "x":
            x,
            "y":
            y,
            "c": [
                groups,
                is_lipinski,
                has_coc,
                has_sa,
                has_tz,
                tpsa_ranked,
                logp_ranked,
                mw_ranked,
                h_acceptors_ranked,
                h_donors_ranked,
                ring_count_ranked,
            ],
            "labels":
            labels,
        },
        shader="smoothCircle",
        colormap=[
            custom_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
        ],
        point_scale=2.5,
        categorical=[
            True, True, True, True, True, False, False, False, False, False
        ],
        has_legend=True,
        legend_labels=[
            labels_groups,
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
        ],
        selected_labels=["SMILES", "Drugbank ID", "Name"],
        series_title=[
            "Group",
            "Lipinski",
            "Ethers",
            "Sulfonamides",
            "Tetrazoles",
            "TPSA",
            "logP",
            "Mol Weight",
            "H Acceptors",
            "H Donors",
            "Ring Count",
        ],
        max_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(max(tpsa))),
            str(round(max(logp))),
            str(round(max(mw))),
            str(round(max(h_acceptors))),
            str(round(max(h_donors))),
            str(round(max(ring_count))),
        ],
        min_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(min(tpsa))),
            str(round(min(logp))),
            str(round(min(mw))),
            str(round(min(h_acceptors))),
            str(round(min(h_donors))),
            str(round(min(ring_count))),
        ],
        title_index=2,
        legend_title="",
    )

    f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank")

    f.plot("drugbank", template="smiles")
Exemple #10
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 2048
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128, store=True)

    fps = []
    # fps_umap = []
    for row in DATA:
        fps.append(tm.VectorUint(list(row)))

    lf.batch_add(enc.batch_from_sparse_binary_array(fps))
    lf.index()

    x_tmap, y_tmap, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP)
    lf.clear()

    # Prepare custom color map
    tab10 = plt.get_cmap("tab10").colors
    colors_gray = [(0.2, 0.2, 0.2), tab10[0], tab10[1], tab10[2], tab10[3],
                   tab10[4]]
    custom_cm_gray = LinearSegmentedColormap.from_list("custom_cm_gray",
                                                       colors_gray,
                                                       N=len(colors_gray))

    legend_labels = [
        (1, "Rudyard Kipling"),
        (2, "Herbert George Wells"),
        (3, "Charles Darwin"),
        (4, "George Bernard Shaw"),
        (5, "William Wymark Jacobs"),
        (0, "Other"),
    ]

    faerun = Faerun(
        clear_color="#111111",
        view="front",
        coords=False,
        alpha_blending=True,
        legend_title="",
    )
    faerun.add_scatter(
        "gutenberg",
        {
            "x": x_tmap,
            "y": y_tmap,
            "c": LABELS,
            "labels": FAERUN_LABELS
        },
        colormap=custom_cm_gray,
        point_scale=4.2,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        legend_title="Authors",
        legend_labels=legend_labels,
        shader="smoothCircle",
        selected_labels=["Author", "Title"],
    )
    faerun.add_tree(
        "gutenberg_tree",
        {
            "from": s,
            "to": t
        },
        point_helper="gutenberg",
        color="#222222",
    )
    faerun.plot("gutenberg", template="default")
Exemple #11
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 1024
    enc = tm.Minhash(28 * 28, 42, dims)
    lf = tm.LSHForest(dims * 2, 128)

    print("Converting images ...")
    for image in IMAGES:
        img = Image.fromarray(np.uint8(np.split(np.array(image), 28)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        IMAGE_LABELS.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))
    tmp = []
    for _, image in enumerate(IMAGES):
        tmp.append(tm.VectorFloat(image / 255))

    print("Running tmap ...")
    start = timer()
    lf.batch_add(enc.batch_from_weight_array(tmp))
    lf.index()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG)
    print("tmap: " + str(timer() - start))

    legend_labels = [
        (0, "T-shirt/top"),
        (1, "Trouser"),
        (2, "Pullover"),
        (3, "Dress"),
        (4, "Coat"),
        (5, "Sandal"),
        (6, "Shirt"),
        (7, "Sneaker"),
        (8, "Bag"),
        (9, "Ankle boot"),
    ]

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "FMNIST",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": IMAGE_LABELS
        },
        colormap="tab10",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        legend_labels=legend_labels,
    )
    faerun.add_tree("FMNIST_tree", {
        "from": s,
        "to": t
    },
                    point_helper="FMNIST",
                    color="#666666")
    faerun.plot("fmnist", template="url_image")