Python Minhash Examples, tmap.Minhash Python Examples

Example #1

0

Show file

def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(1024)
    lf = tm.LSHForest(128, file_backed=True)

    # d = 1000
    # n = 1000000
    d = 10000
    n = 1000

    # Generating some random data
    start = timer()
    for _ in range(n):
        # data.append(tm.VectorUint(np.random.randint(0, high=2, size=d)))
        lf.add(
            enc.from_sparse_binary_array(
                tm.VectorUint(np.random.randint(0, high=2, size=d))))

    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # Find the 10 nearest neighbors of the first entry
    start = timer()
    knng_from = tm.VectorUint()
    knng_to = tm.VectorUint()
    knng_weight = tm.VectorFloat()

    _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10)
    print(f"The kNN search took {(timer() - start) * 1000}ms.")

Example #2

0

Show file

File: test_lsh_forest.py Project: zhangdachuanfoodies/tmap

    def test_knn_graph(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        f = tm.VectorUint()
        t = tm.VectorUint()
        w = tm.VectorFloat()

        lf.get_knn_graph(f, t, w, 10)
        assert len(f) == 1000
        assert t[0] == 0
        assert t[1] == 26
        assert t[2] == 36
        assert t[3] == 67
        assert t[4] == 33
        assert t[5] == 83

Example #3

0

Show file

File: test_minhash.py Project: zhangdachuanfoodies/tmap

 def test_from_weight_array(self):
     mh = tm.Minhash(8, 42, 64)
     a = mh.from_weight_array(
         tm.VectorFloat([0.2, 0.6, 0.22, 0.26, 0.62, 0.66]))
     b = mh.from_weight_array(
         tm.VectorFloat([0.26, 0.6, 0.22, 0.26, 0.62, 1.0]))
     assert len(a) == 128
     assert round(mh.get_weighted_distance(a, b), 3) == 0.094

Example #4

0

Show file

File: test_minhash.py Project: zhangdachuanfoodies/tmap

 def test_from_sparse_binary_array(self):
     mh = tm.Minhash(8)
     a = mh.from_sparse_binary_array(
         tm.VectorUint([6, 22, 26, 62, 626, 226622]))
     b = mh.from_sparse_binary_array(
         tm.VectorUint([6, 22, 26, 62, 262, 226622]))
     assert len(a) == 8
     assert round(mh.get_distance(a, b), 2) == 0.25

Example #5

0

Show file

def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 2048
    enc = tm.Minhash(16384, 42, dims)
    lf = tm.LSHForest(dims * 2, 128, weighted=True)

    images = []
    labels = []
    image_labels = []

    for file in os.listdir("coil_20"):
        labels.append(int(file.split("__")[0].replace("obj", "")) - 1)
        images.append(list(Image.open("coil_20/" + file).getdata()))

    for image in images:
        img = Image.fromarray(np.uint8(np.split(np.array(image), 128)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        image_labels.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))

    tmp = []
    for _, image in enumerate(images):
        avg = sum(image) / sum([1 if x > 0 else 0 for x in image])
        tmp.append([i / 255 for i in image])

    lf.batch_add(enc.batch_from_weight_array(tmp))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf)

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "COIL20",
        {
            "x": x,
            "y": y,
            "c": labels,
            "labels": image_labels
        },
        colormap="tab20",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
    )
    faerun.add_tree("COIL20_tree", {
        "from": s,
        "to": t
    },
                    point_helper="COIL20",
                    color="#666666")
    faerun.plot("coil", template="url_image")

Example #6

0

Show file

File: lsh_forest_knng_mpl.py Project: zhangdachuanfoodies/tmap

def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 10000
    n = 1000

    data = []

    # Generating some random data
    start = timer()
    for i in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(enc.batch_from_binary_array(data))
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # The configuration for the MST plot
    # Distribute the tree more evenly
    cfg = tm.LayoutConfiguration()
    cfg.sl_scaling_min = 1
    cfg.sl_scaling_max = 1
    cfg.node_size = 1 / 50

    # Construct the k-nearest neighbour graph
    start = timer()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)
    print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.")

    # Plot spanning tree layout
    start = timer()
    for i in range(len(s)):
        plt.plot(
            [x[s[i]], x[t[i]]],
            [y[s[i]], y[t[i]]],
            "r-",
            linewidth=1.0,
            alpha=0.5,
            zorder=1,
        )

    plt.scatter(x, y, s=0.1, zorder=2)
    plt.tight_layout()
    plt.savefig("lsh_forest_knng_mpl.png")
    print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")

Example #7

0

Show file

File: map4.py Project: iwatobipen/map4

    def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False):
        """
        MAP4 calculator class
        """
        self.radius = radius
        self.is_counted = is_counted
        self.is_folded = is_folded

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

Example #8

0

Show file

def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 1024
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128)

    print("Converting images ...")
    for image in tqdm(IMAGES):
        img = Image.fromarray(np.uint8(np.split(np.array(image), 28)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        IMAGE_LABELS.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))
    tmp = []
    for _, image in enumerate(IMAGES):
        avg = sum(image) / sum([1 if x > 0 else 0 for x in image])
        tmp.append(tm.VectorUchar([1 if x >= avg else 0 for x in image]))
        # tmp.append(tm.VectorUint(image))

    print("Running tmap ...")
    lf.batch_add(enc.batch_from_binary_array(tmp))
    # LF.batch_add(ENC.batch_from_int_weight_array(tmp))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG)

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "MNIST",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": IMAGE_LABELS
        },
        colormap="tab10",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
    )
    faerun.add_tree("MNIST_tree", {
        "from": s,
        "to": t
    },
                    point_helper="MNIST",
                    color="#666666")
    faerun.plot("i3d-tmap-mnist", path="outputs", template="url_image")

Example #9

0

Show file

File: transformer_fingerprints.py Project: yingli2009/rxnfp

    def __init__(self,
                 model: BertModel,
                 tokenizer: SmilesTokenizer,
                 permutations=256,
                 seed=42,
                 force_no_cuda=False):
        super(RXNBERTFingerprintGenerator).__init__()
        import tmap as tm

        self.model = model
        self.tokenizer = tokenizer
        self.minhash = tm.Minhash(model.config.hidden_size, seed, permutations)
        self.generator = RXNBERTFingerprintGenerator(model, tokenizer)
        self.device = torch.device("cuda" if (
            torch.cuda.is_available() and not force_no_cuda) else "cpu")

Example #10

0

Show file

File: rnaseq.py Project: zhangdachuanfoodies/tmap

def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 256
    enc = tm.Minhash(len(DATA.columns), 42, dims)
    lf = tm.LSHForest(dims * 2, 32, weighted=True)

    fps = []
    for _, row in DATA.iterrows():
        fps.append(tm.VectorFloat(list(row)))

    lf.batch_add(enc.batch_from_weight_array(fps))
    lf.index()

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP)
    lf.clear()

    legend_labels = {(1, "PRAD"), (2, "LUAD"), (3, "BRCA"), (4, "KIRC"),
                     (5, "COAD")}

    # Create the plot
    faerun = Faerun(view="front", coords=False, legend_title="")
    faerun.add_scatter(
        "RNASEQ",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": LABELS
        },
        colormap="tab10",
        point_scale=5.0,
        max_point_size=10,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_labels=legend_labels,
        legend_title="Tumor Types",
    )
    faerun.add_tree("RNASEQ_tree", {
        "from": s,
        "to": t
    },
                    point_helper="RNASEQ",
                    color="#666666")
    faerun.plot("rnaseq")

Example #11

0

Show file

File: minhash.py Project: zhangdachuanfoodies/tmap

def main():
    """ Main function """

    enc = tm.Minhash()

    mh_a = enc.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1, 0, 1, 1,
                                                 0]))
    mh_b = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 0, 1, 1, 0, 1,
                                                 0]))
    mh_c = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 1, 1, 1, 0, 1,
                                                 0]))

    dist_a_b = enc.get_distance(mh_a, mh_b)
    dist_b_c = enc.get_distance(mh_b, mh_c)

    print(dist_a_b)
    print(dist_b_c)

Example #12

0

Show file

    def test_lf_layout(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        x, y, s, t, gp = tm.layout_from_lsh_forest(lf)
        assert len(x) == 100
        assert len(s) == 99

Example #13

0

Show file

File: test_lsh_forest.py Project: zhangdachuanfoodies/tmap

    def test_query(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        assert lf.size() == len(data)

        r = lf.query_linear_scan_by_id(0, 10)
        assert r[0][1] == 0
        assert r[1][1] == 26

Example #14

0

Show file

def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 1000
    n = 10000

    data = []

    # Generating some random data
    start = timer()
    for _ in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_from_binary_array to encode the data
    start = timer()
    data = enc.batch_from_binary_array(data)
    print(f"Encoding the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(data)
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # Find the 10 nearest neighbors of the first entry
    start = timer()
    _ = lf.query_linear_scan_by_id(0, 10)
    print(f"The kNN search took {(timer() - start) * 1000}ms.")

Example #15

0

Show file

    def __init__(self,
                 dimensions=1024,
                 radius=2,
                 is_counted=False,
                 is_folded=False,
                 return_strings=False):
        """
        Parameters
        ----------
        dimensions : int
            (default = 1024)
            Number of entries in the output map4 fingerprint.

        radius : int
            (default = 2)
            Number of bonds away from atom centre to consider.

        is_counted : bool
            (default = False)

        is_folded : bool
            (default = False)

        return_strings : bool
            (default = False)
            If True then returns substructure strings rather than hashed fingerprint.
        """
        self.dimensions = int(dimensions)
        self.radius = int(radius)
        self.is_counted = bool(is_counted)
        self.is_folded = bool(is_folded)
        self.return_strings = bool(return_strings)

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

Example #16

0

Show file

def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 1000
    n = 10000

    data = []

    # Generating some random data
    start = timer()
    for _ in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(enc.batch_from_binary_array(data))
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # Construct the k-nearest neighbour graph
    start = timer()
    knng_from = tm.VectorUint()
    knng_to = tm.VectorUint()
    knng_weight = tm.VectorFloat()

    _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10)
    print(f"The kNN search took {(timer() - start) * 1000}ms.")

Example #17

0

Show file

File: test_minhash.py Project: zhangdachuanfoodies/tmap

 def test_from_binary_array(self):
     mh = tm.Minhash(8)
     a = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1]))
     b = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 1, 0]))
     assert len(a) == 8
     assert mh.get_distance(a, b) == 0.125

Example #18

0

Show file

File: test_minhash.py Project: zhangdachuanfoodies/tmap

 def test_init(self):
     mh = tm.Minhash()
     assert mh is not None

Example #19

0

Show file

File: gutenberg.py Project: zhangdachuanfoodies/tmap

def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 2048
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128, store=True)

    fps = []
    # fps_umap = []
    for row in DATA:
        fps.append(tm.VectorUint(list(row)))

    lf.batch_add(enc.batch_from_sparse_binary_array(fps))
    lf.index()

    x_tmap, y_tmap, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP)
    lf.clear()

    # Prepare custom color map
    tab10 = plt.get_cmap("tab10").colors
    colors_gray = [(0.2, 0.2, 0.2), tab10[0], tab10[1], tab10[2], tab10[3],
                   tab10[4]]
    custom_cm_gray = LinearSegmentedColormap.from_list("custom_cm_gray",
                                                       colors_gray,
                                                       N=len(colors_gray))

    legend_labels = [
        (1, "Rudyard Kipling"),
        (2, "Herbert George Wells"),
        (3, "Charles Darwin"),
        (4, "George Bernard Shaw"),
        (5, "William Wymark Jacobs"),
        (0, "Other"),
    ]

    faerun = Faerun(
        clear_color="#111111",
        view="front",
        coords=False,
        alpha_blending=True,
        legend_title="",
    )
    faerun.add_scatter(
        "gutenberg",
        {
            "x": x_tmap,
            "y": y_tmap,
            "c": LABELS,
            "labels": FAERUN_LABELS
        },
        colormap=custom_cm_gray,
        point_scale=4.2,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        legend_title="Authors",
        legend_labels=legend_labels,
        shader="smoothCircle",
        selected_labels=["Author", "Title"],
    )
    faerun.add_tree(
        "gutenberg_tree",
        {
            "from": s,
            "to": t
        },
        point_helper="gutenberg",
        color="#222222",
    )
    faerun.plot("gutenberg", template="default")

Example #20

0

Show file

File: test_minhash.py Project: zhangdachuanfoodies/tmap

 def test_from_string_array(self):
     mh = tm.Minhash(8)
     a = mh.from_string_array(["a", "b", "c", "x", "y", "z"])
     b = mh.from_string_array(["a", "b", "c", "v", "y", "z"])
     assert len(a) == 8
     assert round(mh.get_distance(a, b), 3) == 0.375

Example #21

0

Show file

File: nips.py Project: zhangdachuanfoodies/tmap

def main():
    """ The main function """
    df = pd.read_csv("papers.tar.xz")
    df.drop(df.tail(1).index, inplace=True)
    df["title"] = df["title"].apply(lambda t: t.replace("'", '"'))
    enc = tm.Minhash()
    lf = tm.LSHForest()

    ctr = Counter()
    texts = []
    for _, row in df.iterrows():
        text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"])
        text = [t.lower() for t in text.split(" ") if len(t) > 2]
        ctr.update(text)
        texts.append(text)

    # Remove the top n words
    n = 6000
    ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1]

    # Make it fast using a lookup map
    all_words = {}
    for i, (key, _) in enumerate(ctr):
        all_words[key] = i

    # Create the fingerprints and also check whether the word
    # "deep" is found in the document
    fingerprints = []
    has_word = []
    for text in texts:
        if "deep" in text:
            has_word.append(1)
        else:
            has_word.append(0)

        fingerprint = []
        for t in text:
            if t in all_words:
                fingerprint.append(all_words[t])
        fingerprints.append(tm.VectorUint(fingerprint))

    # Index the article fingerprints
    lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints))
    lf.index()

    # Create the tmap
    config = tm.LayoutConfiguration()
    config.k = 100
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config)

    faerun = Faerun(
        view="front", coords=False, legend_title="", legend_number_format="{:.0f}"
    )

    # Add a scatter that is bigger than the one above, to add colored
    # circles.
    faerun.add_scatter(
        "NIPS_word",
        {"x": x, "y": y, "c": has_word, "labels": df["title"]},
        colormap="Set1",
        point_scale=7.5,
        max_point_size=25,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_title="Contains word<br/>'deep'",
        legend_labels=[(0, "No"), (1, "Yes")],
        interactive=False,
    )

    # Add a scatter that is colored by year on top
    faerun.add_scatter(
        "NIPS",
        {"x": x, "y": y, "c": df["year"], "labels": df["title"]},
        colormap="gray",
        point_scale=5.0,
        max_point_size=20,
        shader="smoothCircle",
        has_legend=True,
        legend_title="Year of<br/>Publication",
    )

    faerun.add_tree(
        "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666"
    )

    faerun.plot("nips_papers")

Example #22

0

Show file

def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 1024
    enc = tm.Minhash(28 * 28, 42, dims)
    lf = tm.LSHForest(dims * 2, 128)

    print("Converting images ...")
    for image in IMAGES:
        img = Image.fromarray(np.uint8(np.split(np.array(image), 28)))
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())
        IMAGE_LABELS.append("data:image/bmp;base64," +
                            str(img_str).replace("b'", "").replace("'", ""))
    tmp = []
    for _, image in enumerate(IMAGES):
        tmp.append(tm.VectorFloat(image / 255))

    print("Running tmap ...")
    start = timer()
    lf.batch_add(enc.batch_from_weight_array(tmp))
    lf.index()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG)
    print("tmap: " + str(timer() - start))

    legend_labels = [
        (0, "T-shirt/top"),
        (1, "Trouser"),
        (2, "Pullover"),
        (3, "Dress"),
        (4, "Coat"),
        (5, "Sandal"),
        (6, "Shirt"),
        (7, "Sneaker"),
        (8, "Bag"),
        (9, "Ankle boot"),
    ]

    faerun = Faerun(clear_color="#111111", view="front", coords=False)
    faerun.add_scatter(
        "FMNIST",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": IMAGE_LABELS
        },
        colormap="tab10",
        shader="smoothCircle",
        point_scale=2.5,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        legend_labels=legend_labels,
    )
    faerun.add_tree("FMNIST_tree", {
        "from": s,
        "to": t
    },
                    point_helper="FMNIST",
                    color="#666666")
    faerun.plot("fmnist", template="url_image")

Example #23

0

Show file

File: test.py Project: richardjgowers/map4

from rdkit import Chem
import tmap as tm
from map4 import MAP4Calculator

dim = 1024

MAP4 = MAP4Calculator(dimensions=dim)
ENC = tm.Minhash(dim)

smiles_a = 'c1ccccc1'
mol_a = Chem.MolFromSmiles(smiles_a)
map4_a = MAP4.calculate(mol_a)

smiles_b = 'c1cccc(N)c1'
mol_b = Chem.MolFromSmiles(smiles_b)
map4_b = MAP4.calculate(mol_b)

# or use parallelized version:
fps = MAP4.calculate_many([mol_a, mol_b])

print(ENC.get_distance(map4_a, map4_b))

print(ENC.get_distance(fps[0], fps[1]))