def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(1024) lf = tm.LSHForest(128, file_backed=True) # d = 1000 # n = 1000000 d = 10000 n = 1000 # Generating some random data start = timer() for _ in range(n): # data.append(tm.VectorUint(np.random.randint(0, high=2, size=d))) lf.add( enc.from_sparse_binary_array( tm.VectorUint(np.random.randint(0, high=2, size=d)))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Find the 10 nearest neighbors of the first entry start = timer() knng_from = tm.VectorUint() knng_to = tm.VectorUint() knng_weight = tm.VectorFloat() _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def test_knn_graph(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() f = tm.VectorUint() t = tm.VectorUint() w = tm.VectorFloat() lf.get_knn_graph(f, t, w, 10) assert len(f) == 1000 assert t[0] == 0 assert t[1] == 26 assert t[2] == 36 assert t[3] == 67 assert t[4] == 33 assert t[5] == 83
def test_from_weight_array(self): mh = tm.Minhash(8, 42, 64) a = mh.from_weight_array( tm.VectorFloat([0.2, 0.6, 0.22, 0.26, 0.62, 0.66])) b = mh.from_weight_array( tm.VectorFloat([0.26, 0.6, 0.22, 0.26, 0.62, 1.0])) assert len(a) == 128 assert round(mh.get_weighted_distance(a, b), 3) == 0.094
def test_from_sparse_binary_array(self): mh = tm.Minhash(8) a = mh.from_sparse_binary_array( tm.VectorUint([6, 22, 26, 62, 626, 226622])) b = mh.from_sparse_binary_array( tm.VectorUint([6, 22, 26, 62, 262, 226622])) assert len(a) == 8 assert round(mh.get_distance(a, b), 2) == 0.25
def main(): """ Main function """ # Initialize and configure tmap dims = 2048 enc = tm.Minhash(16384, 42, dims) lf = tm.LSHForest(dims * 2, 128, weighted=True) images = [] labels = [] image_labels = [] for file in os.listdir("coil_20"): labels.append(int(file.split("__")[0].replace("obj", "")) - 1) images.append(list(Image.open("coil_20/" + file).getdata())) for image in images: img = Image.fromarray(np.uint8(np.split(np.array(image), 128))) buffered = BytesIO() img.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()) image_labels.append("data:image/bmp;base64," + str(img_str).replace("b'", "").replace("'", "")) tmp = [] for _, image in enumerate(images): avg = sum(image) / sum([1 if x > 0 else 0 for x in image]) tmp.append([i / 255 for i in image]) lf.batch_add(enc.batch_from_weight_array(tmp)) lf.index() x, y, s, t, _ = tm.layout_from_lsh_forest(lf) faerun = Faerun(clear_color="#111111", view="front", coords=False) faerun.add_scatter( "COIL20", { "x": x, "y": y, "c": labels, "labels": image_labels }, colormap="tab20", shader="smoothCircle", point_scale=2.5, max_point_size=10, has_legend=True, categorical=True, ) faerun.add_tree("COIL20_tree", { "from": s, "to": t }, point_helper="COIL20", color="#666666") faerun.plot("coil", template="url_image")
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 10000 n = 1000 data = [] # Generating some random data start = timer() for i in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # The configuration for the MST plot # Distribute the tree more evenly cfg = tm.LayoutConfiguration() cfg.sl_scaling_min = 1 cfg.sl_scaling_max = 1 cfg.node_size = 1 / 50 # Construct the k-nearest neighbour graph start = timer() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.") # Plot spanning tree layout start = timer() for i in range(len(s)): plt.plot( [x[s[i]], x[t[i]]], [y[s[i]], y[t[i]]], "r-", linewidth=1.0, alpha=0.5, zorder=1, ) plt.scatter(x, y, s=0.1, zorder=2) plt.tight_layout() plt.savefig("lsh_forest_knng_mpl.png") print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False): """ MAP4 calculator class """ self.radius = radius self.is_counted = is_counted self.is_folded = is_folded if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def main(): """ Main function """ # Initialize and configure tmap dims = 1024 enc = tm.Minhash(dims) lf = tm.LSHForest(dims, 128) print("Converting images ...") for image in tqdm(IMAGES): img = Image.fromarray(np.uint8(np.split(np.array(image), 28))) buffered = BytesIO() img.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()) IMAGE_LABELS.append("data:image/bmp;base64," + str(img_str).replace("b'", "").replace("'", "")) tmp = [] for _, image in enumerate(IMAGES): avg = sum(image) / sum([1 if x > 0 else 0 for x in image]) tmp.append(tm.VectorUchar([1 if x >= avg else 0 for x in image])) # tmp.append(tm.VectorUint(image)) print("Running tmap ...") lf.batch_add(enc.batch_from_binary_array(tmp)) # LF.batch_add(ENC.batch_from_int_weight_array(tmp)) lf.index() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG) faerun = Faerun(clear_color="#111111", view="front", coords=False) faerun.add_scatter( "MNIST", { "x": x, "y": y, "c": LABELS, "labels": IMAGE_LABELS }, colormap="tab10", shader="smoothCircle", point_scale=2.5, max_point_size=10, has_legend=True, categorical=True, ) faerun.add_tree("MNIST_tree", { "from": s, "to": t }, point_helper="MNIST", color="#666666") faerun.plot("i3d-tmap-mnist", path="outputs", template="url_image")
def __init__(self, model: BertModel, tokenizer: SmilesTokenizer, permutations=256, seed=42, force_no_cuda=False): super(RXNBERTFingerprintGenerator).__init__() import tmap as tm self.model = model self.tokenizer = tokenizer self.minhash = tm.Minhash(model.config.hidden_size, seed, permutations) self.generator = RXNBERTFingerprintGenerator(model, tokenizer) self.device = torch.device("cuda" if ( torch.cuda.is_available() and not force_no_cuda) else "cpu")
def main(): """ Main function """ # Initialize and configure tmap dims = 256 enc = tm.Minhash(len(DATA.columns), 42, dims) lf = tm.LSHForest(dims * 2, 32, weighted=True) fps = [] for _, row in DATA.iterrows(): fps.append(tm.VectorFloat(list(row))) lf.batch_add(enc.batch_from_weight_array(fps)) lf.index() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP) lf.clear() legend_labels = {(1, "PRAD"), (2, "LUAD"), (3, "BRCA"), (4, "KIRC"), (5, "COAD")} # Create the plot faerun = Faerun(view="front", coords=False, legend_title="") faerun.add_scatter( "RNASEQ", { "x": x, "y": y, "c": LABELS, "labels": LABELS }, colormap="tab10", point_scale=5.0, max_point_size=10, shader="smoothCircle", has_legend=True, categorical=True, legend_labels=legend_labels, legend_title="Tumor Types", ) faerun.add_tree("RNASEQ_tree", { "from": s, "to": t }, point_helper="RNASEQ", color="#666666") faerun.plot("rnaseq")
def main(): """ Main function """ enc = tm.Minhash() mh_a = enc.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1, 0, 1, 1, 0])) mh_b = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 0, 1, 1, 0, 1, 0])) mh_c = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 1, 1, 1, 0, 1, 0])) dist_a_b = enc.get_distance(mh_a, mh_b) dist_b_c = enc.get_distance(mh_b, mh_c) print(dist_a_b) print(dist_b_c)
def test_lf_layout(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() x, y, s, t, gp = tm.layout_from_lsh_forest(lf) assert len(x) == 100 assert len(s) == 99
def test_query(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() assert lf.size() == len(data) r = lf.query_linear_scan_by_id(0, 10) assert r[0][1] == 0 assert r[1][1] == 26
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 1000 n = 10000 data = [] # Generating some random data start = timer() for _ in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_from_binary_array to encode the data start = timer() data = enc.batch_from_binary_array(data) print(f"Encoding the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(data) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Find the 10 nearest neighbors of the first entry start = timer() _ = lf.query_linear_scan_by_id(0, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False): """ Parameters ---------- dimensions : int (default = 1024) Number of entries in the output map4 fingerprint. radius : int (default = 2) Number of bonds away from atom centre to consider. is_counted : bool (default = False) is_folded : bool (default = False) return_strings : bool (default = False) If True then returns substructure strings rather than hashed fingerprint. """ self.dimensions = int(dimensions) self.radius = int(radius) self.is_counted = bool(is_counted) self.is_folded = bool(is_folded) self.return_strings = bool(return_strings) if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 1000 n = 10000 data = [] # Generating some random data start = timer() for _ in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Construct the k-nearest neighbour graph start = timer() knng_from = tm.VectorUint() knng_to = tm.VectorUint() knng_weight = tm.VectorFloat() _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def test_from_binary_array(self): mh = tm.Minhash(8) a = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1])) b = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 1, 0])) assert len(a) == 8 assert mh.get_distance(a, b) == 0.125
def test_init(self): mh = tm.Minhash() assert mh is not None
def main(): """ Main function """ # Initialize and configure tmap dims = 2048 enc = tm.Minhash(dims) lf = tm.LSHForest(dims, 128, store=True) fps = [] # fps_umap = [] for row in DATA: fps.append(tm.VectorUint(list(row))) lf.batch_add(enc.batch_from_sparse_binary_array(fps)) lf.index() x_tmap, y_tmap, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP) lf.clear() # Prepare custom color map tab10 = plt.get_cmap("tab10").colors colors_gray = [(0.2, 0.2, 0.2), tab10[0], tab10[1], tab10[2], tab10[3], tab10[4]] custom_cm_gray = LinearSegmentedColormap.from_list("custom_cm_gray", colors_gray, N=len(colors_gray)) legend_labels = [ (1, "Rudyard Kipling"), (2, "Herbert George Wells"), (3, "Charles Darwin"), (4, "George Bernard Shaw"), (5, "William Wymark Jacobs"), (0, "Other"), ] faerun = Faerun( clear_color="#111111", view="front", coords=False, alpha_blending=True, legend_title="", ) faerun.add_scatter( "gutenberg", { "x": x_tmap, "y": y_tmap, "c": LABELS, "labels": FAERUN_LABELS }, colormap=custom_cm_gray, point_scale=4.2, max_point_size=10, has_legend=True, categorical=True, legend_title="Authors", legend_labels=legend_labels, shader="smoothCircle", selected_labels=["Author", "Title"], ) faerun.add_tree( "gutenberg_tree", { "from": s, "to": t }, point_helper="gutenberg", color="#222222", ) faerun.plot("gutenberg", template="default")
def test_from_string_array(self): mh = tm.Minhash(8) a = mh.from_string_array(["a", "b", "c", "x", "y", "z"]) b = mh.from_string_array(["a", "b", "c", "v", "y", "z"]) assert len(a) == 8 assert round(mh.get_distance(a, b), 3) == 0.375
def main(): """ The main function """ df = pd.read_csv("papers.tar.xz") df.drop(df.tail(1).index, inplace=True) df["title"] = df["title"].apply(lambda t: t.replace("'", '"')) enc = tm.Minhash() lf = tm.LSHForest() ctr = Counter() texts = [] for _, row in df.iterrows(): text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"]) text = [t.lower() for t in text.split(" ") if len(t) > 2] ctr.update(text) texts.append(text) # Remove the top n words n = 6000 ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1] # Make it fast using a lookup map all_words = {} for i, (key, _) in enumerate(ctr): all_words[key] = i # Create the fingerprints and also check whether the word # "deep" is found in the document fingerprints = [] has_word = [] for text in texts: if "deep" in text: has_word.append(1) else: has_word.append(0) fingerprint = [] for t in text: if t in all_words: fingerprint.append(all_words[t]) fingerprints.append(tm.VectorUint(fingerprint)) # Index the article fingerprints lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints)) lf.index() # Create the tmap config = tm.LayoutConfiguration() config.k = 100 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config) faerun = Faerun( view="front", coords=False, legend_title="", legend_number_format="{:.0f}" ) # Add a scatter that is bigger than the one above, to add colored # circles. faerun.add_scatter( "NIPS_word", {"x": x, "y": y, "c": has_word, "labels": df["title"]}, colormap="Set1", point_scale=7.5, max_point_size=25, shader="smoothCircle", has_legend=True, categorical=True, legend_title="Contains word<br/>'deep'", legend_labels=[(0, "No"), (1, "Yes")], interactive=False, ) # Add a scatter that is colored by year on top faerun.add_scatter( "NIPS", {"x": x, "y": y, "c": df["year"], "labels": df["title"]}, colormap="gray", point_scale=5.0, max_point_size=20, shader="smoothCircle", has_legend=True, legend_title="Year of<br/>Publication", ) faerun.add_tree( "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666" ) faerun.plot("nips_papers")
def main(): """ Main function """ # Initialize and configure tmap dims = 1024 enc = tm.Minhash(28 * 28, 42, dims) lf = tm.LSHForest(dims * 2, 128) print("Converting images ...") for image in IMAGES: img = Image.fromarray(np.uint8(np.split(np.array(image), 28))) buffered = BytesIO() img.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()) IMAGE_LABELS.append("data:image/bmp;base64," + str(img_str).replace("b'", "").replace("'", "")) tmp = [] for _, image in enumerate(IMAGES): tmp.append(tm.VectorFloat(image / 255)) print("Running tmap ...") start = timer() lf.batch_add(enc.batch_from_weight_array(tmp)) lf.index() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG) print("tmap: " + str(timer() - start)) legend_labels = [ (0, "T-shirt/top"), (1, "Trouser"), (2, "Pullover"), (3, "Dress"), (4, "Coat"), (5, "Sandal"), (6, "Shirt"), (7, "Sneaker"), (8, "Bag"), (9, "Ankle boot"), ] faerun = Faerun(clear_color="#111111", view="front", coords=False) faerun.add_scatter( "FMNIST", { "x": x, "y": y, "c": LABELS, "labels": IMAGE_LABELS }, colormap="tab10", shader="smoothCircle", point_scale=2.5, max_point_size=10, has_legend=True, categorical=True, legend_labels=legend_labels, ) faerun.add_tree("FMNIST_tree", { "from": s, "to": t }, point_helper="FMNIST", color="#666666") faerun.plot("fmnist", template="url_image")
from rdkit import Chem import tmap as tm from map4 import MAP4Calculator dim = 1024 MAP4 = MAP4Calculator(dimensions=dim) ENC = tm.Minhash(dim) smiles_a = 'c1ccccc1' mol_a = Chem.MolFromSmiles(smiles_a) map4_a = MAP4.calculate(mol_a) smiles_b = 'c1cccc(N)c1' mol_b = Chem.MolFromSmiles(smiles_b) map4_b = MAP4.calculate(mol_b) # or use parallelized version: fps = MAP4.calculate_many([mol_a, mol_b]) print(ENC.get_distance(map4_a, map4_b)) print(ENC.get_distance(fps[0], fps[1]))