def main(): """ Main function """ enc = tm.Minhash() mh_a = enc.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1, 0, 1, 1, 0])) mh_b = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 0, 1, 1, 0, 1, 0])) mh_c = enc.from_binary_array(tm.VectorUchar([1, 0, 1, 1, 1, 1, 1, 0, 1, 0])) dist_a_b = enc.get_distance(mh_a, mh_b) dist_b_c = enc.get_distance(mh_b, mh_c) print(dist_a_b) print(dist_b_c)
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 10000 n = 1000 data = [] # Generating some random data start = timer() for i in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # The configuration for the MST plot # Distribute the tree more evenly cfg = tm.LayoutConfiguration() cfg.sl_scaling_min = 1 cfg.sl_scaling_max = 1 cfg.node_size = 1 / 50 # Construct the k-nearest neighbour graph start = timer() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.") # Plot spanning tree layout start = timer() for i in range(len(s)): plt.plot( [x[s[i]], x[t[i]]], [y[s[i]], y[t[i]]], "r-", linewidth=1.0, alpha=0.5, zorder=1, ) plt.scatter(x, y, s=0.1, zorder=2) plt.tight_layout() plt.savefig("lsh_forest_knng_mpl.png") print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")
def main(): """ Main function """ # Initialize and configure tmap dims = 1024 enc = tm.Minhash(dims) lf = tm.LSHForest(dims, 128) print("Converting images ...") for image in tqdm(IMAGES): img = Image.fromarray(np.uint8(np.split(np.array(image), 28))) buffered = BytesIO() img.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()) IMAGE_LABELS.append("data:image/bmp;base64," + str(img_str).replace("b'", "").replace("'", "")) tmp = [] for _, image in enumerate(IMAGES): avg = sum(image) / sum([1 if x > 0 else 0 for x in image]) tmp.append(tm.VectorUchar([1 if x >= avg else 0 for x in image])) # tmp.append(tm.VectorUint(image)) print("Running tmap ...") lf.batch_add(enc.batch_from_binary_array(tmp)) # LF.batch_add(ENC.batch_from_int_weight_array(tmp)) lf.index() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, CFG) faerun = Faerun(clear_color="#111111", view="front", coords=False) faerun.add_scatter( "MNIST", { "x": x, "y": y, "c": LABELS, "labels": IMAGE_LABELS }, colormap="tab10", shader="smoothCircle", point_scale=2.5, max_point_size=10, has_legend=True, categorical=True, ) faerun.add_tree("MNIST_tree", { "from": s, "to": t }, point_helper="MNIST", color="#666666") faerun.plot("i3d-tmap-mnist", path="outputs", template="url_image")
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 1000 n = 10000 data = [] # Generating some random data start = timer() for _ in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_from_binary_array to encode the data start = timer() data = enc.batch_from_binary_array(data) print(f"Encoding the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(data) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Find the 10 nearest neighbors of the first entry start = timer() _ = lf.query_linear_scan_by_id(0, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 1000 n = 10000 data = [] # Generating some random data start = timer() for _ in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Construct the k-nearest neighbour graph start = timer() knng_from = tm.VectorUint() knng_to = tm.VectorUint() knng_weight = tm.VectorFloat() _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def test_from_binary_array(self): mh = tm.Minhash(8) a = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 0, 1])) b = mh.from_binary_array(tm.VectorUchar([1, 1, 1, 1, 1, 0])) assert len(a) == 8 assert mh.get_distance(a, b) == 0.125