def __init__( self, fdir=None, name=None, source="body", redis_dir=None, basename=None, port=6379, host="localhost", num_perm=128, threshold=0.5, create=False, **kwargs, ): self.name = name or self.__class__.__name__ self.indexed = False self.set_storage(fdir) self.source = source self.redis_dir = redis_dir if redis_dir and Path( redis_dir).exists() else None self.basename = str(fdir or self) if (basename is None) else basename self.port = port self.host = host self.num_perm = num_perm self.perm = datasketch.MinHash(num_perm=self.num_perm).permutations self.threshold = threshold self.lshindex = None self.ci_tidi = {} self.digests: typing.Any = None self.digests_list: typing.List[typing.Any] = []
def index(self): self.digests = np.array(self.digests_list) del self.digests_list self.lshindex = self.make_lshindex() with self.lshindex.insertion_session() as session: ci_tidis = cluster.progress(self.ci_tidi.items(), f"Indexing {self.name}") for ci, (ti, di) in ci_tidis: mh = self.digests[di] name = f"{ti}-{ci}" m = datasketch.MinHash(num_perm=self.num_perm, permutations=self.perm, hashvalues=mh) session.insert(name, m, check_duplication=False) if self.redis_dir: r = self.cli("save") log.info(f"Saved redis with code {r.returncode}") self.indexed = True if self.storage: digestsdf = (pd.DataFrame(self.digests).reset_index().melt( id_vars=["index"], var_name="dim", value_name="val")) self.storage.save_df(digestsdf, "digests") self.storage.save_pickle(self.ci_tidi, "ci_tidi") self.storage.save_pickle(self.lshindex, "lshindex") self.close()
def build_lsh(self, permutations=128): """Buld the LSH object up by injecting the minhashes into it. :param permutations: Number of permutations to use for minhashing. :type permutations: int :return: Returning self so that we can chain commands together. :rtype: LshDeduper """ logger.info("Building minhashes...") for doc in common_tools.parse_data(self.data_dir): mh = datasketch.MinHash(num_perm=permutations) word_set = set([s.encode('utf-8') for s in doc["content"].split()]) for word in word_set: mh.update(word) self.minhashes.append((doc["filename"], mh)) if not self.lsh: raise Exception("Please first 'create_lsh' or 'load_lsh'") logger.debug("Inserting minhashes into lsh") with self.lsh.insertion_session() as session: for key, minhash in self.minhashes: session.insert(key, minhash) return self
def get_minhash_signatures(sets, num_perm=128): assert isinstance(sets, np.ndarray) and len(sets.shape) == 3 hashes = [] for i in range(0, len(sets)): m = dk.MinHash(num_perm=num_perm) split_into_shingles()
def block(self, ti: int, cis: typing.Collection[int]): assert self.lshindex is not None for ci in cis: if ci in self.ci_tidi: ti, di = self.ci_tidi[ci] mh = self.digests[di] m = datasketch.MinHash(num_perm=self.num_perm, permutations=self.perm, hashvalues=mh) for name in self.lshindex.query(m): ti, _ = map(int, name.split("-", 1)) yield ti
def minhash_data(data: List[Any]) -> datasketch.LeanMinHash: minhash = datasketch.MinHash(num_perm=256) for element in data: try: minhash.update(element.encode("utf-8")) except AttributeError as e: logger.warning(e) continue return datasketch.LeanMinHash(seed=minhash.seed, hashvalues=minhash.hashvalues)
def __init__(self, process='m'): self.songname = '' self.fingerprint = datasketch.MinHash(num_perm=256) self.framerate = [] if process == 'a': self.ask_user() elif process == 'm': pass else: if input( 'Enter "a" for automated fingerprinting or "m" to proceed manually: ' ) == 'a': self.ask_user() else: sys.exit('''Error: Incorrect entry.''')
async def debugvid(): recommenter = await Recommenter.create() # type: Recommenter videos = recommenter.readFromSQL("videoInfo3.db")[:1000] for video in videos: if video.id in ["5DGwOJXSxqg", "mWXurqWRA74"]: print(f"Generating comment minhash for {video.id}") shingles = w_shingle(video.comment_content, SHINGLE_SIZE_COMMENTS) minhash = datasketch.MinHash(num_perm=800) rated_shingles = await recommenter.word_frequency.sort_shingles(shingles) print(f"Top 4 comment shingles for {video.id} are {rated_shingles[:4]}") for shingle in rated_shingles[:800]: minhash.update(shingle.encode('utf8')) minhash_id = f"{video.id}-comment" print(f"Storing minhash {minhash_id}") await recommenter.store_minhash(minhash_id, minhash) print(f"Inserting minhash {minhash_id}") await recommenter.comments.insert_minhash_obj(minhash_id, minhash)
def tree_sim(deprels): cfg = { 'use_trunc_leaves': True, 'use_drop_nodes': False, 'use_replace_attr': False } mhash = [] for deprel in deprels: adjac = [(index + 1, head, dep) for index, (head, dep) in enumerate(deprel)] nested = ts.adjac_to_nested_with_attr(adjac) nested = ts.remove_node_ids(nested) shingled = ts.shingleset(nested, **cfg) stringified = [json.dumps(tree).encode('utf-8') for tree in shingled] m = datasketch.MinHash(num_perm=256) for s in stringified: m.update(s) mhash.append(m) return mhash[0].jaccard(mhash[1])
def add(self, table): rows = [] if self.source != "head": rows += list( tuple([cell.get("text", "").lower() for cell in r]) for r in table["tableData"]) if self.source != "body": rows += list( tuple([cell.get("text", "").lower() for cell in r]) for r in table["tableHeaders"]) cols = list(zip(*rows)) if not table.get("numericColumns", []): def isnum(col): num = lambda x: x.translate(str.maketrans("", "", "-.,%") ).isnumeric() return sum(int(num(c)) for c in col) / len(col) > 0.5 table["numericColumns"] = [ i for i, c in enumerate(zip(*rows)) if isnum(c) ] ci_range = range(table["columnIndexOffset"], table["columnIndexOffset"] + table["numCols"]) ti = table["tableIndex"] for col, (ci, cells) in enumerate(zip(ci_range, cols)): if col not in table.get("numericColumns", []): cells = set(c for c in cells if c) if len(cells) > 0: m = datasketch.MinHash(num_perm=self.num_perm, permutations=self.perm) for c in cells: m.update(c.encode("utf8")) self.ci_tidi[ci] = (ti, len(self.digests_list)) self.digests_list.append(m.digest())
def minhash_dedupe(data_dir="news_data", threshold=0.75, permutations=128): """Deduplicate by creating the minhash approximation of a jaccard score. 3rd party libraries: https://ekzhu.github.io/datasketch/minhash.html :param data_dir: Location of all documents. :type data_dir: str :param threshold: Threshold above which we consider two documents duplicates :type threshold: float :param permutations: Number of permutations to use for the minhash :type permutations: int :return: The minhash duplicates :rtype: list of floats """ minhashes = [] for doc in common_tools.parse_data(data_dir): mh = datasketch.MinHash(num_perm=permutations) words = [s.encode('utf-8') for s in doc["content"].split()] for word in words: mh.update(word) minhashes.append(mh) duplicates = [] for i_doc in range(len(minhashes)): for j_doc in range(i_doc + 1, len(minhashes)): minhash_similarity = minhashes[i_doc].jaccard(minhashes[j_doc]) is_duplicate = minhash_similarity >= threshold if is_duplicate: duplicates.append(minhash_similarity) # TODO cluster duplicates logger.info("Number of minhash duplicates with threshold {} = {}".format(threshold, len(duplicates))) return duplicates
def cluster_by_similarites(self, threshold=0.7, num_perm=128): from scipy.cluster.hierarchy import linkage, cophenet, fcluster n_pieces = len(self.sets) n_rows = self.sets.shape[1] if isinstance(self.sets, np.ndarray) else len(self.sets[0]) minhashes = [] # pieces shingles_idx = 0 for p in range(0, n_pieces): piece = self.sets[p] minhash = dk.MinHash(num_perm=num_perm) n_cols = len(piece[0]) n_sequence = n_cols + self.k - 1 n_shingle_elements = n_sequence - self.k + 1 shingles = np.empty(n_rows * n_shingle_elements, dtype="S" + str(self.k)) # iterating sequences from a region for s in range(0, n_rows): # input sequence considering surplus characters sequence = np.empty((n_sequence,), dtype="S1") sequence[0 : n_cols] = piece[s] if p != n_pieces - 1: # if we aren't on the last piece: next_piece = self.sets[p + 1] sequence[n_cols:] = next_piece[s][0: self.k - 1] # surplus else: sequence[n_cols:] = 'Z' # TODO: possivelmente substituir por valor mais provavel shingled_sequence = self.__split_into_shingles__(sequence) assert len(shingled_sequence) == n_shingle_elements, \ 'Shingled sequence: ' + str(len(shingled_sequence)) + ' and fixed len ' + str(n_shingle_elements) print shingles_idx + n_cols shingles[shingles_idx: shingles_idx + n_shingle_elements] = shingled_sequence shingles_idx += n_shingle_elements for word in shingles: minhash.update(word) minhashes.append(minhash) shingles_idx = 0 assert len(minhashes) == n_pieces print(shingles) distance_matrix = np.empty((n_pieces, n_pieces), dtype=np.float) for i in range(0, len(minhashes)): for j in range(0, len(minhashes)): if i == j: distance_matrix[i][j] = 0 else: similarity = minhashes[i].jaccard(minhashes[j]) if similarity == 0: distance_matrix[i][j] = 1 else: distance_matrix[i][j] = 1 / similarity Z = linkage(distance_matrix) # todo: test different metrics from scipy.cluster.hierarchy import dendrogram dendrogram(Z, show_leaf_counts=True) # import matplotlib.pyplot as plt # plt.show() # plt.savefig('dendrogram_' + str(self.k)) return fcluster(Z, 0.70)
def handleFlow(self): #TODO replace sorting loops with sorted function self.targets = {} self.api = [] #flow = [] addrs = [] internals = [] for instr in self.bb_insns: if isinstance(instr, CallInsn): if instr.is_api: self.targets[instr.addr] = "API:" + instr.fcn_name self.api.append({"name": instr.fcn_name}) else: internals.append(instr.addr) else: if instr.jumpout: internals.append(instr.addr) else: addrs.append(instr.addr) addrs.append(instr.offset) addrs.sort() addrs_dict = {} for i in range(len(addrs)): addrs_dict[addrs[i]] = i internals_sorted = internals[:] internals_sorted.sort() calleds_dict = {} for i in range(len(internals_sorted)): calleds_dict[internals_sorted[i]] = str(i) flowhash = datasketch.MinHash(num_perm=32) for instr in self.bb_insns: if isinstance(instr, CallInsn): if instr.is_api: #flow.append(hex(instr.offset)+" API:" + instr.fcn_name) flowhash.update("API:" + instr.fcn_name) else: #flow.append(hex(instr.offset)+" OUT:" + calleds_dict[instr.addr]) flowhash.update("OUT:" + calleds_dict[instr.addr]) self.targets[ instr.addr] = "OUT:" + calleds_dict[instr.addr] else: if instr.jumpout: #flow.append(hex(instr.offset)+" OUT:" + calleds_dict[instr.addr]) flowhash.update("OUT:" + calleds_dict[instr.addr]) self.targets[ instr.addr] = "OUT:" + calleds_dict[instr.addr] else: off = addrs_dict[instr.offset] tgt = addrs_dict[instr.addr] #flow.append("%x (%d) JMP:%s - %x (%d)" % (instr.offset, off, str(tgt - off), instr.addr, tgt)) flowhash.update("JMP:" + str(tgt - off)) self.targets[instr.addr] = "JMP:" + str(tgt - off) lean_flowhash = datasketch.LeanMinHash(flowhash) flowhash_buf = bytearray(lean_flowhash.bytesize()) lean_flowhash.serialize(flowhash_buf) self.flowhash = str(flowhash_buf) '''
def handleInsns(self): consts = {} ips = [] #set dafaukt value for PC, SP, BP pc_offset = self.arch.ip_offset regs = {pc_offset: 0, self.arch.sp_offset: 1, self.arch.bp_offset: 2} consts = {} irsbs = [] for instr_c in range(len(self.insns_list)): off = self.insns_list[instr_c][0] instr = self.insns_list[instr_c][1] #manage instruction not recognized by libVEX if self.arch.name == "X86" or self.arch.name == "AMD64": if instr == "\xf4": #hlt x86 instruction irsbs.append("HALT") continue elif instr.startswith("\xf0"): #lock x86 prefix irsbs.append("LOCK") if len(instr) == 1: continue instr = instr[1:] try: irsb = pyvex.IRSB(instr, off, self.arch, opt_level=0) except pyvex.errors.PyVEXError as err: print( "[Please report to the developer] Error with instruction " + instr.encode("hex")) raise err irsbs.append(irsb) stmts = irsb.statements n_addr = 0 for i in range(len(stmts)): #TODO PutI GetI if isinstance(stmts[i], pyvex.stmt.IMark): n_addr = stmts[i].addr + stmts[i].len elif isinstance(stmts[i], pyvex.stmt.Put): if stmts[i].offset == pc_offset and len( stmts[i].constants) == 1: c = stmts[i].constants[0] if c.value in self.targets: stmts[i].data = StrConst(self.targets[c.value]) stmts[i].offset = 0 continue elif c.value == n_addr: stmts[i].data = StrConst("_NEXT_") stmts[i].offset = 0 continue else: ips.append(c.value) stmts[i].reg_name = 0xABADCAFE stmts[i].offset = 0 else: # constants replace for j in range(len(stmts[i].constants)): c = stmts[i].constants[j] if c.value in self.targets: stmts[i].constants[j] = StrConst( self.targets[c.value]) elif c.value == n_addr: stmts[i].constants[j] = StrConst("_NEXT_") else: # constants abstraction consts[c.value] = consts.get( c.value, len(consts)) c.value = consts[c.value] # registers abstraction regs[stmts[i].offset] = regs.get( stmts[i].offset, len(regs)) stmts[i].offset = regs[stmts[i].offset] elif isinstance(stmts[i], pyvex.stmt.Exit): c = stmts[i].dst if c.value in self.targets: stmts[i] = "if (%s) { PUT(offset=0) = %s; %s }" % ( stmts[i].guard, self.targets[c.value], stmts[i].jumpkind) continue else: ips.append(c.value) stmts[i].reg_name = 0xDEADBEEF else: # constants replace for j in range(len(stmts[i].constants)): c = stmts[i].constants[j] if c.value in self.targets: stmts[i].constants[j] = StrConst( self.targets[c.value]) elif c.value == n_addr: stmts[i].constants[j] = StrConst("_NEXT_") else: # constants abstraction consts[c.value] = consts.get(c.value, len(consts)) c.value = consts[c.value] for expr in stmts[i].expressions: if isinstance(expr, pyvex.expr.Get): # registers abstraction regs[expr.offset] = regs.get(expr.offset, len(regs)) expr.offset = regs[expr.offset] #order addresses addrs = {} ips.sort() for i in range(len(ips)): addrs[ips[i]] = i #self.vex_code = "" #self.shingled_code = "" vexhash = datasketch.MinHash(num_perm=64) shingled = {} last = "" for c in range(len(irsbs)): irsb = irsbs[c] if type(irsb) == type(""): ngram = last + irsb #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, irsb) shingled[ngram] = shingled.get(ngram, 0) + 1 last = irsb continue stmts = irsb.statements ins = "" for i in range(len(stmts)): if isinstance(stmts[i], pyvex.stmt.IMark) or isinstance( stmts[i], pyvex.stmt.AbiHint): continue if hasattr(stmts[i], "reg_name"): if stmts[i].reg_name == 0xABADCAFE: stmts[i].constants[0].value = addrs[ stmts[i].constants[0].value] elif stmts[i].reg_name == 0xDEADBEEF: stmts[i].dst.value = addrs[stmts[i].dst.value] v = str(stmts[i]) + "\n" ins += v ngram = last + v shingled[ngram] = shingled.get(ngram, 0) + 1 last = v #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, ins) for ngram in shingled: for c in range(shingled[ngram]): vexhash.update("[%d]\n%s" % (c, ngram)) #self.shingled_code += "[%d]\n%s" % (c, ngram) lean_vexhash = datasketch.LeanMinHash(vexhash) vexhash_buf = bytearray(lean_vexhash.bytesize()) lean_vexhash.serialize(vexhash_buf) self.vexhash = str(vexhash_buf)
async def populateDatabase(): recommenter = await Recommenter.create() # type: Recommenter videos = recommenter.readFromSQL("videoInfo4.db") """ i = 0 for video in videos: # first we add the video's words to our word frequency index if video.id in recommenter.word_frequency.unique_doc['doc_ids']: print(f"Not adding {video.id}'s words to word frequency index, already indexed") continue else: print(f"Adding {video.id}'s words to word frequency index") for word in tqdm(' '.join([video.comment_content, video.transcript_content]).split(' ')): await recommenter.word_frequency.add_to_index(word, video.id) await recommenter.word_frequency.add_doc_id(video.id) if i == 10: await recommenter.word_frequency.upload_cache() i = -1 i += 1 """ for video in videos: if video.id in ["5DGwOJXSxqg", "mWXurqWRA74", "jkGtMjkkmn4", "cqkiim_K0sc", "Ft00DUHRCOo", "FtX_oGO9MHo"]: print(f"Skipping {video.id} as its on the blacklist") continue """ if video.has_enough_comments(): minhash_id = f"{video.id}-comment" if (await recommenter.retrieve_minhash(minhash_id)) != None: print(f"Skipping video {video.id} because comment minhash stored, presumed already indexed") continue # skip video print(f"Generating comment minhash for {video.id}") shingles = w_shingle(video.comment_content, SHINGLE_SIZE_COMMENTS) minhash = datasketch.MinHash(num_perm=800) rated_shingles = await recommenter.word_frequency.sort_shingles(shingles) print(f"Top 4 comment shingles for {video.id} are {rated_shingles[:4]}") for shingle in rated_shingles[:800]: minhash.update(shingle.encode('utf8')) print(f"Storing minhash {minhash_id}") await recommenter.store_minhash(minhash_id, minhash) #print(f"Inserting minhash {minhash_id}") #await recommenter.comments.insert_minhash_obj(minhash_id, minhash) """ if video.has_enough_transcripts(): minhash_id = f"{video.id}-transcript-w{SHINGLE_SIZE_TRANSCRIPT}" if (await recommenter.retrieve_minhash(minhash_id)) != None: print(f"Skipping video {video.id} because transcript minhash stored, presumed already indexed.") continue # skip video print(f"Generating transcript minhash for {video.id}") shingles = w_shingle(video.transcript_content, SHINGLE_SIZE_TRANSCRIPT) minhash = datasketch.MinHash(num_perm=800) rated_shingles = await recommenter.word_frequency.sort_shingles(shingles) print(f"Top 4 transcript shingles for {video.id} are {rated_shingles[:4]}") for shingle in rated_shingles[:800]: minhash.update(shingle.encode('utf8')) print(f"Storing minhash {minhash_id}") await recommenter.store_minhash(minhash_id, minhash) #print(f"Inserting minhash {minhash_id}") #await recommenter.transcripts.insert_minhash_obj(minhash_id, minhash) await recommenter.close()
def calc_jaccard_similarities(sets, k=2, inter_alignments=False): assert len(set(len(subset) for subset in sets)) == 1 assert isinstance(sets, np.ndarray) # print sets.shape, len(sets[0]) # print len(sets[0]) - k + 1 # print len(sets) assert isinstance(inter_alignments, bool) minhashes = [] if inter_alignments: assert len(sets.shape) == 3 shingles = np.empty( (sets.shape[0], sets.shape[1] * (sets.shape[2] - k + 1), k), dtype="S2") shingle_idx = 0 set_row_len = sets.shape[2] - k + 1 for i in range(0, sets.shape[0]): m = dk.MinHash() for j in range(0, sets.shape[1]): # print 'N shingles', len(sets[i][j]) - k + 1 shingles[i, shingle_idx:shingle_idx + set_row_len] = split_into_shingles(sets[i][j], k=k) shingle_idx += set_row_len shingle_idx = 0 shingle_str = [''.join(s) for s in shingles[i].astype(str)] for s in shingle_str: m.update(s.encode('utf-8')) minhashes.append(m) # if not inter_alignments: else: shingles = np.zeros((len(sets), len(sets[0]) - k + 1, k), dtype="S2") for i in range(0, len(sets)): shingles[i] = split_into_shingles(sets[i], k=k) m = dk.MinHash() # for s in shingles[i]: shingle_str = [''.join(s) for s in shingles[i].astype(str)] for s in shingle_str: m.update(s.encode('utf-8')) minhashes.append(m) assert len(sets) == len(minhashes) if not inter_alignments: n_rows = len(sets) * (len(sets) - 1) else: n_rows = sets.shape[0] - 1 # df = pd.DataFrame(data=np.zeros(permutations, 3), index='index', columns=['seq i', 'seq j', 'jaccard'], dtype=np.float) # jaccard_dict = {'jaccard' : np.zeros(permutations, dtype=np.float), 'seq i': np.zeros(permutations, dtype=np.int), 'seq j' : np.zeros(permutations, dtype=np.int)} jaccard_df = pd.DataFrame( np.empty((n_rows, ), dtype=[('i', np.uint8), ('j', np.uint8), ('jaccard', np.float)])) row = 0 for i in range(0, len(sets)): for j in range(0, len(sets)): if i != j and not ( (jaccard_df['i'] == 2) & (jaccard_df['j'] == 5)).any(): # excluding intersections str1 = [''.join(s) for s in shingles[i].astype(str)] str2 = [''.join(s) for s in shingles[j].astype(str)] jaccard = minhashes[i].jaccard(minhashes[j]) # print i, j, float(len(set(str2) & set(str1))) / len(set(str2) | set(str1)) jaccard_df['i'][row] = i jaccard_df['j'][row] = j jaccard_df['jaccard'][row] = jaccard row += 1 # df = pd.DataFrame(data=jaccard_dict) return jaccard_df
def cluster_by_lsh(sets, k=2, num_perm=128): # list of 2d ndarrays or 3d ndarray assert (isinstance(sets, np.ndarray) and len(sets.shape) == 3) or ( isinstance(sets, list) and all(isinstance(x, np.ndarray) and len(x.shape) == 2 for x in sets)) # 3d ndarray assert isinstance(k, int) and k > 0 n_pieces = len(sets) n_rows = sets.shape[1] n_cols = sets.shape[2] n_sequence = n_cols + k - 1 n_shingle_elements = n_sequence - k + 1 # shingles = np.empty((n_pieces, n_rows * (n_shingle_elements)), dtype="S" + str(k)) minhashes = [] # pieces shingles_idx = 0 for p in range(0, n_pieces): piece = sets[p] minhash = dk.MinHash(num_perm=num_perm) shingles = np.empty(n_rows * n_shingle_elements, dtype="S" + str(k)) # iterating sequences from a region for s in range(0, len(piece)): # input sequence considering surplus characters sequence = np.empty((n_sequence, ), dtype="S1") sequence[0:n_cols] = piece[s] if p != n_pieces - 1: # if we aren't on the last piece: next_piece = sets[p + 1] sequence[n_cols:] = next_piece[s][0:k - 1] # surplus else: sequence[ n_cols:] = 'Z' # TODO: possivelmente substituir por valor mais provavel shingled_sequence = split_into_shingles(sequence, k=k) assert len(shingled_sequence) == n_shingle_elements, \ 'Shingled sequence: ' + str(len(shingled_sequence)) + ' and fixed len ' + str(n_shingle_elements) # print 'Seq len', len(sequence) # print 'Len', len(shingled_sequence), 'Seq', shingled_sequence # print 'Shingle len', len(shingles[p][shingles_idx : shingles_idx + n_cols + 1]) # shingles[p][:, shingles_idx: shingles_idx + len(sequence) - 1] = shingled_sequence print shingles_idx + n_cols # shingles[p][shingles_idx : shingles_idx + n_shingle_elements] = shingled_sequence shingles[shingles_idx:shingles_idx + n_shingle_elements] = shingled_sequence shingles_idx += n_shingle_elements for word in shingles: minhash.update(word) minhashes.append(minhash) shingles_idx = 0 # shingle_str = [''.join(s) for s in shingles[piece].astype(str)] #for s in shingles[piece]: # minhash.update(s.encode('utf-8')) assert len(minhashes) == n_pieces print shingles distance_matrix = np.empty((n_pieces, n_pieces), dtype=np.float) for i in range(0, len(minhashes)): for j in range(0, len(minhashes)): if i == j: distance_matrix[i][j] = 0 else: similarity = minhashes[i].jaccard(minhashes[j]) if similarity == 0: distance_matrix[i][j] = 1 else: distance_matrix[i][j] = 1 / similarity print distance_matrix Z = linkage( distance_matrix) # todo: ver quais metricas e metodos adequados from scipy.cluster.hierarchy import dendrogram dendrogram(Z, show_leaf_counts=True) import matplotlib.pyplot as plt plt.show() return fcluster(Z, 0.70)