def get_minhashes_of_unique_str_list(unique_str_list): t0 = time.time() # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions threshold = 0.7 lsh = MinHashLSH(threshold=threshold, num_perm=128) # Create MinHash objects minhashes = {} for i, s in enumerate(unique_str_list): minhash = MinHash(num_perm=128) for d in ngrams(s, 3): minhash.update("".join(d).encode('utf-8')) lsh.insert(i, minhash) minhashes[i] = minhash if i % 5000 == 0: print("counter:", i) elapsed_time = time.time() - t0 print("[exp msg] elapsed time for subprocess: " + str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) elapsed_time = time.time() - t0 print("[exp msg] elapsed time for process: " + str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) #for i in range(len(minhashes.keys())): #for i in range(10): # result = lsh.query(minhashes[i]) # print("Candidates with Jaccard similarity > " + str(threshold) + " for input", i, ":", result) return (lsh, minhashes)
def find_duplicates(minhashes, threshold, permutations, name_hashes): """ Find the duplicates amongst the minhashes. Arguments: - minhashes: a list of minhashes - threshold: the Jaccard threshold for similarity / identity - permutations: the number of permutations. Must be the same as for the minhash objects - name_hashes: list of document hashes (or any ID type, really). If not empty, similarities between documents with the same ID are taken for granted and are not reported. """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) for i, mh in enumerate(minhashes, start=1): lsh.insert(str(i), mh, check_duplication=False) for i, mh in enumerate(minhashes, start=1): similar = lsh.query(mh) similar.remove(str(i)) if name_hashes: # Remove matches that occur in the same document similar = [ s for s in similar if name_hashes[i - 1] != name_hashes[int(s) - 1] ] if similar: print('{}\t{}'.format(i, ' '.join(similar)))
def _train_LSH(self): # create LSH model lsh = MinHashLSH(num_perm=128, params=(5, 7)) # train LSH model for dom in self.new_domains: # remove TLD tld_info = get_tld('http://' + dom, as_object=True, fail_silently=True) try: d = tld_info.domain except: continue # ignore super short new domains if len(d) <= 3: \ continue # create bigram set bigrams = [d[i:i + 2] for i in range(len(d) - 1)] bigrams = set(bigrams) minhash = MinHash(num_perm=128) for b in bigrams: minhash.update(b.encode('utf-8')) minhash_lean = LeanMinHash(minhash) lsh.insert(dom, minhash_lean) print('LSH Trained!') return lsh
def get_topn_similarity_documents_lsh(keywords, n=3): lsh = MinHashLSH(threshold=0.1, num_perm=128) documents_en = docs_col.find({"lang": 'english'}) documents_min = [ lsh_json(str(item["_id"]), item["keyword"]) for item in documents_en ] for item in documents_min: minhash = MinHash(num_perm=128) list_keyword = item["keyword"].split(",") for k in list_keyword: minhash.update(k.encode("utf-8")) lsh.insert(str(item["id"]), minhash) min = MinHash(num_perm=128) keywords = keywords.split(",") for k in keywords: # print(k) min.update(k.encode("utf-8")) result = lsh.query(min) list_docs = [] if result: for item in result: doc = docs_col.find_one({"_id": ObjectId(str(item))}) doc.pop('_id', None) list_docs.append(doc) print(list_docs) return list_docs
def make_lsh_partial(batch_id, batch_size, filename, out_filename, byte_start, nperm=N_PERM, thresh=0.5): """ Generate the LSH index over a subset of the data. :param batch_id: Batch id, used to determine output filename :param batch_size: Specifies number of lines of the file to read :param filename: Input file, generated using the make_lsh_file family of functions :param out_filename: Output file prefix, batch_id is appended to distinguish each block. :param byte_start: Byte offset for the partial file - this allows make_lsh_partial to read the middle sections of a file using the seek() command. :param nperm: number of permutations in the Min-Hash index. :param thresh: Jaccard index threshold to return :return: filename of the dumped LSH file. """ lsh = MinHashLSH(threshold=thresh, num_perm=nperm) current_batch = 0 with open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle: fhandle.seek(byte_start) for line in fhandle: lsplit = line.split(':') if len(lsplit) > 1: lnum = lsplit[0] line_sub = lsplit[1] wordlist = line_sub.split(' ') if len(wordlist) > 3 and (not lsh.__contains__(line_sub)): # lsh.insert((lnum + ':' + line_sub).encode('utf-8'), make_hash(wordlist, nperm)) current_batch += 1 if current_batch >= batch_size: break outfile = out_filename + '_' + str(batch_id) + '.obj' dump_lsh(lsh, outfile) return outfile
def main() -> None: for _ in tqdm(range(1), desc="Create finding example:"): minhash = MinHash(num_perm=256) list_strings = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for i in range(5)) list_strings.append(rand_string) minhash.update_batch([s.encode('utf-8') for s in list_strings]) for _ in tqdm(range(1), desc="Connect to existing db:"): lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) try: for _ in tqdm(range(1), desc="Find minHash similarity:"): result = lsh.query(minhash) print("Approximate neighbours with Jaccard similarity > 0.5", result) except BaseException as e: print(str(e)) print("Error")
class LSH(): def __init__(self,rawlist,shingle_length=2,threshold=0.8): self.indoc = rawlist self.make_lsh(shingle_length=shingle_length,threshold=threshold) def make_shingles(self,doc,length=2): s = [] for i in range(len(doc)-(length-1)): s.append(doc[i:i+length]) return s def make_shingle_sets(self,doclst=None,length=2): if doclst == None: doclst=self.indoc sets = {} for d in doclst: sets[d] = self.make_shingles(d,length) return sets def make_lsh(self,shingle_length=2,threshold=0.8): print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}') sets = self.make_shingle_sets(self.indoc,shingle_length) self.minhashes = {} self.lsh = MinHashLSH(threshold=threshold, num_perm=128) for k in sets.keys(): m = MinHash(num_perm=128) for item in sets[k]: m.update(item.encode('utf8')) self.minhashes[k] = m self.lsh.insert(k,m) def get_minhash(self,doc): return self.minhashes[doc] def get_bucket(self,target_mh): return self.lsh.query(target_mh)
def LSH(): return_result = [] result = part1.readFile(k=4) num_perm = 1024 ''' threshold (float) – Jaccard 距离阈值设定,默认为0.5 num_perm (int, optional) – 哈希置换函数设定个数,在weighted-MinHash中为样本规模大小。 params (tuple, optional) – bands 的数量与规模大小。 ''' lsh = MinHashLSH(threshold=0.9, num_perm=num_perm) #num_perm=128 index = 1 for each in result: #每一个each是一个set doc = MinHash(num_perm=num_perm) for d in each: doc.update(d.encode('utf8')) lsh.insert(str(index), doc) index = index + 1 for each_doc in result: doc_target = MinHash(num_perm=num_perm) for e in each_doc: doc_target.update(e.encode('utf8')) re = lsh.query(doc_target) print("Approximate neighbours with Jaccard similarity > 0.35", re) return_result.append(re) return clean_data(return_result)
def build_content_sim_mh_text(network, mh_signatures): def connect(nid1, nid2, score): network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score) # Materialize signatures for convenience mh_sig_obj = [] content_index = MinHashLSH(threshold=0.7, num_perm=512) # Create minhash objects and index for nid, mh_sig in mh_signatures: mh_obj = MinHash(num_perm=512) mh_array = np.asarray(mh_sig, dtype=int) mh_obj.hashvalues = mh_array content_index.insert(nid, mh_obj) mh_sig_obj.append((nid, mh_obj)) # Query objects for nid, mh_obj in mh_sig_obj: res = content_index.query(mh_obj) for r_nid in res: if r_nid != nid: connect(nid, r_nid, 1) return content_index
def pd_text_hash_create_lsh(df, col, sep=" ", threshold=0.7, num_perm=10): ''' For each of the entry create a hash function ''' from datasketch import MinHash, MinHashLSH #Create LSH lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) #Intialize list hash_lines = [] ll = df[col].values for index, sentence in enumerate(ll): #Get tokens of individual elements tokens = sentence.split(sep) #Create local hash funtion v = MinHash(num_perm=num_perm) for j in set(tokens): v.update(j.encode('utf8')) #Append hash_lines.append(v) lsh.insert(str(index), v) return hash_lines, lsh
def search_lsh_jaccard_topk(index_data, query_data, b, r, k): (index_sets, index_keys, index_minhashes) = index_data (query_sets, query_keys, query_minhashes) = query_data num_perm = b * r print("Building LSH Index.") start = time.perf_counter() index = MinHashLSH(num_perm=num_perm, params=(b, r)) # Use the indices of the indexed sets as keys in LSH. for i in range(len(index_keys)): index.insert(i, index_minhashes[num_perm][i]) end = time.perf_counter() print("Indexing time: {:.3f}.".format(end-start)) print("Querying.") times = [] results = [] for query_minhash, query_key, query_set in \ zip(query_minhashes[num_perm], query_keys, query_sets): start = time.perf_counter() result = index.query(query_minhash) # Recover the retrieved indexed sets and # compute the exact Jaccard similarities. result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])] for i in result] # Sort by similarity. result.sort(key=lambda x : x[1], reverse=True) # Take the first k. result = result[:k] duration = time.perf_counter() - start times.append(duration) results.append((query_key, result)) sys.stdout.write(f"\rQueried {len(results)} sets") sys.stdout.write("\n") return (results, times)
def deduplicate_file(file_prefix, output_dir, threshold, permutations): """ Deduplicates a set of minhashed documents (3 files with the same minhash prefix) and writes them to output_dir. Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) total_read = 0 with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: for input_file, results in read_batch(file_prefix): minhashes, new_minhashes = results['minhash'], [] doc_ids, new_doc_ids = results['id'], [] total_read += len(doc_ids) for i, minhash in enumerate(minhashes): if not lsh.query(minhash): lsh.insert('_'.join(doc_ids[i]), minhash) new_minhashes.append(minhash) new_doc_ids.append(doc_ids[i]) bw.write_results(input_file, { 'id': new_doc_ids, 'minhash': new_minhashes }) logging.debug('Kept {} documents out of {}'.format( len(new_doc_ids), len(doc_ids))) logging.info('Processed batch {}; kept {} documents out of {}.'.format( file_base, bw.total_written, total_read))
def minHash_LSH(data): # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions # Create LSH index lsh = MinHashLSH(threshold=0.65, num_perm=256) # Create MinHash objects minhashes = {} for c, i in enumerate(data): #c è l'indice, i è la tupla #print(i) minhash = MinHash(num_perm=256) for el in i: minhash.update(el.encode('utf8')) # for d in ngrams(i, 3): # minhash.update("".join(d).encode('utf-8')) lsh.insert(c, minhash) minhashes[c] = minhash #print(str(c)+" "+str(minhashes[c])) res_match=[] for i in range(len(minhashes.keys())): result = lsh.query(minhashes[i]) if result not in res_match and len(result)==2: res_match.append(result) #print("Candidates with Jaccard similarity > 0.6 for input", i, ":", result) #print(res) # for i in range(len(res_match)): # print(data[res_match[i][0]]) # print(data[res_match[i][1]]) return res_match
def text2lsh(filename='data/comments_words_std.csv', threshold=0.9, num_perm=128, is_save=True, lshfile='data/output'): lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) X = pd.read_csv(filename) #X=X[X.words_size>3] n = X.shape[0] hash_list = [] for i in tqdm(range(0, n)): try: id = int(X.id[i]) score = int(X.score[i]) m = MinHash(num_perm=num_perm) words = X.words[i].split(' ') for d in words: m.update(d.encode('utf8')) lsh.insert((id, score), m) hash_list.append((id, m)) except Exception as e: print(e, words) #print(id, score) if is_save: f = open(lshfile + '_lsh.pkl', 'wb') pickle.dump(lsh, f, 0) f.close() f = open(lshfile + '_hash_list.pkl', 'wb') pickle.dump(hash_list, f, 0) f.close() return lsh, hash_list
def main(): print('Загружаем корпус') all_csv = pd.read_csv("./all.csv", encoding="utf-8") raw_corpus = all_csv["text"] print('Приводим его к стандартному виду') normalized_copus: List[List[str]] = [ normalize(proverb) for proverb in raw_corpus ] print('Составляем индекс для поиска дублей') lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT) deduplicated_corpus = [] for i, words in enumerate(normalized_copus): words_hash = to_minhash(words) duplicates = lsh.query(words_hash) if duplicates: print(f'Найдены совпадения для ({i}): {raw_corpus[i]}') all_csv.drop(duplicates) for idx in duplicates: print(f'\t{idx:>5d}. {raw_corpus[idx]}') else: lsh.insert(i, words_hash) deduplicated_corpus.append(raw_corpus[i]) print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus)) print( f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} рецензий)' ) all_csv.to_csv("./all_deduplicated.csv", encoding="utf-8", index=False)
def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5): """ use local similarity hashing to efficiently remove tweets that are similar to others (might be autogenerated or retweets) english tweets only """ t0 = time.time() df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col]) tweets = [t.split(" ") for t in df["tweet_clean"]] t1 = time.time() print t1 - t0, "cleaned tweets" lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64) # jaccard similarity idx_selected = {} df_indices = df.index.values.tolist() for idx, tweet in zip(df_indices, tweets): s = MinHash(num_perm=64) for word in tweet: s.update(word.encode('utf8')) # only add if the tweet is not similar to existing ones if len(lsh.query(s)) == 0: lsh.insert(idx, s) idx_selected[idx] = True t2 = time.time() print t2 - t1, "created lsh" # only select the first tweet in a group of similar tweets df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices) print df["select"].value_counts() t3 = time.time() print t3-t2, "selected df" return df[df["select"]]
class DuplicationIndex: def __init__( self, *, duplication_jaccard_threshold: float = 0.85, ): self._duplication_jaccard_threshold = duplication_jaccard_threshold self._num_perm = NUM_PERM self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) self._duplicate_clusters = defaultdict(set) def add(self, code_key: Tuple, min_hash: MinHash) -> None: """Add a key to _index (MinHashLSH) the min_hash is used to query closest matches based on the jaccard_threshold. The new key is either added to a existing cluster of one close match, or a new cluster is created. The clusters created in this way, depend on the order of add. Args: code_key (Tuple of (index, repo_name, path)): Theoritically any hasbale key. Here we use a tuple to retrieve the information later. min_hash: MinHash of the code_key. """ close_duplicates = self._index.query(min_hash) if code_key in self._index.keys: print(f"Duplicate key {code_key}") return self._index.insert(code_key, min_hash) if len(close_duplicates) > 0: for base_duplicate in close_duplicates: if base_duplicate in self._duplicate_clusters: self._duplicate_clusters[base_duplicate].add(code_key) break else: self._duplicate_clusters[close_duplicates[0]].add(code_key) def get_duplicate_clusters(self) -> List[List[Dict]]: """Export the duplicate clusters. For each cluster, the first element is the base element of the cluster. The base element has an estimation jaccard similarity higher than the threshold with all the other elements. Returns: duplicate_clusters (List[List[Dict]]): List of duplicate clusters. """ duplicate_clusters = [] for base, duplicates in self._duplicate_clusters.items(): cluster = [base] + list(duplicates) # reformat the cluster to be a list of dict cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster] duplicate_clusters.append(cluster) return duplicate_clusters def save(self, filepath) -> None: duplicate_clusters = self.get_duplicate_clusters() with open(filepath, "w") as f: json.dump(duplicate_clusters, f)
def main(): """Точка входа в приложение.""" corpus_root = Path('corpus/clean') """Находим названия всех файлов""" list_files = file_searcher(corpus_root) print('Загружаем корпус') raw_corpus = [] for file in list_files: with open(file, 'r', encoding='utf-8') as src: text_news = '\n'.join([line.rstrip('\r\n') for line in src]) raw_corpus.append(text_news) print('Приводим его к стандартному виду') normalized_copus: List[List[str]] = [ normalize(news) for news in raw_corpus ] print('Составляем индекс для поиска дублей') dst = open('duplicate.txt', 'w', encoding='utf-8') lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT) deduplicated_corpus = [] for i, (file, words) in enumerate(zip(list_files, normalized_copus)): words_hash = to_minhash(words) duplicates = lsh.query(words_hash) if duplicates: print(f'Найдены совпадения для ({file}): {raw_corpus[i]}', file=dst) for idx in duplicates: print(f'\t{list_files[idx]}. {raw_corpus[idx]}', file=dst) print('\n\n\n\n', file=dst) else: lsh.insert(i, words_hash) deduplicated_corpus.append((raw_corpus[i], list_files[i])) print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus), file=dst) print( f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} новостей)' ) # Создаем пустые папки all_genre = [ 'Политика', 'В мире', 'Экономика', 'Общество', 'Происшествия', 'Армия', 'Наука', 'Культура', 'Религия', 'Спорт', 'Туризм' ] import os for genre in all_genre: newpath = 'corpus/super clean/' + genre if not os.path.exists(newpath): os.makedirs(newpath) # Сохраняем корпус for text, name in deduplicated_corpus: with open('corpus/super clean/' + name[13:], 'w', encoding='utf-8') as dst: print(text, file=dst)
def build_lsh(code_set, jaccard): lsh = MinHashLSH(threshold=jaccard, num_perm=128) minhashes = [] for i, c in enumerate(code_set): m = minhashing(c) lsh.insert(str(i), m) minhashes.append([i, m]) return minhashes, lsh
def __init__(self, lang): self.lang = lang self.wikient = WikiEntities(self.lang) normalizer_factory = IndicNormalizerFactory() self.normalizer = normalizer_factory.get_normalizer(self.lang) self.articles = [] self.ent_sets = [] self.minhashes = [] self.lsh = MinHashLSH(threshold=0.5, num_perm=128)
def build_lsh(self, threshold=0.5): start = time.time() print 'Buidling LSH...' lsh = MinHashLSH(threshold=threshold, num_perm=128) with lsh.insertion_session() as session: for i, entity in enumerate(self.entities): session.insert(i, self.minhash(entity.value)) print '[{} s]'.format(time.time() - start) return lsh
def find_relation_class_name_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = nlp.camelcase_to_snakecase(source_name) source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() m = MinHash(num_perm=32) for token in source_name.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('relation', (db_name, original_source_name), m)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=32) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) # Index all the minhashes lsh_index = MinHashLSH(threshold=0.5, num_perm=32) for idx in range(len(names)): lsh_index.insert(idx, names[idx][2]) matchings = [] for idx in range(0, num_relations_inserted): # Compare only with classes N = lsh_index.query(names[idx][2]) for n in N: kind_q = names[idx][0] kind_n = names[n][0] if kind_n != kind_q: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx][1][0], names[idx][1][1], "_"), names[n][1]) matchings.append(match) et = time.time() print("Time to relation-class (name): " + str(et - st)) return matchings
def fit(self, trainset): '''Computes the signature matrix for the training set.''' AlgoBase.fit(self, trainset) self.lsh = MinHashLSH(threshold=self.tr, num_perm=self.n_perm) for user in tqdm(self.trainset.ur, desc='Computing LSH'): self.lsh.insert(user, self.compute_minhash_signature(user)) return self
def __init__( self, *, duplication_jaccard_threshold: float = 0.85, ): self._duplication_jaccard_threshold = duplication_jaccard_threshold self._num_perm = NUM_PERM self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) self._duplicate_clusters = defaultdict(set)
def setup_lsh(self): """ Create Minhash lsh """ if self.__storage_config: self.__lsh = MinHashLSH(threshold=self.__threshold, num_perm=self.__num_perm, storage_config=self.__storage_config) else: raise ValueError("Storage Backend Required Due to Use of Session")
def perform_lsh(lsh_text, standard_labels, title_labels, char_ngram=5, savefile=''): t0 = time.time() shingled_desc = [shingles(desc) for desc in lsh_text] print_elapsed(t0, 'splitting the text into groups of characters') #Create hash signatures for shingles t0 = time.time() hash_objects = [] for i in range(len(shingled_desc)): m = MinHash(num_perm=200) hash_objects.append(m) print_elapsed(t0, 'creating hash signatures') t0 = time.time() for ix, desc in enumerate(shingled_desc): for d in desc: hash_objects[ix].update(d.encode('utf8')) print_elapsed(t0, 'encoding hash objects') #Define LSH and Jaccard similarity threshold lsh = MinHashLSH(threshold=0.8, num_perm=200) content = [] for ix, desc in enumerate(shingled_desc): content.append((standard_labels[ix], hash_objects[ix])) for ix, elem in enumerate(content): #lsh.insert('{}'.format(ix), elem[1]) #elem[0], elem[1]) lsh.insert(elem[0], elem[1]) #For each standard search all signatures and identify potential clashes (e.g. other standards with Jaccard similarity #of shingle sets greater or equal to the threshold). Note: some of the candidates might be false positives. candidates = {} for ix, desc in enumerate(shingled_desc): result = lsh.query(hash_objects[ix]) if len(result) > 1: candidates[standard_labels[ix] + ': ' + title_labels[ix]] = [ (res, df_nos['Title'].loc[res]) for res in result ] #candidates.append(result) print(standard_labels[ix] + ': ' + title_labels[ix], ': ', [(res, df_nos['Title'].loc[res]) for res in result]) #print(standard_labels[ix], ': ',result) print('***************') else: candidates[standard_labels[ix]] = 'none' if len(savefile): pd.DataFrame.from_dict(candidates, orient='index').to_csv(savefile) return candidates, shingled_desc, content, lhs
def compute_lsh(self, entry): """ Indexes the WARC entry using LSH """ if not self.lsh: self.lsh = MinHashLSH( threshold=self.config.getfloat(self.domain, 'lsh_threshold'), num_perm=self.config.getint(self.domain, 'number_of_minhash_buckets')) self.lsh.insert(entry['item_id'], entry['minhash'])
def make_lsh(self,shingle_length=2,threshold=0.8): print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}') sets = self.make_shingle_sets(self.indoc,shingle_length) self.minhashes = {} self.lsh = MinHashLSH(threshold=threshold, num_perm=128) for k in sets.keys(): m = MinHash(num_perm=128) for item in sets[k]: m.update(item.encode('utf8')) self.minhashes[k] = m self.lsh.insert(k,m)
def apply_lsh(group, col): lsh = MinHashLSH(threshold=0.9, num_perm=256) minhashes = {} for idx, text in group[col].iteritems(): minhash = MinHash(num_perm=256) for d in ngrams(text, 3): minhash.update("".join(d).encode('utf-8')) index = group.loc[idx, 'productId'] lsh.insert(key=index, minhash=minhash) minhashes[index] = minhash return lsh, minhashes
def reset(self, domain): """Resets the bookkeeping and statistics objects.""" self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.permutations) self.freq_ps = {} # type: Dict[str, PData] self.num_dup = 0 # Bootstrap the domain frequency counts if previous data is available _, docs, pdatas = self.bootstrap.get(domain, self.BOOTSTRAP_TUPLE) self.stats = CollectStats(domains=1, docs=docs) for pdata_id, pdata in enumerate(pdatas, start=1): self.lsh.insert(str(pdata_id), pdata.minhash) self.freq_ps[str(pdata_id)] = pdata
def benchmark_lsh(threshold, index_data, query_data): print("Building LSH index") num_perm = len(index_data.minhashes[0].hashvalues) lsh = MinHashLSH(threshold, num_perm) for key, minhash in zip(index_data.filenames, index_data.minhashes): lsh.insert(key, minhash) print("Querying") times = [] results = [] for minhash in query_data.minhashes: start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(result) return times, results
def analyze_file(name, f, verbose=False): urls = [] Doc = namedtuple('Doc', ['item', 'min_hash']) documents = {} # key -> Doc lsh = MinHashLSH(threshold=0.9, num_perm=128) too_common = get_too_common_shingles(f, name, limit=300) for i, item in enumerate(item_reader(f, name)): urls.append(item['url']) min_hash = get_min_hash(item['extracted_text'], too_common) key = 'item_{}'.format(i) item = {'url': item['url']} documents[key] = Doc(item, min_hash) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)] duplicates = get_duplicates(lsh, documents, verbose=verbose) print(name.ljust(40), '\t'.join(map(str, [ len(urls), len(set(urls)), len(set(paths)), n_unique(documents, duplicates), ])))
def load(self): """Loads the stored model data from previous runs""" if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)): self.lsh = pickle.load( open( './minhash-%s--%d-%.2f.pkl' % ( self.lang, self.permutations, self.threshold ), 'rb' ) ) else: self.lsh = MinHashLSH( threshold=self.threshold, num_perm=self.permutations )
def learn_duplicates(name, f, verbose=False): print(name) logging.basicConfig(level=logging.DEBUG) texts_sample = [ item['extracted_text'] for item in item_reader(f, name, limit=300)] dupe_predictor = DupePredictor(texts_sample) lsh = MinHashLSH(threshold=0.9, num_perm=128) # separate from dupe_predictor too_common_shingles = dupe_predictor.too_common_shingles threshold = 0.98 y_pred, y_true = [], [] def _report_pr(): tp = sum(p > threshold and d for p, d in zip(y_pred, y_true)) fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true)) fn = sum(p < threshold and d for p, d in zip(y_pred, y_true)) n_dup = tp + fn print('precision: %.3f, recall %.3f at %.2f threshold ' '(%d duplicates)' % ( tp / (tp + fp) if tp else 0., tp / n_dup if n_dup else 0., threshold, n_dup)) for i, item in enumerate(item_reader(f, name)): dupe_prob = dupe_predictor.get_dupe_prob(item['url']) y_pred.append(dupe_prob) min_hash = get_min_hash(item['extracted_text'], too_common_shingles) if dupe_prob < threshold: duplicates = [url for url, _ in dupe_predictor.update_model( item['url'], item['extracted_text'])] else: # We think this is a duplicate: replicate crawling # and do not update the model. duplicates = list(lsh.query(min_hash)) key = canonicalize_url(item['url']) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) y_true.append(bool(duplicates)) if verbose: if duplicates and dupe_prob < threshold: path = _full_path(item['url']) sample = [url for url in duplicates if _full_path(url) == path] or duplicates print('false negative %s (%s, %d more)' % ( item['url'], sample[0], len(sample) - 1)) elif not duplicates and dupe_prob > threshold: print('false positive', item['url']) if i % 100 == 0: _report_pr() _report_pr()
class NearDuplicate(EtlProcessor): """A class that acts over the raw tweets collected from the twitter stream in order to detect whether the tweet is duplicate, near-duplicate or nothing at all""" punct = re.compile(r"[\.,;:]\\xe2", re.IGNORECASE) langs = { "es": "spanish", "en": "english" } process_count = 0 def __init__( self, connector=None, lang='en', threshold=0.8, permutations=90, autostart=True ): self.permutations = permutations self.threshold = threshold self.lang = lang self.connector = None self.lsh = None EtlProcessor.__init__(self, connector=connector, autostart=autostart) if autostart: self.load() self.listen() def listen(self): """Performs a model check on whether the current tweet resembles at least to a 80% level as other previous tweets""" for msg in self.connector.listen(): tweet = json.loads(msg.value()) try: if self.is_unique(tweet): self.connector.send( msg.value() ) self.connector.log( json.dumps({ "id_str": tweet['id_str'], "source": self.connector.consumer_topic, "dest": self.connector.producer_topic }) ) except ValueError: self.connector.send( json.dumps({ "id_str": tweet['id_str'], "source": self.connector.consumer_topic, "dest": "error" }) ) continue finally: self.process_count += 1 if self.process_count % 1000 == 0: self.save() def load(self): """Loads the stored model data from previous runs""" if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)): self.lsh = pickle.load( open( './minhash-%s--%d-%.2f.pkl' % ( self.lang, self.permutations, self.threshold ), 'rb' ) ) else: self.lsh = MinHashLSH( threshold=self.threshold, num_perm=self.permutations ) def save(self): """Stores the currently processed data for this model""" pickle.dump( self.lsh, open( './minhash-%s--%d-%.2f.pkl' % ( self.lang, self.permutations, self.threshold ), 'wb+' ) ) def replace_urls(self, tweet): """Convenience function that replaces the compressed URLs by their expanded counterparts, in order to treat the same real URL as it is (and not obfuscating the same URL in diferent tweets by a different t.co link)""" removed_characters = 0 if 'entities' in tweet and 'urls' in tweet['entities']: for url in tweet['entities']['urls']: tweet['text'] = tweet['text'][:(url['indices'][0] - removed_characters - 1)] + \ tweet['text'][(url['indices'][1] - removed_characters - 1):] removed_characters += url['indices'][1] - url['indices'][0] for url in tweet['entities']['urls']: tweet['text'] += ' ' + url['expanded_url'] return tweet @lru_cache(maxsize=1e06) def minhash_tweet(self, tweet_text): """Minhashing operation that allows for a caching of up to 1M tweets in order to speed up the checking procedure when it's the same tweet text""" tweet_hash = MinHash(num_perm=self.permutations) for word in tweet_text.split(): tweet_hash.update( self.punct.sub( "", word.encode('utf8') ) ) return tweet_hash def is_unique(self, tweet): """Core method to check whether this tweet resembles enough to other previous tweets to label it as unique or near-duplicate""" is_unique_tweet = False urlfied_tweet = self.replace_urls(tweet) mht = self.minhash_tweet( urlfied_tweet['text'] ) if 'minteressa' not in tweet: tweet['minteressa'] = {} if self.lsh.is_empty() is not True: similars = self.lsh.query(mht) if len(similars) == 0: # It's a unique tweet try: self.lsh.insert( tweet['id_str'], mht ) is_unique_tweet = True except ValueError: logging.error(ValueError) else: # nondupe for tweet_idx in similars: if 'nearduplicates' not in tweet['minteressa']: tweet['minteressa']['nearduplicates'] = 0 tweet['minteressa']['nearduplicates'] += 1 else: is_unique_tweet = True self.lsh.insert( tweet['id_str'], mht ) return is_unique_tweet
def print_stats( f, show=None, skip_unique=False, max_int_value=5, duration_limit=None, print_duplicates=False, print_urls=False, limit=None): stats = Counter() if not skip_unique: lsh = MinHashLSH(threshold=0.9, num_perm=128) too_common = get_too_common_shingles(f, limit=1000) urls = {} min_timestamp = max_timestamp = None for i, item in enumerate(item_reader(f, limit=limit)): if print_urls: print(item['url']) content_type = item.get('content_type', 'missing') stats.update([ 'content_type: ' + content_type, 'content_type[0]: ' + content_type.split('/')[0]]) if min_timestamp is None: min_timestamp = item['timestamp'] max_timestamp = item['timestamp'] if duration_limit and \ (max_timestamp - min_timestamp) / 1000 > duration_limit: break if 'extracted_text' not in item: assert item['obj_stored_url'] stats.update(['documents']) continue stats.update(['items']) for key, value in item['extracted_metadata'].items(): if key == 'forms': for form in value: stats.update(['form_{}'.format(form['form'])]) stats.update(['form_field {}'.format(f) for f in form['fields'].values()]) if isinstance(value, list): value = len(value) if isinstance(value, int) and not isinstance(value, bool): if value >= max_int_value: value = '{}+'.format(max_int_value) key = '{}_{}'.format(key, value) if value: stats.update([key]) if key == show: print(item['url']) if not skip_unique: min_hash = get_min_hash(item['extracted_text'], too_common) duplicates = lsh.query(min_hash) if not duplicates: stats.update(['unique_items']) elif print_duplicates: print('{} {} duplicates: {}'.format( item['url'], len(duplicates), ' '.join(urls[k] for k in duplicates[:10]))) key = 'item_{}'.format(i) lsh.insert(key, min_hash) urls[key] = item['url'] if max_timestamp and min_timestamp: stats['duration'] = (max_timestamp - min_timestamp) / 1000 for k, v in sorted(stats.items()): print(k.ljust(20), v) return stats
allshingle.append(''.join(shingle[i][j]).split()) # Create MinHash objects m = [] for i in range(0,allshingle.__len__()): m.append(MinHash(num_perm=128)) for i in range(allshingle.__len__()): for d in allshingle[i]: m[i].update(d.encode('utf8')) # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions lsh = MinHashLSH(threshold=1, num_perm=128) # Insert m into the index for i in range(0, m.__len__()): lsh.insert("m%d"%i, m[i]) # Search all the frequent shingle which frequency bigger than 100 result = [] for i in range(0, m.__len__()): if len(lsh.query(m[i])) > 100: result.append(lsh.query(m[i])) #Find the frequency of the shingle index = [] for i in range(0,result.__len__()): tem = len(result[i])
def minhash_merger_series(interactive=True): matchlogger = MatchLogBuilder() if interactive: callback=askuser_callback_series else: callback=matchlogger.add_match_series print("fetching series") with app.app_context(): items = models.Series.query.options( joinedload(Series.alternatenames) ).all() altn = [] for item in items: for name in item.alternatenames: altn.append((name.id, name.series, name.cleanname, item.title)) print("Building mapping dictionaries") # Map altname id to series id altnid_sid_dict = dict([(tmp[0], tmp[1]) for tmp in altn]) altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn]) sid_sname_dict = dict([(tmp[1], tmp[3]) for tmp in altn]) sid_altnid_dict = {} for nid, sid in altnid_sid_dict.items(): sid_altnid_dict.setdefault(sid, []) sid_altnid_dict[sid].append(nid) print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict))) perms = 512 gram_sz = 3 minhashes = {} lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms) print("Building lsh minhash data structure") with ProcessPoolExecutor(max_workers=8) as ex: print("Submitting jobs") futures = [(key, ex.submit(minhash_str, content, perms, gram_sz)) for key, content in altnid_name_dict.items() if len(content) >= 5 ] print("Consuming futures") for key, future in tqdm.tqdm(futures): minhash = future.result() lsh.insert(key, minhash) minhashes[key] = minhash print("Doing search") for key, minhash in minhashes.items(): result = lsh.query(minhashes[key]) if key in result: result.remove(key) if result: sid = altnid_sid_dict[result[0]] src_sid = altnid_sid_dict[key] if sid != src_sid: sname = sid_sname_dict[sid] res_sids = set([altnid_sid_dict[tmp] for tmp in result]) names = [] for res_id in result: if altnid_sid_dict[res_id] != src_sid: names.append((altnid_sid_dict[res_id], res_id, altnid_name_dict[res_id])) if names: names.sort() print("Search returned %s results in %s series for %s:%s" % (len(result), len(res_sids), src_sid, sname)) for sid, nid, name in names: print(" %s -> %s: %s" % (str(sid).rjust(8), str(nid).rjust(8), name)) if not interactive: matchlogger.save_log("./seriesname-matchset-minhash.json")
def consolidate_dupes(self, agg_files): # Remove short items for key, value in agg_files.items(): for fkey in list(value['files'].keys()): # print("File params: ", value['files'][fkey].keys()) if not 'content_text' in value['files'][fkey]: print("Missing file:", key, fkey) value['files'].pop(fkey) elif len(value['files'][fkey]['content_text']) < 100: print("Removing short file: ", (key, fkey)) value['files'].pop(fkey) smap = {} for key, value in agg_files.items(): for fkey in value['files']: smap[(key, fkey)] = value['files'][fkey]['content_text'] perms = 512 gram_sz = 10 thresh = 0.5 lsh = MinHashLSH(threshold=thresh, num_perm=perms) print("Loading word hashes") minhashes = {} with ProcessPoolExecutor(max_workers=10) as ex: print("Submitting jobs") futures = [(key, ex.submit(minhash_str, content, perms, gram_sz)) for key, content in smap.items() ] print("Submitted %s jobs. Consuming futures" % len(futures)) for key, future in tqdm.tqdm(futures, "Hashing"): minhash = future.result() lsh.insert(key, minhash) minhashes[key] = minhash lens = {} for key, content in smap.items(): clen = len(content) lens.setdefault(clen, []) lens[clen].append(key) lenl = list(lens.keys()) lenl.sort() print("%s items in file map before dupe elimination" % len(smap)) for clen in lenl: tgt_keys = lens[clen] for key in tgt_keys: if key not in smap: continue if key not in minhashes: continue result = lsh.query(minhashes[key]) if key in result: result.remove(key) if result: still_ok = [tmp for tmp in result if tmp in smap] if still_ok: smap.pop(key) akey, fkey = key agg_files[akey]['files'].pop(fkey) # for res in result: # print(key) # print("Similar: ", result) print("%s items in file map after dupe elimination" % len(smap)) return agg_files