def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold, permutations): """ Removes all documents from a set of minhashed documents (3 files with the same minhash prefix) that occur in other batches in input_dir. Only batches whose number is higher than the batch in question are considered (i.e. upper triangular matrix). Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) # First, load the (already deduplicated) batch... for input_file, results in read_batch(file_prefix): for doc_id, minhash in zip(results['id'], results['minhash']): lsh.insert('\t'.join(doc_id), minhash) initial_len = len(lsh.keys) to_match_with = find_all_batches(input_dir, int(file_prefix.rpartition(os.sep)[-1])) # Now, remove all documents in it that are contained in other batches # to the "right" of it (with greater batch numbers) for batch in to_match_with: initial_batch_len = len(lsh.keys) for _, results in read_batch(batch): for i, minhash in enumerate(results['minhash']): for duplicate in lsh.query(minhash): lsh.remove(duplicate) logging.info( 'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'. format(file_base, op.basename(batch), initial_batch_len, len(lsh.keys))) # Finally, we print the documents left. Unfortunately, in order to # keep the format, we have to read the original batch again. with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: # OK, we need to re-read the batch unfortunately for input_file, results in read_batch(file_prefix): doc_ids, minhashes = [], [] for doc_id, minhash in zip(results['id'], results['minhash']): if '\t'.join(doc_id) in lsh: doc_ids.append(doc_id) minhashes.append(minhash) bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes}) logging.info('Processed batch {}; kept {} out of {} documents.'.format( file_base, len(lsh.keys), initial_len)) return len(lsh.keys), initial_len
def learn_duplicates(name, f, verbose=False): print(name) logging.basicConfig(level=logging.DEBUG) texts_sample = [ item['extracted_text'] for item in item_reader(f, name, limit=300)] dupe_predictor = DupePredictor(texts_sample) lsh = MinHashLSH(threshold=0.9, num_perm=128) # separate from dupe_predictor too_common_shingles = dupe_predictor.too_common_shingles threshold = 0.98 y_pred, y_true = [], [] def _report_pr(): tp = sum(p > threshold and d for p, d in zip(y_pred, y_true)) fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true)) fn = sum(p < threshold and d for p, d in zip(y_pred, y_true)) n_dup = tp + fn print('precision: %.3f, recall %.3f at %.2f threshold ' '(%d duplicates)' % ( tp / (tp + fp) if tp else 0., tp / n_dup if n_dup else 0., threshold, n_dup)) for i, item in enumerate(item_reader(f, name)): dupe_prob = dupe_predictor.get_dupe_prob(item['url']) y_pred.append(dupe_prob) min_hash = get_min_hash(item['extracted_text'], too_common_shingles) if dupe_prob < threshold: duplicates = [url for url, _ in dupe_predictor.update_model( item['url'], item['extracted_text'])] else: # We think this is a duplicate: replicate crawling # and do not update the model. duplicates = list(lsh.query(min_hash)) key = canonicalize_url(item['url']) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) y_true.append(bool(duplicates)) if verbose: if duplicates and dupe_prob < threshold: path = _full_path(item['url']) sample = [url for url in duplicates if _full_path(url) == path] or duplicates print('false negative %s (%s, %d more)' % ( item['url'], sample[0], len(sample) - 1)) elif not duplicates and dupe_prob > threshold: print('false positive', item['url']) if i % 100 == 0: _report_pr() _report_pr()
class DuplicateChecker: def __init__(self): self.minhashes = {} self.lsh = MinHashLSH(threshold=THRESHOLD) def create_minhashes_reading_articles(self, start_date, end_date): """Fills the minhashes dict with the files paths as the keys and the minhashes from the articles bodies as the values""" for category in read_categories_from_file(): for date_between in get_dates_between(start_date, end_date): try: date_between = date_between.strftime('%Y/%m/%d') current_dir_path = f'{DUMP_DIR}/{category}/{date_between}' for filename in os.listdir(current_dir_path): self._create_minhash_from_file(current_dir_path, filename) except FileNotFoundError: pass def _create_minhash_from_file(self, current_dir_path, filename): file_path = f'{current_dir_path}/{filename}' with open(file_path) as f: article = Article(**json.load(f)) if not article.body: os.remove(file_path) return minhash = MinHash() for word in article.body.split(' '): minhash.update(word.encode('utf8')) lean_minhash = LeanMinHash(minhash) self.minhashes[file_path] = lean_minhash self.lsh.insert(file_path, lean_minhash) def find_similar_articles(self): """Finds every similar article from the LSH index, and removes it from the index itself as well as the file from the disk""" for path, minhash in self.minhashes.items(): # The LSH will find at least the path itself, so we need to filter it for similar_article_path in [ x for x in self.lsh.query(minhash) if x is not path ]: print( f'\tremoving similar article from {similar_article_path}') self.lsh.remove(similar_article_path) with contextlib.suppress(FileNotFoundError): os.remove(similar_article_path)
def analyze_file(name, f, verbose=False): urls = [] Doc = namedtuple('Doc', ['item', 'min_hash']) documents = {} # key -> Doc lsh = MinHashLSH(threshold=0.9, num_perm=128) too_common = get_too_common_shingles(f, name, limit=300) for i, item in enumerate(item_reader(f, name)): urls.append(item['url']) min_hash = get_min_hash(item['extracted_text'], too_common) key = 'item_{}'.format(i) item = {'url': item['url']} documents[key] = Doc(item, min_hash) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)] duplicates = get_duplicates(lsh, documents, verbose=verbose) print(name.ljust(40), '\t'.join(map(str, [ len(urls), len(set(urls)), len(set(paths)), n_unique(documents, duplicates), ])))
class DupePredictor(object): """ Learn to predict if the content is duplicate by the URL. """ def __init__(self, texts_sample=None, jaccard_threshold=0.9, num_perm=128): """ Initialize DupePredictor. :param jaccard_threshold: a minimal jaccard similarity when pages are considered duplicates (intersection of content / union of content). :param texts_sample: a list of texts to calculate too_common_shingles - this allows a more precise duplicate detection, because now we know which parts are common to all pages, and which are unique for each page. """ self.jaccard_threshold = jaccard_threshold self.num_perm = num_perm self.lsh = MinHashLSH( threshold=self.jaccard_threshold, num_perm=self.num_perm) self.too_common_shingles = set() if texts_sample: self.too_common_shingles = get_too_common_shingles(texts_sample) self.seen_urls = {} # url: URLMeta self.urls_by_path = defaultdict(set) # path: {url} self.urls_by_path_q = defaultdict(set) # (path, q): {url} self.urls_by_path_qwp = defaultdict(set) # (path, param, q): {url} self.params_by_path = defaultdict(set) # path: {param} self.param_values = defaultdict(set) # (path, param): {value} # Duplicate hypotheses: # (1) All items with same path are duplicates. Key is (path,) self.path_dupstats = defaultdict(DupStat) # (2) All items with same path that differ only in given param are # duplicates. Key is (param,) self.param_dupstats = defaultdict(DupStat) # (3) Same but conditioned by path, key is (path, param) self.path_param_dupstats = defaultdict(DupStat) # (4) Same but conditioned by path + the rest of the query # Key is (path, query, param) self.path_query_param_dupstats = defaultdict(DupStat) # (5) All items with same path with only added param=value are duplicates # Key is (param, value) self.param_value_dupstats = defaultdict(DupStat) # (6) Same but conditioned by path, key is (path, param, value) self.path_param_value_dupstats = defaultdict(DupStat) # TODO - more powerful hypotheses: # - param + value without path # - more than one get param def get_dupe_prob(self, url): """ A probability of given url being a duplicate of some content that has already been seem. """ path, query = _parse_url(url) dupestats = [] extend_ds = lambda x: dupestats.extend([_f for _f in ( ds_dict.get(key) for ds_dict, key in x) if _f]) if self.urls_by_path.get(path): extend_ds([(self.path_dupstats, path)]) # If param is in the query for param, value in list(query.items()): qwp_key = _q_key(_without_key(query, param)) # Have we seen the query with param changed or removed? has_changed = self.urls_by_path_qwp.get((path, param, qwp_key)) has_removed = self.urls_by_path_q.get((path, qwp_key)) if has_changed or has_removed: extend_ds(self._param_dupstats(path, param, qwp_key)) if has_removed: extend_ds(self._param_value_dupstats(path, param, value)) # If param is not in the query, but we've crawled a page when it is q_key = _q_key(query) for param in (self.params_by_path.get(path, set()) - set(query)): if self.urls_by_path_qwp.get((path, param, q_key)): extend_ds(self._param_dupstats(path, param, q_key)) # FIXME - this could be a long list of param values, # it's better to somehow store only high-probability values? for value in self.param_values.get((path, param), set()): extend_ds(self._param_value_dupstats(path, param, value)) return max(ds.get_prob() for ds in dupestats) if dupestats else 0. def update_model(self, url, text): """ Update prediction model with a page by given url and text content. Return a list of item duplicates (for testing purposes). """ min_hash = get_min_hash(text, self.too_common_shingles, self.num_perm) item_url = canonicalize_url(url) item_path, item_query = _parse_url(item_url) all_duplicates = [ (url, self.seen_urls[url]) for url in self.lsh.query(min_hash)] duplicates = [(url, m.query) for url, m in all_duplicates if m.path == item_path] # Hypothesis (1) - just paths n_path_nodup = self._nodup_filter(min_hash, ( self.urls_by_path.get(item_path, set()) .difference(url for url, _ in duplicates))) self.path_dupstats[item_path].update(len(duplicates), n_path_nodup) # Other hypotheses, if param is in the query for param, value in list(item_query.items()): self._update_with_param( duplicates, min_hash, item_path, item_query, param, [value]) # Other hypotheses, if param is not in the query for param in ( self.params_by_path.get(item_path, set()) - set(item_query)): self._update_with_param( duplicates, min_hash, item_path, item_query, param, self.param_values.get((item_path, param), set())) # Update indexes for param, value in list(item_query.items()): self.urls_by_path_q[item_path, _q_key(item_query)].add(item_url) item_qwp_key = _q_key(_without_key(item_query, param)) self.urls_by_path_qwp[item_path, param, item_qwp_key].add(item_url) self.params_by_path[item_path].add(param) self.param_values[item_path, param].add(value) if not item_query: self.urls_by_path_q[item_path, ()].add(item_url) self.urls_by_path[item_path].add(item_url) if item_url in self.lsh: self.lsh.remove(item_url) self.lsh.insert(item_url, min_hash) self.seen_urls[item_url] = URLMeta(item_path, item_query, min_hash) if len(self.seen_urls) % 100 == 0: self.log_dupstats() return all_duplicates def _update_with_param(self, duplicates, min_hash, item_path, item_query, param, values): # qwp = "query without param" item_qwp = _without_key(item_query, param) item_qwp_key = _q_key(item_qwp) q_dup = {url for url, q in duplicates if _without_key(q, param) == item_qwp} n_q_nodup = self._nodup_filter(min_hash, ( self.urls_by_path_qwp.get((item_path, param, item_qwp_key), set()) .union(self.urls_by_path_q.get((item_path, item_qwp_key), set())) .difference(q_dup))) if q_dup or n_q_nodup: for ds_dict, key in self._param_dupstats( item_path, param, item_qwp_key): ds_dict[key].update(len(q_dup), n_q_nodup) if values: if param in item_query: qv_dup = {url for url, q in duplicates if q == item_qwp} n_qv_nodup = self._nodup_filter(min_hash, ( self.urls_by_path_q.get((item_path, item_qwp_key), set()) .difference(qv_dup))) # FIXME - this could be a long list of param values, # it's better to somehow store only high-probability values? for value in values: if param not in item_query: qv_dup = {url for url, q in duplicates if q.get(param) == value and _without_key(q, param) == item_qwp} qap_key = _q_key(_with_key_val(item_query, param, value)) n_qv_nodup = self._nodup_filter(min_hash, ( self.urls_by_path_q.get((item_path, qap_key), set()) .difference(qv_dup))) if qv_dup or n_qv_nodup: for ds_dict, key in self._param_value_dupstats( item_path, param, value): ds_dict[key].update(len(qv_dup), n_qv_nodup) def _param_dupstats(self, path, param, qwp_key): return [ (self.param_dupstats, param), (self.path_param_dupstats, (path, param)), (self.path_query_param_dupstats, (path, param, qwp_key)), ] def _param_value_dupstats(self, path, param, value): return [ (self.param_value_dupstats, (param, value)), (self.path_param_value_dupstats, (path, param, value)), ] def _nodup_filter(self, min_hash, all_urls, max_sample=200): """ This filters results that are considered not duplicates. But we really need to check that, because lsh.query does not always return ALL duplicates, esp. when there are a lot of them, so here we double-check and return only urls that are NOT duplicates. Return estimated number of not duplicates. """ if not all_urls: return 0 urls = random.sample(all_urls, max_sample) \ if len(all_urls) > max_sample else all_urls filtered = [ url for url in urls if min_hash.jaccard(self.seen_urls[url].min_hash) < self.jaccard_threshold] return int(len(filtered) / len(urls) * len(all_urls)) def log_dupstats(self, min_dup=100): for ds, name in [ (self.path_dupstats, 'Path dupstats'), (self.param_dupstats, 'Param dupstats'), (self.path_param_dupstats, 'Path-param dupstats'), (self.path_query_param_dupstats, 'Path-query-param dupstats'), (self.param_value_dupstats, 'Param-value dupstats'), (self.path_param_value_dupstats, 'Path-param-value dupstats'), ]: _log_dupstats(ds, name, min_dup=min_dup)
for d in data1: m1.update(d.encode('utf8')) for d in data2: m1.update(d.encode('utf8')) for d in data3: m1.update(d.encode('utf8')) print((m1.hashvalues)) print((m2.hashvalues)) print((m3.hashvalues)) import numpy as np print(np.shape(m1.hashvalues)) # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions lsh = MinHashLSH(threshold=0.5, num_perm=128) # Insert m2 and m3 into the index lsh.insert("m2", m2) lsh.insert("m3", m3) # Check for membership using the key print("m2" in lsh) print("m3" in lsh) # Using m1 as the query, retrieve the keys of the qualifying datasets result = lsh.query(m1) print("Candidates with Jaccard similarity > 0.5", result) # Remove key from lsh lsh.remove("m2")
def summarize(self, dataset): lilm = self.sm N = lilm.shape[0] M = (lilm.nnz + np.sum(lilm.diagonal() != 0)) // 2 nnz = M degs = np.array(lilm.sum(axis=1)) degs = np.squeeze(degs).tolist() pt = PriorityTree(((d, i) for i, d in enumerate(degs))) sizes = defaultdict(lambda: 1) nodes = {} for n in range(N): nodes[n] = {n} deg_len = sum(LN(d) for d in degs) length = LN(N) length += deg_len length += N * LN(1) length += c_MDL.LnU(N * (N + 1) // 2, M) length += M * LN(1) logger.info(f"Init length: {length}") # Initialize MinHash LSH lsh = MinHashLSH(threshold=0.4, num_perm=64) minhashes = [None] * N for n in range(N): m = MinHash(num_perm=64, hashfunc=hash, seed=1024) neighbors = ssp.find(lilm[n])[1] for nei in neighbors: m.update(nei) minhashes[n] = m lsh.insert(str(n), m) start_time = time.time() cnt = N end = False non_gain = 0 while not end: logger.debug(f"Iteration {N-cnt}") while True: du, u = pt.pop() if du == math.inf: logger.debug(f"Choose node {u} and break") end = True break if du != degs[u]: logger.warning(f"Degree not match! {du} != {degs[u]}") candidates = lsh.query(minhashes[u]) candidates = candidates[:min(self.C, len(candidates))] if len(candidates) == 1: non_gain += 1 if non_gain >= cnt: end = True break logger.debug(f"No candidate for node {u}({du})") continue if not end: logger.debug( f"Choose node: {u} with degree {du}, {len(candidates)} candidates." ) break if end: break neiu = set(ssp.find(lilm[u])[1]) max_gain, v = 0, -1 len_matrix = c_MDL.LnU(cnt * (cnt + 1) // 2, nnz) g_new_nnz = 0 neiv = set() for c in candidates: c = int(c) if c == u: continue dc = degs[c] gain = xlogx(du) + xlogx(dc) gain -= xlogx(du + dc) gain2 = 0 common_nei = 0 if lilm[u, u] != 0: gain += xlogx(lilm[u, u]) / 2 if lilm[c, c] != 0: gain += xlogx(lilm[c, c]) / 2 new_weight = lilm[u, u] + lilm[c, c] + 2 * lilm[u, c] if new_weight != 0: gain -= xlogx(new_weight) / 2 gain2 += LN(lilm[u, u]) + LN(lilm[c, c]) - LN(new_weight) neic = set(ssp.find(lilm[c])[1]) for nei in neiu: if nei not in neic: continue if nei == u or nei == v: continue gain -= xlogx(lilm[u, nei]) gain -= xlogx(lilm[c, nei]) new_weight = lilm[u, nei] + lilm[c, nei] gain += xlogx(new_weight) gain2 += LN(lilm[u, nei]) + \ LN(lilm[c, nei]) - LN(new_weight) gain += gain2 gain += LN(cnt) - LN(cnt - 1) size_u, size_v = sizes[u], sizes[v] gain += LN(size_u) + LN(size_v) - LN(size_u + size_v) gain += c_MDL.log_comb(size_u + size_v, size_u) common_nei = len(neiu & neic) new_nnz = nnz - common_nei if (u in neiu) and (c in neic) and (c not in neiu): new_nnz -= 1 gain += LN(nnz) - LN(new_nnz) # gain += common_nei * self.B gain += len_matrix gain -= c_MDL.LnU(cnt * (cnt - 1) // 2, new_nnz) if cnt * (cnt - 1) // 2 < new_nnz: logger.warning(f"Wrong parameter: {cnt}, {new_nnz}") end = True break if gain > max_gain: max_gain = gain g_new_nnz = new_nnz v = c neiv = neic if end: break if v == -1: logger.debug(f"No non-negative gain for node {u}") non_gain += 1 if non_gain >= cnt: end = True continue non_gain = 0 # Merge u and v logger.debug(f"Merge {u} and {v}, gain: {max_gain}") nodes[u] = nodes[u] | nodes[v] if v in nodes: del nodes[v] length -= max_gain logger.debug(f"Current length: {length}") # Update degree and sizes pt.update(u, (degs[u] + degs[v], u)) degs[u] = degs[u] + degs[v] pt.update(v, (math.inf, v)) degs[v] = 0 sizes[u] = sizes[u] + sizes[v] sizes[v] = 0 lilm[u] = lilm[u] + lilm[v] lilm[u, u] += lilm[u, v] for nei in ssp.find(lilm[u])[1]: if nei != u and nei != v: lilm[nei, u] = lilm[u, nei] lilm[v] = 0 lilm[:, v] = 0 cnt -= 1 nnz = g_new_nnz mu = minhashes[u] for nei in (neiu | neiv): if nei not in neiu: mu.update(nei) if nei == u or nei == v: continue # Leave v in nei's MinHash if nei not in neiu: m = minhashes[nei] m.update(u) if str(nei) in lsh: lsh.remove(str(nei)) lsh.insert(str(nei), m) minhashes[nei] = m if str(u) in lsh: lsh.remove(str(u)) if str(v) in lsh: lsh.remove(str(v)) lsh.insert(str(u), mu) minhashes[u] = mu minhashes[v] = None elapsed = time.time() - start_time logger.info( f"Summarize {N} nodes to {cnt} nodes, costs {elapsed} seconds, final length: {length}" ) return lilm, nodes
data = {jsonFile["id_str"] : [lot, lat] } data_dict.update(data) if counter <= upper_limit: counter = counter + 1 geolocated_list.append(jsonFile["id_str"]) m1 = MinHash(num_perm=128) assigned_lot=0;assigned_lat=0; start=[];end=[]; haversine_distances = list() #json file is being splitted from the collection for jsonFile in LHDB.db_collection.find(): splitted_word = jsonFile["text"].split(' ') for tokenized in splitted_word: m1.update(tokenized.encode('utf8')) result1 = lsh1.query(m1) #assigning long and lat values in the non geo items for item in result1: if(item not in geolocated_list): assigned_lot = data_dict[jsonFile["id_str"]][0] assigned_lat = data_dict[jsonFile["id_str"]][1] lot = data_dict[item][0] lat = data_dict[item][1] start = (lat,lot) end = (assigned_lat,assigned_lot) distance = haversine(start, end) haversine_distances.append(distance) lsh1.remove(item) print(haversine_distances)
def randomize_lsh_search(arr, seq_all, valid_events, th_lsh, sample_size=128, beta=1): # initialize lsh # calculate occur time of every event vector_all = {} for sid in arr: vector_all[sid] = arr[sid]['pattern'] m_all = {} for sid in vector_all: m_all[sid] = MinHash(num_perm=128) for d in vector_all[sid]: m_all[sid].update(str(d).encode('utf8')) lsh = MinHashLSH(threshold=th_lsh, num_perm=sample_size) for sid in m_all: # 'sid' for the label to put out, 'm_all' is the sequence data to put in lsh.insert(str(sid), m_all[sid]) # main loop rs = {} count = 0 # find the longest sequence longestSeq = 0 id_to_merge = "" for userkey in arr: if len(arr[userkey]['pattern']) > longestSeq: longestSeq = len(arr[userkey]['pattern']) id_to_merge = userkey while (len(arr) > 0): # find the candidates ------ similar in minHash lsh_candidates = lsh.query(m_all[id_to_merge]) del lsh_candidates[lsh_candidates.index(id_to_merge)] cost_min = 0 cost_temp = 0 el = {} el_temp = {} Tag = {} bound = {} if len(arr) == 1: rs[id_to_merge] = arr[id_to_merge] break for id_candidates in lsh_candidates: if arr[id_to_merge]['size'] == 1 and arr[id_candidates][ 'size'] == 1: el_temp, cost_temp, Tag[id_candidates] = merge( arr[id_to_merge], arr[id_candidates], seq_all, valid_events, beta=beta) bound[id_candidates] = [cost_temp, cost_temp] else: bound[id_candidates], Tag[id_candidates] = calbound( arr[id_to_merge], arr[id_candidates], seq_all, valid_events, beta=beta) if len(bound) > 0: minBound = min([bound[key][1] for key in bound]) # find out which candidate is the best for merging for id_candidates in lsh_candidates: # take the candidate which minimal LCS distance is less than the element counts difference out of consideration if (bound[id_candidates][0] - minBound) > 0.001 and Tag[id_candidates] == False: continue count += 1 el_temp, cost_temp, overlapTag = merge(arr[id_to_merge], arr[id_candidates], seq_all, valid_events, beta=beta) if cost_temp < cost_min: cost_min = cost_temp el = el_temp id_merged = id_candidates if cost_min < 0: del arr[id_to_merge] del arr[id_merged] del m_all[id_to_merge] del m_all[id_merged] lsh.remove(id_to_merge) lsh.remove(id_merged) arr[el['id'][0]] = el vector_temp = el['pattern'] m_all[el['id'][0]] = MinHash(num_perm=128) for d in vector_temp: m_all[el['id'][0]].update(str(d).encode('utf8')) lsh.insert(el['id'][0], m_all[el['id'][0]]) else: rs[id_to_merge] = arr[id_to_merge] del arr[id_to_merge] del m_all[id_to_merge] lsh.remove(id_to_merge) # find the longest sequence longestSeq = 0 id_to_merge = "" for userkey in arr: if len(arr[userkey]['pattern']) > longestSeq: longestSeq = len(arr[userkey]['pattern']) id_to_merge = userkey return rs
shingle_set = get_k_shingles(exp, shingle_size) mh = MinHash() # create MinHash for exp for s in shingle_set: mh.update(s.encode('utf8')) # convert shingle s into MinHash minhash_dict[idx] = LeanMinHash(mh) print(now_time() + 'Created Minhash') del sentences # to save memory for sim_threshold in sim_thresholds: # create MinHash for once, when testing multiple similarity values lsh = MinHashLSH(threshold=sim_threshold) # create LSH index for idx, mh in minhash_dict.items(): lsh.insert(str(idx), mh) print(now_time() + 'Created LSH for similarity {}'.format(sim_threshold)) queried_ids = set() # way more efficient than list exp_id_groups = [] for idx, mh in minhash_dict.items(): if idx in queried_ids: continue one_group_ids_str = lsh.query(mh) # id list of one group of duplicate sentences for i in one_group_ids_str: lsh.remove(i) # for efficiency one_group_ids_int = [int(i) for i in one_group_ids_str] if len(one_group_ids_int) > group_size: exp_id_groups.append(one_group_ids_int) # only keep a group with enough sentences for i in one_group_ids_int: queried_ids.add(i) pickle.dump(exp_id_groups, open(directory + 'groups{}.pickle'.format(sim_threshold), 'wb')) print(now_time() + 'Saved a file for similarity {}'.format(sim_threshold))
class FrequentCollector: """ Parts of the frequent paragraph collection algorithm in :func:`collect_frequent` have been moved here to make the code more readable. """ # The default (dummy) bootstrap tuple used when there is no bootstrap data BOOTSTRAP_TUPLE = (None, 0, []) def __init__(self, threshold: float, permutations: int, decay: float, min_freq: int, bootstrap: Union[RandomPDataReader, None] = None, decay_filter: str = 'score < 0.5', wrap_filter: int = 'count >= min_freq'): self.threshold = threshold self.permutations = permutations self.decay = decay self.min_freq = min_freq self.bootstrap = bootstrap or {} self.decay_filter = Filter(decay_filter) self.wrap_filter = Filter(wrap_filter) logging.debug('Decay filter: {}'.format(decay_filter)) logging.debug('Wrap filter: {}'.format(wrap_filter)) def reset(self, domain): """Resets the bookkeeping and statistics objects.""" self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.permutations) self.freq_ps = {} # type: Dict[str, PData] self.num_dup = 0 # Bootstrap the domain frequency counts if previous data is available _, docs, pdatas = self.bootstrap.get(domain, self.BOOTSTRAP_TUPLE) self.stats = CollectStats(domains=1, docs=docs) for pdata_id, pdata in enumerate(pdatas, start=1): self.lsh.insert(str(pdata_id), pdata.minhash) self.freq_ps[str(pdata_id)] = pdata def collect_from_doc(self, url: str, paragraphs: List[Any]): """ Runs the algorithm in MMDS (TOOD) on a document, does the bookkeeping and updates the statistics in the object. :param url: the URL of the document (used as key in LSH). :param paragraphs: the minhashes of the paragraphs of the document. """ # Step 1: decrease score of all paragraphs for pdata in self.freq_ps.values(): pdata *= self.decay # Step 2: add new paragraphs to the roster already_increased = set() # type: Set[str] for p, mh in enumerate(paragraphs, start=1): found_dup = False for duplicate in self.lsh.query(mh): # Ensure that the paragraph counter is increased by # at most one per document if duplicate not in already_increased: self.freq_ps[duplicate] += 1 already_increased.add(duplicate) if not found_dup: found_dup = True self.num_dup += 1 if not found_dup: # OK, this is a new paragraph key = url + '_' + str(p) self.lsh.insert(key, mh) self.freq_ps[key] = PData(mh) already_increased.add(key) self.stats.docs += 1 self.stats.ps += p # Step 3: drop paragraphs with low score to_drop = [ key for key, pdata in self.freq_ps.items() if self.decay_filter(score=pdata.score, count=pdata.count) ] for key in to_drop: self.freq_ps.pop(key) self.lsh.remove(key) def wrap_up_domain(self): """ Drops all frequent candidates that are below the minimum frequency and updates the statistics. """ # Get rid of paragraphs that only occured once self.freq_ps = { key: pdata for key, pdata in self.freq_ps.items() if self.wrap_filter(score=pdata.score, count=pdata.count, min_freq=self.min_freq, docs=self.stats.docs) } self.stats.frequents = len(self.freq_ps)
def deduplicate_other(main_batch, batches_to_subtract, output_dir, threshold, permutations): """ Removes all documents from a set of minhashed documents (3 files with the same minhash prefix) that occur in other batches. Both main_batch and batches_to_subtract should be batch prefixes. Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) main_base = op.basename(main_batch) logging.info('Processing input batch {}...'.format(main_base)) # First, load the (already deduplicated) batch... for input_file, results in read_batch(main_batch): for doc_id, minhash in zip(results['id'], results['minhash']): lsh.insert('\t'.join(doc_id), minhash) initial_len = len(lsh.keys) # Now, remove all documents in it that are contained in th batches # to subtract content_duplicates, url_duplicates = 0, 0 for batch in batches_to_subtract: batch_content_duplicates, batch_url_duplicates = 0, 0 initial_batch_len = len(lsh.keys) for _, results in read_batch(batch): for doc_id, minhash in zip(results['id'], results['minhash']): key = '_'.join(doc_id) if key in lsh: batch_url_duplicates += 1 lsh.remove(key) else: for duplicate in lsh.query(minhash): lsh.remove(duplicate) batch_content_duplicates += 1 logging.info( 'Cross-deduplicated input batch {} with cross batch {}: {} -> {} ' 'documents (removed {} by url, {} by content).'.format( main_base, op.basename(batch), initial_batch_len, len(lsh.keys), batch_url_duplicates, batch_content_duplicates)) content_duplicates += batch_content_duplicates url_duplicates += batch_url_duplicates # Finally, we print the documents left. Unfortunately, in order to # keep the format, we have to read the original batch again. with closing( BatchWriter(sys.maxsize, output_dir, len(main_base), int(main_base))) as bw: # OK, we need to re-read the batch unfortunately for input_file, results in read_batch(main_batch): doc_ids, minhashes = [], [] for doc_id, minhash in zip(results['id'], results['minhash']): if '\t'.join(doc_id) in lsh: doc_ids.append(doc_id) minhashes.append(minhash) bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes}) logging.info('Processed input batch {}; kept {} out of {} documents ' '(removed {} by url, {} by content).'.format( main_base, len(lsh.keys), initial_len, url_duplicates, content_duplicates)) return len(lsh.keys), initial_len