def test_diverse(self): hashes = [ 0x00000000, 0x10101000, 0x10100010, 0x10001010, 0x00101010, 0x01010100, 0x01010001, 0x01000101, 0x00010101 ] expected = [ (0x00000000, 0x10101000), (0x00000000, 0x10100010), (0x00000000, 0x10001010), (0x00000000, 0x00101010), (0x00000000, 0x01010100), (0x00000000, 0x01010001), (0x00000000, 0x01000101), (0x00000000, 0x00010101), (0x00101010, 0x10001010), (0x00101010, 0x10100010), (0x00101010, 0x10101000), (0x10001010, 0x10100010), (0x10001010, 0x10101000), (0x10100010, 0x10101000), (0x00010101, 0x01000101), (0x00010101, 0x01010001), (0x00010101, 0x01010100), (0x01000101, 0x01010001), (0x01000101, 0x01010100), (0x01010001, 0x01010100) ] for blocks in range(4, 10): self.assertEqual( sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
def query(self, distance=2, blocks='auto'): """ Find all the nearests neighbours for the dataset Parameters ---------- distance : int, default=2 Maximum number of differnet bits in the simhash blocks : int or 'auto', default='auto' number of blocks into which the simhash is split when searching for duplicates, see https://github.com/seomoz/simhash-py Returns ------- simhash : array the simhash value for all documents in the collection cluster_id : array the exact duplicates (documents with the same simhash) are grouped by in cluster_id dup_pairs : list list of tuples for the near-duplicates """ from simhash import find_all if distance >= 64: raise ValueError( 'Wrong input parameter for distance = {}'.format(distance) + 'Must be less than 64!') _, cluster_id = np.unique(self._fit_shash, return_inverse=True) if blocks == 'auto': blocks = min(distance * 2, 64) matches = find_all(self._fit_shash, blocks, distance) return self._fit_shash, cluster_id, matches
def prune(df): adj_list = collections.defaultdict(set) #Self matches identity_match_count = 0 for sh, subdf in df.groupby(["simhash", "year_season"]): for i in subdf.index: for j in subdf.index: if i != j: identity_match_count += 1 adj_list[i].add(j) #Cross matches for m1, m2 in simhash.find_all(df.simhash, 2, 1): for i in df[df.simhash == m1].index: for j in df[(df.simhash == m2) & (df.year_season == df.loc[i].year_season)].index: adj_list[i].add(j) adj_list[j].add(i) rev_map, rep_map = bfs(adj_list) to_keep = set((i for i in rep_map if i in rep_map[i])) | (set(df.index) - set(rev_map.keys())) df_pruned = df.loc[to_keep] logger.info( f"Removed {len(df)-len(df_pruned)} of {len(df)} documents due to similarity. Wanted to keep {len(to_keep)}" ) return df_pruned
def find_changed_policies(): policies_cache = defaultdict(lambda: defaultdict(dict)) for h, d, y, s in util.get_pool().map( fnd.hash_text, ((data["policy_text"], data["site_url"], data["year"], data["season"]) for data, cols in util.load_all_policies())): policies_cache[d][y][s] = h changed_pols = defaultdict(lambda: defaultdict(lambda: 0)) all_pols = defaultdict(lambda: defaultdict(lambda: 0)) for dom in policies_cache: print(policies_cache[dom]) prev_pol = None for y, s in util.iter_year_season(): try: print(y, s) print(policies_cache[dom][y]) pol = policies_cache[dom][y][s] except KeyError: continue if prev_pol is not None: if len(simhash.find_all([prev_pol, pol], 4, 3)) != 0: changed_pols[y][s] += 1 all_pols[y][s] += 1 return ([changed_pols[y][s] for y, s in util.iter_year_season()], [all_pols[y][s] for y, s in util.iter_year_season()])
def __find_matches(self, hashes, blocks, distance): start = time.time() m = simhash.find_all(hashes, blocks, distance) average_time = (time.time() - start) / len(hashes) for i in range(len(hashes)): self.find_time_dict.update({i: average_time}) return m
def test_diverse(self): hashes = [ 0x00000000, 0x10101000, 0x10100010, 0x10001010, 0x00101010, 0x01010100, 0x01010001, 0x01000101, 0x00010101 ] expected = [(0x00000000, 0x10101000), (0x00000000, 0x10100010), (0x00000000, 0x10001010), (0x00000000, 0x00101010), (0x00000000, 0x01010100), (0x00000000, 0x01010001), (0x00000000, 0x01000101), (0x00000000, 0x00010101), (0x00101010, 0x10001010), (0x00101010, 0x10100010), (0x00101010, 0x10101000), (0x10001010, 0x10100010), (0x10001010, 0x10101000), (0x10100010, 0x10101000), (0x00010101, 0x01000101), (0x00010101, 0x01010001), (0x00010101, 0x01010100), (0x01000101, 0x01010001), (0x01000101, 0x01010100), (0x01010001, 0x01010100)] for blocks in range(4, 10): self.assertEqual(sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
def hammingCompare(outtweets, innerTwitter): client = retinasdk.FullClient(apiKey.retina_token, apiServer="http://api.cortical.io/rest", retinaName="en_associative") liteClient = retinasdk.LiteClient(apiKey.retina_token) res = [] for index, outtweet in enumerate(outtweets): result = {} # get simHash simhash_pair = getSimHash(outtweet[2], innerTwitter, client) if len(simhash_pair) > 1: diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'], simhash_pair['in_hash']) hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']] blocks = 4 # Number of blocks to use distance = 3 # Number of bits that may differ in matching pairs matches = simhash.find_all(hashes, blocks, distance) res.append([index, outtweet[2], matches]) return res
def test_basic(self): hashes = [ 0x000000FF, 0x000000EF, 0x000000EE, 0x000000CE, 0x00000033, 0x0000FF00, 0x0000EF00, 0x0000EE00, 0x0000CE00, 0x00003300, 0x00FF0000, 0x00EF0000, 0x00EE0000, 0x00CE0000, 0x00330000, 0xFF000000, 0xEF000000, 0xEE000000, 0xCE000000, 0x33000000 ] expected = [(0x000000EF, 0x000000FF), (0x000000EE, 0x000000EF), (0x000000EE, 0x000000FF), (0x000000CE, 0x000000EE), (0x000000CE, 0x000000EF), (0x000000CE, 0x000000FF), (0x0000EF00, 0x0000FF00), (0x0000EE00, 0x0000EF00), (0x0000EE00, 0x0000FF00), (0x0000CE00, 0x0000EE00), (0x0000CE00, 0x0000EF00), (0x0000CE00, 0x0000FF00), (0x00EF0000, 0x00FF0000), (0x00EE0000, 0x00EF0000), (0x00EE0000, 0x00FF0000), (0x00CE0000, 0x00EE0000), (0x00CE0000, 0x00EF0000), (0x00CE0000, 0x00FF0000), (0xEF000000, 0xFF000000), (0xEE000000, 0xEF000000), (0xEE000000, 0xFF000000), (0xCE000000, 0xEE000000), (0xCE000000, 0xEF000000), (0xCE000000, 0xFF000000)] for blocks in range(4, 10): self.assertEqual(sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
def test_basic(self): hashes = [ 0x000000FF, 0x000000EF, 0x000000EE, 0x000000CE, 0x00000033, 0x0000FF00, 0x0000EF00, 0x0000EE00, 0x0000CE00, 0x00003300, 0x00FF0000, 0x00EF0000, 0x00EE0000, 0x00CE0000, 0x00330000, 0xFF000000, 0xEF000000, 0xEE000000, 0xCE000000, 0x33000000 ] expected = [ (0x000000EF, 0x000000FF), (0x000000EE, 0x000000EF), (0x000000EE, 0x000000FF), (0x000000CE, 0x000000EE), (0x000000CE, 0x000000EF), (0x000000CE, 0x000000FF), (0x0000EF00, 0x0000FF00), (0x0000EE00, 0x0000EF00), (0x0000EE00, 0x0000FF00), (0x0000CE00, 0x0000EE00), (0x0000CE00, 0x0000EF00), (0x0000CE00, 0x0000FF00), (0x00EF0000, 0x00FF0000), (0x00EE0000, 0x00EF0000), (0x00EE0000, 0x00FF0000), (0x00CE0000, 0x00EE0000), (0x00CE0000, 0x00EF0000), (0x00CE0000, 0x00FF0000), (0xEF000000, 0xFF000000), (0xEE000000, 0xEF000000), (0xEE000000, 0xFF000000), (0xCE000000, 0xEE000000), (0xCE000000, 0xEF000000), (0xCE000000, 0xFF000000) ] for blocks in range(4, 10): self.assertEqual( sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
if args.random: if args.hashes: print 'Random supplied with --hashes' exit(1) if not hashes: print 'Generating %i hashes' % args.random hashes = [random.randint(0, 1 << 64) for i in range(args.random)] elif not args.hashes: print 'No hashes or queries supplied' exit(2) class Timer(object): def __init__(self, name): self.name = name def __enter__(self): self.start = -time.time() print 'Starting %s' % self.name return self def __exit__(self, t, v, tb): self.start += time.time() if t: print ' Failed %s in %fs' % (self.name, self.start) else: print ' Ran %s in %fs' % (self.name, self.start) with Timer('Find all'): len(simhash.find_all(hashes, args.blocks, args.bits))
def dedup_near(infile, outfile, b, k, debug=False): """ """ # removelist = [] grplist = [] # writer = open(outfile, 'w') reader = open(infile, 'rb') startColid = getStartColumn(infile) # duphash = {} #hash -> set(lineid) # linecnt = 0 data_h = [] #list of hash val index = {} # hash val -> lineid data_v = {} # lineid -> data for line in reader: #apos = line.find(' ') apos = getStartPos(line, startColid) if apos >= 0: hash = compute(line[apos:]) data_h.append(hash) #here duplicate hash exist if hash in index: #add the same line into the same group #set grpid to the grpid of the last lineid with equal hash value if hash in duphash: duphash[hash].append(linecnt) else: #init with the first lineid duphash[hash] = [index[hash]] duphash[hash].append(linecnt) else: index[hash] = linecnt data_v[linecnt] = line #data_v[linecnt] = line[apos:] linecnt += 1 #logger.info('lines=%s', '\n'.join([data_v[x] for x in range(5)])) # logger.info('hash=%s', data_h[:5]) if debug: with open('hash.txt', 'w') as hashf: for h in data_h: hashf.write('%s\n' % h) with open('hash_full.txt', 'w') as hashf: for idx in range(len(data_h)): hashf.write('%s %s' % (data_h[idx], data_v[idx])) # output the match group to .log grpwriter = open(outfile + '.log', 'w') for key in duphash.keys(): ids = duphash[key] #only the first one reserved removelist.extend(ids[1:]) grplist.append(ids) grpwriter.write('ids:%s\n' % ' '.join([str(x) for x in ids])) #write the group of match for lineid in ids: grpwriter.write('%s' % data_v[lineid]) grpwriter.write('==================\n') logger.info('duphash removecnt=%d, linecnt = %s', len(removelist), linecnt) #find all pairs of match matches = simhash.find_all(data_h, b, k) marks = {} #lineid -> groupid grpindex = {} # groupid -> [lineids] groupid = 0 for A, B in matches: grpidA, grpidB = -1, -1 if index[A] in marks: grpidA = marks[index[A]] if index[B] in marks: grpidB = marks[index[B]] if grpidA == -1 and grpidB == -1: #new pair marks[index[A]] = groupid marks[index[B]] = groupid grpindex[groupid] = set([index[A], index[B]]) groupid += 1 elif grpidA == -1: #add B to group A marks[index[A]] = grpidB grpindex[grpidB].add(index[A]) elif grpidB == -1: marks[index[B]] = grpidA grpindex[grpidA].add(index[B]) else: #merge two old groups for lid in grpindex[grpidB]: marks[lid] = grpidA grpindex[grpidA].add(lid) grpindex[grpidB].clear() # output the groups #grpwriter = open(outfile + '.log', 'w') linecntx = 0 for grp in grpindex.keys(): if grpindex[grp]: ids = [lid for lid in grpindex[grp]] ids = sorted(ids, reverse=True) linecntx += len(ids[1:]) #output the first one removelist.extend(ids[1:]) grplist.append(ids) #output all grpwriter.write('ids:%s\n' % ids) #write the group of match for lineid in ids: grpwriter.write('%s' % data_v[lineid]) grpwriter.write('==================\n') logger.info('total removecnt=%d, linecntx = %s, grpcnt=%d', len(removelist), linecntx, len(grpindex.keys())) #out put final result remove = set(removelist) for lid in range(linecnt): if lid not in remove and lid in data_v: writer.write('%s' % data_v[lid]) # output the grplist with open(outfile + '.grp', 'w') as grpf: for grp in grplist: if len(grp) > 1: grpf.write('%s\n' % ' '.join([str(x) for x in grp])) else: grpf.write('%s\n' % grp[0]) reader.close() writer.close()
def dedup_near(data, k, b): removelist = [] grplist = [] duphash = {} #hash -> set(lineid) linecnt = 0 data_h = [] #list of hash val index = {} # hash val -> lineid data_v = {} # lineid -> data for line in data: hash = compute(line) if hash in index: if hash in duphash: duphash[hash].append(linecnt) else: duphash[hash] = [ index[hash], ] duphash[hash].append(linecnt) else: index[hash] = linecnt data_v[linecnt] = line data_h.append(hash) linecnt += 1 for key in duphash.keys(): ids = duphash[key] removelist.extend(ids[1:]) grplist.append(ids) logger.info('duphash removecnt=%d, linecnt = %s', len(removelist), linecnt) matches = simhash.find_all(data_h, b, k) marks = {} # lineid -> groupid grpindex = {} # groupid -> [lineids] groupid = 0 for A, B in matches: grpidA, grpidB = -1, -1 if index[A] in marks: grpidA = marks[index[A]] if index[B] in marks: grpidB = marks[index[B]] if grpidA == -1 and grpidB == -1: # new pair marks[index[A]] = groupid marks[index[B]] = groupid grpindex[groupid] = set([index[A], index[B]]) groupid += 1 elif grpidA == -1: # add B to group A marks[index[A]] = grpidB grpindex[grpidB].add(index[A]) elif grpidB == -1: marks[index[B]] = grpidA grpindex[grpidA].add(index[B]) else: # merge two old groups for lid in grpindex[grpidB]: marks[lid] = grpidA grpindex[grpidA].add(lid) grpindex[grpidB].clear() linecntx = 0 for grp in grpindex.keys(): if grpindex[grp]: ids = [lid for lid in grpindex[grp]] ids = sorted(ids, reverse=True) linecntx += len(ids[1:]) # output the first one removelist.extend(ids[1:]) grplist.append(ids) logger.info('total removecnt=%d, linecntx = %s, grpcnt=%d', len(removelist), linecntx, len(grpindex.keys())) remain = [] remove = set(removelist) for lid in range(linecnt): if lid not in remove and lid in data_v: remain.append(data_v[lid]) with open('grp', 'w') as grpf: for grp in grplist: if len(grp) > 1: for id in grp: grpf.write('%s\n' % (data_v[id].replace(" ", ""))) grpf.write('###############\n') return remain
duphash[hash] = [index[hash]] duphash[hash].append(linecnt) else: index[hash] = linecnt data_v[linecnt] = line linecnt += 1 print 'duphash', duphash for key in duphash.keys(): ids = duphash[key] removelist.extend(ids[1:]) grplist.append(ids) b = 4 k = 3 matches = simhash.find_all(data_h, b, k) marks = {} # lineid -> groupid grpindex = {} # groupid -> [lineids] groupid = 0 for A, B in matches: grpidA, grpidB = -1, -1 if index[A] in marks: grpidA = marks[index[A]] if index[B] in marks: grpidB = marks[index[B]] if grpidA == -1 and grpidB == -1: marks[index[A]] = groupid marks[index[B]] = groupid grpindex[groupid] = set([index[A], index[B]])
print 'Random supplied with --hashes' exit(1) if not hashes: print 'Generating %i hashes' % args.random hashes = [random.randint(0, 1 << 64) for i in range(args.random)] elif not args.hashes: print 'No hashes or queries supplied' exit(2) class Timer(object): def __init__(self, name): self.name = name def __enter__(self): self.start = -time.time() print 'Starting %s' % self.name return self def __exit__(self, t, v, tb): self.start += time.time() if t: print ' Failed %s in %fs' % (self.name, self.start) else: print ' Ran %s in %fs' % (self.name, self.start) with Timer('Find all'): len(simhash.find_all(hashes, args.blocks, args.bits))
data = [] with Pool(4) as p: for d in p.imap_unordered(one, data_one): data.append(d) data.sort(key=lambda t: t['date_parsed']) by_hash = defaultdict(list) by_root = defaultdict(list) for i, d in enumerate(data): root = d['root_url'] by_root[root].append(d) if hashing: by_hash.setdefault(d['text_simhash'], []).append(i) if hashing: matching_hashes = simhash.find_all(by_hash.keys(), 8, 6) matching_indices = {} if hashing: for a, b in matching_hashes: for i in by_hash[a]: for j in by_hash[b]: matching_indices.setdefault(i, set()).add(j) matching_indices.setdefault(j, set()).add(i) # zacinam z minimalniho i follower = {} following = {}
def make_textsim_graph(filtername): try: print("making dirs: %s" % ("../data/text_sim/%s/" % filtername)) os.mkdirs("../data/text_sim/%s/" % filtername) except: pass p = util.get_pool() args = [] for data,cols in ioutils.load_all_policies(limit=-1, filtername=filtername): text=data["policy_text"] domain=data["site_url"] year=str(data["year"]) season=data["season"] args.append((text, domain, int(year), season)) print("Total docs is %d" % len(args)) simhashes = {} all_hashes = [] sentences = {} for h, sentence, domain, year, season in p.map(hash_text, args): if sentence not in sentences: sentId = len(sentences) sentences[sentence] = sentId else: sentId = sentences[sentence] if h not in simhashes: simhashes[h] = {} simhashes[h][(domain,year,season)] = sentId all_hashes.append(h) matches = simhash.find_all(all_hashes,SIMHASH_THRESH+1,SIMHASH_THRESH) sentence_inv = {} for s in sorted(sentences, key=lambda x:sentences[x]): i = sentences[s] sentence_inv[i] = s del sentences lzma_filters = my_filters = [ { "id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME, "dict_size": 100000, #~10k words in english speaker's vocab, x10 for good measure "lc": 3, "lp": 0, "pb": 0, # assume ascii "mode": lzma.MODE_NORMAL, "nice_len": 273, "mf": lzma.MF_BT4 } ] adj = {} adj_sen = {} adj_sen_dom = {} self_match = [(h,h) for h in simhashes if len(simhashes[h]) > 1] if SAMPLE: dist_bins = [[] for i in range(10)] accepted = 0 rejected = 0 rejected_low_pass = 0 for l,r in itertools.chain(matches, self_match): lpols = simhashes[l].keys() rpols = simhashes[r].keys() ldomains = set((dom for dom, _, _ in simhashes[l])) rdomains = set((dom for dom, _, _ in simhashes[r])) domains = ldomains.union(rdomains) if l == r or len(domains) > max(len(ldomains),len(rdomains)): for ld, ly, ls in lpols: for rd, ry, rs in rpols: lt = ly * 10 + seasonToOrd[ls] rt = ry * 10 + seasonToOrd[rs] if lt == rt: if CROSS_YEAR_ONLY: continue else: first = "%d%s_%s" % (ly, ls, ld) second = "%d%s_%s" % (ry, rs, rd) pass elif lt < rt: first = "%d%s_%s" % (ly, ls, ld) second = "%d%s_%s" % (ry, rs, rd) else: first = "%d%s_%s" % (ry, rs, rd) second = "%d%s_%s" % (ly, ls, ld) if first not in adj: adj[first] = [] adj[first].append(second) lId = simhashes[l][ld,ly,ls] rId = simhashes[r][rd,ry,rs] if FUZZ_THRESH == 0 and not USE_NCD: #Anything will pass, no need to compute comp_dist = 100 else: comp_dist = check_distance(lId,rId,sentence_inv, lzma_filters) if USE_NCD: if len(sentence_inv[lId]) + len(sentence_inv[rId]) < 200: comp_dist -= 0.3 #Magic offset because NCD doesn't work well on small text if comp_dist > NCD_THRESH: print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r))) rejected += 1 continue else: if SAMPLE: if comp_dist != 100 and comp_dist >= 90: dist_bins[100 - (comp_dist + 1)].append((lId,rId,comp_dist)) if comp_dist < 90: rejected_low_pass += 1 if comp_dist < FUZZ_THRESH: # print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r))) rejected += 1 continue accepted += 1 if lId not in adj_sen_dom: adj_sen_dom[lId] = set() adj_sen_dom[lId].add(first) adj_sen_dom[lId].add(second) if lId not in adj_sen: adj_sen[lId] = set() adj_sen[lId].add(rId) if rId not in adj_sen: adj_sen[rId] = set() adj_sen[rId].add(lId) adj_rev, adj_rep = bfs(adj) print("Accepted: %d, rejected: %d, low pass: %d" % (accepted, rejected, rejected_low_pass)) if SAMPLE: for i in range(len(dist_bins)): with open("../data/text_sim/sample_0_%d.txt" % i, "w+") as f: if len(dist_bins[i]) <= 50: sample = dist_bins[i] else: sample = random.sample(dist_bins[i],10) for lId,rId,comp_dist in sample: #print(lId,rId,comp_dist) s1 = sentence_inv[lId] s2 = sentence_inv[rId] with open("../data/text_sim/s1_tmp.txt","w+") as f1: f1.write(s1) with open("../data/text_sim/s2_tmp.txt","w+") as f1: f1.write(s2) try: diff = subprocess.check_output("echo \"diff -y <(fold -s -w72 ../data/text_sim/s1_tmp.txt) <(fold -s -w72 ../data/text_sim/s2_tmp.txt) -W 200; exit 0\" | bash", shell=True) except subprocess.CalledProcessError as e: if e.returncode == 2: print(e.output) sys.exit(1) diff = e.output diff = diff.decode() f.write("%s\n%d\n%s\n" % ("="*40,comp_dist,"-"*40)) f.write("%s\n" % (diff)) with open("../data/text_sim/%s/policy_links.json" % filtername, "w+") as f: write_obj = [] i = 0 for s in adj_rep: l = [dom[6:] for dom in adj_rep[s]] write_obj.append({"id": i, "domains": l}) i += 1 json.dump(write_obj, f)
#df.swifter.progress_bar(enable=True) df["policy_text"] = clean_text(df.policy_text) save_data(prune(df)) return df if __name__ == "__main__": df = main() adj_list = collections.defaultdict(set) #Self matches identity_match_count = 0 for sh, subdf in df.groupby(["simhash", "year_season"]): for i in subdf.index: for j in subdf.index: if i != j: identity_match_count += 1 adj_list[i].add(j) #Cross matches for m1, m2 in simhash.find_all(df.simhash, 2, 1): for i in df[df.simhash == m1].index: for j in df[(df.simhash == m2) & (df.year_season == df.loc[i].year_season)].index: adj_list[i].add(j) adj_list[j].add(i) rev_map, rep_map = bfs(adj_list)