def get_simHashindex(hash_list): """ 功能:创建Simhash索引 参数:SimHash列表 返回值:SimHash索 """ return SimhashIndex(hash_list, k=5) #创建索引
def _build_index(): global _INDEX index_list = [] for domain in _TRAIN.keys(): sim = Simhash(domain) index_list.append((domain, sim)) _INDEX = SimhashIndex(index_list, k=100)
def extract_next_links(url, resp) -> "list()": defrag=urldefrag(url)[0] print(defrag) if resp.status == 200: print("Scanning") if defrag not in urls: content = resp.raw_response.text data=getVisibleText(content) simmed=Simhash(data) if simmed.value not in sims: index=SimhashIndex(objs,k=3) if len(index.get_near_dups(simmed))==0: urls.add(defrag) sims.add(simmed.value) objs.append((url,simmed)) print(len(urls),len(sims),len(objs)) try: file=open("data_dump.txt","a",errors="ignore") to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n" file.write(to_write) except Exception as e: raise e finally: file.close() #urls[defrag].add(getVisibleText(content)) #print(urls[defrag]) return getAllUrls(url,content) else: print("Cant scan") return []
def __init__(self, vocab_to_freq, f=64, k=32): self.vocab_to_freq = vocab_to_freq self.simhash_index = SimhashIndex([], f=f, k=k) self.f = f self.k = k simhash_index = self.simhash_index for w in vocab_to_freq: sh = Simhash(w, f=f) simhash_index.add(w, sh)
def clustering(): fout = open('cluster.txt', 'w', encoding='UTF-8') cursor = conn.cursor() cursor.execute( 'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0' ) entrylist = cursor.fetchall() objs = [] entrydic = {} for item in entrylist: if not is_en(item[1]): if not item[4].startswith("https://weibo.com"): sim = Simhash(get_features_cn(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } else: sim = Simhash(get_features(item[1])) objs.append((str(item[0]), sim)) entrydic[str(item[0])] = { 'title': item[1], 'cluster': 0, 'sim_count': 0, 'link': item[4], 'simhash': sim.value } index = SimhashIndex(objs, k=tolerance) cluster_num = last_cluster_num for key in entrydic: if entrydic[key]['cluster'] == 0: sims = index.get_near_dups( Simhash(get_features_cn(entrydic[key]['title']))) for item in sims: entrydic[item]['cluster'] = cluster_num # if len(sims) > 1: entrydic[item]['sim_count'] = len(sims) - 1 if len(sims) > 1: fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\n') cursor.execute( 'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s', (entrydic[item]['cluster'], entrydic[item]['sim_count'], str(entrydic[item]['simhash']), item)) # conn.commit() # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n') cluster_num += 1 # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,)) # conn.commit() conn.close()
def add_to_redis(CONN, hashes): logger.info(type(hashes)) objs = [] for k, v in hashes.iteritems(): a = Simhash('a') a.value = int(k) objs.append((v, a)) logger.info(objs[0]) logger.info('Number of objects: {}'.format(len(objs))) index = SimhashIndex(CONN, objs, k=3) return index
def create_test_data(): """For 1 million records, it takes 5 minutes. """ complexity = 1000**2 print("creat data ...") data = [rand_str(8) for i in range(complexity)] print("calculate simhash ...") objs = [(i, Simhash(item)) for i, item in enumerate(data)] print("creat index ...") index = SimhashIndex(objs, k=3) safe_dump_pk(data, datafile) safe_dump_pk(index, indexfile)
def main(): # user_query = input() DOCID = 0 numPartial = 1 index = SimhashIndex([]) totaldocs = 0 docnum = 0 validDocFile = open('validDocs2', 'w') for root, dirs, files in os.walk(DEVPATH): for fname in files: if not fname.endswith(".json"): continue totaldocs += 1 h2t = html2text.HTML2Text() file = open(root + "/" + fname) pageDict = json.loads(file.read()) # close file to get memory back file.close() # get html formated content htmlContent = pageDict['content'] print(pageDict['url']) plainContent = h2t.handle(htmlContent) feat = get_features(plainContent) sim = Simhash(feat) if len(index.get_near_dups(sim)) > 0: continue print(docnum, totaldocs) index.add(str(docnum), sim) validDocFile.write(root + "/" + fname + "\n") docnum+=1 validDocFile.close()
async def gen_simhash_index(conf): m = 0 n = 0 objs = [] simhash_answer_index = {} for items in conf.DEMO_QUESTION: for item in items: objs.append((n, Simhash(await _tokenization(conf, item)))) simhash_answer_index[n] = m n += 1 m += 1 simhash_index = SimhashIndex(objs, k=6) return simhash_index, simhash_answer_index
def get_near_dups(query_simhash, candidates_simhash, k): res = [0] * len(candidates_simhash) query = Simhash(value=query_simhash) for i in range(len(candidates_simhash)): candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i])) i = i + 1 index = SimhashIndex(candidates_simhash, k=k) near_dups = index.get_near_dups(query) for dup in near_dups: res[int(dup)] = 1 return res
def find_near_matches(session, collection, index_size, probability_index_near_match): from simhash import Simhash, SimhashIndex logging.getLogger().setLevel(logging.CRITICAL) tweet_id_simhash_value = session.execute( sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']]) .where(model.Tweet.collection == collection) ) simhash_index = SimhashIndex([], k=7) insert_relation_stmt = pg.insert(model.relation) # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update( # index_elements=['tweet_id', 'collection'], # set_={ # 'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id # } # ) indexed_tweet_ids = [] for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value): if (i % 100000) == 1000: logger.info('Processed %s tweets. Committing.', i) session.commit() simhash = Simhash(simhash_value) near_matches_ids = simhash_index.get_near_dups(simhash) if not near_matches_ids: simhash_index.add(tweet_id, simhash) indexed_tweet_ids.append((tweet_id, simhash)) if len(indexed_tweet_ids) > index_size: simhash_index.delete(*indexed_tweet_ids.pop(0)) if near_matches_ids: near_match_id = min(near_matches_ids) logger.debug('A near match %s for tweet %s', near_match_id, tweet_id) session.execute( insert_relation_stmt.values( [(tweet_id, collection, 'near_match', near_match_id)] ) ) session.commit()
def simhash_clustering( signatures: List[int], hamming_distance: int = 3, # num_blocks: Optional[int] = 5, ) -> List[List[int]]: index = SimhashIndex([(i, Simhash(value=signature)) for i, signature in enumerate(signatures)], k=hamming_distance) neighbors: List[List[int]] = [] for signature in signatures: neighbors.append( list(map(int, index.get_near_dups(Simhash(value=signature))))) return neighbors
def console_test(): from simhash import Simhash, SimhashIndex data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } objs = [(str(k), Simhash(v)) for k, v in data.items()] index = SimhashIndex(objs, k=10) s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = index.get_near_dups(s1) dups = index.get_near_dups2(s1, 5) index.remove(s1)
def process_graph(self, project_id): visits = defaultdict(list) p = 0 hashtags_db = Hashtag.objects.filter(project_id=project_id) logger.info("Total hashtags to process " + str(len(hashtags_db))) for hashtag_entry in hashtags_db: visits[hashtag_entry.user_id].append(hashtag_entry.hashtag) p += 1 logger.info("Hashtag read") logger.info("Hashtag processed " + str(p)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, hashtags in visits.iteritems(): if len(hashtags) > MIN_HASHTAG_PER_USER: simhash = Simhash(hashtags, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, hashtags in visits.iteritems(): near_dups = index.get_near_dups(Simhash(hashtags, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): hashtag_near_dups = visits[user_near_dups] intersect = set(hashtags).intersection(hashtag_near_dups) ratio = len(intersect) * 1.0 / len(hashtag_near_dups) if ratio >= 0.1: hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) hashtag_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def use_simhash_index(): data = { 1: "How are you? I Am fine. blar blar blar blar blar Thanks.", 2: "How are you i am fine. blar blar blar blar blar than", 3: "This is simhash test.", } objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] index = SimhashIndex(objs, k=3) print(index.bucket_size()) s1 = Simhash( get_features(u"How are you i am fine. blar blar blar blar blar thank")) print(index.get_near_dups(s1)) index.add("4", s1) print(index.get_near_dups(s1))
def sim_merge(finaldb_cut, simdb): d = {} index_list = [] hashurl2sim = {} max_distance = 10 with open(finaldb_cut, 'r') as f: for line in f: if not line: break # hashurl title author images links text pub_time # 1 2 3 4 5 6 7 # jushi shouji zujin dizhi ditie url crawl_time source ext # 8 9 10 11 12 13 14 15 16 array = line.rstrip('\r\n').split('\t') hashurl = array[0] #string,key title = array[1] #string text = array[5] #string pub_time = array[6] #string url = array[12] #string s = Simhash((title + text).decode('utf-8')) d.update({hashurl: (title, url, pub_time)}) sim = Simhash((title + text).decode('utf-8')) index_list.append((hashurl, sim)) hashurl2sim.update({hashurl: sim}) index = SimhashIndex(index_list, k=max_distance) merged = {} while d: hashurl, (title, url, pub_time) = d.popitem() merged[hashurl] = (title, url, pub_time) sim_list = index.get_near_dups(hashurl2sim[hashurl]) buf_list = [] for h in sim_list: if h != hashurl: if d.has_key(h): title2, url2, pub_time2 = d.pop(h) merged[h] = (title2, url2, pub_time2) else: title2, url2, pub_time2 = merged[h] else: title2, url2, pub_time2 = title, url, pub_time buf_list.append((h, title2, url2, pub_time2)) if len(buf_list) > 1: buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True) simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
def process_graph(self, project_id): visits = defaultdict(list) processed = 0 urls_db = Urls.objects.filter(project_id=project_id) logger.info("Total urls to process " + str(len(urls_db))) for url_entry in urls_db: visits[url_entry.user_id].append(url_entry.url) processed += 1 logger.info("Urls read") logger.info("Urls processed " + str(processed)) logger.info("Visits count " + str(len(visits))) objs = [] cant_users = 0 cant_processed = 0 index = SimhashIndex(objs, f=f1, k=k1) for user, urls in visits.iteritems(): if len(urls) > MIN_URLS_PER_USER: simhash = Simhash(urls, f=f1) index.add(user, simhash) cant_processed += 1 cant_users += 1 if cant_users % 10000 == 0: logger.info("%s processed" % cant_users) logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits))) cant_processed = 0 for user, urls in visits.iteritems(): near_dups = index.get_near_dups(Simhash(urls, f=f1)) for user_near_dups in near_dups: user_near_dups = long(user_near_dups) if user_near_dups != long(user): urls_near_dups = visits[user_near_dups] intersect = set(urls).intersection(urls_near_dups) ratio = len(intersect) * 1.0 / len(urls_near_dups) if ratio >= 0.1: url_graph = UrlsGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio) url_graph.save() cant_processed += 1 if cant_processed % 10000 == 0: logger.info("%i processed" % cant_processed)
def simhashsort(datadic, entryset): objs = [(id, Simhash(sent)) for id, sent in datadic.items()] index = SimhashIndex(objs, k = tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for id in datadic: if str(id) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(datadic[id])) similiarlist.append(str(id)) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["cluster"] = kind kind += 1
def save_duplicates(save_path, text2hash_dict, k=5): """Group similar docs' title""" # Construct SimhashIndex object for similar docs detection. k is tolerance. index = SimhashIndex(text2hash_dict, k=k) done = list() with tqdm(total=len(text2hash_dict)) as pbar: with open(save_path, 'w', encoding='utf8') as file: for i in range(len(text2hash_dict) - 1): # get near duplicates near_dups = index.get_near_dups(text2hash_dict[i][1]) # near dups includes origin title, len > 1 requested if len(near_dups) > 1 and text2hash_dict[i][0] not in done: for title in near_dups: file.write(title) file.write('\n') file.write('#' * 5 + '\n') done.extend(near_dups) pbar.update()
def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10): """ Params: hash_size : The number of output bits of the hash function used in SimHash. Higher values -> able to handle more noise. hash_tol : The number of bits that can differ for a candidate near-match in Simhash num_words_to_complete : The number of words to complete given a context when a new document is encountered in get_best_match """ self.num_words_to_complete = num_words_to_complete self.hash_size = hash_size self.hash_tol = hash_tol #This implementation of simhash stores the index in RAM, but it could easily be # put on disk. self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol) self.author_identifier = LanguageModelAuthorIdentifier() self.author_semantic_models = SemanticLanguageModels()
def simhashSort2(datadic, entryset): objs = [] for entry in datadic: objs.append((entry[0], Simhash(entry[1]))) index = SimhashIndex(objs, k=tolerance) # k是容忍度;k越大,检索出的相似文本就越多 kind = 1 # 类型号 sorted = set() for item in datadic: if str(item[0]) in sorted: # 不重复分类 continue # 求相似集 similiarlist = index.get_near_dups(Simhash(item[1])) similiarlist.append(str(item[1])) # 将相似集信息返回到entryset中 for id in similiarlist: sorted.add(id) for entry in entryset: if str(entry["id"]) in similiarlist: entry["sim_count"] = kind kind += 1
def __init__(self, config, worker=None): self.config = config self.host, self.port = config.cache_server #self.robots = list of banned paths self.robots = {} self.simhashes = SimhashIndex([]) self.link = 1 self.worker = worker self.maxWords = ( "", 0 ) # maxWords[0] is the URL, maxWords[1] is the number of words in it self.wordCounter = Counter( ) # a dictionary that keeps track of the # of words self.stopWords = [ '1', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and', 'any', 'are', 'are', "aren't", 'as', 'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'can', "can't", 'cannot', 'could', "couldn't", 'd', 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', 'has', "hasn't", "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", "ll", 'm', 'may', 'me', 'more', 'most', "mustn't", 'my', 'myself', 'next', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'once', 'one', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's', 'same', 'say', 'says', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'under', 'until', 'until', 'up', 've', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'will', 'with', "won't", 'would', "wouldn't", 'x', 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yourself', 'yourselves' ]
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5): dictionary = dict(zip(labels, targets)) objs = [(str(k), Simhash(get_features(v, width))) for k, v in dictionary.items()] index = SimhashIndex(objs, k=k) query_simhash = Simhash(get_features(query, width)) near_dups = index.get_near_dups(query_simhash) # Save fingerprints for future use appendToFingerprints( dataset, './dataset/fingerprints.csv', { "query": str(query_simhash.value), "duplicates": ' '.join([str(obj[1].value) for obj in objs]) }) # print("QUERY: {}".format(query_url)) # pp(near_dups) return { "dataset": dataset, "query": query_url, "duplicates": ' '.join(near_dups) }
def test(n): import time import distance from simhash import Simhash, SimhashIndex WIDTH = 3 def gg(): import random from random import randint from simhash import Simhash, SimhashIndex from itertools import groupby # text = str(bin(randint(2**63, 2**64-1)))[2:] # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))] # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))}) text = ''.join([random.choice('0123456789abcdef') for _ in range(36)]) return text, Simhash(text) hashes = [gg() for _ in range(n)] d1, d2 = [], [] test_string, test_hash = gg() start = time.time() for s, h in hashes: d1.append([distance.hamming(test_string, s), s]) print time.time() - start start = time.time() index = SimhashIndex(hashes, k=5) for st in index.get_near_dups(test_hash): d2.append([distance.hamming(test_string, st), st]) print time.time() - start print len(d1), len(d2) for a, b in zip(sorted(d1)[:20], sorted(d2)): print a[1] == b[1], '\t', a, '\t', b
def main(path): corpuses = readFiles.normalize(path) results = [] for corpus in corpuses: hashset = {} listofitems = [] for item in corpus.keys(): if item == 'desc': continue z = Simhash(corpus[item]) hashset[item] = z listofitems += [(item, z)] l = SimhashIndex(listofitems) #print(l.get_near_dups(hashset['../corpus/bbc/tech1/001.txt'])) hashlist = {} for i, item1 in enumerate(hashset.keys()): hashlist[item1] = [] for j, item2 in enumerate(hashset.keys()): if j < i: hashlist[item1] += [' '] continue hashlist[item1] += [hashset[item1].distance(hashset[item2])] #print item1, item2, hashset[item1].distance(hashset[item2]) results += [[hashset, hashlist, corpus['desc']]] with open('results.csv', 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='{') for hashset, hashlist, desc in results: writer.writerow([" "]) writer.writerow([i for i in desc.split()]) record = [] record += [['Table'] + [key for key in hashset.keys()]] for k in hashset.keys(): record += [[k] + hashlist[k]] for item in record: writer.writerow(item)
def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10)
def init_index(url, initial_data): data[url] = initial_data objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()] global index index = SimhashIndex(objs, k=3)
f_stop.close() f_stop_seg_list = f_stop_text.split('\n') for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordlist.append(myword) return ''.join(mywordlist) #data.head()['content'].apply(lambda x:jiebaclearText(str(x))) data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x))) data['simhash'] = data['content'].apply(lambda x: Simhash(x).value) train = data.loc[data['source'] == 'train'] test = data.loc[data['source'] == 'test'] train.drop('source', axis=1, inplace=True) test.drop([ 'source', ], axis=1, inplace=True) objs = [(row["id"], Simhash(row["content"])) for index, row in train.iterrows()] index = SimhashIndex(objs, k=12) test['result'] = test['content'].apply( lambda x: index.get_near_dups(Simhash(x))) sub['result'] = test['result'] sub.to_csv('../output/simhash.csv', index=False)
import ast import time import mysql.connector from sumy.utils import get_stop_words import nltk from config import * # 注意把password设为你的root口令: conn = mysql.connector.connect(user='******', password=sql_password, database='test') conn.autocommit = True last_cluster_num = 0 objs = [] index = SimhashIndex(objs, k=tolerance) def restore_simhash(): global last_cluster_num cursor = conn.cursor() cursor.execute('select id, simhash from entries where simhash > 0') entries = cursor.fetchall() for entry in entries: index.add(str(entry[0]), Simhash(int(entry[1]))) cursor.execute('select max(cluster) from entries') last_cluster_num = cursor.fetchone()[0] + 1 # 不需要再加1 def is_en(s):
for file in os.listdir(news_dir): news_file = os.path.join(news_dir, file) with open(news_file) as f: for line in f: news = json.loads(line) title_features, content_features = get_news_feature(news) print(title_features) print(content_features) title_data.append((str(news_id), Simhash(title_features))) content_data.append((str(news_id), Simhash(content_features))) news_id += 1 if news_id % 1000 == 0: logging.info('{} has finished'.format(news_id)) title_index = SimhashIndex(title_data) content_index = SimhashIndex(content_data) # saving with open('title_index.pkl', 'wb') as f1: pickle.dump(title_index, f1) with open('content_index.pkl','wb') as f2: pickle.dump(content_index, f2) # loading # with open('title_index.pkl','rb') as f1: # title_index = pickle.load(f1) # with open('content_index.pkl','rb') as f2: # content_index = pickle.load(f2) # # print(title_index.bucket_size) # print(content_index.bucket_size)