def gensim_simhash(content,test_news): # 加载积累的stopwords stopwords = load_stopwords() # 切割token并清除stopwords x = [[word for word in line.split() if word not in stopwords] for line in content] # 切割token并清除stopwords test_news = [word for word in test_news.split() if word not in stopwords] # 计算simhash test_news_hash = Simhash(test_news) sim=[] # 遍历语料计算simhash值 for news in x: hash = Simhash(news) score=test_news_hash.distance(hash) sim.append( score) #print "add %d %f" %(index,score) for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]: # print "index:%d similarities:%f" % (index, score) print "index:%d similarities:%f content:%s" % (index, score, content[index])
def test_segtree(self): sh = Simhash('How are you? I am fine. Thanks. And you?') self.assertEqual(sh.value, 6460565663990245323) sh2 = Simhash('How old are you ? :-) I am fine. Thanks. And you?') self.assertEqual(sh.distance(sh2), 8) sh3 = Simhash(sh2) self.assertEqual(sh2.distance(sh3), 0)
def test_distance(self): sh = Simhash('How are you? I AM fine. Thanks. And you?') sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?') self.assertTrue(sh.distance(sh2) > 0) sh3 = Simhash(sh2) self.assertEqual(0, sh2.distance(sh3)) self.assertNotEqual(0, Simhash('1').distance(Simhash('2')))
def test_distance(self): sh = Simhash('How are you? I AM fine. Thanks. And you?') sh2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?') self.assertTrue(sh.distance(sh2) > 0) sh3 = Simhash(sh2) self.assertEqual(sh2.distance(sh3), 0) self.assertNotEqual(Simhash('1').distance(Simhash('2')), 0)
def test_chinese(self): self.maxDiff = None sh1 = Simhash(u'你好 世界! 呼噜。') sh2 = Simhash(u'你好,世界 呼噜') sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.') sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than') sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') self.assertEqual(sh1.distance(sh2), 0) self.assertTrue(sh4.distance(sh6) < 3) self.assertTrue(sh5.distance(sh6) < 3)
def test_chinese(self): self.maxDiff = None sh1 = Simhash(u'你好 世界! 呼噜。') sh2 = Simhash(u'你好,世界 呼噜') sh4 = Simhash(u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.') sh5 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than') sh6 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') self.assertEqual(0, sh1.distance(sh2)) self.assertTrue(sh4.distance(sh6) < 3) self.assertTrue(sh5.distance(sh6) < 3)
def gensim_simhash(content, test_news): # 加载积累的stopwords stopwords = load_stopwords() # 切割token并清除stopwords x = [[word for word in line.split() if word not in stopwords] for line in content] # 切割token并清除stopwords test_news = [word for word in test_news.split() if word not in stopwords] # 计算simhash test_news_hash = Simhash(test_news) sim = [] # 遍历语料计算simhash值 for news in x: hash = Simhash(news) score = test_news_hash.distance(hash) sim.append(score) #print "add %d %f" %(index,score) for index, score in sorted(enumerate(sim), key=lambda item: item[1])[:6]: # print "index:%d similarities:%f" % (index, score) print "index:%d similarities:%f content:%s" % (index, score, content[index])
def simhash_similarity(text1, text2): aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) distince = aa_simhash.distance(bb_simhash) similar = 1 - distince / max_hashbit return similar
def test2(): # po = Pool(10) for dirpath, dirnames, filenames in os.walk(driver_path): for filename in filenames: index = filenames.index(filename) print '下标', index file_path1 = dirpath + '/' + filename cont = news_process(file_path1) simhash1 = Simhash(cont) print file_path1 key1 = num10_to2_sys(simhash1.value) print key1 for i in filenames[:index]: file_path2 = dirpath + '/' + i cont2 = news_process(file_path2) simhash2 = Simhash(cont2) # key2 = num10_to2_sys(simhash2.value) # a = hammingDis(key1, key2) # print '海明距离', a # print file_path1 # print key1 # print file_path2 # print key2 key2 = simhash1.distance(simhash2) print '海明距离', key2 print file_path1 print simhash1.value print file_path2 print simhash2.value
def get_simlar_text(self, text1, text2): ''' 1.文本相似度比较算法 2.使用simhash分析 :param text1: :param text2: :return: ''' new_simhash = SimHash() hash_first = new_simhash.getHash(text1) # 计算hash值 hash_second = new_simhash.getHash(text2) text_first_hash = Simhash(hash_first) text_second_hash = Simhash(hash_second) distince = text_first_hash.distance(text_second_hash) max_hashbit = max(len(bin(text_first_hash.value)), (len(bin(text_second_hash.value)))) if max_hashbit == 0: return 0 else: similar = 1 - distince / max_hashbit return (similar)
def simhash_remove_similar(news_list): result_list = [] # 需要两两比较simhash值 len_news_list = len(news_list) for i in range(len_news_list): news_i_id = news_list[i]['news_id'] news_i_news_content = accord_news_id_get_content_list( news_i_id)['news_content'] sim_hash1 = Simhash(news_i_news_content) for j in range(i + 1, len_news_list): # 已经被打过标记的不判断 if 'del' in news_list[j]: continue news_j_id = news_list[j]['news_id'] news_j_news_content = accord_news_id_get_content_list( news_j_id)['news_content'] sim_hash2 = Simhash(news_j_news_content) # 如果两个新闻的汉明距离小于 10 则按照顺序只保留一个 if sim_hash1.distance(sim_hash2) <= SIMHASH_DISTINCT: # 表示不要这个新闻了 news_list[j]['del'] = 'yes' for news in news_list: if 'del' not in news: result_list.append(news) return result_list
def get_sim_simhash(self, text1, text2, f_num=64): a_simhash = Simhash(text1, f=f_num) b_simhash = Simhash(text2, f=f_num) max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value))) distance = a_simhash.distance(b_simhash) sim = 1 - distance / max_hashbit return sim
def test_chinese(self): self.maxDiff = None sh1 = Simhash(u'浣犲ソ銆�涓栫晫锛併��銆�鍛煎櫆銆�') sh2 = Simhash(u'浣犲ソ锛屼笘鐣屻��鍛煎櫆') sh4 = Simhash( u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.' ) sh5 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than' ) sh6 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) self.assertEqual(sh1.distance(sh2), 0) self.assertTrue(sh4.distance(sh6) < 3) self.assertTrue(sh5.distance(sh6) < 3)
def isPageTooSimilar(pageTextString, pageHashes): pageHash = Simhash(pageTextString) minDist = 100000000 skipPage = False for hashedPage in pageHashes: if pageHash.distance(hashedPage) < 3: skipPage = True break else: pageHashes.add(pageHash) return skipPage
def simhash_distance(self, text1, text2): text1_hash = Simhash(text1) text2_hash = Simhash(text2) max_hashbit = max(len(bin(text1_hash.value)), (len(bin(text2_hash.value)))) # 汉明距离 distince = text1_hash.distance(text2_hash) similar = 1 - distince / max_hashbit return similar
def compare_data_simhash(self, data1, data2): """ 对文本使用simhash进行近似判断 :param data1: :param data2: :return: """ data1_sim = Simhash(data1) data2_sim = Simhash(data2) # 汉明距离 dis = data1_sim.distance(data2_sim) if dis < 2: return True
def simhash_similarity(text1, text2): a_simhash = Simhash(text1) b_simhash = Simhash(text2) print(a_simhash.value) print(b_simhash.value) max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value))) print(max_hashbit) #汉明距离 distince = a_simhash.distance(b_simhash) print(distince) similar = distince / max_hashbit return similar
def sim_hash_similarity(text1, text2): """ :param text1: 文本1 :param text2: 文本2 :return: 返回两篇文章的相似度 """ aa_sim_hash = Simhash(text1) bb_sim_hash = Simhash(text2) max_hash_bit = max(len(bin(aa_sim_hash.value)), (len(bin(bb_sim_hash.value)))) # 汉明距离 distance = aa_sim_hash.distance(bb_sim_hash) similar = 1 - distance / max_hash_bit return similar
def simhash_similarity(text1, text2): """ :param tex1: 文本1 :param text2: 文本2 :return: 返回两篇文章的相似度 """ text1 = filter_html(text1) text2 = filter_html(text2) aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) # 汉明距离 distince = aa_simhash.distance(bb_simhash) # 相似度计算 similar = 1 - distince / max_hashbit return similar
def simhash(sentence1: str, sentence2: str) -> float: """ 求两文本的相似度 :param text_a: :param text_b: :return: """ if sentence1 and sentence2: sentence1=str(sentence1) sentence2=str(sentence2) a_simhash = Simhash(sentence1) b_simhash = Simhash(sentence2) max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value))) # 汉明距离 distince = a_simhash.distance(b_simhash) similar = 1 - distince / max_hashbit return similar
def is_similar_page(res1, res2, radio=3): # 使用simHash判断页面的相似程度 if res1 is None or res2 is None: return False body1 = res1.body body2 = res2.body # 此处非常耗时,大概是split函数费时 simhash1 = Simhash(body1.split()) simhash2 = Simhash(body2.split()) calc_radio = simhash1.distance(simhash2) if calc_radio <= radio: return True return False
def is_content(self, text, word_count): if text and word_count >= 150: current_sim = Simhash(text) #first link, nothing in simhash set if len(self.simhashes) == 0: self.simhashes.add(current_sim) return True for x in self.simhashes: if current_sim.distance(x) <= 3: print("duplicate detected") return False self.simhashes.add(current_sim) return True else: print("low text count") return False
def similarHash(text1, text2): ''' 获取文章相似度 ''' simhash = GetHash() hash1 = simhash.get_str_hash(text1) # 计算hash hash2 = simhash.get_str_hash(text2) # 计算hash t1_simhash = Simhash(hash1) t2_simhash = Simhash(hash2) distince = t1_simhash.distance(t2_simhash) max_hashbit = max(len(bin(t1_simhash.value)), (len(bin(t2_simhash.value)))) if max_hashbit == 0: return 0 else: ssimilar = 1 - distince / max_hashbit return (ssimilar)
def simhash_similarity(text1, text2): """ :param tex1: 文本1 :param text2: 文本2 :return: 返回两篇文章的相似度 """ begin = time.time() aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) # print(aa_simhash.value) # print(bb_simhash.value) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) # 汉明距离 distince = aa_simhash.distance(bb_simhash) similar = 1 - distince / max_hashbit print("两两计算的时间:%f" % (time.time() - begin)) return similar
def simhash_similarity(text1, text2): """ :param tex1: 文本1 :param text2: 文本2 :return: 返回两篇文章的相似度 """ aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) print(1, bin(aa_simhash.value)) print(2, bin(aa_simhash.value)) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) print(max_hashbit) # 汉明距离 distince = aa_simhash.distance(bb_simhash) print(distince) similar = 1 - distince / max_hashbit return similar
def shrinkdup(self): pkv = Simhash('') shdist = 10 for content in self.sortedlog: if not content or content.strip() == '': continue try: ret = self._extract_content(content) hv = Simhash(ret) if pkv.distance(hv) > shdist: self.shrinklogs.append(content) pkv = hv except Exception as e: print content print e if len(self.shrinklogs) > 0: filen = self.filen + '-shrunk' with open(filen, 'w') as f: for v in self.shrinklogs: f.write('%s' % (v))
def findNeighborVectors(simHashIndex, matrix, vector, topk=10): hash = Simhash(get_features(vector), f=32) hashstr = bin(hash.value) nearVectorArray=[] for num in range(4): # For 32-bit fingerprints key = hashstr[num * 8 + 2:(num + 1) * 8 + 2] # if (simHashIndex[num].has_key(key)): # nearVectorArray.extend(simHashIndex[num][key]) nearVectorArray.extend(simHashIndex[num].get(int(key))) sortedDict= {} for index in nearVectorArray: index=int(index) vec=matrix[index] vechash=Simhash(get_features(vector), f=32) sortedDict[index]=hash.distance(vechash) sorted(sortedDict.items(), lambda x, y: cmp(x[1], y[1])) sortindexs=sortedDict.keys()[:topk] return sortindexs
class FuncHash: def __init__(self, dict): super().__init__() self.path = dict.get("path") self.start = dict.get("start") self.stop = dict.get("stop") self.source = dict.get("source") self.startLoc = dict.get("startLoc") self.stopLoc = dict.get("stopLoc") self.blame = dict.get("blame") self.lineCount = len(self.source.split('\n')) def hashSource(self): self.simHash = Simhash(self.source) def distance(self, to): return self.simHash.distance(to.simHash) def _asdict(self): return dict(path=self.path, start=self.start, stop=self.stop, source=self.source)
def simHashLabel(user1filepath,user2filepath,user1Floder,user2Floder,num_floder): ans=0.0 for i in range(num_floder): labeluser1='' labeluser2='' tempmax1=0 tempmax2=0 f1=open(user1filepath+user1Floder[i]+os.sep+'RCed_stoppoint.txt') for line in f1: labeluser1+=line.split(',')[4] labeluser1+=',' tempmax1+=1 f2=open(user2filepath+user2Floder[i]+os.sep+'RCed_stoppoint.txt') for line in f2: labeluser2+=line.split(',')[4] labeluser2+=',' tempmax2+=1 sh1 = Simhash(u'%s'%labeluser1) sh2 = Simhash(u'%s'%labeluser2) maxlen=tempmax1 if tempmax1>=tempmax2 else tempmax2 ans+= sh1.distance(sh2)/maxlen return ans
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Joshua # @E-Mail: [email protected] # @Date: 2015-02-11 12:32:00 # @About demo.py # import re from simhash import Simhash from simhash import SimhashIndex from data import news_lists_1, news_lists_2 # 对于汉语无论分词于不分词,simhash的结果是相同的。 # for i, news in enumerate(news_lists_1): x = Simhash(news_lists_1[i]['content'], f=64) #y = Simhash('hello') #x = Simhash('Hi') y = Simhash(news_lists_2[i]['content'], f=64) print('1.simhash:', x.value) print('2.simhash:', y.value) print('distance:', x.distance(y)) print('similarity:', (64 - x.distance(y)) / 64) print(news_lists_1[i]['title'])
#coding:utf-8 __author__ = 'lym' import re from simhash import Simhash print Simhash('aa').distance(Simhash('bb')) print Simhash('aa').distance(Simhash('aa')) sh1 = Simhash(u'你好 世界 呼噜') sh2 = Simhash(u'你好 世界') print(sh1.distance(sh2)) print ','.join('fff') test='的阿斯顿多少多少' print "this is test1: %s" %test
def sim_hash(self, q, doc): '''距离越小越相似''' q = self.fomrat_str(q) doc = self.fomrat_str(doc) s1, s2 = Simhash(q), Simhash(doc) return s1.distance(s2)
# coding: utf8 from simhash import Simhash s1 = Simhash('abcdefghijklmnopqrstuvwxyz', 128) s2 = Simhash('abcdefghijklmnopqrstuvwayz', 128) print s1.value print s2.value print s1.distance(s2)
def crawl(self, in_dir, num=10): # url: string # title: string # content: string # outlinks: list i = 0 while (len(self.fetchedurls) <= num): # init new Webpage class webpage = Webpage() # pop url if (len(self.frontier) > 0): webpage.url = self.frontier.pop( ) # 0: first element default: last element else: print("empty frontier") exit # fetch url and parse try: html = urlopen(webpage.url) except HTTPError: continue else: bsObj = BeautifulSoup(html, "html.parser") # get a bs object # check the category div = bsObj.find(name='div', id='mw-normal-catlinks') tmp_category = "" try: contents = div.find_all(name='a') except AttributeError: continue else: for content in contents: aText = content.get_text().lower() tmp_category += aText # print(tmp_category) if (not any(category in tmp_category for category in self.category_list)): continue # fetch title webpage.title = str(bsObj.title).replace(" - Wikipedia</title>", '').replace( "<title>", "") # fetch content tmp_content = "" div = bsObj.find(name='div', id='mw-content-text') ps = div.find_all(name='p') for p in ps: pText = p.get_text() tmp_content += pText webpage.content = tmp_content if (not webpage.title or not webpage.content or len(webpage.content) < 100): continue # check the content tmp_simhash = Simhash(webpage.content) for i in range(len(self.hash)): if (tmp_simhash.distance(self.hash[i]) <= 5): continue # satisfied url self.hash.append(tmp_simhash) self.fetchedurls.append(webpage.url) # fetch outlinks tmp_outlinks = [] newurls = div.find_all('a', href=re.compile("^(/wiki/)((?!;)\S)*$")) for newurl in newurls: # obey the robots.txt if (newurl.attrs['href'].replace("\n", '') in self.rules): continue myurl = "https://en.wikipedia.org" + newurl.attrs['href'] # dup URL elim if myurl not in self.fetchedurls and myurl not in self.frontier: self.frontier.append(myurl) tmp_outlinks.append(myurl) webpage.outlinks = tmp_outlinks # write to file with open(in_dir + '/' + str(i), 'wb') as fwrite: pickle.dump(webpage, fwrite) i += 1 if (i % 100 == 0): print(i) fopen = open("fetchedurls", "wb") pickle.dump(self.fetchedurls, fopen)
def extract_next_links(url, resp): # If the raw_response exists, and the status is within 200 to 599, and is not 404 or 403, # then process the raw_response.content if resp: if not resp.raw_response == None: if resp.status >= 200 and resp.status <= 599: if resp.status == 404 or resp.status == 403: return list() try: # Get the HTML content and make it into a tree with lxml parser = lxml.etree.HTMLParser(encoding='UTF-8') tree = lxml.etree.parse( io.StringIO( resp.raw_response.content.decode( encoding='UTF-8')), parser) # String of all the text on the page pageTextString = "" # Check these tags for text wantedTags = { "p", "span", "blockquote", "code", "br", "a", "ol", "ins", "sub", "sup", "h1", "h2", "h3", "h4", "h5", "h6", "li", "ul", "title", "b", "strong", "em", "i", "small", "sub", "sup", "ins", "del", "mark", "pre" } parsed = urlparse(url) listofLinks = [] for elem in tree.iter(): if elem.tag in wantedTags: if elem.text: pageTextString += elem.text + " " if elem.tag == "a" and "href" in elem.attrib: link = elem.attrib["href"] if len(link) == 0: continue if link == r"/" or link == parsed.netloc: continue elif link[0] == r"/": link = parsed.netloc + link elif link[0:2] == r"//": link = "https:" + link link = link.split('#')[0] if "replytocom=" in link or "share=" in link: link = link.split('?')[0] listofLinks.append(link) # If the distance between this page's hash and any other page # is less than 3, return an empty list because this page is # too similar to another page to be useful pageHash = Simhash(pageTextString) minDist = 100000000000 for hashedPage in hashes: if pageHash.distance(hashedPage) < minDist: minDist = pageHash.distance(hashedPage) if pageHash.distance(hashedPage) <= 3: return list() hashes.add(pageHash) print(minDist) # Tokenize the page and put the resulting list in pageListofWords pageListofWords = [] currWord = "" for char in pageTextString: try: charOrd = ord(char) if (charOrd >= 64 and charOrd <= 90): currWord += char.lower() elif (charOrd >= 48 and charOrd <= 57) or (charOrd >= 97 and charOrd <= 122): currWord += char else: if currWord != "": if not currWord in stopWords and len( currWord) > 1: pageListofWords.append(currWord) currWord = "" except: continue # If the number of words is less than 150, return an empty list # because this page is not useful enough pageWordCount = len(pageListofWords) if pageWordCount < 150: return list() # If this page has more words than the current longest page, # set this page as the new longest page global longestPageWordCount global longestPageURL if pageWordCount > longestPageWordCount: longestPageWordCount = pageWordCount longestPageURL = url print("New longest page: " + url + " " + str(longestPageWordCount)) # Increase word counters by their occurrences on this page for word in pageListofWords: if word not in stopWords: if word not in totalWordDict: totalWordDict[word] = 1 else: totalWordDict[word] += 1 return listofLinks # Prints an exception if the page has non-UTF-8 characters except Exception as e: print(e) # There was no response, or no content, or a bad resp_status return list()
def test_chinese(self): sh1 = Simhash(u'你好 世界! 呼噜。') sh2 = Simhash(u'你好,世界 呼噜') #self.assertEqual(sh1._features, []) self.assertEqual(sh1.distance(sh2), 0)
s2 = """我们知道,在文本去重的时候,有很多方式,在文本与文本之间对比,如果是整篇对比,费时费力,有人就想到用什么东西代表每篇文章,如摘要,当然,对计算机来说,摘要和整篇的区别只是缩小了篇幅,所以又有人想到了采用关键字来对比。这样确实可以大大缩减我们对比的复杂性。那我们怎么得到一篇文章的关键字呢?一般采用词频(TF),但是只用词频,如中文出现类似“的”、“我们”之类的词语很多,应该怎么去掉这些词语呢,手动去掉实在是麻烦,于是可以结合逆向词频(IDF),这就是著名的TD-IDF,一种提取一个文章的关键词的算法。词频我们很好理解,一个词语在整篇文章中出现的次数与词语总个数之比。IDF又怎么算呢,假如一个词语,在我们所有文章中出现的频率都非常高(例如“的”在我们多个文本中出现的次数很多),我们就认为,这个词语不具有代表性,就可以降低其作用,也就是赋予其较小的权值。 那这个权重,我们怎么计算呢,(这里敲公式比较麻烦,直接找来图片),如下图,分子代表文章总数,分母表示该词语在这些文章(|D|)出现的篇数。一般我们还会采取分母加一的方法,防止分母为0的情况出现,在这个比值之后取对数,就是IDF了。 simhash是一种局部敏感hash。我们都知道什么是hash。那什么叫局部敏感呢,假定A、B具有一定的相似性,在hash之后,仍然能保持这种相似性,就称之为局部敏感hash。 在上文中,我们得到一个文档的关键词,取得一篇文章关键词集合,又会降低对比效率,我们可以通过hash的方法,把上述得到的关键词集合hash成一串二进制,这样我们直接对比二进制数,看其相似性就可以得到两篇文档的相似性,在查看相似性的时候我们采用海明距离,即在对比二进制的时候,我们看其有多少位不同,就称海明距离为多少。在这里,我是将文章simhash得到一串64位的二进制,一般取海明距离为3作为阈值,即在64位二进制中,只有三位不同,我们就认为两个文档是相似的。当然了,这里可以根据自己的需求来设置阈值。 就这样,我们把一篇文档用一个二进制代表了,也就是把一个文档hash之后得到一串二进制数的算法,称这个hash为simhash。 具体simhash步骤如下: (1)将文档分词,取一个文章的TF-IDF权重最高的前20个词(feature)和权重(weight)。即一篇文档得到一个长度为20的(feature:weight)的集合。 (2)对其中的词(feature),进行普通的哈希之后得到一个64为的二进制,得到长度为20的(hash : weight)的集合。 (3)根据(2)中得到一串二进制数(hash)中相应位置是1是0,对相应位置取正值weight和负值weight。例如一个词进过(2)得到(010111:5)进过步骤(3)之后可以得到列表[-5,5,-5,5,5,5],即对一个文档,我们可以得到20个长度为64的列表[weight,-weight...weight]。 """ import lshash # a1 = Simhash('How are you? I AM fine. Thanks. And you?') # a2 = Simhash('How old are you ? :-) i am fine. Thanks. And you?') a1 = Simhash(s1) a2 = Simhash(s2) ret = a1.distance(a2) print(ret) exit(0) usage = """ ftclass.py -r 训练数据集文件名 -t 测试数据集文件名 -m 模型文件名 -r file 训练数据集文件名 -t file 测试数据集文件名 -m file 模型文件名 -l label 关键词和类别之间的分隔符,"__label__" ftclass.py -r train.txt -t test.txt -m model.bin -c a,b,c,d,e -l __|||__