class FeatureContainer: def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, 'r') as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print 'Loaded ', len(self.word_list), 'words' def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode('utf8') for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode('utf8') for word in new_words]) feature_vec = self.fb.compute(token_list) return feature_vec, self.smb.sim_hash(feature_vec)
class FeatureContainer: def __init__(self, word_dict, keyword_dict=None): # Load word list self.word_list = [] self.word_dict = {} l = [(value, key) for key, value in word_dict.items()] l = sorted(l, reverse=True) for idx, (value, key) in enumerate(l): self.word_list.append(key) self.word_dict[key.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict, keyword_dict) self.smb = SimhashBuilder(self.word_list) self.mnb = MinhashBuilder() print 'FeatureContainer OK' def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode('utf8') for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode('utf8') for word in new_words]) feature_vec = self.fb.compute(token_list) sim_hash, hash_vec = self.smb.sim_hash_nonzero(feature_vec) min_hash = self.mnb.min_hash(hash_vec) return feature_vec, sim_hash, min_hash
class FeatureContainer: def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, "r") as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode("utf8")] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print "Loaded ", len(self.word_list), "words" def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode("utf8") for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode("utf8") for word in new_words]) feature_vec = self.fb.compute(token_list) return feature_vec, self.smb.sim_hash(feature_vec)
def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, 'r') as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print 'Loaded ', len(self.word_list), 'words'
def __init__(self, word_dict, keyword_dict=None): # Load word list self.word_list = [] self.word_dict = {} l = [(value, key) for key, value in word_dict.items()] l = sorted(l, reverse=True) for idx, (value, key) in enumerate(l): self.word_list.append(key) self.word_dict[key.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict, keyword_dict) self.smb = SimhashBuilder(self.word_list) self.mnb = MinhashBuilder() print 'FeatureContainer OK'
def preProcessingData(filename): loadData(filename) jt_time = time.time() global jt jt = JiebaTokenizer(stopwords_path, 'c') end_jt_time = time.time() print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time)) # 根据所有的标注数据做词向量模型 生成词典 wordList, wordDict = buildWords(jt, labelContents) end_build_time = time.time() print('buildWords time: %s' % str(end_build_time - end_jt_time)) # 生成特征向量 global fb fb = FeatureBuilder(wordDict) end_fb_build_time = time.time() print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time)) # 生成指纹 global smb smb = SimhashBuilder(wordList) end_smb_build_time = time.time() print('SimhashBuilder time: %s' % str(end_smb_build_time - end_fb_build_time)) # 生成所有标注数据的特征向量 for flowId, processLabelDataMap in processFlowMap.items(): processFlowMap[flowId] = generateDocFeatureVector( processLabelDataMap, jt, fb, smb) end_docFV_time = time.time() print('generateDocFeatureVector time: %s' % str(end_docFV_time - end_smb_build_time))
def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, "r") as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode("utf8")] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print "Loaded ", len(self.word_list), "words"
# 初始化分词器,主要是加载停用词 jt = JiebaTokenizer(stopword_path, 'c') # 分词 tokens返回分词后的数组 doc_token_1 = jt.tokens(doc_data_1) print 'Loading word dict...' # 加载字典并构建词典 word_list = [] with open(word_dict, 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx # 构建非0特征向量 fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute( doc_token_1 ) # return feature_nonzero得到一个非0 长度的向量 ,元素为(idx,value)且 value > 0 #使得字典中的每个值都有一个hash值, smb = SimhashBuilder(word_list) doc_fl_1 = DocFeatLoader(smb, doc_feat_1) #测试文件,用于调研算法 out_file = open('/home/lin.xiong/lsh_data/out.file', 'w') #fp_set = set() fp_arr = [] fp_post_id_dict = {} with open('/home/lin.xiong/lsh_data/lsh_clear.fingerprint', 'r') as fp: for line in fp: fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0] fp_arr.append(long(line.split('\t')[1]))
doc_token_1 = jt.tokens(doc_data_1) doc_token_2 = jt.tokens(doc_data_2) print 'Loading word dict...' # Load word list from word_dict word_list = [] with open(word_dict, 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Build unicode string word dict word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx # Build nonzero-feature fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute(doc_token_1) doc_feat_2 = fb.compute(doc_token_2) # Init simhash_builder smb = SimhashBuilder(word_list) doc_fl_1 = DocFeatLoader(smb, doc_feat_1) doc_fl_2 = DocFeatLoader(smb, doc_feat_2) if mode == '-c': print 'Matching by VSM + cosine distance' dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False) if dist > float(threshold): print 'Matching Result:\t<True:%s>' % dist else:
if __name__=="__main__": if len(sys.argv) < 7: print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path" exit(-1) # Load word list word_list = [] with open(sys.argv[1], 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Init tokenizer jt = JiebaTokenizer(sys.argv[2], 'c') # Init feature_builder word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx fb = FeatureBuilder(word_dict) # Init simhash_builder smb = SimhashBuilder(word_list) # Load fingerprint list fingerprint_list = [] with open(sys.argv[3], 'r') as ins: for line in ins.readlines(): fingerprint_list.append(int(line)) # For exp: load document content doc_list = [] with open(sys.argv[4], 'r') as ins: for line in ins.readlines(): doc_list.append(line.strip()) # Detection process begins min_sim = 64 min_docid = 0
if __name__ == "__main__": if len(sys.argv) < 7: print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path" exit(-1) # Load word list word_list = [] with open(sys.argv[1], 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Init tokenizer jt = JiebaTokenizer(sys.argv[2], 'c') # Init feature_builder word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx fb = FeatureBuilder(word_dict) # Init simhash_builder smb = SimhashBuilder(word_list) # Load fingerprint list fingerprint_list = [] with open(sys.argv[3], 'r') as ins: for line in ins.readlines(): fingerprint_list.append(int(line)) # For exp: load document content doc_list = [] with open(sys.argv[4], 'r') as ins: for line in ins.readlines(): doc_list.append(line.strip()) # Detection process begins min_sim = 64 min_docid = 0
doc_token_1 = jt.tokens(doc_data_1) doc_token_2 = jt.tokens(doc_data_2) print 'Loading word dict...' # Load word list from word_dict word_list = [] with open(word_dict, 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Build unicode string word dict word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx # Build nonzero-feature fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute(doc_token_1) doc_feat_2 = fb.compute(doc_token_2) # Init simhash_builder smb = SimhashBuilder(word_list) doc_fl_1 = DocFeatLoader(smb, doc_feat_1) doc_fl_2 = DocFeatLoader(smb, doc_feat_2) if mode == '-c': print 'Matching by VSM + cosine distance' dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False) if dist > float(threshold):