class FeatureContainer: def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, 'r') as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print 'Loaded ', len(self.word_list), 'words' def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode('utf8') for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode('utf8') for word in new_words]) feature_vec = self.fb.compute(token_list) return feature_vec, self.smb.sim_hash(feature_vec)
class FeatureContainer: def __init__(self, word_dict, keyword_dict=None): # Load word list self.word_list = [] self.word_dict = {} l = [(value, key) for key, value in word_dict.items()] l = sorted(l, reverse=True) for idx, (value, key) in enumerate(l): self.word_list.append(key) self.word_dict[key.decode('utf8')] = idx self.fb = FeatureBuilder(self.word_dict, keyword_dict) self.smb = SimhashBuilder(self.word_list) self.mnb = MinhashBuilder() print 'FeatureContainer OK' def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode('utf8') for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode('utf8') for word in new_words]) feature_vec = self.fb.compute(token_list) sim_hash, hash_vec = self.smb.sim_hash_nonzero(feature_vec) min_hash = self.mnb.min_hash(hash_vec) return feature_vec, sim_hash, min_hash
class FeatureContainer: def __init__(self, word_dict_path): # Load word list self.word_dict_path = word_dict_path self.word_list = [] with open(word_dict_path, "r") as ins: for line in ins.readlines(): self.word_list.append(line.split()[1]) self.word_dict = {} for idx, ascword in enumerate(self.word_list): self.word_dict[ascword.decode("utf8")] = idx self.fb = FeatureBuilder(self.word_dict) self.smb = SimhashBuilder(self.word_list) print "Loaded ", len(self.word_list), "words" def compute_feature(self, token_list): new_words = [] for token in token_list: if not token in self.word_dict: new_words.append(token) if len(new_words) != 0: # Update word_list and word_dict self.fb.update_words(new_words) self.smb.update_words([word.encode("utf8") for word in new_words]) self.word_dict = self.fb.word_dict self.word_list.extend([word.encode("utf8") for word in new_words]) feature_vec = self.fb.compute(token_list) return feature_vec, self.smb.sim_hash(feature_vec)