class TweetHolder: def __init__(self, tw): self.tw = tw self.id, self.retwid = tw.get(tk.key_id), tu.in_reply_to(tw) self.tokens = None self.retwset = None self.tokenize() def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) # def get_id(self): return self.id # # def get_retwid(self): return self.retwid def get_cluid(self): if self.retwset is None: raise ValueError( '_retwset in twh should not be None when getting cluid') return self.retwset.get_cluidarr() def tokenize(self): self.tokens = IdFreqDict() for token in self.tw[tk.key_spacy]: word = token.text.lower().strip('#').strip() if ClusterService.is_valid_keyword(word) and token_dict().has_word( word): self.tokens.count_word(word) def into_retwset(self, retwset): # if retwset is not None and retwset.can_join_twh(self): # if self._retwset is not None: # self._retwset.update_by_twh(self, factor=-1) self.retwset = retwset self.retwset.move_twh_into_cluster(self) def abandon(self): self.retwset.remove_twh_from_cluster(self)
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
def preprocess_twarr(self, twarr): """pre-process the tweet text, including dropping non-common terms""" key_tokens = tk.key_wordlabels self.twarr = twarr for word_dict in self.word_dicts: word_dict.clear() # self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict = \ # IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict() pos_tag2dict_map = dict([(tag, self.prop_n_dict) for tag in self.prop_n_tags] + [(tag, self.comm_n_dict) for tag in self.comm_n_tags] + [(tag, self.verb_dict) for tag in self.verb_tags] + [(tag, self.ht_dict) for tag in self.ht_rags]) for tw in twarr: tokens = tw[key_tokens] for i in range(len(tokens) - 1, -1, -1): tokens[i][0] = tokens[i][0].lower().strip() word, _, pos_tag = tokens[i] if not cs.is_valid_keyword(word): del tokens[i] if word.startswith('#') and not pos_tag.lower() == 'ht': pos_tag = tokens[i][2] = 'HT' if pos_tag in pos_tag2dict_map: pos_tag2dict_map[pos_tag].count_word(word) self.prop_n_dict.drop_words_by_condition(3) self.comm_n_dict.drop_words_by_condition(4) self.verb_dict.drop_words_by_condition(4) self.ht_dict.drop_words_by_condition(3) for tw in twarr: tw[self.key_prop_n], tw[self.key_comm_n], tw[self.key_verb], tw[self.key_ht] = \ IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict() tw_pos_tag2dict_map = dict([(tag, tw[self.key_prop_n]) for tag in self.prop_n_tags] + [(tag, tw[self.key_comm_n]) for tag in self.comm_n_tags] + [(tag, tw[self.key_verb]) for tag in self.verb_tags] + [(tag, tw[self.key_ht]) for tag in self.ht_rags]) for token in tw[key_tokens]: word, _, pos_tag = token if pos_tag in tw_pos_tag2dict_map and pos_tag2dict_map[ pos_tag].has_word(word): tw_pos_tag2dict_map[pos_tag].count_word(word)
def get_semantic_tokens_multi(file_path): pos_type_info = { ark.prop_label: { K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file }, ark.comm_label: { K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file }, ark.verb_label: { K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file }, ark.hstg_label: { K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file }, } total_doc_num = 0 file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_semantic_tokens, [(file_list, ) for file_list in file_list_block]) for res_type_info, doc_num in res_list: total_doc_num += doc_num for label in res_type_info.keys(): pos_type_info[label][K_IFD].merge_freq_from( res_type_info[label][K_IFD]) print('total_doc_num', total_doc_num) for label in pos_type_info.keys(): ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][ K_FILE] ifd.drop_words_by_condition(3) if label != ark.hstg_label: ifd.drop_words_by_condition(lambda word, _: word.startswith('#')) ifd.dump_dict(file_name) print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
class TweetHolder: def __init__(self, doc): self.cluster = None self.text = doc.text self.topic = doc.topic self.tokenids = doc.tokenids self.ifd = IdFreqDict() for t in self.tokenids: self.ifd.count_word(t) def get_cluid(self): return self.cluster.cluid def update_cluster(self, cluster): if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
class IfdGetter: K_IFD_FILE = 'ifd_file' def __init__(self, ifd_file=None): self.ifd_file = ifd_file self.ifd = None def __call__(self, *args, **kwargs): if IfdGetter.K_IFD_FILE in kwargs: self.ifd_file = kwargs.get(IfdGetter.K_IFD_FILE) if self.ifd_file is None: raise ValueError('An id freq dict should be specified.') if self.ifd is None: self.ifd = IdFreqDict() self.ifd.load_dict(self.ifd_file) return self.ifd def reload(self, ifd_file): if self.ifd is not None: self.ifd.load_dict(ifd_file)
def get_semantic_tokens(file_list): pos_type_info = { ark.prop_label: {K_IFD: IdFreqDict()}, ark.comm_label: {K_IFD: IdFreqDict()}, ark.verb_label: {K_IFD: IdFreqDict()}, ark.hstg_label: {K_IFD: IdFreqDict()}, } total_doc_num = 0 for file in file_list: twarr = ark.twarr_ark(fu.load_array(file)) total_doc_num += len(twarr) pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr]) for pos_token in pos_tokens: word = pos_token[0].strip().lower() if len(word) <= 2 or not pu.is_valid_keyword(word): continue real_label = ark.pos_token2semantic_label(pos_token) if real_label: pos_type_info[real_label][K_IFD].count_word(word) return pos_type_info, total_doc_num
def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): """ 实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus, 并在其上进行完整的聚类过程,耗时较多; 若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息, 并对new_twharr中的每条推特包装对象采样已有的聚类对象 :param old_twharr: list,元素类型为 TweetHolder :param new_twharr: list,元素类型为 TweetHolder :param iter_num: 聚类循环所用的迭代次数 :return: """ cludict = self.cludict """ recalculate the valid dictionary """ valid_dict = IdFreqDict() D = len(old_twharr) + len(new_twharr) for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for cluster in cludict.values(): cluster.clear() for old_twh in old_twharr: # if old_twh.get_cluid() not in cludict: # continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if old_twharr: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid cluster = cludict[new_cluid] new_twh.update_cluster(cluster) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format( i + 1, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[cluid] = ClusterHolder(cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): return self.cluster.cluid def update_cluid_into_tw(self): self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower()) tokens = [ t.strip() for t in tokens if pu.is_valid_keyword(t) and not pu.is_stop_word(t) ] for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
def get_ifd_from_docarr(docarr): """ assume that docarr has been tokenized """ ifd = IdFreqDict() for doc in docarr: ifd.count_words(doc.tokens) ifd.reset_id() return ifd
def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): cludict = self.cludict valid_dict = IdFreqDict() if len(old_twharr) > 0: for cluster in cludict.values(): cluster.clear() D = len(old_twharr) + len(new_twharr) """ recalculate the valid dictionary """ for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for old_twh in old_twharr: if old_twh.get_cluid() not in cludict: continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if len(old_twharr) > 0: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid new_twh.update_cluster(cludict[new_cluid]) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format(i, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[self.max_cluid] = ClusterHolder(self.max_cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
def get_tokens(file_list): id_freq_dict, total_doc_num = IdFreqDict(), 0 for file in file_list: twarr = fu.load_array(file) total_doc_num += len(twarr) for tw in twarr: tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower()) real_tokens = list() for token in tokens: if len(token) >= 16: real_tokens.extend(pu.segment(token)) else: real_tokens.append(token) for token in real_tokens: if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token): id_freq_dict.count_word(token) id_freq_dict.drop_words_by_condition(2) print(id_freq_dict.vocabulary_size()) return id_freq_dict, total_doc_num
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): return list(self.twhdict.values()) def get_twarr(self): return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label
class WordFreqCounter: def __init__(self, capignore=True, worddict=None): self.doc_num = 0 self.capignore = capignore self.worddict = worddict if worddict else IdFreqDict() self.posdict = IdFreqDict() # pos_dict_file = os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'posdict.txt' # self.posdict.load_worddict(pos_dict_file) # self.notional = {'NN': 0, 'NNP': 0, 'NNPS': 0, 'NNS': 0, 'RB': 0, 'RBR': 0, 'RBS': 0, # 'UH': 0, 'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, } # self.verb = {'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, } def vocabulary_size(self): # return self.worddict.vocabulary_size() + self.posdict.vocabulary_size() return self.worddict.vocabulary_size() @staticmethod def is_valid_wordlabel(wordlabel): isnotentity = wordlabel[1].startswith('O') return isnotentity def calculate_idf(self): if self.doc_num == 0: raise ValueError('No valid word has been recorded yet.') for word in self.worddict.dictionary: df = self.worddict.dictionary[word]['df'] self.worddict.dictionary[word]['idf'] = 10 / np.log( (self.doc_num + 1) / df) def feature_matrix_of_twarr(self, twarr): mtx = list() for tw in twarr: idfvec, added, num_entity = self.wordlabel_vector( tw[tk.key_wordlabels]) mtx.append(idfvec * (np.log(len(added) + 1) + 1) * (np.log(num_entity + 1) + 1)) return np.array(mtx) def wordlabel_vector(self, wordlabels): added_word_dict = dict() word_vector = np.array([0] * self.worddict.vocabulary_size(), dtype=np.float32) pos_vector = np.array([0] * self.posdict.vocabulary_size(), dtype=np.float32) for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word # if not wordlabel[0].lower().strip("#") == word: # print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word) if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue if word in added_word_dict: continue added_word_dict[word] = True if not self.worddict.is_word_in_dict(word): pos_tag = wordlabel[2] pos_vector[self.posdict.word2id(pos_tag)] += 1 else: wordid = self.worddict.word_2_id(word) word_vector[wordid] = self.worddict.dictionary[word]['idf'] added_word = sorted(added_word_dict.keys()) added_entity = sorted( [1 for w in wordlabels if not self.is_valid_wordlabel(w)]) return word_vector, added_word, len(added_entity) # return np.concatenate([word_vector, pos_vector]), added_word, len(added_entity) def expand_dict_and_count_df_from_wordlabel(self, wordlabels): added_word_dict = dict() for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue else: if word in added_word_dict: continue added_word_dict[word] = True # "word" is now neither entity nor invalid keyword_info or duplicated word by now self.worddict.expand_dict_from_word(word) if 'df' not in self.worddict.dictionary[word]: self.worddict.dictionary[word]['df'] = 1 else: self.worddict.dictionary[word]['df'] += 1 self.doc_num += 1 def expand_from_wordlabel_array(self, wordlabel_arr): for wordlabel in wordlabel_arr: self.expand_dict_and_count_df_from_wordlabel(wordlabel) self.worddict.reset_ids() def reserve_word_by_idf_condition(self, rsv_cond): self.calculate_idf() for word in list(self.worddict.dictionary.keys()): word_idf = self.worddict.dictionary[word]['idf'] if not rsv_cond(word_idf): self.worddict.remove_word(word) self.worddict.reset_ids() def merge_from(self, othercounter): thisdict = self.worddict.dictionary otherdict = othercounter.worddict.dictionary for otherword, otherwordattr in otherdict.items(): if otherword not in thisdict: thisdict[otherword] = otherwordattr thisdict[otherword]['idf'] /= 5 # def most_common_words(self, rank): # wordnum = self.worddict.vocabulary_size() # if 0 < rank < 1: # top_k = wordnum * rank # elif rank > 1 and type(rank) is int: # top_k = rank # else: # raise ValueError('rank is not a valid number' + str(rank)) # dic = self.worddict.dictionary # return sorted(dic.keys(), key=lambda w: dic[w]['idf'])[:top_k] def dump_worddict(self, dict_file, overwrite=True): self.worddict.dump_worddict(dict_file, overwrite) def load_worddict(self, dict_file): self.worddict.load_worddict(dict_file)
def __init__(self): self.twarr = None self.word_dicts = (self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict) = \ (IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict())
def __init__(self): self.type_ifd_dict = dict([(k_type, IdFreqDict()) for k_type in TokenSet.KEY_LIST])
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): """ 返回聚类当前持有的推特包装对象的列表,不考虑排列顺序 :return: list,每个元素类型为TweetHolder """ return list(self.twhdict.values()) def get_twarr(self): """ 返回聚类当前持有的推特对象的列表,不考虑排列顺序 :return: list,推特列表 """ return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): """ 返回聚类当前的推特对象所持有的标记(若存在该信息)的列表,不考虑排列顺序 :return: list,元素为int,表示推特原本的标记值(原本属于哪个聚类) """ return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): """ 清空当前聚类的统计信息,包括分词表、推特列表、推特计数 :return: """ self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): """ 将输入的推特包装对象加入/移出当前聚类,并根据其 valid_tokens 更新当前聚类的分词表等统计信息 :param twh: TweetHolder,要加入的推特包装对象 :param factor: int,1表示加入,0表示移出 :return: """ twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): """ 计算当前聚类中是否存在标记数占推特列表总数的比例大于阈值 rep_thres 的标记 :param rep_thres: float,判定阈值 :return: int,若存在足够占比的标记则返回该标记,否则返回-1 """ lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label
def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0
class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): """ 返回当前已被分配的聚类的ID :return: int,聚类的ID编号 """ return self.cluster.cluid def update_cluid_into_tw(self): """ 更新推特对象(self.tw,dict类型)的 tk.key_event_cluid 字段为当前已被分配的聚类的ID, 若尚未被分配聚类则置为None :return: """ self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): """ 将推特对象的text进行分词并保存分词结果,使用 self.tokens 进行分词计数 :return: """ # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.valid_tokenize(self.tw[tk.key_text].lower()) for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): """ 更新分词表,将 self.tokens 中存在于 using_ifd 的单词重新计数到 self.valid_tokens 中 :param using_ifd: utils.id_freq_dict.IdFreqDict,包含当前迭代中的合法分词 :return: """ self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): """ 若原本有聚类,则将当前推特从原本的 self.cluster 中分离; 并将当前推特合并至 cluster 中(若不为None),更新 self.cluster :param cluster: 目标聚类对象 :return: """ if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list)) for pre, post in pre2post.items(): ifd = IdFreqDict() ifd.load_dict(pre) pre_vocab = ifd.vocabulary_size() print('{} loaded, {} words'.format(pre, pre_vocab)) ifd.drop_words_by_condition(word_remove) print('{} words dropped, remain {} words'.format(pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size())) ifd.dump_dict(post) print('dump over')
def __init__(self, capignore=True, worddict=None): self.doc_num = 0 self.capignore = capignore self.worddict = worddict if worddict else IdFreqDict() self.posdict = IdFreqDict()
self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list)) for pre, post in pre2post.items(): ifd = IdFreqDict() ifd.load_dict(pre) pre_vocab = ifd.vocabulary_size() print('{} loaded, {} words'.format(pre, pre_vocab)) ifd.drop_words_by_condition(word_remove) print('{} words dropped, remain {} words'.format( pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size())) ifd.dump_dict(post) print('dump over')
def load_ifd(self): from utils.id_freq_dict import IdFreqDict return IdFreqDict().load_dict(self.dict_file)