def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): """ 实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus, 并在其上进行完整的聚类过程,耗时较多; 若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息, 并对new_twharr中的每条推特包装对象采样已有的聚类对象 :param old_twharr: list,元素类型为 TweetHolder :param new_twharr: list,元素类型为 TweetHolder :param iter_num: 聚类循环所用的迭代次数 :return: """ cludict = self.cludict """ recalculate the valid dictionary """ valid_dict = IdFreqDict() D = len(old_twharr) + len(new_twharr) for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for cluster in cludict.values(): cluster.clear() for old_twh in old_twharr: # if old_twh.get_cluid() not in cludict: # continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if old_twharr: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid cluster = cludict[new_cluid] new_twh.update_cluster(cluster) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format( i + 1, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[cluid] = ClusterHolder(cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): cludict = self.cludict valid_dict = IdFreqDict() if len(old_twharr) > 0: for cluster in cludict.values(): cluster.clear() D = len(old_twharr) + len(new_twharr) """ recalculate the valid dictionary """ for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for old_twh in old_twharr: if old_twh.get_cluid() not in cludict: continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if len(old_twharr) > 0: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid new_twh.update_cluster(cludict[new_cluid]) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format(i, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[self.max_cluid] = ClusterHolder(self.max_cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
def get_tokens(file_list): id_freq_dict, total_doc_num = IdFreqDict(), 0 for file in file_list: twarr = fu.load_array(file) total_doc_num += len(twarr) for tw in twarr: tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower()) real_tokens = list() for token in tokens: if len(token) >= 16: real_tokens.extend(pu.segment(token)) else: real_tokens.append(token) for token in real_tokens: if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token): id_freq_dict.count_word(token) id_freq_dict.drop_words_by_condition(2) print(id_freq_dict.vocabulary_size()) return id_freq_dict, total_doc_num
class WordFreqCounter: def __init__(self, capignore=True, worddict=None): self.doc_num = 0 self.capignore = capignore self.worddict = worddict if worddict else IdFreqDict() self.posdict = IdFreqDict() # pos_dict_file = os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'posdict.txt' # self.posdict.load_worddict(pos_dict_file) # self.notional = {'NN': 0, 'NNP': 0, 'NNPS': 0, 'NNS': 0, 'RB': 0, 'RBR': 0, 'RBS': 0, # 'UH': 0, 'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, } # self.verb = {'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, } def vocabulary_size(self): # return self.worddict.vocabulary_size() + self.posdict.vocabulary_size() return self.worddict.vocabulary_size() @staticmethod def is_valid_wordlabel(wordlabel): isnotentity = wordlabel[1].startswith('O') return isnotentity def calculate_idf(self): if self.doc_num == 0: raise ValueError('No valid word has been recorded yet.') for word in self.worddict.dictionary: df = self.worddict.dictionary[word]['df'] self.worddict.dictionary[word]['idf'] = 10 / np.log( (self.doc_num + 1) / df) def feature_matrix_of_twarr(self, twarr): mtx = list() for tw in twarr: idfvec, added, num_entity = self.wordlabel_vector( tw[tk.key_wordlabels]) mtx.append(idfvec * (np.log(len(added) + 1) + 1) * (np.log(num_entity + 1) + 1)) return np.array(mtx) def wordlabel_vector(self, wordlabels): added_word_dict = dict() word_vector = np.array([0] * self.worddict.vocabulary_size(), dtype=np.float32) pos_vector = np.array([0] * self.posdict.vocabulary_size(), dtype=np.float32) for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word # if not wordlabel[0].lower().strip("#") == word: # print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word) if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue if word in added_word_dict: continue added_word_dict[word] = True if not self.worddict.is_word_in_dict(word): pos_tag = wordlabel[2] pos_vector[self.posdict.word2id(pos_tag)] += 1 else: wordid = self.worddict.word_2_id(word) word_vector[wordid] = self.worddict.dictionary[word]['idf'] added_word = sorted(added_word_dict.keys()) added_entity = sorted( [1 for w in wordlabels if not self.is_valid_wordlabel(w)]) return word_vector, added_word, len(added_entity) # return np.concatenate([word_vector, pos_vector]), added_word, len(added_entity) def expand_dict_and_count_df_from_wordlabel(self, wordlabels): added_word_dict = dict() for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue else: if word in added_word_dict: continue added_word_dict[word] = True # "word" is now neither entity nor invalid keyword_info or duplicated word by now self.worddict.expand_dict_from_word(word) if 'df' not in self.worddict.dictionary[word]: self.worddict.dictionary[word]['df'] = 1 else: self.worddict.dictionary[word]['df'] += 1 self.doc_num += 1 def expand_from_wordlabel_array(self, wordlabel_arr): for wordlabel in wordlabel_arr: self.expand_dict_and_count_df_from_wordlabel(wordlabel) self.worddict.reset_ids() def reserve_word_by_idf_condition(self, rsv_cond): self.calculate_idf() for word in list(self.worddict.dictionary.keys()): word_idf = self.worddict.dictionary[word]['idf'] if not rsv_cond(word_idf): self.worddict.remove_word(word) self.worddict.reset_ids() def merge_from(self, othercounter): thisdict = self.worddict.dictionary otherdict = othercounter.worddict.dictionary for otherword, otherwordattr in otherdict.items(): if otherword not in thisdict: thisdict[otherword] = otherwordattr thisdict[otherword]['idf'] /= 5 # def most_common_words(self, rank): # wordnum = self.worddict.vocabulary_size() # if 0 < rank < 1: # top_k = wordnum * rank # elif rank > 1 and type(rank) is int: # top_k = rank # else: # raise ValueError('rank is not a valid number' + str(rank)) # dic = self.worddict.dictionary # return sorted(dic.keys(), key=lambda w: dic[w]['idf'])[:top_k] def dump_worddict(self, dict_file, overwrite=True): self.worddict.dump_worddict(dict_file, overwrite) def load_worddict(self, dict_file): self.worddict.load_worddict(dict_file)
self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list)) for pre, post in pre2post.items(): ifd = IdFreqDict() ifd.load_dict(pre) pre_vocab = ifd.vocabulary_size() print('{} loaded, {} words'.format(pre, pre_vocab)) ifd.drop_words_by_condition(word_remove) print('{} words dropped, remain {} words'.format( pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size())) ifd.dump_dict(post) print('dump over')
self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list)) for pre, post in pre2post.items(): ifd = IdFreqDict() ifd.load_dict(pre) pre_vocab = ifd.vocabulary_size() print('{} loaded, {} words'.format(pre, pre_vocab)) ifd.drop_words_by_condition(word_remove) print('{} words dropped, remain {} words'.format(pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size())) ifd.dump_dict(post) print('dump over')