class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.retwsetdict = dict() self.tokens = IdFreqDict() self.twnum = 0 def update_by_retwset(self, retwset, factor): set_mstid = retwset.master_twhid if factor > 0: # if retw_mstid in self._retwsetdict: # raise ValueError('cannot move in retwset since retwid {} is in cluster'.format(retw_mstid)) self.retwsetdict[set_mstid] = retwset for twh in retwset.get_twharr(): self.update_by_twh(twh, 1) else: # if retw_mstid not in self._retwsetdict: # raise ValueError('cannot move out retwset since retwid {} not in cluster'.format(retw_mstid)) for twh in retwset.get_twharr(): self.update_by_twh(twh, -1) self.retwsetdict.pop(set_mstid) def update_by_twh(self, twh, factor): twh_tokens = twh.tokens if factor > 0: self.tokens.merge_freq_from(twh_tokens) self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens) self.twnum -= 1
def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): """ 实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus, 并在其上进行完整的聚类过程,耗时较多; 若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息, 并对new_twharr中的每条推特包装对象采样已有的聚类对象 :param old_twharr: list,元素类型为 TweetHolder :param new_twharr: list,元素类型为 TweetHolder :param iter_num: 聚类循环所用的迭代次数 :return: """ cludict = self.cludict """ recalculate the valid dictionary """ valid_dict = IdFreqDict() D = len(old_twharr) + len(new_twharr) for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for cluster in cludict.values(): cluster.clear() for old_twh in old_twharr: # if old_twh.get_cluid() not in cludict: # continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if old_twharr: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid cluster = cludict[new_cluid] new_twh.update_cluster(cluster) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format( i + 1, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[cluid] = ClusterHolder(cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.tokens = IdFreqDict() self.twnum = 0 def update_by_twh(self, twh, factor): twh_tokens = twh.ifd if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) self.twnum -= 1
def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num): cludict = self.cludict valid_dict = IdFreqDict() if len(old_twharr) > 0: for cluster in cludict.values(): cluster.clear() D = len(old_twharr) + len(new_twharr) """ recalculate the valid dictionary """ for twh in old_twharr + new_twharr: valid_dict.merge_freq_from(twh.tokens, newest=False) valid_dict.drop_words_by_condition(3) """ reallocate & parameter """ for old_twh in old_twharr: if old_twh.get_cluid() not in cludict: continue old_twh.validate(valid_dict) old_cluster = old_twh.cluster old_twh.cluster = None old_twh.update_cluster(old_cluster) for new_twh in new_twharr: new_twh.validate(valid_dict) if len(old_twharr) > 0: new_cluid = self.sample(new_twh, D, using_max=True, no_new_clu=True) else: new_cluid = self.max_cluid new_twh.update_cluster(cludict[new_cluid]) self.beta0 = self.beta * valid_dict.vocabulary_size() """ start iteration """ for i in range(iter_num): print(' {} th clustering, clu num: {}'.format(i, len(cludict))) for twh in new_twharr: cluster = twh.cluster twh.update_cluster(None) if cluster.twnum == 0: cludict.pop(cluster.cluid) cluid = self.sample(twh, D, using_max=(i == iter_num - 1)) if cluid not in cludict: self.max_cluid = cluid cludict[self.max_cluid] = ClusterHolder(self.max_cluid) twh.update_cluster(cludict[cluid]) for twh in new_twharr: twh.update_cluid_into_tw()
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): return list(self.twhdict.values()) def get_twarr(self): return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): """ 返回聚类当前持有的推特包装对象的列表,不考虑排列顺序 :return: list,每个元素类型为TweetHolder """ return list(self.twhdict.values()) def get_twarr(self): """ 返回聚类当前持有的推特对象的列表,不考虑排列顺序 :return: list,推特列表 """ return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): """ 返回聚类当前的推特对象所持有的标记(若存在该信息)的列表,不考虑排列顺序 :return: list,元素为int,表示推特原本的标记值(原本属于哪个聚类) """ return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): """ 清空当前聚类的统计信息,包括分词表、推特列表、推特计数 :return: """ self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): """ 将输入的推特包装对象加入/移出当前聚类,并根据其 valid_tokens 更新当前聚类的分词表等统计信息 :param twh: TweetHolder,要加入的推特包装对象 :param factor: int,1表示加入,0表示移出 :return: """ twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): """ 计算当前聚类中是否存在标记数占推特列表总数的比例大于阈值 rep_thres 的标记 :param rep_thres: float,判定阈值 :return: int,若存在足够占比的标记则返回该标记,否则返回-1 """ lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label