class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): return self.cluster.cluid def update_cluid_into_tw(self): self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower()) tokens = [ t.strip() for t in tokens if pu.is_valid_keyword(t) and not pu.is_stop_word(t) ] for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
class TweetHolder: def __init__(self, tw): self.tw = tw self.id, self.retwid = tw.get(tk.key_id), tu.in_reply_to(tw) self.tokens = None self.retwset = None self.tokenize() def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) # def get_id(self): return self.id # # def get_retwid(self): return self.retwid def get_cluid(self): if self.retwset is None: raise ValueError( '_retwset in twh should not be None when getting cluid') return self.retwset.get_cluidarr() def tokenize(self): self.tokens = IdFreqDict() for token in self.tw[tk.key_spacy]: word = token.text.lower().strip('#').strip() if ClusterService.is_valid_keyword(word) and token_dict().has_word( word): self.tokens.count_word(word) def into_retwset(self, retwset): # if retwset is not None and retwset.can_join_twh(self): # if self._retwset is not None: # self._retwset.update_by_twh(self, factor=-1) self.retwset = retwset self.retwset.move_twh_into_cluster(self) def abandon(self): self.retwset.remove_twh_from_cluster(self)
def get_tokens(file_list): id_freq_dict, total_doc_num = IdFreqDict(), 0 for file in file_list: twarr = fu.load_array(file) total_doc_num += len(twarr) for tw in twarr: tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower()) real_tokens = list() for token in tokens: if len(token) >= 16: real_tokens.extend(pu.segment(token)) else: real_tokens.append(token) for token in real_tokens: if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token): id_freq_dict.count_word(token) id_freq_dict.drop_words_by_condition(2) print(id_freq_dict.vocabulary_size()) return id_freq_dict, total_doc_num
class TweetHolder: def __init__(self, doc): self.cluster = None self.text = doc.text self.topic = doc.topic self.tokenids = doc.tokenids self.ifd = IdFreqDict() for t in self.tokenids: self.ifd.count_word(t) def get_cluid(self): return self.cluster.cluid def update_cluster(self, cluster): if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): """ 返回当前已被分配的聚类的ID :return: int,聚类的ID编号 """ return self.cluster.cluid def update_cluid_into_tw(self): """ 更新推特对象(self.tw,dict类型)的 tk.key_event_cluid 字段为当前已被分配的聚类的ID, 若尚未被分配聚类则置为None :return: """ self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): """ 将推特对象的text进行分词并保存分词结果,使用 self.tokens 进行分词计数 :return: """ # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.valid_tokenize(self.tw[tk.key_text].lower()) for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): """ 更新分词表,将 self.tokens 中存在于 using_ifd 的单词重新计数到 self.valid_tokens 中 :param using_ifd: utils.id_freq_dict.IdFreqDict,包含当前迭代中的合法分词 :return: """ self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): """ 若原本有聚类,则将当前推特从原本的 self.cluster 中分离; 并将当前推特合并至 cluster 中(若不为None),更新 self.cluster :param cluster: 目标聚类对象 :return: """ if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)