Beispiel #1
0
class TweetHolder:
    # using_ifd = token_dict()

    def __init__(self, tw):
        self.tw = tw
        self.id = tw.get(tk.key_id)
        self.cluster = None
        self.tokens = IdFreqDict()
        self.valid_tokens = IdFreqDict()
        self.tokenize()

    def __contains__(self, key):
        return key in self.tw

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    def get_cluid(self):
        return self.cluster.cluid

    def update_cluid_into_tw(self):
        self.tw[
            tk.
            key_event_cluid] = self.cluster.cluid if self.cluster is not None else None

    def tokenize(self):
        # tokens = (t.text.lower() for t in self.tw[tk.key_spacy])
        tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower())
        tokens = [
            t.strip() for t in tokens
            if pu.is_valid_keyword(t) and not pu.is_stop_word(t)
        ]
        for token in tokens:
            self.tokens.count_word(token)

    def validate(self, using_ifd):
        self.valid_tokens.clear()
        for word, freq in self.tokens.word_freq_enumerate(newest=False):
            if using_ifd.has_word(word):
                self.valid_tokens.count_word(word, freq)

    def update_cluster(self, cluster):
        if self.cluster is not None:
            self.cluster.update_by_twh(self, factor=-1)
        self.cluster = cluster
        if cluster is not None:
            cluster.update_by_twh(self, factor=1)
Beispiel #2
0
class TweetHolder:
    def __init__(self, tw):
        self.tw = tw
        self.id, self.retwid = tw.get(tk.key_id), tu.in_reply_to(tw)
        self.tokens = None
        self.retwset = None
        self.tokenize()

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    # def get_id(self): return self.id
    #
    # def get_retwid(self): return self.retwid

    def get_cluid(self):
        if self.retwset is None:
            raise ValueError(
                '_retwset in twh should not be None when getting cluid')
        return self.retwset.get_cluidarr()

    def tokenize(self):
        self.tokens = IdFreqDict()
        for token in self.tw[tk.key_spacy]:
            word = token.text.lower().strip('#').strip()
            if ClusterService.is_valid_keyword(word) and token_dict().has_word(
                    word):
                self.tokens.count_word(word)

    def into_retwset(self, retwset):
        # if retwset is not None and retwset.can_join_twh(self):
        # if self._retwset is not None:
        #     self._retwset.update_by_twh(self, factor=-1)
        self.retwset = retwset
        self.retwset.move_twh_into_cluster(self)

    def abandon(self):
        self.retwset.remove_twh_from_cluster(self)
Beispiel #3
0
def get_tokens(file_list):
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for file in file_list:
        twarr = fu.load_array(file)
        total_doc_num += len(twarr)
        for tw in twarr:
            tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower())
            real_tokens = list()
            for token in tokens:
                if len(token) >= 16:
                    real_tokens.extend(pu.segment(token))
                else:
                    real_tokens.append(token)
            for token in real_tokens:
                if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token):
                    id_freq_dict.count_word(token)
    id_freq_dict.drop_words_by_condition(2)
    print(id_freq_dict.vocabulary_size())
    return id_freq_dict, total_doc_num
Beispiel #4
0
    class TweetHolder:
        def __init__(self, doc):
            self.cluster = None
            self.text = doc.text
            self.topic = doc.topic
            self.tokenids = doc.tokenids
            self.ifd = IdFreqDict()
            for t in self.tokenids:
                self.ifd.count_word(t)

        def get_cluid(self):
            return self.cluster.cluid

        def update_cluster(self, cluster):
            if self.cluster is not None:
                self.cluster.update_by_twh(self, factor=-1)
            self.cluster = cluster
            if cluster is not None:
                cluster.update_by_twh(self, factor=1)
Beispiel #5
0
class TweetHolder:
    # using_ifd = token_dict()

    def __init__(self, tw):
        self.tw = tw
        self.id = tw.get(tk.key_id)
        self.cluster = None
        self.tokens = IdFreqDict()
        self.valid_tokens = IdFreqDict()
        self.tokenize()

    def __contains__(self, key):
        return key in self.tw

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    def get_cluid(self):
        """
        返回当前已被分配的聚类的ID
        :return: int,聚类的ID编号
        """
        return self.cluster.cluid

    def update_cluid_into_tw(self):
        """
        更新推特对象(self.tw,dict类型)的 tk.key_event_cluid 字段为当前已被分配的聚类的ID,
        若尚未被分配聚类则置为None
        :return:
        """
        self.tw[
            tk.
            key_event_cluid] = self.cluster.cluid if self.cluster is not None else None

    def tokenize(self):
        """
        将推特对象的text进行分词并保存分词结果,使用 self.tokens 进行分词计数
        :return:
        """
        # tokens = (t.text.lower() for t in self.tw[tk.key_spacy])
        tokens = pu.valid_tokenize(self.tw[tk.key_text].lower())
        for token in tokens:
            self.tokens.count_word(token)

    def validate(self, using_ifd):
        """
        更新分词表,将 self.tokens 中存在于 using_ifd 的单词重新计数到 self.valid_tokens 中
        :param using_ifd: utils.id_freq_dict.IdFreqDict,包含当前迭代中的合法分词
        :return:
        """
        self.valid_tokens.clear()
        for word, freq in self.tokens.word_freq_enumerate(newest=False):
            if using_ifd.has_word(word):
                self.valid_tokens.count_word(word, freq)

    def update_cluster(self, cluster):
        """
        若原本有聚类,则将当前推特从原本的 self.cluster 中分离;
        并将当前推特合并至 cluster 中(若不为None),更新 self.cluster
        :param cluster: 目标聚类对象
        :return:
        """
        if self.cluster is not None:
            self.cluster.update_by_twh(self, factor=-1)
        self.cluster = cluster
        if cluster is not None:
            cluster.update_by_twh(self, factor=1)