Exemple #1
0
class TweetHolder:
    def __init__(self, tw):
        self.tw = tw
        self.id, self.retwid = tw.get(tk.key_id), tu.in_reply_to(tw)
        self.tokens = None
        self.retwset = None
        self.tokenize()

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    # def get_id(self): return self.id
    #
    # def get_retwid(self): return self.retwid

    def get_cluid(self):
        if self.retwset is None:
            raise ValueError(
                '_retwset in twh should not be None when getting cluid')
        return self.retwset.get_cluidarr()

    def tokenize(self):
        self.tokens = IdFreqDict()
        for token in self.tw[tk.key_spacy]:
            word = token.text.lower().strip('#').strip()
            if ClusterService.is_valid_keyword(word) and token_dict().has_word(
                    word):
                self.tokens.count_word(word)

    def into_retwset(self, retwset):
        # if retwset is not None and retwset.can_join_twh(self):
        # if self._retwset is not None:
        #     self._retwset.update_by_twh(self, factor=-1)
        self.retwset = retwset
        self.retwset.move_twh_into_cluster(self)

    def abandon(self):
        self.retwset.remove_twh_from_cluster(self)
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
Exemple #3
0
 def preprocess_twarr(self, twarr):
     """pre-process the tweet text, including dropping non-common terms"""
     key_tokens = tk.key_wordlabels
     self.twarr = twarr
     for word_dict in self.word_dicts:
         word_dict.clear()
     # self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict = \
     #     IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
     pos_tag2dict_map = dict([(tag, self.prop_n_dict)
                              for tag in self.prop_n_tags] +
                             [(tag, self.comm_n_dict)
                              for tag in self.comm_n_tags] +
                             [(tag, self.verb_dict)
                              for tag in self.verb_tags] +
                             [(tag, self.ht_dict) for tag in self.ht_rags])
     for tw in twarr:
         tokens = tw[key_tokens]
         for i in range(len(tokens) - 1, -1, -1):
             tokens[i][0] = tokens[i][0].lower().strip()
             word, _, pos_tag = tokens[i]
             if not cs.is_valid_keyword(word):
                 del tokens[i]
             if word.startswith('#') and not pos_tag.lower() == 'ht':
                 pos_tag = tokens[i][2] = 'HT'
             if pos_tag in pos_tag2dict_map:
                 pos_tag2dict_map[pos_tag].count_word(word)
     self.prop_n_dict.drop_words_by_condition(3)
     self.comm_n_dict.drop_words_by_condition(4)
     self.verb_dict.drop_words_by_condition(4)
     self.ht_dict.drop_words_by_condition(3)
     for tw in twarr:
         tw[self.key_prop_n], tw[self.key_comm_n], tw[self.key_verb], tw[self.key_ht] = \
             IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
         tw_pos_tag2dict_map = dict([(tag, tw[self.key_prop_n])
                                     for tag in self.prop_n_tags] +
                                    [(tag, tw[self.key_comm_n])
                                     for tag in self.comm_n_tags] +
                                    [(tag, tw[self.key_verb])
                                     for tag in self.verb_tags] +
                                    [(tag, tw[self.key_ht])
                                     for tag in self.ht_rags])
         for token in tw[key_tokens]:
             word, _, pos_tag = token
             if pos_tag in tw_pos_tag2dict_map and pos_tag2dict_map[
                     pos_tag].has_word(word):
                 tw_pos_tag2dict_map[pos_tag].count_word(word)
def get_semantic_tokens_multi(file_path):
    pos_type_info = {
        ark.prop_label: {
            K_IFD: IdFreqDict(),
            K_FILE: getcfg().pre_prop_dict_file
        },
        ark.comm_label: {
            K_IFD: IdFreqDict(),
            K_FILE: getcfg().pre_comm_dict_file
        },
        ark.verb_label: {
            K_IFD: IdFreqDict(),
            K_FILE: getcfg().pre_verb_dict_file
        },
        ark.hstg_label: {
            K_IFD: IdFreqDict(),
            K_FILE: getcfg().pre_hstg_dict_file
        },
    }
    total_doc_num = 0
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile)
                                             for subfile in subfiles],
                                            process_num=20)
    res_list = mu.multi_process(get_semantic_tokens,
                                [(file_list, )
                                 for file_list in file_list_block])
    for res_type_info, doc_num in res_list:
        total_doc_num += doc_num
        for label in res_type_info.keys():
            pos_type_info[label][K_IFD].merge_freq_from(
                res_type_info[label][K_IFD])
    print('total_doc_num', total_doc_num)
    for label in pos_type_info.keys():
        ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][
            K_FILE]
        ifd.drop_words_by_condition(3)
        if label != ark.hstg_label:
            ifd.drop_words_by_condition(lambda word, _: word.startswith('#'))
        ifd.dump_dict(file_name)
        print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
Exemple #5
0
    class TweetHolder:
        def __init__(self, doc):
            self.cluster = None
            self.text = doc.text
            self.topic = doc.topic
            self.tokenids = doc.tokenids
            self.ifd = IdFreqDict()
            for t in self.tokenids:
                self.ifd.count_word(t)

        def get_cluid(self):
            return self.cluster.cluid

        def update_cluster(self, cluster):
            if self.cluster is not None:
                self.cluster.update_by_twh(self, factor=-1)
            self.cluster = cluster
            if cluster is not None:
                cluster.update_by_twh(self, factor=1)
Exemple #6
0
class IfdGetter:
    K_IFD_FILE = 'ifd_file'

    def __init__(self, ifd_file=None):
        self.ifd_file = ifd_file
        self.ifd = None

    def __call__(self, *args, **kwargs):
        if IfdGetter.K_IFD_FILE in kwargs:
            self.ifd_file = kwargs.get(IfdGetter.K_IFD_FILE)
        if self.ifd_file is None:
            raise ValueError('An id freq dict should be specified.')
        if self.ifd is None:
            self.ifd = IdFreqDict()
            self.ifd.load_dict(self.ifd_file)
        return self.ifd

    def reload(self, ifd_file):
        if self.ifd is not None:
            self.ifd.load_dict(ifd_file)
def get_semantic_tokens(file_list):
    pos_type_info = {
        ark.prop_label: {K_IFD: IdFreqDict()},
        ark.comm_label: {K_IFD: IdFreqDict()},
        ark.verb_label: {K_IFD: IdFreqDict()},
        ark.hstg_label: {K_IFD: IdFreqDict()},
    }
    total_doc_num = 0
    for file in file_list:
        twarr = ark.twarr_ark(fu.load_array(file))
        total_doc_num += len(twarr)
        pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr])
        for pos_token in pos_tokens:
            word = pos_token[0].strip().lower()
            if len(word) <= 2 or not pu.is_valid_keyword(word):
                continue
            real_label = ark.pos_token2semantic_label(pos_token)
            if real_label:
                pos_type_info[real_label][K_IFD].count_word(word)
    return pos_type_info, total_doc_num
Exemple #8
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     """
     实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus,
     并在其上进行完整的聚类过程,耗时较多;
     若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息,
     并对new_twharr中的每条推特包装对象采样已有的聚类对象
     :param old_twharr: list,元素类型为 TweetHolder
     :param new_twharr: list,元素类型为 TweetHolder
     :param iter_num: 聚类循环所用的迭代次数
     :return:
     """
     cludict = self.cludict
     """ recalculate the valid dictionary """
     valid_dict = IdFreqDict()
     D = len(old_twharr) + len(new_twharr)
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for cluster in cludict.values():
         cluster.clear()
     for old_twh in old_twharr:
         # if old_twh.get_cluid() not in cludict:
         #     continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if old_twharr:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         cluster = cludict[new_cluid]
         new_twh.update_cluster(cluster)
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(
             i + 1, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[cluid] = ClusterHolder(cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Exemple #9
0
class TweetHolder:
    # using_ifd = token_dict()

    def __init__(self, tw):
        self.tw = tw
        self.id = tw.get(tk.key_id)
        self.cluster = None
        self.tokens = IdFreqDict()
        self.valid_tokens = IdFreqDict()
        self.tokenize()

    def __contains__(self, key):
        return key in self.tw

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    def get_cluid(self):
        return self.cluster.cluid

    def update_cluid_into_tw(self):
        self.tw[
            tk.
            key_event_cluid] = self.cluster.cluid if self.cluster is not None else None

    def tokenize(self):
        # tokens = (t.text.lower() for t in self.tw[tk.key_spacy])
        tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower())
        tokens = [
            t.strip() for t in tokens
            if pu.is_valid_keyword(t) and not pu.is_stop_word(t)
        ]
        for token in tokens:
            self.tokens.count_word(token)

    def validate(self, using_ifd):
        self.valid_tokens.clear()
        for word, freq in self.tokens.word_freq_enumerate(newest=False):
            if using_ifd.has_word(word):
                self.valid_tokens.count_word(word, freq)

    def update_cluster(self, cluster):
        if self.cluster is not None:
            self.cluster.update_by_twh(self, factor=-1)
        self.cluster = cluster
        if cluster is not None:
            cluster.update_by_twh(self, factor=1)
Exemple #10
0
def get_ifd_from_docarr(docarr):
    """ assume that docarr has been tokenized """
    ifd = IdFreqDict()
    for doc in docarr:
        ifd.count_words(doc.tokens)
    ifd.reset_id()
    return ifd
Exemple #11
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     cludict = self.cludict
     valid_dict = IdFreqDict()
     if len(old_twharr) > 0:
         for cluster in cludict.values():
             cluster.clear()
     D = len(old_twharr) + len(new_twharr)
     """ recalculate the valid dictionary """
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for old_twh in old_twharr:
         if old_twh.get_cluid() not in cludict:
             continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if len(old_twharr) > 0:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         new_twh.update_cluster(cludict[new_cluid])
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(i, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[self.max_cluid] = ClusterHolder(self.max_cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Exemple #12
0
def get_tokens(file_list):
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for file in file_list:
        twarr = fu.load_array(file)
        total_doc_num += len(twarr)
        for tw in twarr:
            tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower())
            real_tokens = list()
            for token in tokens:
                if len(token) >= 16:
                    real_tokens.extend(pu.segment(token))
                else:
                    real_tokens.append(token)
            for token in real_tokens:
                if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token):
                    id_freq_dict.count_word(token)
    id_freq_dict.drop_words_by_condition(2)
    print(id_freq_dict.vocabulary_size())
    return id_freq_dict, total_doc_num
Exemple #13
0
class ClusterHolder:
    def __init__(self, cluid):
        self.cluid = cluid
        self.twhdict = dict()
        self.tokens = IdFreqDict()
        self.twnum = 0

    """ basic functions """

    def get_twharr(self):
        return list(self.twhdict.values())

    def get_twarr(self):
        return [twh.tw for twh in self.twhdict.values()]

    def get_lbarr(self):
        return [twh[tk.key_event_label] for twh in self.twhdict.values()]

    def clear(self):
        self.twhdict.clear()
        self.tokens.clear()
        self.twnum = 0

    def update_by_twh(self, twh, factor):
        twh_tokens = twh.valid_tokens
        twh_id = twh.id
        if factor > 0:
            self.tokens.merge_freq_from(twh_tokens, newest=False)
            self.twhdict[twh_id] = twh
            self.twnum += 1
        else:
            self.tokens.drop_freq_from(twh_tokens, newest=False)
            if twh_id in self.twhdict:
                self.twhdict.pop(twh_id)
            self.twnum -= 1

    """ extra functions """

    def get_rep_label(self, rep_thres):
        lb_count = Counter(self.get_lbarr())
        max_label, max_lbnum = lb_count.most_common(1)[0]
        rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label
        return rep_label
class WordFreqCounter:
    def __init__(self, capignore=True, worddict=None):
        self.doc_num = 0
        self.capignore = capignore

        self.worddict = worddict if worddict else IdFreqDict()
        self.posdict = IdFreqDict()
        # pos_dict_file = os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'posdict.txt'
        # self.posdict.load_worddict(pos_dict_file)
        # self.notional = {'NN': 0, 'NNP': 0, 'NNPS': 0, 'NNS': 0, 'RB': 0, 'RBR': 0, 'RBS': 0,
        #                  'UH': 0, 'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, }
        # self.verb = {'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, }

    def vocabulary_size(self):
        # return self.worddict.vocabulary_size() + self.posdict.vocabulary_size()
        return self.worddict.vocabulary_size()

    @staticmethod
    def is_valid_wordlabel(wordlabel):
        isnotentity = wordlabel[1].startswith('O')
        return isnotentity

    def calculate_idf(self):
        if self.doc_num == 0:
            raise ValueError('No valid word has been recorded yet.')
        for word in self.worddict.dictionary:
            df = self.worddict.dictionary[word]['df']
            self.worddict.dictionary[word]['idf'] = 10 / np.log(
                (self.doc_num + 1) / df)

    def feature_matrix_of_twarr(self, twarr):
        mtx = list()
        for tw in twarr:
            idfvec, added, num_entity = self.wordlabel_vector(
                tw[tk.key_wordlabels])
            mtx.append(idfvec * (np.log(len(added) + 1) + 1) *
                       (np.log(num_entity + 1) + 1))
        return np.array(mtx)

    def wordlabel_vector(self, wordlabels):
        added_word_dict = dict()
        word_vector = np.array([0] * self.worddict.vocabulary_size(),
                               dtype=np.float32)
        pos_vector = np.array([0] * self.posdict.vocabulary_size(),
                              dtype=np.float32)
        for wordlabel in wordlabels:
            word = wordlabel[0].lower().strip(
                "#") if self.capignore else wordlabel[0]
            # word = get_root_word(word) if wordlabel[2] in self.verb else word
            # if not wordlabel[0].lower().strip("#") == word:
            #     print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word)
            if not (pu.is_valid_keyword(word)
                    and self.is_valid_wordlabel(wordlabel)):
                continue
            if word in added_word_dict:
                continue
            added_word_dict[word] = True
            if not self.worddict.is_word_in_dict(word):
                pos_tag = wordlabel[2]
                pos_vector[self.posdict.word2id(pos_tag)] += 1
            else:
                wordid = self.worddict.word_2_id(word)
                word_vector[wordid] = self.worddict.dictionary[word]['idf']
        added_word = sorted(added_word_dict.keys())
        added_entity = sorted(
            [1 for w in wordlabels if not self.is_valid_wordlabel(w)])
        return word_vector, added_word, len(added_entity)
        # return np.concatenate([word_vector, pos_vector]), added_word, len(added_entity)

    def expand_dict_and_count_df_from_wordlabel(self, wordlabels):
        added_word_dict = dict()
        for wordlabel in wordlabels:
            word = wordlabel[0].lower().strip(
                "#") if self.capignore else wordlabel[0]
            # word = get_root_word(word) if wordlabel[2] in self.verb else word
            if not (pu.is_valid_keyword(word)
                    and self.is_valid_wordlabel(wordlabel)):
                continue
            else:
                if word in added_word_dict:
                    continue
                added_word_dict[word] = True
                # "word" is now neither entity nor invalid keyword_info or duplicated word by now
                self.worddict.expand_dict_from_word(word)
                if 'df' not in self.worddict.dictionary[word]:
                    self.worddict.dictionary[word]['df'] = 1
                else:
                    self.worddict.dictionary[word]['df'] += 1
        self.doc_num += 1

    def expand_from_wordlabel_array(self, wordlabel_arr):
        for wordlabel in wordlabel_arr:
            self.expand_dict_and_count_df_from_wordlabel(wordlabel)
        self.worddict.reset_ids()

    def reserve_word_by_idf_condition(self, rsv_cond):
        self.calculate_idf()
        for word in list(self.worddict.dictionary.keys()):
            word_idf = self.worddict.dictionary[word]['idf']
            if not rsv_cond(word_idf):
                self.worddict.remove_word(word)
        self.worddict.reset_ids()

    def merge_from(self, othercounter):
        thisdict = self.worddict.dictionary
        otherdict = othercounter.worddict.dictionary
        for otherword, otherwordattr in otherdict.items():
            if otherword not in thisdict:
                thisdict[otherword] = otherwordattr
                thisdict[otherword]['idf'] /= 5

    # def most_common_words(self, rank):
    #     wordnum = self.worddict.vocabulary_size()
    #     if 0 < rank < 1:
    #         top_k = wordnum * rank
    #     elif rank > 1 and type(rank) is int:
    #         top_k = rank
    #     else:
    #         raise ValueError('rank is not a valid number' + str(rank))
    #     dic = self.worddict.dictionary
    #     return sorted(dic.keys(), key=lambda w: dic[w]['idf'])[:top_k]

    def dump_worddict(self, dict_file, overwrite=True):
        self.worddict.dump_worddict(dict_file, overwrite)

    def load_worddict(self, dict_file):
        self.worddict.load_worddict(dict_file)
Exemple #15
0
 def __init__(self):
     self.twarr = None
     self.word_dicts = (self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict) = \
         (IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict())
 def __init__(self):
     self.type_ifd_dict = dict([(k_type, IdFreqDict())
                                for k_type in TokenSet.KEY_LIST])
Exemple #17
0
class ClusterHolder:
    def __init__(self, cluid):
        self.cluid = cluid
        self.twhdict = dict()
        self.tokens = IdFreqDict()
        self.twnum = 0

    """ basic functions """

    def get_twharr(self):
        """
        返回聚类当前持有的推特包装对象的列表,不考虑排列顺序
        :return:  list,每个元素类型为TweetHolder
        """
        return list(self.twhdict.values())

    def get_twarr(self):
        """
        返回聚类当前持有的推特对象的列表,不考虑排列顺序
        :return: list,推特列表
        """
        return [twh.tw for twh in self.twhdict.values()]

    def get_lbarr(self):
        """
        返回聚类当前的推特对象所持有的标记(若存在该信息)的列表,不考虑排列顺序
        :return: list,元素为int,表示推特原本的标记值(原本属于哪个聚类)
        """
        return [twh[tk.key_event_label] for twh in self.twhdict.values()]

    def clear(self):
        """
        清空当前聚类的统计信息,包括分词表、推特列表、推特计数
        :return:
        """
        self.twhdict.clear()
        self.tokens.clear()
        self.twnum = 0

    def update_by_twh(self, twh, factor):
        """
        将输入的推特包装对象加入/移出当前聚类,并根据其 valid_tokens 更新当前聚类的分词表等统计信息
        :param twh: TweetHolder,要加入的推特包装对象
        :param factor: int,1表示加入,0表示移出
        :return:
        """
        twh_tokens = twh.valid_tokens
        twh_id = twh.id
        if factor > 0:
            self.tokens.merge_freq_from(twh_tokens, newest=False)
            self.twhdict[twh_id] = twh
            self.twnum += 1
        else:
            self.tokens.drop_freq_from(twh_tokens, newest=False)
            if twh_id in self.twhdict:
                self.twhdict.pop(twh_id)
            self.twnum -= 1

    """ extra functions """

    def get_rep_label(self, rep_thres):
        """
        计算当前聚类中是否存在标记数占推特列表总数的比例大于阈值 rep_thres 的标记
        :param rep_thres: float,判定阈值
        :return: int,若存在足够占比的标记则返回该标记,否则返回-1
        """
        lb_count = Counter(self.get_lbarr())
        max_label, max_lbnum = lb_count.most_common(1)[0]
        rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label
        return rep_label
Exemple #18
0
 def __init__(self, cluid):
     self.cluid = cluid
     self.twhdict = dict()
     self.tokens = IdFreqDict()
     self.twnum = 0
Exemple #19
0
class TweetHolder:
    # using_ifd = token_dict()

    def __init__(self, tw):
        self.tw = tw
        self.id = tw.get(tk.key_id)
        self.cluster = None
        self.tokens = IdFreqDict()
        self.valid_tokens = IdFreqDict()
        self.tokenize()

    def __contains__(self, key):
        return key in self.tw

    def __getitem__(self, key):
        return self.get(key)

    def __setitem__(self, key, value):
        self.setdefault(key, value)

    def get(self, key):
        return self.tw.get(key, None)

    def setdefault(self, key, value):
        self.tw.setdefault(key, value)

    def get_cluid(self):
        """
        返回当前已被分配的聚类的ID
        :return: int,聚类的ID编号
        """
        return self.cluster.cluid

    def update_cluid_into_tw(self):
        """
        更新推特对象(self.tw,dict类型)的 tk.key_event_cluid 字段为当前已被分配的聚类的ID,
        若尚未被分配聚类则置为None
        :return:
        """
        self.tw[
            tk.
            key_event_cluid] = self.cluster.cluid if self.cluster is not None else None

    def tokenize(self):
        """
        将推特对象的text进行分词并保存分词结果,使用 self.tokens 进行分词计数
        :return:
        """
        # tokens = (t.text.lower() for t in self.tw[tk.key_spacy])
        tokens = pu.valid_tokenize(self.tw[tk.key_text].lower())
        for token in tokens:
            self.tokens.count_word(token)

    def validate(self, using_ifd):
        """
        更新分词表,将 self.tokens 中存在于 using_ifd 的单词重新计数到 self.valid_tokens 中
        :param using_ifd: utils.id_freq_dict.IdFreqDict,包含当前迭代中的合法分词
        :return:
        """
        self.valid_tokens.clear()
        for word, freq in self.tokens.word_freq_enumerate(newest=False):
            if using_ifd.has_word(word):
                self.valid_tokens.count_word(word, freq)

    def update_cluster(self, cluster):
        """
        若原本有聚类,则将当前推特从原本的 self.cluster 中分离;
        并将当前推特合并至 cluster 中(若不为None),更新 self.cluster
        :param cluster: 目标聚类对象
        :return:
        """
        if self.cluster is not None:
            self.cluster.update_by_twh(self, factor=-1)
        self.cluster = cluster
        if cluster is not None:
            cluster.update_by_twh(self, factor=1)
Exemple #20
0
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]


if __name__ == '__main__':
    import utils.pattern_utils as pu
    
    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False
    
    pre2post = dict(zip(pre_list, post_list))
    for pre, post in pre2post.items():
        ifd = IdFreqDict()
        ifd.load_dict(pre)
        pre_vocab = ifd.vocabulary_size()
        print('{} loaded, {} words'.format(pre, pre_vocab))
        ifd.drop_words_by_condition(word_remove)
        print('{} words dropped, remain {} words'.format(pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size()))
        ifd.dump_dict(post)
        print('dump over')
    def __init__(self, capignore=True, worddict=None):
        self.doc_num = 0
        self.capignore = capignore

        self.worddict = worddict if worddict else IdFreqDict()
        self.posdict = IdFreqDict()
Exemple #22
0
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]

if __name__ == '__main__':
    import utils.pattern_utils as pu

    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False

    pre2post = dict(zip(pre_list, post_list))
    for pre, post in pre2post.items():
        ifd = IdFreqDict()
        ifd.load_dict(pre)
        pre_vocab = ifd.vocabulary_size()
        print('{} loaded, {} words'.format(pre, pre_vocab))
        ifd.drop_words_by_condition(word_remove)
        print('{} words dropped, remain {} words'.format(
            pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size()))
        ifd.dump_dict(post)
        print('dump over')
Exemple #23
0
 def load_ifd(self):
     from utils.id_freq_dict import IdFreqDict
     return IdFreqDict().load_dict(self.dict_file)