Ejemplo n.º 1
0
 def __init__(self, tw):
     self.tw = tw
     self.id = tw.get(tk.key_id)
     self.cluster = None
     self.tokens = IdFreqDict()
     self.valid_tokens = IdFreqDict()
     self.tokenize()
Ejemplo n.º 2
0
def get_semantic_tokens(file_list):
    pos_type_info = {
        ark.prop_label: {
            K_IFD: IdFreqDict()
        },
        ark.comm_label: {
            K_IFD: IdFreqDict()
        },
        ark.verb_label: {
            K_IFD: IdFreqDict()
        },
        ark.hstg_label: {
            K_IFD: IdFreqDict()
        },
    }
    total_doc_num = 0
    for file in file_list:
        twarr = ark.twarr_ark(fu.load_array(file))
        total_doc_num += len(twarr)
        pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr])
        for pos_token in pos_tokens:
            word = pos_token[0].strip().lower()
            if len(word) <= 2 or not pu.is_valid_keyword(word):
                continue
            real_label = ark.pos_token2semantic_label(pos_token)
            if real_label:
                pos_type_info[real_label][K_IFD].count_word(word)
    return pos_type_info, total_doc_num
Ejemplo n.º 3
0
def get_semantic_tokens_multi(file_path):
    pos_type_info = {
        ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file},
        ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file},
        ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file},
        ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file},
    }
    total_doc_num = 0
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block])
    for res_type_info, doc_num in res_list:
        total_doc_num += doc_num
        for label in res_type_info.keys():
            pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD])
    print('total_doc_num', total_doc_num)
    for label in pos_type_info.keys():
        ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE]
        ifd.drop_words_by_condition(3)
        if label != ark.hstg_label:
            ifd.drop_words_by_condition(lambda word, _: word.startswith('#'))
        ifd.dump_dict(file_name)
        print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
Ejemplo n.º 4
0
 def tokenize(self):
     self.tokens = IdFreqDict()
     for token in self.tw[tk.key_spacy]:
         word = token.text.lower().strip('#').strip()
         if ClusterService.is_valid_keyword(word) and token_dict().has_word(
                 word):
             self.tokens.count_word(word)
Ejemplo n.º 5
0
def get_ifd_from_docarr(docarr):
    """ assume that docarr has been tokenized """
    ifd = IdFreqDict()
    for doc in docarr:
        ifd.count_words(doc.tokens)
    ifd.reset_id()
    return ifd
Ejemplo n.º 6
0
 def __init__(self, doc):
     self.cluster = None
     self.text = doc.text
     self.topic = doc.topic
     self.tokenids = doc.tokenids
     self.ifd = IdFreqDict()
     for t in self.tokenids:
         self.ifd.count_word(t)
Ejemplo n.º 7
0
 def preprocess_twarr(self, twarr):
     """pre-process the tweet text, including dropping non-common terms"""
     key_tokens = tk.key_wordlabels
     self.twarr = twarr
     for word_dict in self.word_dicts:
         word_dict.clear()
     # self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict = \
     #     IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
     pos_tag2dict_map = dict([(tag, self.prop_n_dict)
                              for tag in self.prop_n_tags] +
                             [(tag, self.comm_n_dict)
                              for tag in self.comm_n_tags] +
                             [(tag, self.verb_dict)
                              for tag in self.verb_tags] +
                             [(tag, self.ht_dict) for tag in self.ht_rags])
     for tw in twarr:
         tokens = tw[key_tokens]
         for i in range(len(tokens) - 1, -1, -1):
             tokens[i][0] = tokens[i][0].lower().strip()
             word, _, pos_tag = tokens[i]
             if not cs.is_valid_keyword(word):
                 del tokens[i]
             if word.startswith('#') and not pos_tag.lower() == 'ht':
                 pos_tag = tokens[i][2] = 'HT'
             if pos_tag in pos_tag2dict_map:
                 pos_tag2dict_map[pos_tag].count_word(word)
     self.prop_n_dict.drop_words_by_condition(3)
     self.comm_n_dict.drop_words_by_condition(4)
     self.verb_dict.drop_words_by_condition(4)
     self.ht_dict.drop_words_by_condition(3)
     for tw in twarr:
         tw[self.key_prop_n], tw[self.key_comm_n], tw[self.key_verb], tw[self.key_ht] = \
             IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
         tw_pos_tag2dict_map = dict([(tag, tw[self.key_prop_n])
                                     for tag in self.prop_n_tags] +
                                    [(tag, tw[self.key_comm_n])
                                     for tag in self.comm_n_tags] +
                                    [(tag, tw[self.key_verb])
                                     for tag in self.verb_tags] +
                                    [(tag, tw[self.key_ht])
                                     for tag in self.ht_rags])
         for token in tw[key_tokens]:
             word, _, pos_tag = token
             if pos_tag in tw_pos_tag2dict_map and pos_tag2dict_map[
                     pos_tag].has_word(word):
                 tw_pos_tag2dict_map[pos_tag].count_word(word)
Ejemplo n.º 8
0
 def __call__(self, *args, **kwargs):
     if IfdGetter.K_IFD_FILE in kwargs:
         self.ifd_file = kwargs.get(IfdGetter.K_IFD_FILE)
     if self.ifd_file is None:
         raise ValueError('An id freq dict should be specified.')
     if self.ifd is None:
         self.ifd = IdFreqDict()
         self.ifd.load_dict(self.ifd_file)
     return self.ifd
Ejemplo n.º 9
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     """
     实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus,
     并在其上进行完整的聚类过程,耗时较多;
     若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息,
     并对new_twharr中的每条推特包装对象采样已有的聚类对象
     :param old_twharr: list,元素类型为 TweetHolder
     :param new_twharr: list,元素类型为 TweetHolder
     :param iter_num: 聚类循环所用的迭代次数
     :return:
     """
     cludict = self.cludict
     """ recalculate the valid dictionary """
     valid_dict = IdFreqDict()
     D = len(old_twharr) + len(new_twharr)
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for cluster in cludict.values():
         cluster.clear()
     for old_twh in old_twharr:
         # if old_twh.get_cluid() not in cludict:
         #     continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if old_twharr:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         cluster = cludict[new_cluid]
         new_twh.update_cluster(cluster)
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(
             i + 1, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[cluid] = ClusterHolder(cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Ejemplo n.º 10
0
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
Ejemplo n.º 11
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     cludict = self.cludict
     valid_dict = IdFreqDict()
     if len(old_twharr) > 0:
         for cluster in cludict.values():
             cluster.clear()
     D = len(old_twharr) + len(new_twharr)
     """ recalculate the valid dictionary """
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for old_twh in old_twharr:
         if old_twh.get_cluid() not in cludict:
             continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if len(old_twharr) > 0:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         new_twh.update_cluster(cludict[new_cluid])
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(i, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[self.max_cluid] = ClusterHolder(self.max_cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Ejemplo n.º 12
0
def get_tokens(file_list):
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for file in file_list:
        twarr = fu.load_array(file)
        total_doc_num += len(twarr)
        for tw in twarr:
            tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower())
            real_tokens = list()
            for token in tokens:
                if len(token) >= 16:
                    real_tokens.extend(pu.segment(token))
                else:
                    real_tokens.append(token)
            for token in real_tokens:
                if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token):
                    id_freq_dict.count_word(token)
    id_freq_dict.drop_words_by_condition(2)
    print(id_freq_dict.vocabulary_size())
    return id_freq_dict, total_doc_num
Ejemplo n.º 13
0
 def __init__(self):
     self.twarr = None
     self.word_dicts = (self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict) = \
         (IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict())
 def __init__(self):
     self.type_ifd_dict = dict([(k_type, IdFreqDict())
                                for k_type in TokenSet.KEY_LIST])
Ejemplo n.º 15
0
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]

if __name__ == '__main__':
    import utils.pattern_utils as pu

    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False

    pre2post = dict(zip(pre_list, post_list))
    for pre, post in pre2post.items():
        ifd = IdFreqDict()
        ifd.load_dict(pre)
        pre_vocab = ifd.vocabulary_size()
        print('{} loaded, {} words'.format(pre, pre_vocab))
        ifd.drop_words_by_condition(word_remove)
        print('{} words dropped, remain {} words'.format(
            pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size()))
        ifd.dump_dict(post)
        print('dump over')
Ejemplo n.º 16
0
 def __init__(self, cluid):
     self.cluid = cluid
     self.twhdict = dict()
     self.tokens = IdFreqDict()
     self.twnum = 0
    def __init__(self, capignore=True, worddict=None):
        self.doc_num = 0
        self.capignore = capignore

        self.worddict = worddict if worddict else IdFreqDict()
        self.posdict = IdFreqDict()
Ejemplo n.º 18
0
 def load_ifd(self):
     from utils.id_freq_dict import IdFreqDict
     return IdFreqDict().load_dict(self.dict_file)