コード例 #1
0
 def preprocess_twarr(self, twarr):
     """pre-process the tweet text, including dropping non-common terms"""
     key_tokens = tk.key_wordlabels
     self.twarr = twarr
     for word_dict in self.word_dicts:
         word_dict.clear()
     # self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict = \
     #     IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
     pos_tag2dict_map = dict([(tag, self.prop_n_dict)
                              for tag in self.prop_n_tags] +
                             [(tag, self.comm_n_dict)
                              for tag in self.comm_n_tags] +
                             [(tag, self.verb_dict)
                              for tag in self.verb_tags] +
                             [(tag, self.ht_dict) for tag in self.ht_rags])
     for tw in twarr:
         tokens = tw[key_tokens]
         for i in range(len(tokens) - 1, -1, -1):
             tokens[i][0] = tokens[i][0].lower().strip()
             word, _, pos_tag = tokens[i]
             if not cs.is_valid_keyword(word):
                 del tokens[i]
             if word.startswith('#') and not pos_tag.lower() == 'ht':
                 pos_tag = tokens[i][2] = 'HT'
             if pos_tag in pos_tag2dict_map:
                 pos_tag2dict_map[pos_tag].count_word(word)
     self.prop_n_dict.drop_words_by_condition(3)
     self.comm_n_dict.drop_words_by_condition(4)
     self.verb_dict.drop_words_by_condition(4)
     self.ht_dict.drop_words_by_condition(3)
     for tw in twarr:
         tw[self.key_prop_n], tw[self.key_comm_n], tw[self.key_verb], tw[self.key_ht] = \
             IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict()
         tw_pos_tag2dict_map = dict([(tag, tw[self.key_prop_n])
                                     for tag in self.prop_n_tags] +
                                    [(tag, tw[self.key_comm_n])
                                     for tag in self.comm_n_tags] +
                                    [(tag, tw[self.key_verb])
                                     for tag in self.verb_tags] +
                                    [(tag, tw[self.key_ht])
                                     for tag in self.ht_rags])
         for token in tw[key_tokens]:
             word, _, pos_tag = token
             if pos_tag in tw_pos_tag2dict_map and pos_tag2dict_map[
                     pos_tag].has_word(word):
                 tw_pos_tag2dict_map[pos_tag].count_word(word)
コード例 #2
0
ファイル: gsdmm_hashtag.py プロジェクト: leeyanghaha/my_merge
    def GSDMM_twarr_hashtag(twarr, alpha, beta, gamma, K, iter_num):
        ner_pos_token = tk.key_wordlabels
        twarr = twarr[:]
        key_dict = dict()
        ht_dict = dict()

        def word_count_id(word_dict, w):
            if w in word_dict:
                word_dict[w]['freq'] += 1
            else:
                word_dict[w] = {'freq': 1, 'id': word_dict.__len__()}

        def rearrange_id(word_dict):
            for idx, w in enumerate(sorted(word_dict.keys())):
                word_dict[w]['id'] = idx

        def drop_words_freq_less_than(word_dict, min_freq):
            for w in list(word_dict.keys()):
                if word_dict[w]['freq'] < min_freq:
                    del word_dict[w]
            rearrange_id(word_dict)

        """pre-process the tweet text, including dropping non-common terms"""
        for tw in twarr:
            wordlabels = tw[ner_pos_token]
            for i in range(len(wordlabels) - 1, -1, -1):
                key = wordlabels[i][0] = wordlabels[i][0].lower().strip(
                )  # hashtags are reserved here
                if not cs.is_valid_keyword(key):
                    del wordlabels[i]
                else:
                    if key.startswith('#'):
                        word_count_id(ht_dict, key)
                    else:
                        word_count_id(key_dict, key)
        drop_words_freq_less_than(ht_dict, 3)
        drop_words_freq_less_than(key_dict, 5)
        for tw in twarr:
            tw['key'] = dict(
                Counter([
                    wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in key_dict
                ]))
            tw['ht'] = dict(
                Counter([
                    wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in ht_dict
                ]))
        # pos_tw_num = len([1 for label in ref_labels if label <= 11])
        # neg_tw_num = len(twarr) - pos_tw_num
        # print('hashtag in pos:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() > 0]) / pos_tw_num)
        # print('hashtag in pos = 1:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() == 1]) / pos_tw_num)
        # print('hashtag in pos = 2:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() == 2]) / pos_tw_num)
        # print('hashtag in pos >= 3:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() >= 3]) / pos_tw_num)
        # print('hashtag in neg:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() > 0]) / neg_tw_num)
        # print('hashtag in neg = 1:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() == 1]) / neg_tw_num)
        # print('hashtag in neg = 2:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() == 2]) / neg_tw_num)
        # print('hashtag in neg >= 3:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() >= 3]) / neg_tw_num)
        # print('tw num:', len(twarr), 'pos_tw_num', pos_tw_num, 'neg_tw_num', neg_tw_num)
        # print('----')
        """definitions of parameters"""
        D = len(twarr)
        V = len(key_dict)
        H = len(ht_dict)
        alpha0 = K * alpha
        beta0 = V * beta  # hyperparam for keyword_info
        gamma0 = H * gamma  # hyperparam for hashtag

        z = [0] * D
        m_z = [0] * K
        n_z_key = [0] * K
        n_z_ht = [0] * K
        n_zw_key = [[0] * V for _ in range(K)]
        n_zw_ht = [[0] * H for _ in range(K)]
        """initialize the counting arrays"""
        for d in range(D):
            cluster = int(K * np.random.random())
            z[d] = cluster
            m_z[cluster] += 1
            key_freq_dict = twarr[d]['key']
            ht_freq_dict = twarr[d]['ht']
            for key, freq in key_freq_dict.items():
                n_z_key[cluster] += freq
                n_zw_key[cluster][key_dict[key]['id']] += freq
            for ht, freq in ht_freq_dict.items():
                n_z_ht[cluster] += freq
                n_zw_ht[cluster][ht_dict[ht]['id']] += freq
        """make sampling using current counting information"""

        def rule_value_of(tw_freq_dict_, word_id_dict_, n_z_, n_zw_, p, p0,
                          cluster):
            i_ = value = 1
            for w_, w_freq in tw_freq_dict_.items():
                for i in range(0, w_freq):
                    value *= (n_zw_[cluster][word_id_dict_[w_]['id']] + i +
                              p) / (n_z_[cluster] + i_ + p0)
                    i_ += 1
            return value

        def sample_cluster(tw, iter=None):
            prob = [0] * K
            for k in range(K):
                prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
                key_freq_dict = tw['key']
                ht_freq_dict = tw['ht']
                prob[k] *= rule_value_of(key_freq_dict, key_dict, n_z_key,
                                         n_zw_key, beta, beta0, k)
                prob[k] *= rule_value_of(ht_freq_dict, ht_dict, n_z_ht,
                                         n_zw_ht, gamma, gamma0, k)
            if iter is not None and iter > iter_num - 5:
                return np.argmax(prob)
            else:
                return au.sample_index(np.array(prob))

        """start iteration"""

        def update_using_freq_dict(tw_freq_dict_, word_id_dict_, n_z_, n_zw_,
                                   factor):
            for w, w_freq in tw_freq_dict_.items():
                w_freq *= factor
                n_z_[cluster] += w_freq
                n_zw_[cluster][word_id_dict_[w]['id']] += w_freq

        """ start iteration """
        z_iter = list()
        for i in range(iter_num):
            z_iter.append(z[:])

            for d in range(D):
                cluster = z[d]
                m_z[cluster] -= 1
                key_freq_dict = twarr[d]['key']
                ht_freq_dict = twarr[d]['ht']

                update_using_freq_dict(key_freq_dict, key_dict, n_z_key,
                                       n_zw_key, -1)
                update_using_freq_dict(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht,
                                       -1)

                cluster = sample_cluster(twarr[d], i)

                z[d] = cluster
                m_z[cluster] += 1
                update_using_freq_dict(key_freq_dict, key_dict, n_z_key,
                                       n_zw_key, 1)
                update_using_freq_dict(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht,
                                       1)

        z_iter.append(z[:])
        return z_iter
コード例 #3
0
    def GSDMM_twarr(twarr, alpha, beta, K, iter_num):
        ner_pos_token = tk.key_wordlabels
        twarr = twarr[:]
        words = dict()
        """pre-process the tweet text, including dropping non-common terms"""
        for tw in twarr:
            wordlabels = tw[ner_pos_token]
            for i in range(len(wordlabels) - 1, -1, -1):
                wordlabels[i][0] = wordlabels[i][0].lower().strip('#').strip()
                if not cs.is_valid_keyword(wordlabels[i][0]):
                    del wordlabels[i]
            for wordlabel in wordlabels:
                word = wordlabel[0]
                if word in words:
                    words[word]['freq'] += 1
                else:
                    words[word] = {'freq': 1, 'id': len(words.keys())}
        min_df = 3
        for w in list(words.keys()):
            if words[w]['freq'] < min_df:
                del words[w]
        for idx, w in enumerate(sorted(words.keys())):
            words[w]['id'] = idx
        for tw in twarr:
            tw['dup'] = dict(
                Counter(
                    [wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in words]))
        """definitions of parameters"""
        V = len(words.keys())
        D = len(twarr)
        alpha0 = K * alpha
        beta0 = V * beta
        z = [0] * D
        m_z = [0] * K
        n_z = [0] * K
        n_zw = [[0] * V for _ in range(K)]
        """initialize the counting arrays"""
        for d in range(D):
            cluster = int(K * np.random.random())
            z[d] = cluster
            m_z[cluster] += 1
            freq_dict = twarr[d]['dup']
            for word in freq_dict.keys():
                n_z[cluster] += freq_dict[word]
                n_zw[cluster][words[word]['id']] += freq_dict[word]
        """make sampling using current counting information"""
        small_double = 1e-150
        large_double = 1e150

        def recompute(prob, underflowcount):
            max_count = max(underflowcount)
            return [
                prob[k] * (large_double**(underflowcount[k] - max_count))
                for k in range(len(prob))
            ]

        def sample_cluster(tw, iter=None):
            prob = [0] * K
            underflowcount = [0] * K
            for k in range(K):
                prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
                rule_value = 1.0
                i = 0
                freq_dict = tw['dup']
                for w, freq in freq_dict.items():
                    for j in range(freq):
                        wid = words[w]['id']
                        rule_value *= (n_zw[k][wid] + beta + j) / (n_z[k] +
                                                                   beta0 + i)
                        if rule_value < small_double:
                            underflowcount[k] -= 1
                            rule_value *= large_double
                        i += 1
                prob[k] *= rule_value

            prob = recompute(prob, underflowcount)
            if iter is not None and iter > 95:
                return np.argmax(prob)
            else:
                return au.sample_index(np.array(prob))

        """start iteration"""
        z_iter = list()
        for i in range(iter_num):
            z_iter.append(z[:])

            for d in range(D):
                cluster = z[d]
                m_z[cluster] -= 1
                freq_dict = twarr[d]['dup']
                for word in freq_dict.keys():
                    wordid = words[word]['id']
                    wordfreq = freq_dict[word]
                    n_zw[cluster][wordid] -= wordfreq
                    n_z[cluster] -= wordfreq

                cluster = sample_cluster(twarr[d], i)

                z[d] = cluster
                m_z[cluster] += 1
                for word in freq_dict.keys():
                    wordid = words[word]['id']
                    wordfreq = freq_dict[word]
                    n_zw[cluster][wordid] += wordfreq
                    n_z[cluster] += wordfreq

        z_iter.append(z[:])
        return z_iter
コード例 #4
0
 def GSDPMM_twarr(self, old_twarr, old_z, new_twarr, iter_num):
     pos_token = tk.key_ark
     twarr = old_twarr + new_twarr
     words = dict()
     """pre-process the tweet text, including dropping non-common terms"""
     for tw in twarr:
         tokens = tw[pos_token]
         for i in range(len(tokens) - 1, -1, -1):
             tokens[i][0] = tokens[i][0].lower().strip('#').strip()
             if not cs.is_valid_keyword(tokens[i][0]): del tokens[i]
         for wordlabel in tokens:
             word = wordlabel[0]
             if word in words:
                 words[word]['freq'] += 1
             else:
                 words[word] = {'freq': 1, 'id': len(words.keys())}
     min_df = 3
     for w in list(words.keys()):
         if words[w]['freq'] < min_df: del words[w]
     for idx, w in enumerate(sorted(words.keys())):
         words[w]['id'] = idx
     for tw in twarr:
         tw['dup'] = dict(Counter([wlb[0] for wlb in tw[pos_token] if wlb[0] in words]))
     """definitions of parameters"""
     D = len(twarr)
     V = len(words.keys())
     alpha, beta = self.alpha, self.beta
     beta0 = V * beta
     new_z = [0] * len(new_twarr)
     K = {self.max_cluid} if not old_z else set(old_z)
     m_z = dict([(k, 0) for k in K])
     n_z = dict([(k, 0) for k in K])
     n_zw = dict([(k, [0] * V) for k in K])
     """initialize the counting arrays"""
     for old_d in range(len(old_twarr)):
         old_cluid = old_z[old_d]
         m_z[old_cluid] += 1
         for word, freq in old_twarr[old_d]['dup'].items():
             n_z[old_cluid] += freq
             n_zw[old_cluid][words[word]['id']] += freq
     for new_d in range(len(new_twarr)):
         new_cluid = np.random.choice(list(K))
         new_z[new_d] = new_cluid
         m_z[new_cluid] += 1
         for word, freq in new_twarr[new_d]['dup'].items():
             n_z[new_cluid] += freq
             n_zw[new_cluid][words[word]['id']] += freq
     """make sampling using current counting information"""
     
     def sample_cluster(_tw, cur_iter=None):
         prob = {}
         tw_freq_dict = _tw['dup']
         for k in K:
             prob[k] = m_z[k] / (D - 1 + alpha)
             _i = 0
             for _word, _freq in tw_freq_dict.items():
                 for _j in range(_freq):
                     prob[k] *= (n_zw[k][words[_word]['id']] + beta + _j) / (n_z[k] + beta0 + _i)
                     _i += 1
         new_clu_prob = alpha / (D - 1 + alpha)
         _i = 0
         for _word, _freq in tw_freq_dict.items():
             for _j in range(_freq):
                 new_clu_prob *= (beta + _j) / (beta0 + _i)
                 _i += 1
         
         prob[self.max_cluid + 1] = new_clu_prob
         cluid_arr = sorted(prob.keys())
         prob_arr = [prob[__cluid] for __cluid in cluid_arr]
         if cur_iter is not None and cur_iter >= iter_num - 1:
             return cluid_arr[np.argmax(prob_arr)]
         else:
             return cluid_arr[au.sample_index(np.array(prob_arr))]
     
     """start iteration"""
     for i in range(iter_num):
         for new_d in range(len(new_twarr)):
             freq_dict = new_twarr[new_d]['dup']
             old_cluid = new_z[new_d]
             new_z[new_d] = -1
             m_z[old_cluid] -= 1
             for word, freq in freq_dict.items():
                 n_z[old_cluid] -= freq
                 n_zw[old_cluid][words[word]['id']] -= freq
             
             for _cluid in list(m_z.keys()):
                 if m_z[_cluid] == 0:
                     m_z.pop(_cluid), n_z.pop(_cluid), n_zw.pop(_cluid), K.remove(_cluid)
             
             new_cluid = sample_cluster(new_twarr[new_d], i)
             
             if new_cluid > self.max_cluid:
                 new_cluid = self.max_cluid = self.max_cluid + 1
                 m_z[self.max_cluid] = 0
                 n_z[self.max_cluid] = 0
                 n_zw[self.max_cluid] = [0] * V
                 K.add(self.max_cluid)
             
             new_z[new_d] = new_cluid
             m_z[new_cluid] += 1
             for word, freq in freq_dict.items():
                 n_z[new_cluid] += freq
                 n_zw[new_cluid][words[word]['id']] += freq
     
     return new_z