def sample(self,
               twh,
               D,
               using_max=False,
               no_new_clu=False,
               cluid_range=None):
        alpha = self.alpha
        p_dict = self.p_dict
        p0_dict = self.p0_dict
        cludict = self.cludict
        tw_valid_token_set = twh.valid_token_set
        etap, etac, etav, etah = [
            p_dict.get(k_type) for k_type in TokenSet.KEY_LIST
        ]
        etap0, etac0, etav0, etah0 = [
            p0_dict.get(k_type) for k_type in TokenSet.KEY_LIST
        ]
        twhp, twhc, twhv, twhh = [
            tw_valid_token_set.get(k_type) for k_type in TokenSet.KEY_LIST
        ]

        if cluid_range is None:
            cluid_range = cludict.keys()

        cluid_prob = dict()
        old_prob_delta = self.old_prob_delta
        new_prob_delta = self.new_prob_delta

        for cluid in cluid_range:
            cluster = cludict[cluid]
            clu_token_set = cluster.token_set
            old_clu_prob = cluster.twnum / (D - 1 + alpha)
            prob_delta = 1.0
            prob_delta *= old_prob_delta(twhp, clu_token_set.get(su.pos_prop),
                                         etap, etap0)
            prob_delta *= old_prob_delta(twhc, clu_token_set.get(su.pos_comm),
                                         etac, etac0)
            prob_delta *= old_prob_delta(twhv, clu_token_set.get(su.pos_verb),
                                         etav, etav0)
            prob_delta *= old_prob_delta(twhh, clu_token_set.get(su.pos_hstg),
                                         etah, etah0)
            cluid_prob[cluid] = old_clu_prob * prob_delta

        if not no_new_clu:
            new_clu_prob = alpha / (D - 1 + alpha)
            prob_delta = 1.0
            prob_delta *= new_prob_delta(twhp, etap, etap0)
            prob_delta *= new_prob_delta(twhc, etac, etac0)
            prob_delta *= new_prob_delta(twhv, etav, etav0)
            prob_delta *= new_prob_delta(twhh, etah, etah0)
            cluid_prob[self.max_cluid + 1] = new_clu_prob * prob_delta
            # print('new:{} init new:{} delta:{}'.format(cluid_prob[self.max_cluid + 1], new_clu_prob, prob_delta))
            # print()

        cluid_arr = list(cluid_prob.keys())
        prob_arr = [cluid_prob[cluid] for cluid in cluid_arr]
        sampled_idx = np.argmax(prob_arr) if using_max else au.sample_index(
            np.array(prob_arr))
        return cluid_arr[sampled_idx]
Example #2
0
 def sample_cluster(tw, cur_iter):
     prob = [0] * K
     for k in range(K):
         prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
         prob[k] *= rule_value_of(tw[self.key_prop_n], prop_n_dict, n_z_p, n_zw_p, etap, etap0, k)
         prob[k] *= rule_value_of(tw[self.key_comm_n], comm_n_dict, n_z_c, n_zw_c, etac, etac0, k)
         prob[k] *= rule_value_of(tw[self.key_verb], verb_dict, n_z_v, n_zw_v, etav, etav0, k)
         prob[k] *= rule_value_of(tw[self.key_ht], ht_dict, n_z_h, n_zw_h, etah, etah0, k)
     if cur_iter >= iter_num - 1:
         return np.argmax(prob)
     else:
         return au.sample_index(np.array(prob))
Example #3
0
 def sample_cluster(tw, iter=None):
     prob = [0] * K
     for k in range(K):
         prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
         key_freq_dict = tw['key']
         ht_freq_dict = tw['ht']
         prob[k] *= rule_value_of(key_freq_dict, key_dict, n_z_key,
                                  n_zw_key, beta, beta0, k)
         prob[k] *= rule_value_of(ht_freq_dict, ht_dict, n_z_ht,
                                  n_zw_ht, gamma, gamma0, k)
     if iter is not None and iter > iter_num - 5:
         return np.argmax(prob)
     else:
         return au.sample_index(np.array(prob))
Example #4
0
    def sample(self, retwset, D, using_max=False, no_new_clu=False):
        assert len(retwset.get_twharr()) > 0
        prob_twset = []
        alpha, beta, beta0 = self.alpha, self.beta, self.beta0
        for twh in retwset.get_twharr():
            cluid_prob = dict()
            for cluid, cluster in self.cludict.items():
                m_zk, clu_tokens = cluster.twnum, cluster.tokens
                clu_freq_sum = clu_tokens.get_freq_sum()
                old_clu_prob = m_zk / (D - 1 + alpha)
                prob_delta = 1.0
                ii = 0
                for word, freq in twh.tokens.word_freq_enumerate(newest=False):
                    clu_word_freq = clu_tokens.freq_of_word(
                        word) if clu_tokens.has_word(word) else 0
                    # print('w:{} cid:{} cwf:{} cfs:{}, beta:{} beta0:{}'.format(
                    #     word, cluid, clu_word_freq, clu_freq_sum, beta, beta0))
                    for jj in range(freq):
                        prob_delta *= (clu_word_freq + beta +
                                       jj) / (clu_freq_sum + beta0 + ii)
                        ii += 1
                cluid_prob[cluid] = old_clu_prob * prob_delta
                # print('old:{} init old:{} delta:{}'.format(cluid_prob[cluid], old_clu_prob, prob_delta))
            ii = 0
            new_clu_prob = alpha / (D - 1 + alpha)
            prob_delta = 1.0
            for word, freq in twh.tokens.word_freq_enumerate(newest=False):
                for jj in range(freq):
                    prob_delta *= (beta + jj) / (beta0 + ii)
                    ii += 1
            cluid_prob[self.max_cluid + 1] = new_clu_prob * prob_delta
            # print('new:{} init new:{} delta:{}'.format(cluid_prob[self.max_cluid + 1], new_clu_prob, prob_delta))
            # print()
            if no_new_clu:
                cluid_prob.pop(self.max_cluid + 1)

            cluid_arr = sorted(cluid_prob.keys())
            prob_arr = [cluid_prob[__cluid] for __cluid in cluid_arr]
            # print(prob_arr)
            pred_cluid = cluid_arr[np.argmax(
                prob_arr)] if using_max else cluid_arr[au.sample_index(
                    np.array(prob_arr))]
            prob_twset.append(pred_cluid)

        if len(retwset.get_twharr()) > 1:
            final_sample = int(au.choice(prob_twset))
        else:
            final_sample = prob_twset[0]
        return final_sample
Example #5
0
        def sample_cluster(tw):
            geo_freq_dict = tw['geo_and_time']
            ent_freq_dict = tw['ent']
            key_freq_dict = tw['key']
            prob = [0] * K
            for k in range(K):
                prob[k] = (m_z[k] + alpha) / (D + alpha0)
                prob[k] *= rule_value_of(geo_freq_dict, geo_word_id_dict,
                                         n_z_geo, n_zw_geo, eta, eta0, k)
                prob[k] *= rule_value_of(ent_freq_dict, ent_word_id_dict,
                                         n_z_ent, n_zw_ent, beta, beta0, k)
                prob[k] *= rule_value_of(key_freq_dict, key_word_id_dict,
                                         n_z_key, n_zw_key, lambd, lambd0, k)
                # bb=1.0
                # b = 1
                # rule_value = 1.0
                # for geo_w, w_count in geo_freq_dict.items():
                #     for idx in range(1, w_count + 1):
                #         wid = geo_word_id_dict[geo_w]['id']
                #         rule_value *= (n_zw_geo[k][wid] + idx + eta)/(n_z_geo[k] + b + eta0)
                #         b += 1
                # bb*=rule_value
                # b = 1
                # rule_value = 1.0
                # for ent_w, w_count in ent_freq_dict.items():
                #     for idx in range(1, w_count + 1):
                #         wid = ent_word_id_dict[ent_w]['id']
                #         rule_value *= (n_zw_ent[k][wid] + idx + beta)/(n_z_ent[k] + b + beta0)
                #         b += 1
                # bb *= rule_value
                # b = 1
                # rule_value = 1.0
                # for key_w, w_count in key_freq_dict.items():
                #     for idx in range(1, w_count + 1):
                #         wid = key_word_id_dict[key_w]['id']
                #         rule_value *= (n_zw_key[k][wid] + idx + lambd)/(n_z_key[k] + b + lambd0)
                #         b += 1
                # bb *= rule_value
                # print(aa-bb)

                # if rule_value < smallDouble:
                #     underflowcount[k] -= 1
                #     rule_value *= largeDouble
                # prob = recompute(prob, underflowcount)
            return au.sample_index(np.array(prob))
Example #6
0
 def sample_cluster(tw, iter=None):
     prob = [0] * K
     freq_dict = tw['dup']
     for k in range(K):
         prob[k] = m_z[k] / (D - 1 + alpha)
         rule_value = 1.0
         i_ = 0
         for word, freq in freq_dict.items():
             for j_ in range(freq):
                 rule_value *= (n_zw[k][words[word]['id']] + beta +
                                j_) / (n_z[k] + beta0 + i_)
                 i_ += 1
         prob[k] *= rule_value
     new_cluster_prob = alpha / (D - 1 + alpha)
     i_ = 0
     for word, freq in freq_dict.items():
         for j_ in range(freq):
             new_cluster_prob *= (beta + j_) / (beta0 + i_)
             i_ += 1
     if iter is not None and iter > iter_num - 5:
         return np.argmax(prob + [new_cluster_prob])
     else:
         return au.sample_index(np.array(prob + [new_cluster_prob]))
Example #7
0
        def sample_cluster(tw, iter=None):
            prob = [0] * K
            underflowcount = [0] * K
            for k in range(K):
                prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
                rule_value = 1.0
                i = 0
                freq_dict = tw['dup']
                for w, freq in freq_dict.items():
                    for j in range(freq):
                        wid = words[w]['id']
                        rule_value *= (n_zw[k][wid] + beta + j) / (n_z[k] +
                                                                   beta0 + i)
                        if rule_value < small_double:
                            underflowcount[k] -= 1
                            rule_value *= large_double
                        i += 1
                prob[k] *= rule_value

            prob = recompute(prob, underflowcount)
            if iter is not None and iter > 95:
                return np.argmax(prob)
            else:
                return au.sample_index(np.array(prob))
Example #8
0
 def sample_cluster(_tw, cur_iter=None):
     prob = {}
     tw_freq_dict = _tw['dup']
     for k in K:
         prob[k] = m_z[k] / (D - 1 + alpha)
         _i = 0
         for _word, _freq in tw_freq_dict.items():
             for _j in range(_freq):
                 prob[k] *= (n_zw[k][words[_word]['id']] + beta + _j) / (n_z[k] + beta0 + _i)
                 _i += 1
     new_clu_prob = alpha / (D - 1 + alpha)
     _i = 0
     for _word, _freq in tw_freq_dict.items():
         for _j in range(_freq):
             new_clu_prob *= (beta + _j) / (beta0 + _i)
             _i += 1
     
     prob[self.max_cluid + 1] = new_clu_prob
     cluid_arr = sorted(prob.keys())
     prob_arr = [prob[__cluid] for __cluid in cluid_arr]
     if cur_iter is not None and cur_iter >= iter_num - 1:
         return cluid_arr[np.argmax(prob_arr)]
     else:
         return cluid_arr[au.sample_index(np.array(prob_arr))]