def sample(self, twh, D, using_max=False, no_new_clu=False, cluid_range=None): alpha = self.alpha p_dict = self.p_dict p0_dict = self.p0_dict cludict = self.cludict tw_valid_token_set = twh.valid_token_set etap, etac, etav, etah = [ p_dict.get(k_type) for k_type in TokenSet.KEY_LIST ] etap0, etac0, etav0, etah0 = [ p0_dict.get(k_type) for k_type in TokenSet.KEY_LIST ] twhp, twhc, twhv, twhh = [ tw_valid_token_set.get(k_type) for k_type in TokenSet.KEY_LIST ] if cluid_range is None: cluid_range = cludict.keys() cluid_prob = dict() old_prob_delta = self.old_prob_delta new_prob_delta = self.new_prob_delta for cluid in cluid_range: cluster = cludict[cluid] clu_token_set = cluster.token_set old_clu_prob = cluster.twnum / (D - 1 + alpha) prob_delta = 1.0 prob_delta *= old_prob_delta(twhp, clu_token_set.get(su.pos_prop), etap, etap0) prob_delta *= old_prob_delta(twhc, clu_token_set.get(su.pos_comm), etac, etac0) prob_delta *= old_prob_delta(twhv, clu_token_set.get(su.pos_verb), etav, etav0) prob_delta *= old_prob_delta(twhh, clu_token_set.get(su.pos_hstg), etah, etah0) cluid_prob[cluid] = old_clu_prob * prob_delta if not no_new_clu: new_clu_prob = alpha / (D - 1 + alpha) prob_delta = 1.0 prob_delta *= new_prob_delta(twhp, etap, etap0) prob_delta *= new_prob_delta(twhc, etac, etac0) prob_delta *= new_prob_delta(twhv, etav, etav0) prob_delta *= new_prob_delta(twhh, etah, etah0) cluid_prob[self.max_cluid + 1] = new_clu_prob * prob_delta # print('new:{} init new:{} delta:{}'.format(cluid_prob[self.max_cluid + 1], new_clu_prob, prob_delta)) # print() cluid_arr = list(cluid_prob.keys()) prob_arr = [cluid_prob[cluid] for cluid in cluid_arr] sampled_idx = np.argmax(prob_arr) if using_max else au.sample_index( np.array(prob_arr)) return cluid_arr[sampled_idx]
def sample_cluster(tw, cur_iter): prob = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) prob[k] *= rule_value_of(tw[self.key_prop_n], prop_n_dict, n_z_p, n_zw_p, etap, etap0, k) prob[k] *= rule_value_of(tw[self.key_comm_n], comm_n_dict, n_z_c, n_zw_c, etac, etac0, k) prob[k] *= rule_value_of(tw[self.key_verb], verb_dict, n_z_v, n_zw_v, etav, etav0, k) prob[k] *= rule_value_of(tw[self.key_ht], ht_dict, n_z_h, n_zw_h, etah, etah0, k) if cur_iter >= iter_num - 1: return np.argmax(prob) else: return au.sample_index(np.array(prob))
def sample_cluster(tw, iter=None): prob = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) key_freq_dict = tw['key'] ht_freq_dict = tw['ht'] prob[k] *= rule_value_of(key_freq_dict, key_dict, n_z_key, n_zw_key, beta, beta0, k) prob[k] *= rule_value_of(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht, gamma, gamma0, k) if iter is not None and iter > iter_num - 5: return np.argmax(prob) else: return au.sample_index(np.array(prob))
def sample(self, retwset, D, using_max=False, no_new_clu=False): assert len(retwset.get_twharr()) > 0 prob_twset = [] alpha, beta, beta0 = self.alpha, self.beta, self.beta0 for twh in retwset.get_twharr(): cluid_prob = dict() for cluid, cluster in self.cludict.items(): m_zk, clu_tokens = cluster.twnum, cluster.tokens clu_freq_sum = clu_tokens.get_freq_sum() old_clu_prob = m_zk / (D - 1 + alpha) prob_delta = 1.0 ii = 0 for word, freq in twh.tokens.word_freq_enumerate(newest=False): clu_word_freq = clu_tokens.freq_of_word( word) if clu_tokens.has_word(word) else 0 # print('w:{} cid:{} cwf:{} cfs:{}, beta:{} beta0:{}'.format( # word, cluid, clu_word_freq, clu_freq_sum, beta, beta0)) for jj in range(freq): prob_delta *= (clu_word_freq + beta + jj) / (clu_freq_sum + beta0 + ii) ii += 1 cluid_prob[cluid] = old_clu_prob * prob_delta # print('old:{} init old:{} delta:{}'.format(cluid_prob[cluid], old_clu_prob, prob_delta)) ii = 0 new_clu_prob = alpha / (D - 1 + alpha) prob_delta = 1.0 for word, freq in twh.tokens.word_freq_enumerate(newest=False): for jj in range(freq): prob_delta *= (beta + jj) / (beta0 + ii) ii += 1 cluid_prob[self.max_cluid + 1] = new_clu_prob * prob_delta # print('new:{} init new:{} delta:{}'.format(cluid_prob[self.max_cluid + 1], new_clu_prob, prob_delta)) # print() if no_new_clu: cluid_prob.pop(self.max_cluid + 1) cluid_arr = sorted(cluid_prob.keys()) prob_arr = [cluid_prob[__cluid] for __cluid in cluid_arr] # print(prob_arr) pred_cluid = cluid_arr[np.argmax( prob_arr)] if using_max else cluid_arr[au.sample_index( np.array(prob_arr))] prob_twset.append(pred_cluid) if len(retwset.get_twharr()) > 1: final_sample = int(au.choice(prob_twset)) else: final_sample = prob_twset[0] return final_sample
def sample_cluster(tw): geo_freq_dict = tw['geo_and_time'] ent_freq_dict = tw['ent'] key_freq_dict = tw['key'] prob = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D + alpha0) prob[k] *= rule_value_of(geo_freq_dict, geo_word_id_dict, n_z_geo, n_zw_geo, eta, eta0, k) prob[k] *= rule_value_of(ent_freq_dict, ent_word_id_dict, n_z_ent, n_zw_ent, beta, beta0, k) prob[k] *= rule_value_of(key_freq_dict, key_word_id_dict, n_z_key, n_zw_key, lambd, lambd0, k) # bb=1.0 # b = 1 # rule_value = 1.0 # for geo_w, w_count in geo_freq_dict.items(): # for idx in range(1, w_count + 1): # wid = geo_word_id_dict[geo_w]['id'] # rule_value *= (n_zw_geo[k][wid] + idx + eta)/(n_z_geo[k] + b + eta0) # b += 1 # bb*=rule_value # b = 1 # rule_value = 1.0 # for ent_w, w_count in ent_freq_dict.items(): # for idx in range(1, w_count + 1): # wid = ent_word_id_dict[ent_w]['id'] # rule_value *= (n_zw_ent[k][wid] + idx + beta)/(n_z_ent[k] + b + beta0) # b += 1 # bb *= rule_value # b = 1 # rule_value = 1.0 # for key_w, w_count in key_freq_dict.items(): # for idx in range(1, w_count + 1): # wid = key_word_id_dict[key_w]['id'] # rule_value *= (n_zw_key[k][wid] + idx + lambd)/(n_z_key[k] + b + lambd0) # b += 1 # bb *= rule_value # print(aa-bb) # if rule_value < smallDouble: # underflowcount[k] -= 1 # rule_value *= largeDouble # prob = recompute(prob, underflowcount) return au.sample_index(np.array(prob))
def sample_cluster(tw, iter=None): prob = [0] * K freq_dict = tw['dup'] for k in range(K): prob[k] = m_z[k] / (D - 1 + alpha) rule_value = 1.0 i_ = 0 for word, freq in freq_dict.items(): for j_ in range(freq): rule_value *= (n_zw[k][words[word]['id']] + beta + j_) / (n_z[k] + beta0 + i_) i_ += 1 prob[k] *= rule_value new_cluster_prob = alpha / (D - 1 + alpha) i_ = 0 for word, freq in freq_dict.items(): for j_ in range(freq): new_cluster_prob *= (beta + j_) / (beta0 + i_) i_ += 1 if iter is not None and iter > iter_num - 5: return np.argmax(prob + [new_cluster_prob]) else: return au.sample_index(np.array(prob + [new_cluster_prob]))
def sample_cluster(tw, iter=None): prob = [0] * K underflowcount = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) rule_value = 1.0 i = 0 freq_dict = tw['dup'] for w, freq in freq_dict.items(): for j in range(freq): wid = words[w]['id'] rule_value *= (n_zw[k][wid] + beta + j) / (n_z[k] + beta0 + i) if rule_value < small_double: underflowcount[k] -= 1 rule_value *= large_double i += 1 prob[k] *= rule_value prob = recompute(prob, underflowcount) if iter is not None and iter > 95: return np.argmax(prob) else: return au.sample_index(np.array(prob))
def sample_cluster(_tw, cur_iter=None): prob = {} tw_freq_dict = _tw['dup'] for k in K: prob[k] = m_z[k] / (D - 1 + alpha) _i = 0 for _word, _freq in tw_freq_dict.items(): for _j in range(_freq): prob[k] *= (n_zw[k][words[_word]['id']] + beta + _j) / (n_z[k] + beta0 + _i) _i += 1 new_clu_prob = alpha / (D - 1 + alpha) _i = 0 for _word, _freq in tw_freq_dict.items(): for _j in range(_freq): new_clu_prob *= (beta + _j) / (beta0 + _i) _i += 1 prob[self.max_cluid + 1] = new_clu_prob cluid_arr = sorted(prob.keys()) prob_arr = [prob[__cluid] for __cluid in cluid_arr] if cur_iter is not None and cur_iter >= iter_num - 1: return cluid_arr[np.argmax(prob_arr)] else: return cluid_arr[au.sample_index(np.array(prob_arr))]