def preprocess_twarr(self, twarr): """pre-process the tweet text, including dropping non-common terms""" key_tokens = tk.key_wordlabels self.twarr = twarr for word_dict in self.word_dicts: word_dict.clear() # self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict = \ # IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict() pos_tag2dict_map = dict([(tag, self.prop_n_dict) for tag in self.prop_n_tags] + [(tag, self.comm_n_dict) for tag in self.comm_n_tags] + [(tag, self.verb_dict) for tag in self.verb_tags] + [(tag, self.ht_dict) for tag in self.ht_rags]) for tw in twarr: tokens = tw[key_tokens] for i in range(len(tokens) - 1, -1, -1): tokens[i][0] = tokens[i][0].lower().strip() word, _, pos_tag = tokens[i] if not cs.is_valid_keyword(word): del tokens[i] if word.startswith('#') and not pos_tag.lower() == 'ht': pos_tag = tokens[i][2] = 'HT' if pos_tag in pos_tag2dict_map: pos_tag2dict_map[pos_tag].count_word(word) self.prop_n_dict.drop_words_by_condition(3) self.comm_n_dict.drop_words_by_condition(4) self.verb_dict.drop_words_by_condition(4) self.ht_dict.drop_words_by_condition(3) for tw in twarr: tw[self.key_prop_n], tw[self.key_comm_n], tw[self.key_verb], tw[self.key_ht] = \ IdFreqDict(), IdFreqDict(), IdFreqDict(), IdFreqDict() tw_pos_tag2dict_map = dict([(tag, tw[self.key_prop_n]) for tag in self.prop_n_tags] + [(tag, tw[self.key_comm_n]) for tag in self.comm_n_tags] + [(tag, tw[self.key_verb]) for tag in self.verb_tags] + [(tag, tw[self.key_ht]) for tag in self.ht_rags]) for token in tw[key_tokens]: word, _, pos_tag = token if pos_tag in tw_pos_tag2dict_map and pos_tag2dict_map[ pos_tag].has_word(word): tw_pos_tag2dict_map[pos_tag].count_word(word)
def GSDMM_twarr_hashtag(twarr, alpha, beta, gamma, K, iter_num): ner_pos_token = tk.key_wordlabels twarr = twarr[:] key_dict = dict() ht_dict = dict() def word_count_id(word_dict, w): if w in word_dict: word_dict[w]['freq'] += 1 else: word_dict[w] = {'freq': 1, 'id': word_dict.__len__()} def rearrange_id(word_dict): for idx, w in enumerate(sorted(word_dict.keys())): word_dict[w]['id'] = idx def drop_words_freq_less_than(word_dict, min_freq): for w in list(word_dict.keys()): if word_dict[w]['freq'] < min_freq: del word_dict[w] rearrange_id(word_dict) """pre-process the tweet text, including dropping non-common terms""" for tw in twarr: wordlabels = tw[ner_pos_token] for i in range(len(wordlabels) - 1, -1, -1): key = wordlabels[i][0] = wordlabels[i][0].lower().strip( ) # hashtags are reserved here if not cs.is_valid_keyword(key): del wordlabels[i] else: if key.startswith('#'): word_count_id(ht_dict, key) else: word_count_id(key_dict, key) drop_words_freq_less_than(ht_dict, 3) drop_words_freq_less_than(key_dict, 5) for tw in twarr: tw['key'] = dict( Counter([ wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in key_dict ])) tw['ht'] = dict( Counter([ wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in ht_dict ])) # pos_tw_num = len([1 for label in ref_labels if label <= 11]) # neg_tw_num = len(twarr) - pos_tw_num # print('hashtag in pos:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() > 0]) / pos_tw_num) # print('hashtag in pos = 1:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() == 1]) / pos_tw_num) # print('hashtag in pos = 2:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() == 2]) / pos_tw_num) # print('hashtag in pos >= 3:', len([1 for tw in twarr[:pos_tw_num] if tw['ht'].__len__() >= 3]) / pos_tw_num) # print('hashtag in neg:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() > 0]) / neg_tw_num) # print('hashtag in neg = 1:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() == 1]) / neg_tw_num) # print('hashtag in neg = 2:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() == 2]) / neg_tw_num) # print('hashtag in neg >= 3:', len([1 for tw in twarr[pos_tw_num:] if tw['ht'].__len__() >= 3]) / neg_tw_num) # print('tw num:', len(twarr), 'pos_tw_num', pos_tw_num, 'neg_tw_num', neg_tw_num) # print('----') """definitions of parameters""" D = len(twarr) V = len(key_dict) H = len(ht_dict) alpha0 = K * alpha beta0 = V * beta # hyperparam for keyword_info gamma0 = H * gamma # hyperparam for hashtag z = [0] * D m_z = [0] * K n_z_key = [0] * K n_z_ht = [0] * K n_zw_key = [[0] * V for _ in range(K)] n_zw_ht = [[0] * H for _ in range(K)] """initialize the counting arrays""" for d in range(D): cluster = int(K * np.random.random()) z[d] = cluster m_z[cluster] += 1 key_freq_dict = twarr[d]['key'] ht_freq_dict = twarr[d]['ht'] for key, freq in key_freq_dict.items(): n_z_key[cluster] += freq n_zw_key[cluster][key_dict[key]['id']] += freq for ht, freq in ht_freq_dict.items(): n_z_ht[cluster] += freq n_zw_ht[cluster][ht_dict[ht]['id']] += freq """make sampling using current counting information""" def rule_value_of(tw_freq_dict_, word_id_dict_, n_z_, n_zw_, p, p0, cluster): i_ = value = 1 for w_, w_freq in tw_freq_dict_.items(): for i in range(0, w_freq): value *= (n_zw_[cluster][word_id_dict_[w_]['id']] + i + p) / (n_z_[cluster] + i_ + p0) i_ += 1 return value def sample_cluster(tw, iter=None): prob = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) key_freq_dict = tw['key'] ht_freq_dict = tw['ht'] prob[k] *= rule_value_of(key_freq_dict, key_dict, n_z_key, n_zw_key, beta, beta0, k) prob[k] *= rule_value_of(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht, gamma, gamma0, k) if iter is not None and iter > iter_num - 5: return np.argmax(prob) else: return au.sample_index(np.array(prob)) """start iteration""" def update_using_freq_dict(tw_freq_dict_, word_id_dict_, n_z_, n_zw_, factor): for w, w_freq in tw_freq_dict_.items(): w_freq *= factor n_z_[cluster] += w_freq n_zw_[cluster][word_id_dict_[w]['id']] += w_freq """ start iteration """ z_iter = list() for i in range(iter_num): z_iter.append(z[:]) for d in range(D): cluster = z[d] m_z[cluster] -= 1 key_freq_dict = twarr[d]['key'] ht_freq_dict = twarr[d]['ht'] update_using_freq_dict(key_freq_dict, key_dict, n_z_key, n_zw_key, -1) update_using_freq_dict(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht, -1) cluster = sample_cluster(twarr[d], i) z[d] = cluster m_z[cluster] += 1 update_using_freq_dict(key_freq_dict, key_dict, n_z_key, n_zw_key, 1) update_using_freq_dict(ht_freq_dict, ht_dict, n_z_ht, n_zw_ht, 1) z_iter.append(z[:]) return z_iter
def GSDMM_twarr(twarr, alpha, beta, K, iter_num): ner_pos_token = tk.key_wordlabels twarr = twarr[:] words = dict() """pre-process the tweet text, including dropping non-common terms""" for tw in twarr: wordlabels = tw[ner_pos_token] for i in range(len(wordlabels) - 1, -1, -1): wordlabels[i][0] = wordlabels[i][0].lower().strip('#').strip() if not cs.is_valid_keyword(wordlabels[i][0]): del wordlabels[i] for wordlabel in wordlabels: word = wordlabel[0] if word in words: words[word]['freq'] += 1 else: words[word] = {'freq': 1, 'id': len(words.keys())} min_df = 3 for w in list(words.keys()): if words[w]['freq'] < min_df: del words[w] for idx, w in enumerate(sorted(words.keys())): words[w]['id'] = idx for tw in twarr: tw['dup'] = dict( Counter( [wlb[0] for wlb in tw[ner_pos_token] if wlb[0] in words])) """definitions of parameters""" V = len(words.keys()) D = len(twarr) alpha0 = K * alpha beta0 = V * beta z = [0] * D m_z = [0] * K n_z = [0] * K n_zw = [[0] * V for _ in range(K)] """initialize the counting arrays""" for d in range(D): cluster = int(K * np.random.random()) z[d] = cluster m_z[cluster] += 1 freq_dict = twarr[d]['dup'] for word in freq_dict.keys(): n_z[cluster] += freq_dict[word] n_zw[cluster][words[word]['id']] += freq_dict[word] """make sampling using current counting information""" small_double = 1e-150 large_double = 1e150 def recompute(prob, underflowcount): max_count = max(underflowcount) return [ prob[k] * (large_double**(underflowcount[k] - max_count)) for k in range(len(prob)) ] def sample_cluster(tw, iter=None): prob = [0] * K underflowcount = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) rule_value = 1.0 i = 0 freq_dict = tw['dup'] for w, freq in freq_dict.items(): for j in range(freq): wid = words[w]['id'] rule_value *= (n_zw[k][wid] + beta + j) / (n_z[k] + beta0 + i) if rule_value < small_double: underflowcount[k] -= 1 rule_value *= large_double i += 1 prob[k] *= rule_value prob = recompute(prob, underflowcount) if iter is not None and iter > 95: return np.argmax(prob) else: return au.sample_index(np.array(prob)) """start iteration""" z_iter = list() for i in range(iter_num): z_iter.append(z[:]) for d in range(D): cluster = z[d] m_z[cluster] -= 1 freq_dict = twarr[d]['dup'] for word in freq_dict.keys(): wordid = words[word]['id'] wordfreq = freq_dict[word] n_zw[cluster][wordid] -= wordfreq n_z[cluster] -= wordfreq cluster = sample_cluster(twarr[d], i) z[d] = cluster m_z[cluster] += 1 for word in freq_dict.keys(): wordid = words[word]['id'] wordfreq = freq_dict[word] n_zw[cluster][wordid] += wordfreq n_z[cluster] += wordfreq z_iter.append(z[:]) return z_iter
def GSDPMM_twarr(self, old_twarr, old_z, new_twarr, iter_num): pos_token = tk.key_ark twarr = old_twarr + new_twarr words = dict() """pre-process the tweet text, including dropping non-common terms""" for tw in twarr: tokens = tw[pos_token] for i in range(len(tokens) - 1, -1, -1): tokens[i][0] = tokens[i][0].lower().strip('#').strip() if not cs.is_valid_keyword(tokens[i][0]): del tokens[i] for wordlabel in tokens: word = wordlabel[0] if word in words: words[word]['freq'] += 1 else: words[word] = {'freq': 1, 'id': len(words.keys())} min_df = 3 for w in list(words.keys()): if words[w]['freq'] < min_df: del words[w] for idx, w in enumerate(sorted(words.keys())): words[w]['id'] = idx for tw in twarr: tw['dup'] = dict(Counter([wlb[0] for wlb in tw[pos_token] if wlb[0] in words])) """definitions of parameters""" D = len(twarr) V = len(words.keys()) alpha, beta = self.alpha, self.beta beta0 = V * beta new_z = [0] * len(new_twarr) K = {self.max_cluid} if not old_z else set(old_z) m_z = dict([(k, 0) for k in K]) n_z = dict([(k, 0) for k in K]) n_zw = dict([(k, [0] * V) for k in K]) """initialize the counting arrays""" for old_d in range(len(old_twarr)): old_cluid = old_z[old_d] m_z[old_cluid] += 1 for word, freq in old_twarr[old_d]['dup'].items(): n_z[old_cluid] += freq n_zw[old_cluid][words[word]['id']] += freq for new_d in range(len(new_twarr)): new_cluid = np.random.choice(list(K)) new_z[new_d] = new_cluid m_z[new_cluid] += 1 for word, freq in new_twarr[new_d]['dup'].items(): n_z[new_cluid] += freq n_zw[new_cluid][words[word]['id']] += freq """make sampling using current counting information""" def sample_cluster(_tw, cur_iter=None): prob = {} tw_freq_dict = _tw['dup'] for k in K: prob[k] = m_z[k] / (D - 1 + alpha) _i = 0 for _word, _freq in tw_freq_dict.items(): for _j in range(_freq): prob[k] *= (n_zw[k][words[_word]['id']] + beta + _j) / (n_z[k] + beta0 + _i) _i += 1 new_clu_prob = alpha / (D - 1 + alpha) _i = 0 for _word, _freq in tw_freq_dict.items(): for _j in range(_freq): new_clu_prob *= (beta + _j) / (beta0 + _i) _i += 1 prob[self.max_cluid + 1] = new_clu_prob cluid_arr = sorted(prob.keys()) prob_arr = [prob[__cluid] for __cluid in cluid_arr] if cur_iter is not None and cur_iter >= iter_num - 1: return cluid_arr[np.argmax(prob_arr)] else: return cluid_arr[au.sample_index(np.array(prob_arr))] """start iteration""" for i in range(iter_num): for new_d in range(len(new_twarr)): freq_dict = new_twarr[new_d]['dup'] old_cluid = new_z[new_d] new_z[new_d] = -1 m_z[old_cluid] -= 1 for word, freq in freq_dict.items(): n_z[old_cluid] -= freq n_zw[old_cluid][words[word]['id']] -= freq for _cluid in list(m_z.keys()): if m_z[_cluid] == 0: m_z.pop(_cluid), n_z.pop(_cluid), n_zw.pop(_cluid), K.remove(_cluid) new_cluid = sample_cluster(new_twarr[new_d], i) if new_cluid > self.max_cluid: new_cluid = self.max_cluid = self.max_cluid + 1 m_z[self.max_cluid] = 0 n_z[self.max_cluid] = 0 n_zw[self.max_cluid] = [0] * V K.add(self.max_cluid) new_z[new_d] = new_cluid m_z[new_cluid] += 1 for word, freq in freq_dict.items(): n_z[new_cluid] += freq n_zw[new_cluid][words[word]['id']] += freq return new_z