def input_batch_with_label(self, tw_batch, lb_batch=None): tu.twarr_nlp(tw_batch) self.history_length.append(len(tw_batch)) if len(self.history_length) < self.hold_batch_num: # insufficient tweets self.twarr.extend(tw_batch) self.label.extend(lb_batch) if lb_batch is not None else None return (None, None) if lb_batch is not None else None elif not self.init_batch_ready: # the first time when len(self.batch_twnum_list) == self.hold_batch_num, may get merged self.init_batch_ready = True self.twarr += tw_batch self.label.extend(lb_batch) if lb_batch is not None else None self.z = self.GSDPMM_twarr(list(), list(), self.twarr, iter_num=30) z_ = [int(i) for i in self.z] return (z_, self.label[:]) if lb_batch is not None else z_ else: # normal process of new twarr new_z = self.GSDPMM_twarr(self.twarr, self.z, tw_batch, iter_num=5) self.twarr += tw_batch self.label.extend(lb_batch) if lb_batch is not None else None self.z += new_z oldest_len = self.history_length.pop(0) self.z = self.z[oldest_len:] self.twarr = self.twarr[oldest_len:] if lb_batch is not None: self.label = self.label[oldest_len:] z_ = [int(i) for i in self.z] return (z_, self.label[:]) if lb_batch is not None else z_
def load_tw_batches(self, load_cluid_arr): tw_batches = fu.load_array(self.labelled_batch_file) tu.twarr_nlp(au.merge_array(tw_batches)) print("twarr nlp over") if load_cluid_arr: cluid_batches = fu.load_array(self.cluid_batch_file) assert len(tw_batches) == len(cluid_batches) for b_idx in range(len(tw_batches)): tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx] assert len(tw_batch) == len(cluid_batch) for idx in range(len(tw_batch)): tw, cluid = tw_batch[idx], cluid_batch[idx] tw[tk.key_event_cluid] = cluid return tw_batches
def multi(file): # ent_tags = {'FAC', 'GPE', 'LOC', 'ORG', 'NORP'} word_type = list() twarr = fu.load_array(file) twarr = tu.twarr_nlp(twarr) for tw in twarr: doc = tw[tk.key_spacy] for token in doc: word_type.append([token.text, token.ent_type_, token.tag_]) return word_type
def pre_process_twarr(twarr, lbarr): # every tw should only get processed once twarr = tu.twarr_nlp(twarr) twharr = list() for idx in range(len(twarr)): tw, lb = twarr[idx], lbarr[idx] twh = TweetHolder(tw) twh.label = lb twharr.append(twh) return twharr
def load_tw_batches(self, load_cluid_arr): temp_len = 60000 twarr = fu.load_array(self.filtered_twarr_file)[:temp_len] print("load_tw_batches, len(twarr)=", len(twarr)) if load_cluid_arr: cluidarr = fu.load_array(self.filtered_cluidarr_file)[:temp_len] assert len(twarr) == len(cluidarr) for idx in range(len(twarr)): tw, twid = twarr[idx], twarr[idx][tk.key_id] origin_id, cluid = cluidarr[idx] assert twid == origin_id tw[tk.key_event_cluid] = cluid twarr = tu.twarr_nlp(twarr) tw_batches = split_array_into_batches(twarr, self.batch_size) print("batch distrb {}, {} batches, total {} tweets".format( [len(b) for b in tw_batches], len(tw_batches), len(twarr))) return tw_batches
def GSDMM_new_twarr(self, old_twarr, old_z, new_twarr, alpha, etap, etac, etav, etah, K, iter_num): new_twarr = tu.twarr_nlp(new_twarr) prop_n_dict, comm_n_dict, verb_dict, ht_dict = \ self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict D_old, D_new = len(old_twarr), len(new_twarr) D = D_old + D_new VP = prop_n_dict.vocabulary_size() VC = comm_n_dict.vocabulary_size() VV = verb_dict.vocabulary_size() VH = ht_dict.vocabulary_size() alpha0 = K * alpha etap0 = VP * etap etac0 = VC * etac etav0 = VV * etav etah0 = VH * etah new_z = [-1] * D_new m_z = [0] * K n_z_p = [0] * K n_z_c = [0] * K n_z_v = [0] * K n_z_h = [0] * K n_zw_p = [[0] * VP for _ in range(K)] n_zw_c = [[0] * VC for _ in range(K)] n_zw_v = [[0] * VV for _ in range(K)] n_zw_h = [[0] * VH for _ in range(K)] """initialize the counting arrays""" def update_clu_dicts_by_tw(tw, clu_id, factor=1): count_tw_into_tables(tw[self.key_prop_n], prop_n_dict, n_z_p, n_zw_p, clu_id, factor) count_tw_into_tables(tw[self.key_comm_n], comm_n_dict, n_z_c, n_zw_c, clu_id, factor) count_tw_into_tables(tw[self.key_verb], verb_dict, n_z_v, n_zw_v, clu_id, factor) count_tw_into_tables(tw[self.key_ht], ht_dict, n_z_h, n_zw_h, clu_id, factor) def count_tw_into_tables(tw_freq_dict_, ifd_, n_z_, n_zw_, clu_id, factor): for word, freq in tw_freq_dict_.word_freq_enumerate(): if factor > 0: n_z_[clu_id] += freq n_zw_[clu_id][ifd_.word2id(word)] += freq else: n_z_[clu_id] -= freq n_zw_[clu_id][ifd_.word2id(word)] -= freq for d in range(D_old): k = old_z[d] m_z[k] += 1 update_clu_dicts_by_tw(old_twarr[d], k, factor=1) for d in range(D_new): k = int(K * np.random.random()) new_z[d] = k m_z[k] += 1 update_clu_dicts_by_tw(new_twarr[d], k, factor=1) """make sampling using current counting information""" def rule_value_of(tw_freq_dict_, word_id_dict_, n_z_, n_zw_, p, p0, clu_id): i_ = value = 1.0 for word, freq in tw_freq_dict_.word_freq_enumerate(): for ii in range(0, freq): value *= (n_zw_[clu_id][word_id_dict_.word2id(word)] + ii + p) / (n_z_[clu_id] + i_ + p0) i_ += 1 return value def sample_cluster(tw, cur_iter): prob = [0] * K for k in range(K): prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0) prob[k] *= rule_value_of(tw[self.key_prop_n], prop_n_dict, n_z_p, n_zw_p, etap, etap0, k) prob[k] *= rule_value_of(tw[self.key_comm_n], comm_n_dict, n_z_c, n_zw_c, etac, etac0, k) prob[k] *= rule_value_of(tw[self.key_verb], verb_dict, n_z_v, n_zw_v, etav, etav0, k) prob[k] *= rule_value_of(tw[self.key_ht], ht_dict, n_z_h, n_zw_h, etah, etah0, k) if cur_iter >= iter_num - 1: return np.argmax(prob) else: return au.sample_index(np.array(prob)) """start iteration""" for i in range(iter_num): for d in range(D_new): k = new_z[d] m_z[k] -= 1 update_clu_dicts_by_tw(new_twarr[d], k, factor=-1) k = sample_cluster(new_twarr[d], i) new_z[d] = k m_z[k] += 1 update_clu_dicts_by_tw(new_twarr[d], k, factor=1) return new_z
def pre_process_twarr(twarr): twarr = tu.twarr_nlp(twarr) return [TweetHolder(tw) for tw in twarr]