Exemple #1
0
 def input_batch_with_label(self, tw_batch, lb_batch=None):
     tu.twarr_nlp(tw_batch)
     self.history_length.append(len(tw_batch))
     if len(self.history_length) < self.hold_batch_num:
         # insufficient tweets
         self.twarr.extend(tw_batch)
         self.label.extend(lb_batch) if lb_batch is not None else None
         return (None, None) if lb_batch is not None else None
     elif not self.init_batch_ready:
         # the first time when len(self.batch_twnum_list) == self.hold_batch_num, may get merged
         self.init_batch_ready = True
         self.twarr += tw_batch
         self.label.extend(lb_batch) if lb_batch is not None else None
         self.z = self.GSDPMM_twarr(list(), list(), self.twarr, iter_num=30)
         z_ = [int(i) for i in self.z]
         return (z_, self.label[:]) if lb_batch is not None else z_
     else:
         # normal process of new twarr
         new_z = self.GSDPMM_twarr(self.twarr, self.z, tw_batch, iter_num=5)
         self.twarr += tw_batch
         self.label.extend(lb_batch) if lb_batch is not None else None
         self.z += new_z
         
         oldest_len = self.history_length.pop(0)
         self.z = self.z[oldest_len:]
         self.twarr = self.twarr[oldest_len:]
         if lb_batch is not None:
             self.label = self.label[oldest_len:]
         z_ = [int(i) for i in self.z]
         return (z_, self.label[:]) if lb_batch is not None else z_
Exemple #2
0
 def load_tw_batches(self, load_cluid_arr):
     tw_batches = fu.load_array(self.labelled_batch_file)
     tu.twarr_nlp(au.merge_array(tw_batches))
     print("twarr nlp over")
     if load_cluid_arr:
         cluid_batches = fu.load_array(self.cluid_batch_file)
         assert len(tw_batches) == len(cluid_batches)
         for b_idx in range(len(tw_batches)):
             tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx]
             assert len(tw_batch) == len(cluid_batch)
             for idx in range(len(tw_batch)):
                 tw, cluid = tw_batch[idx], cluid_batch[idx]
                 tw[tk.key_event_cluid] = cluid
     return tw_batches
Exemple #3
0
def multi(file):
    # ent_tags = {'FAC', 'GPE', 'LOC', 'ORG', 'NORP'}
    word_type = list()
    twarr = fu.load_array(file)
    twarr = tu.twarr_nlp(twarr)
    for tw in twarr:
        doc = tw[tk.key_spacy]
        for token in doc:
            word_type.append([token.text, token.ent_type_, token.tag_])
    return word_type
 def pre_process_twarr(twarr, lbarr):
     # every tw should only get processed once
     twarr = tu.twarr_nlp(twarr)
     twharr = list()
     for idx in range(len(twarr)):
         tw, lb = twarr[idx], lbarr[idx]
         twh = TweetHolder(tw)
         twh.label = lb
         twharr.append(twh)
     return twharr
Exemple #5
0
 def load_tw_batches(self, load_cluid_arr):
     temp_len = 60000
     twarr = fu.load_array(self.filtered_twarr_file)[:temp_len]
     print("load_tw_batches, len(twarr)=", len(twarr))
     if load_cluid_arr:
         cluidarr = fu.load_array(self.filtered_cluidarr_file)[:temp_len]
         assert len(twarr) == len(cluidarr)
         for idx in range(len(twarr)):
             tw, twid = twarr[idx], twarr[idx][tk.key_id]
             origin_id, cluid = cluidarr[idx]
             assert twid == origin_id
             tw[tk.key_event_cluid] = cluid
     twarr = tu.twarr_nlp(twarr)
     tw_batches = split_array_into_batches(twarr, self.batch_size)
     print("batch distrb {}, {} batches, total {} tweets".format(
         [len(b) for b in tw_batches], len(tw_batches), len(twarr)))
     return tw_batches
    def GSDMM_new_twarr(self, old_twarr, old_z, new_twarr, alpha, etap, etac,
                        etav, etah, K, iter_num):
        new_twarr = tu.twarr_nlp(new_twarr)
        prop_n_dict, comm_n_dict, verb_dict, ht_dict = \
            self.prop_n_dict, self.comm_n_dict, self.verb_dict, self.ht_dict
        D_old, D_new = len(old_twarr), len(new_twarr)
        D = D_old + D_new
        VP = prop_n_dict.vocabulary_size()
        VC = comm_n_dict.vocabulary_size()
        VV = verb_dict.vocabulary_size()
        VH = ht_dict.vocabulary_size()
        alpha0 = K * alpha
        etap0 = VP * etap
        etac0 = VC * etac
        etav0 = VV * etav
        etah0 = VH * etah

        new_z = [-1] * D_new
        m_z = [0] * K
        n_z_p = [0] * K
        n_z_c = [0] * K
        n_z_v = [0] * K
        n_z_h = [0] * K
        n_zw_p = [[0] * VP for _ in range(K)]
        n_zw_c = [[0] * VC for _ in range(K)]
        n_zw_v = [[0] * VV for _ in range(K)]
        n_zw_h = [[0] * VH for _ in range(K)]
        """initialize the counting arrays"""
        def update_clu_dicts_by_tw(tw, clu_id, factor=1):
            count_tw_into_tables(tw[self.key_prop_n], prop_n_dict, n_z_p,
                                 n_zw_p, clu_id, factor)
            count_tw_into_tables(tw[self.key_comm_n], comm_n_dict, n_z_c,
                                 n_zw_c, clu_id, factor)
            count_tw_into_tables(tw[self.key_verb], verb_dict, n_z_v, n_zw_v,
                                 clu_id, factor)
            count_tw_into_tables(tw[self.key_ht], ht_dict, n_z_h, n_zw_h,
                                 clu_id, factor)

        def count_tw_into_tables(tw_freq_dict_, ifd_, n_z_, n_zw_, clu_id,
                                 factor):
            for word, freq in tw_freq_dict_.word_freq_enumerate():
                if factor > 0:
                    n_z_[clu_id] += freq
                    n_zw_[clu_id][ifd_.word2id(word)] += freq
                else:
                    n_z_[clu_id] -= freq
                    n_zw_[clu_id][ifd_.word2id(word)] -= freq

        for d in range(D_old):
            k = old_z[d]
            m_z[k] += 1
            update_clu_dicts_by_tw(old_twarr[d], k, factor=1)
        for d in range(D_new):
            k = int(K * np.random.random())
            new_z[d] = k
            m_z[k] += 1
            update_clu_dicts_by_tw(new_twarr[d], k, factor=1)
        """make sampling using current counting information"""

        def rule_value_of(tw_freq_dict_, word_id_dict_, n_z_, n_zw_, p, p0,
                          clu_id):
            i_ = value = 1.0
            for word, freq in tw_freq_dict_.word_freq_enumerate():
                for ii in range(0, freq):
                    value *= (n_zw_[clu_id][word_id_dict_.word2id(word)] + ii +
                              p) / (n_z_[clu_id] + i_ + p0)
                    i_ += 1
            return value

        def sample_cluster(tw, cur_iter):
            prob = [0] * K
            for k in range(K):
                prob[k] = (m_z[k] + alpha) / (D - 1 + alpha0)
                prob[k] *= rule_value_of(tw[self.key_prop_n], prop_n_dict,
                                         n_z_p, n_zw_p, etap, etap0, k)
                prob[k] *= rule_value_of(tw[self.key_comm_n], comm_n_dict,
                                         n_z_c, n_zw_c, etac, etac0, k)
                prob[k] *= rule_value_of(tw[self.key_verb], verb_dict, n_z_v,
                                         n_zw_v, etav, etav0, k)
                prob[k] *= rule_value_of(tw[self.key_ht], ht_dict, n_z_h,
                                         n_zw_h, etah, etah0, k)
            if cur_iter >= iter_num - 1:
                return np.argmax(prob)
            else:
                return au.sample_index(np.array(prob))

        """start iteration"""
        for i in range(iter_num):
            for d in range(D_new):
                k = new_z[d]
                m_z[k] -= 1
                update_clu_dicts_by_tw(new_twarr[d], k, factor=-1)
                k = sample_cluster(new_twarr[d], i)
                new_z[d] = k
                m_z[k] += 1
                update_clu_dicts_by_tw(new_twarr[d], k, factor=1)
        return new_z
Exemple #7
0
 def pre_process_twarr(twarr):
     twarr = tu.twarr_nlp(twarr)
     return [TweetHolder(tw) for tw in twarr]