コード例 #1
0
ファイル: sgd4lda.py プロジェクト: gblackout/very_large
 def gen_alias_table(self, MH_max, perplexity=False):
     # here the phi is [:,4w]
     if perplexity:
         phi = self.theta[:, self.test_doc[2]] / self.norm_const
         # samples has shape (w, 1e3 * I)
         samples = util_funcs.gen_obj(phi.shape[1])
         util_funcs.gen_alias_table(table_h=self.table_h, table_l=self.table_l, table_p=self.table_p,
                                    phi=phi / np.sum(phi, 0), batch_mask=self.test_doc[2], w_sample=self.test_doc[3],
                                    samples=samples, iter_per_update=self.samples_per_update, MH_max=MH_max)
     else:
         cul_time = time.time()
         tmp = self.theta[:, self.current_set[2]]
         self.time_bak += time.time() - cul_time - 1.5 * self.apprx * (tmp.shape[0] * tmp.shape[1]) / 1e9
         phi = tmp / self.norm_const; tmp = None; collect()
         if self.w4_cnt is not None:
             util_funcs.kill_obj(self.w4_cnt, self.samples)
         self.w4_cnt = phi.shape[1]
         # samples has shape (w, 1e3 * I)
         samples = util_funcs.gen_obj(phi.shape[1])
         util_funcs.gen_batch_map(self.current_set[2], self.batch_map_4w, self.W)
         util_funcs.gen_alias_table(table_h=self.table_h, table_l=self.table_l, table_p=self.table_p,
                                    phi=phi / np.sum(phi, 0), batch_mask=self.current_set[2],
                                    w_sample=self.current_set[1], samples=samples,
                                    iter_per_update=self.samples_per_update, MH_max=MH_max)
     return samples, phi
コード例 #2
0
 def gen_alias_table(self, MH_max, theta, norm_const, perplexity=False):
     """
         generate alias table for fast per-token sampling
     """
     # here the phi is [:,4w]
     if perplexity:
         phi = theta[:, self.test_doc[2]] / norm_const
         # samples has shape (w, 1e3 * I)
         samples = util_funcs.gen_obj(phi.shape[1])
         util_funcs.gen_alias_table(table_h=self.table_h, table_l=self.table_l, table_p=self.table_p,
                                    phi=phi / np.sum(phi, 0), batch_mask=self.test_doc[2], w_sample=self.test_doc[3],
                                    samples=samples, iter_per_update=self.samples_per_update, MH_max=MH_max)
     else:
         cul_time = time.time()
         tmp = theta[:, self.current_set[2]]
         self.time_bak += time.time() - cul_time - 1.5 * self.apprx * (tmp.shape[0] * tmp.shape[1]) / 1e9
         phi = tmp / norm_const; tmp = None; collect()
         if self.w4_cnt is not None:
             util_funcs.kill_obj(self.w4_cnt, self.samples)
         self.w4_cnt = phi.shape[1]
         # samples has shape (w, 1e3 * I)
         samples = util_funcs.gen_obj(phi.shape[1])
         util_funcs.gen_batch_map(self.current_set[2], self.batch_map_4w, self.W)
         util_funcs.gen_alias_table(table_h=self.table_h, table_l=self.table_l, table_p=self.table_p,
                                    phi=phi / np.sum(phi, 0), batch_mask=self.current_set[2],
                                    w_sample=self.current_set[1], samples=samples,
                                    iter_per_update=self.samples_per_update, MH_max=MH_max)
     return samples, phi
コード例 #3
0
    def get_perp_just_in_time(self, iter, MH_max):
        # *************************************** parameters ************************************************
        phi_mask = np.logical_and(self.test_doc[2], self.mask)
        phi = np.float32(self.nkw[:, phi_mask]) / np.float32(
            self.nk[:, np.newaxis])

        samples = util_funcs.gen_obj(phi.shape[1])
        table_h = np.zeros(self.K, dtype=np.int32)
        table_l = np.zeros(self.K, dtype=np.int32)
        table_p = np.zeros(self.K, dtype=np.float32)
        batch_map = np.zeros(self.V, dtype=np.int32)
        util_funcs.gen_batch_map(phi_mask, batch_map, self.V)

        # *************************************** sampling ************************************************
        util_funcs.gen_alias_table(table_h=table_h,
                                   table_l=table_l,
                                   table_p=table_p,
                                   phi=phi / np.sum(phi, 0),
                                   batch_mask=phi_mask,
                                   w_sample=self.test_doc[3],
                                   samples=samples,
                                   iter_per_update=iter,
                                   MH_max=MH_max)

        batch_N = sum(len(doc) for doc in self.test_doc[0])
        batch_D = len(self.test_doc[0])
        w_cnt = phi.shape[1]
        z = [None for _ in xrange(batch_D)]
        Adk = np.zeros((batch_D, self.K), dtype=np.int32)
        Adk_mean = np.zeros(Adk.shape, dtype=np.float32)
        burn_in = iter // 2
        rand_kkk = np.random.randint(self.K, size=batch_N)

        util_funcs.sample_z_par_alias_per(batch_D, self.test_doc[0], z, w_cnt,
                                          self.K, iter, burn_in, self.alpha,
                                          self.alpha_bar, self.beta,
                                          self.beta_bar, Adk, Adk_mean,
                                          batch_map, phi, samples, MH_max,
                                          rand_kkk, phi_mask, True)
        # *************************************** perplexity ************************************************
        Adk_mean += self.alpha
        Adk_mean /= np.sum(Adk_mean, 1)[:, np.newaxis]

        doc_len = len(self.test_doc[1])
        log_avg_probs = 0

        for d in xrange(doc_len):
            for w in self.test_doc[1][d]:
                if not self.mask[w]:
                    continue
                log_avg_probs += np.log(
                    np.dot(Adk_mean[d, :], phi[:, batch_map[w]]))

        num = sum([len(d) for d in self.test_doc[1]])
        util_funcs.kill_obj(phi.shape[1], samples)
        return np.exp(-log_avg_probs / num)
コード例 #4
0
ファイル: sampler_g.py プロジェクト: gblackout/very_large
    def get_perp_just_in_time(self, iter, MH_max):
        # *************************************** parameters ************************************************
        phi_mask = np.logical_and(self.test_doc[2], self.mask)
        phi = np.float32(self.nkw[:, phi_mask]) / np.float32(self.nk[:, np.newaxis])

        samples = util_funcs.gen_obj(phi.shape[1])
        table_h = np.zeros(self.K, dtype=np.int32)
        table_l = np.zeros(self.K, dtype=np.int32)
        table_p = np.zeros(self.K, dtype=np.float32)
        batch_map = np.zeros(self.V, dtype=np.int32)
        util_funcs.gen_batch_map(phi_mask, batch_map, self.V)

        # *************************************** sampling ************************************************
        util_funcs.gen_alias_table(table_h=table_h, table_l=table_l, table_p=table_p, phi=phi/np.sum(phi, 0),
                                   batch_mask=phi_mask, w_sample=self.test_doc[3],
                                   samples=samples, iter_per_update=iter, MH_max=MH_max)

        batch_N = sum(len(doc) for doc in self.test_doc[0])
        batch_D = len(self.test_doc[0])
        w_cnt = phi.shape[1]
        z = [None for _ in xrange(batch_D)]
        Adk = np.zeros((batch_D, self.K), dtype=np.int32)
        Adk_mean = np.zeros(Adk.shape, dtype=np.float32)
        burn_in = iter // 2
        rand_kkk = np.random.randint(self.K, size=batch_N)

        util_funcs.sample_z_par_alias_per(batch_D, self.test_doc[0], z, w_cnt, self.K, iter, burn_in, self.alpha,
                                          self.alpha_bar, self.beta, self.beta_bar, Adk, Adk_mean, batch_map, phi,
                                          samples, MH_max, rand_kkk, phi_mask, True)
        # *************************************** perplexity ************************************************
        Adk_mean += self.alpha
        Adk_mean /= np.sum(Adk_mean, 1)[:, np.newaxis]

        doc_len = len(self.test_doc[1])
        log_avg_probs = 0

        for d in xrange(doc_len):
            for w in self.test_doc[1][d]:
                if not self.mask[w]:
                    continue
                log_avg_probs += np.log(np.dot(Adk_mean[d, :], phi[:, batch_map[w]]))

        num = sum([len(d) for d in self.test_doc[1]])
        util_funcs.kill_obj(phi.shape[1], samples)
        return np.exp(- log_avg_probs / num)
コード例 #5
0
ファイル: sgd4lda.py プロジェクト: gblackout/very_large
    def get_perp_just_in_time(self, MH_max):
        """ the form of test_doc is: [ [[w], [..], ...], [[test_w], [..], ..], mask[], map[] ]"""
        samples, phi = self.gen_alias_table(MH_max, perplexity=True)
        util_funcs.gen_batch_map(self.test_doc[2], self.batch_map, self.W)
        Adk_mean = self.sample_counts(self.test_doc[0], phi, len(self.test_doc[0]), self.samples_per_update, samples,
                                      phi.shape[1], self.batch_map, self.batch_map, MH_max, perplexity=True); collect()
        Adk_mean += self.alpha
        Adk_mean /= np.sum(Adk_mean, 1)[:, np.newaxis]

        doc_len = len(self.test_doc[1])
        log_avg_probs = 0

        for d in xrange(doc_len):
            for w in self.test_doc[1][d]:
                log_avg_probs += np.log(np.dot(Adk_mean[d, :], phi[:, self.batch_map[w]]))

        num = sum([len(d) for d in self.test_doc[1]])
        util_funcs.kill_obj(phi.shape[1], samples)
        return np.exp(- log_avg_probs / num)
コード例 #6
0
    def get_perp_just_in_time(self, MH_max, theta=None, norm_const=None):
        """
            note:
                assume the form of test_doc is: [ [[w], [..], ...], [[test_w], [..], ..], mask[], map[] ]
        """
        theta = self.theta if theta is None else theta; norm_const = self.norm_const if norm_const is None else norm_const
        samples, phi = self.gen_alias_table(MH_max, theta, norm_const, perplexity=True)
        util_funcs.gen_batch_map(self.test_doc[2], self.batch_map, self.W)
        Adk_mean = self.sample_counts(self.test_doc[0], phi, len(self.test_doc[0]), self.samples_per_update, samples,
                                      phi.shape[1], self.batch_map, self.batch_map, MH_max, perplexity=True); collect()
        Adk_mean += self.alpha
        Adk_mean /= np.sum(Adk_mean, 1)[:, np.newaxis]

        doc_len = len(self.test_doc[1])
        log_avg_probs = 0

        for d in xrange(doc_len):
            for w in self.test_doc[1][d]:
                log_avg_probs += np.log(np.dot(Adk_mean[d, :], phi[:, self.batch_map[w]]))

        num = sum([len(d) for d in self.test_doc[1]])
        util_funcs.kill_obj(phi.shape[1], samples)
        return np.exp(- log_avg_probs / num)