def __init__(self, V, K, rank, doc_per_set, dir='./', alpha=0.01, beta=0.0001, word_partition=2000, set_name='t_saved', test_doc='test_doc', silent=False, single=True): """ dir: indicates the root folder of each data folder, tmp file folder shall be created in here NOTICE: mask is incomplete if dis, manually add it NOTICE: digits in set name are used to ident the ndk nd and z""" # ******************************* store parameter ********************************************* self.K = K self.V = V self.doc_per_set = doc_per_set self.suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + str(rank) self.data_dir = dir + str(rank) + '/' self.tmp_dir = dir + 'tmp' + self.suffix + '/' makedirs(self.tmp_dir) self.alpha = alpha self.beta = beta self.beta_bar = beta * V self.alpha_bar = alpha * K self.mask = np.ones(V, dtype=bool) util_funcs.set_srand() self.set_name = set_name self.rank = rank self.bak_time = 0 # ******************************* init the matrices ********************************************* self.name = self.tmp_dir + 'nkw' + self.suffix + '.h5' self.node_name = 'nkw' nkw_file = h5py.File(self.tmp_dir + 'nkw' + self.suffix + '.h5', 'w') self.nkw = nkw_file.create_dataset('nkw', (K, self.V), dtype='int32') self.nk = np.zeros(K, dtype=np.int32) self.mask = np.zeros(self.V, dtype=bool) self.test_doc = np.load(self.data_dir + test_doc + '.npy').tolist() # ******************************* init counts&mask ********************************************* self.train_cts_set = [] for file_name in listdir(self.data_dir): if self.set_name in file_name: self.train_cts_set.append((file_name, np.load(self.data_dir + file_name).tolist())) if single: ndk = np.zeros((self.doc_per_set, self.K), dtype=np.int32) nd = np.zeros(self.doc_per_set, dtype=np.int32) z = np.array([None for _ in xrange(self.doc_per_set)], dtype=object) start = 0 while start < self.V: end = start + word_partition end = end * (end <= self.V) + self.V * (end > self.V) nkw_part = np.zeros((K, end - start), dtype=np.int32) self.init_cnts(nkw_part, ndk, nd, z, start, end, silent) self.nkw[:, start:end] = nkw_part start = end
def __init__(self, V, K, rank, doc_per_set, dir='./', alpha=0.01, beta=0.0001, word_partition=2000, set_name='t_saved', test_doc='test_doc', silent=False, single=True): """ dir: indicates the root folder of each data folder, tmp file folder shall be created in here NOTICE: mask is incomplete if dis, manually add it NOTICE: digits in set name are used to ident the ndk nd and z""" # ******************************* store parameter ********************************************* self.K = K self.V = V self.doc_per_set = doc_per_set self.suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + str(rank) self.data_dir = dir + str(rank) + '/' self.tmp_dir = dir + 'tmp' + self.suffix + '/' makedirs(self.tmp_dir) self.alpha = alpha self.beta = beta self.beta_bar = beta * V self.alpha_bar = alpha * K self.mask = np.ones(V, dtype=bool) util_funcs.set_srand() self.set_name = set_name self.rank = rank self.bak_time = 0 # ******************************* init the matrices ********************************************* self.name = self.tmp_dir + 'nkw' + self.suffix + '.h5' self.node_name = 'nkw' nkw_file = h5py.File(self.tmp_dir + 'nkw' + self.suffix + '.h5', 'w') self.nkw = nkw_file.create_dataset('nkw', (K, self.V), dtype='int32') self.nk = np.zeros(K, dtype=np.int32) self.mask = np.zeros(self.V, dtype=bool) self.test_doc = np.load(self.data_dir + test_doc + '.npy').tolist() # ******************************* init counts&mask ********************************************* self.train_cts_set = [] for file_name in listdir(self.data_dir): if self.set_name in file_name: self.train_cts_set.append( (file_name, np.load(self.data_dir + file_name).tolist())) if single: ndk = np.zeros((self.doc_per_set, self.K), dtype=np.int32) nd = np.zeros(self.doc_per_set, dtype=np.int32) z = np.array([None for _ in xrange(self.doc_per_set)], dtype=object) start = 0 while start < self.V: end = start + word_partition end = end * (end <= self.V) + self.V * (end > self.V) nkw_part = np.zeros((K, end - start), dtype=np.int32) self.init_cnts(nkw_part, ndk, nd, z, start, end, silent) self.nkw[:, start:end] = nkw_part start = end
def __init__(self, H, dir, rank, D, K, W, max_len, apprx, batch_size=50, alpha=0.01, beta=0.0001, a=10**5.2, b=10**(-6), c=0.33, samples_per_update=50, test_doc='test_doc', suffix=None): # set the related parameters1 self.K = K self.batch_size = batch_size self.step_size_params = (a, b, c) self.samples_per_update = samples_per_update self.W = W self.D = D self.H = H self.alpha = alpha self.beta = beta self.beta_bar = beta * self.W self.alpha_bar = alpha * K self.update_ct = 0 self.rank = rank if suffix is None: suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + '_' + str(rank) self.dir = dir self.data_dir = dir + str(rank) + '/' self.tmp_dir = dir + 'tmp' + suffix + '/' makedirs(self.tmp_dir) self.current_set = None self.batch_loc = [0, 0] self.time_bak = 0 self.apprx = apprx # used to map between real w and the sliced cnts matrix in memory self.batch_map = np.zeros(self.W, dtype=np.int32) self.batch_map_4w = np.zeros(self.W, dtype=np.int32) self.w4_cnt = None util_funcs.set_srand() # allocate the file theta_file = h5py.File(self.tmp_dir + 'theta' + suffix + '.h5', 'w') self.theta = theta_file.create_dataset('theta', (K, self.W), dtype='float32') self.norm_const = np.zeros((self.K, 1), dtype=np.float32) start = 0 while start < self.W: end = start + max_len end = end * (end <= self.W) + self.W * (end > self.W) tmp = np.random.gamma(1, 1, (self.K, end-start)); collect() self.theta[:, start:end] = tmp self.norm_const[:] += np.sum(tmp, 1)[:, np.newaxis] start = end tmp = None; collect() self.ndk = np.zeros((self.batch_size, K), dtype=np.int32) self.ndk_avg = np.zeros((self.batch_size, K), dtype=np.float32) self.nd = np.zeros(self.batch_size, dtype=np.int32) self.nk = np.zeros(K, dtype=np.int32) self.table_h = np.zeros(self.K, dtype=np.int32) self.table_l = np.zeros(self.K, dtype=np.int32) self.table_p = np.zeros(self.K, dtype=np.float32) self.samples = None self.test_doc = load(open(self.data_dir + test_doc, 'r')) self.mask = np.ones(self.W, dtype=bool) self.iters_per_doc = 50
def __init__(self, H, dir, rank, D, K, W, max_len, apprx, batch_size=50, alpha=0.01, beta=0.0001, a=10**5.2, b=10**(-6), c=0.33, samples_per_update=50, test_doc='test_doc', suffix=None): """ H: value sqrt(m)*sigma^(1+0.3) dir: indicates the root folder of each data folder, tmp file folder shall be created in here rank: indicate the subfolder where the docs exist D: the total docs in the local training set K: the num of topic W: the len of vocab max_len: maximum number of slices we can load into memory (ignore for 10708 prj) approx: a hack for counting the time (ignore for 10708 prj) samples_per_update = how many iterations needed to approximate the expectation term in SGLD update test_doc: file path of test documents time_bak: time used to load, correct with 1.5s; you need to set 0 every time you use it (ignore for 10708 prj) train_set: has form [ [[d],[d],[d],[d]], map[], mask[], flag, [maskd[], maskd[], maskd[], maskd[]] ] test_doc: [ [[w], [..], ...], [[test_w], [..], ..], mask[], map[] ] """ # set the related parameters self.K = K self.batch_size = batch_size self.step_size_params = (a, b, c) self.samples_per_update = samples_per_update self.W = W self.D = D self.H = H self.alpha = alpha self.beta = beta self.beta_bar = beta * self.W self.alpha_bar = alpha * K self.update_ct = 0 self.rank = rank if suffix is None: suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + '_' + str(rank) self.dir = dir self.data_dir = dir + str(rank) + '/' self.tmp_dir = dir + 'tmp' + suffix + '/' makedirs(self.tmp_dir) self.current_set = None self.batch_loc = [0, 0] self.time_bak = 0 self.apprx = apprx # used to map between real w and the sliced cnts matrix in memory self.batch_map = np.zeros(self.W, dtype=np.int32) self.batch_map_4w = np.zeros(self.W, dtype=np.int32) self.w4_cnt = None util_funcs.set_srand() # allocate the file theta_file = h5py.File(self.tmp_dir + 'theta' + suffix + '.h5', 'w') self.theta = theta_file.create_dataset('theta', (K, self.W), dtype='float32') self.norm_const = np.zeros((self.K, 1), dtype=np.float32) # comments for 10708 prj # here theta is the T matrix I refer to in the paper, and theta in the paper is named as g_theta (global theta) # I init theta one chunk at a time for the sake of memory # the norm_const is the normal constant of theta, where we have: phi (topic-word mat) = theta / norm_const start = 0 while start < self.W: end = start + max_len end = end * (end <= self.W) + self.W * (end > self.W) tmp = np.random.gamma(1, 1, (self.K, end-start)); collect() self.theta[:, start:end] = tmp self.norm_const[:] += np.sum(tmp, 1)[:, np.newaxis] start = end tmp = None; collect() # ndk, nd and nk are cnt matrix similar to those used in vanilla collapsed gibbs sampling, and we use them # to count in every minibatches self.ndk = np.zeros((self.batch_size, K), dtype=np.int32) self.ndk_avg = np.zeros((self.batch_size, K), dtype=np.float32) self.nd = np.zeros(self.batch_size, dtype=np.int32) self.nk = np.zeros(K, dtype=np.int32) # these are alias tables for lightlda fast per-token sampling, you can refer to the summary in my paper to # refresh yourself self.table_h = np.zeros(self.K, dtype=np.int32) self.table_l = np.zeros(self.K, dtype=np.int32) self.table_p = np.zeros(self.K, dtype=np.float32) self.samples = None self.test_doc = load(open(self.data_dir + test_doc, 'r')) self.mask = np.ones(self.W, dtype=bool)