def gen_lda_model(self, vectorizer, n_topics, finalCorpus, update_mat=False): #vectorizer = gen_feature_vectorizer(self,t) t0 = time() #print(finalCorpus) if update_mat or self.tfMatrix == None: # dont want to re-create the everytime we want to generate an lda_model (e.g. different topic numbers) # unless this is a new vectorizer, or a new corpus t0 = time() #print('[gen_lda_model] finalCorpus:', len(finalCorpus), type(finalCorpus), len(finalCorpus[0]), finalCorpus.columns, "||", finalCorpus[0][1]) #try this #finalCorpus = ['This is the first document.', 'This is the second second document.', 'And the third one.','Is this the first document?'] #finalCorpus = finalCorpus.iloc[0:5] self.tfMatrix = vectorizer.fit_transform(finalCorpus) #transformer = TfidfTransformer() # this would be a substitue for TFIDFVectorizor, but already using it... #self.tfMatrix = transformer.fit_transform(TermDocMatrix) print("[tffeature]: gen_lda_model: transform done in %0.3fs." % (time() - t0)) print("[tffeature]: gen_lda_model: tfMatrix shape:", self.tfMatrix.shape) self.tfMatrix = normalize(self.tfMatrix, norm='l1', axis=1) # rule of thumb: # alpha >> beta - we get simpler topic distributions with more topic weight appearing in documents # alpha << beta - we get complicated topic distributions with fewer topic weight appearing in documents # alpha, beta - small : we get contension in simpler topics, but also fewer topics in documents # alpha, beta - big: contension in complicated topics, but many topics in documents # default alpha = beta = 1/n_topics (which depending on number of topics can be big or small) alpha = 1.0 / float(n_topics) beta = (1.0 / float(n_topics))**2 #beta = 1.0/float(n_topics) lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=20, learning_method='online', random_state=0, n_jobs=-1) t0 = time() lda_model.fit(self.tfMatrix) print("[tffeature]: gen_lda_model: topics done in %0.3fs." % (time() - t0)) print("\n[tffeature]: gen_lda_model: Topics in LDA model:") print_top_words(lda_model, vectorizer.get_feature_names(), 20) return lda_model
def main(): # Report url all_reports_url = 'http://www.gov.cn/guowuyuan/baogao.htm' x = html_parse(gov_url=all_reports_url) # plot_word_cloud(x.corpus[2],max_words = 30) # LDA n_features = 1000 # Delete stop word stop_word_txt ="中文停用词表.txt" with open(stop_word_txt, 'rb') as fp: stopword = fp.read().decode('utf-8') stopword_list = stopword.splitlines() tf_vectorizer = CountVectorizer(stop_words= stopword_list,max_features=n_features) tf = tf_vectorizer.fit_transform(x.corpus) n_topics = 5 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=500,learning_method='batch') lda.fit(tf) # Get feature name tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words= 20)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default="configs/1k.yaml", help="Which configuration to use. See into 'config' folder") opt = parser.parse_args() with open(opt.config, 'r') as ymlfile: config = yaml.load(ymlfile, Loader=yaml.FullLoader) print(config) # dataset config_dataset = config['dataset'] data_path = osp.join( config_dataset['folder-path'], config_dataset['data-file']) vocab_path = osp.join( config_dataset['folder-path'], config_dataset['vocab-file']) seedword_path = osp.join( config_dataset['folder-path'], config_dataset['sw-file']) labels = config_dataset['labels'] # model config_model = config['model'] n_encoder_1 = config_model['n_encoder_1'] n_encoder_2 = config_model['n_encoder_2'] n_latent = config_model['n_latent'] dr = config_model['dropout'] ld = config_model['lambda'] al = config_model['alpha'] # training config_training = config['training'] lr = config_training['lr'] bs = config_training['bs'] d_step = config_training['d_step'] epochs = config_training['epochs'] n_topwords = config_training['n_topwords'] ratio = config_training['ratio'] exp = config_training['exp'] result = config_training['result'] write = config_training['write'] # create result folders os.makedirs(result, exist_ok=True) dataset = np.load(data_path) with open(vocab_path, 'rb') as vocabfile: vocab = pickle.load(vocabfile) dataset_x, dataset_y = dataset[:, 0], dataset[:, 1] id_vocab = utils.sort_values(vocab) vocab_size = len(vocab) seedwords = utils.read_seedword(seedword_path) tfms_x = [Onehotify(vocab_size)] tfms_y = [] # Split data sss = StratifiedShuffleSplit(n_splits=exp, test_size=ratio, random_state=0) splitted_data = sss.split(dataset_x, dataset_y) gamma_prior = np.zeros((vocab_size, n_latent)) gamma_prior_batch = np.zeros((bs, vocab_size, n_latent)) for idx_topic, seed_topic in enumerate(seedwords): for idx_word, seed_word in enumerate(seed_topic): idx_vocab = vocab[seed_word] gamma_prior[idx_vocab, idx_topic] = 1.0 # V x K gamma_prior_batch[:, idx_vocab, :] = 1.0 # N x V x K model = AVIAD(n_encoder_1, n_encoder_2, vocab_size, n_latent, gamma_prior=gamma_prior, ld=ld, al=al, lr=lr, dr=dr) for ds_train_idx, ds_test_idx in splitted_data: train_ds, test_ds = URSADataset(dataset_x[ds_train_idx], dataset_y[ds_train_idx], tfms_x, tfms_y), \ URSADataset(dataset_x[ds_test_idx], dataset_y[ds_test_idx], tfms_x, tfms_y) train_dl, test_dl = DataLoader(train_ds, bs, False), DataLoader(test_ds, bs, False) beta = None for epoch in range(epochs): avg_cost = 0. sum_t_c = 0. for batch_train_x, batch_train_y in train_dl: t_c = time.time() cost, beta = model.partial_fit(batch_train_x, gamma_prior_batch) c_elap = time.time() - t_c # Compute average loss avg_cost += cost / len(train_ds) * bs # Compute avg time sum_t_c += c_elap if np.isnan(avg_cost): print('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.') sys.exit() # Display logs per epoch step if (epoch + 1) % d_step == 0: print("##################################################") print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)) utils.print_top_words(epoch + 1, beta, id_vocab, n_topwords, result, write) print("##################################################") print("epoch={}, cost={:.9f}, sum_t_c={:.2f}".format((epoch + 1), avg_cost, sum_t_c)) gamma = model.gamma_test() utils.print_gamma(gamma, vocab, seedwords) utils.classification_evaluate_dl(model, train_dl, n_latent, labels, show=True) utils.classification_evaluate_dl(model, test_dl, n_latent, labels, show=True) utils.print_top_words(epoch + 1, beta, id_vocab, n_topwords, result, write) utils.calc_perp(model, test_dl, gamma_prior_batch)
def train(sess, model, train_url, test_url, batch_size, vocab_size, training_epochs=200, alternate_epochs=1,#10 lexicon=[], result_file='test.txt', B=1, warm_up_period=100): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset train_size=len(train_set) validation_size=int(train_size*0.1) dev_set = train_set[:validation_size] dev_count = train_count[:validation_size] train_set = train_set[validation_size:] train_count = train_count[validation_size:] print('sizes',train_size,validation_size,len(dev_set),len(train_set)) optimize_jointly = True dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) warm_up = 0 start_min_alpha = 0.00001 min_alpha = start_min_alpha warm_up_alpha=False start_B=4 curr_B=B #for early stopping best_print_ana_ppx=1e10 early_stopping_iters=30 no_improvement_iters=0 stopped=False epoch=-1 #for epoch in range(training_epochs): while not stopped: epoch+=1 train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) if warm_up<1.: warm_up += 1./warm_up_period else: warm_up=1. # train #for switch in range(0, 2): if optimize_jointly: optim = model.optim_all print_mode = 'updating encoder and decoder' elif switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ana_loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 ana_kld_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B} _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]), input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(train_batches) dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder') phi = dec_vars[0] phi = sess.run(phi) utils.print_top_words(phi, lexicon,result_file=None) print_ppx = np.exp(loss_sum / word_count) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(train_batches) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch train: {:d} |'.format(epoch+1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld)) if warm_up_alpha: if min_alpha>0.0001: min_alpha-=(start_min_alpha-0.0001)/training_epochs #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 print_ana_ppx = 0.0 ana_loss_sum = 0.0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective], input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(dev_batches) print_loss = recon_sum/len(dev_batches) if print_ana_ppx<best_print_ana_ppx: no_improvement_iters=0 best_print_ana_ppx=print_ana_ppx #check on validation set, if ppx better-> save improved model tf.train.Saver().save(sess, 'models/improved_model_bernoulli') else: no_improvement_iters+=1 print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx) if no_improvement_iters>=early_stopping_iters: #if model has not improved for 30 iterations, stop training ###########STOP TRAINING############ stopped=True print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters) ###########LOAD BEST MODEL########## print('load stored model') tf.train.Saver().restore(sess,'models/improved_model_bernoulli') print('| Epoch dev: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld) , '| Loss: {:.5}'.format(print_loss)) #------------------------------- # test #if epoch%10==0 or epoch==training_epochs-1: if FLAGS.test: #if epoch==training_epochs-1: if stopped: #only do it once in the end coherence=utils.topic_coherence(test_set,phi, lexicon) print('topic coherence',str(coherence)) loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum = 0.0 ana_loss_sum = 0.0 ana_kld_sum = 0.0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B} loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld)/np.sum(mask) ana_loss_sum += np.sum(ana_loss) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(test_batches) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(test_batches) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch test: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld))
clf = LatentDirichletAllocation(n_topics) count_vect = CountVectorizer(stop_words=stop_words) print('Fitting count_vectorizer') mat = count_vect.fit_transform(mc.get_text()) print('Done!') print('-'*100) print('Fitting LDA model') clf.fit(mat) print('Done!') print('-'*100) print('Perplexity = {}'.format(clf.perplexity(mat))) print('-'*100) print('Topics:') print_top_words(clf, count_vect.get_feature_names(), 10) print('-'*100) mc.disconnect() print('Saving to ' + save_path) if not exists(save_path): print('Path doesn\'t exist, creating one') makedirs(save_path) with open(save_path + 'lda_model_' + str(int(time())) + '.pickle', 'wb') as f: pickle.dump(clf, f) print('-'*100)
print("preprocessing data") df = utils.preprocess_data(df, analyzer, tt) df.to_csv("data/tesi_US_preprocessed.csv", index=None) else: print("loading preprocessed data") df = pd.read_csv("data/tesi_US_preprocessed.csv") print("training vectorizer") TDmat = cv.fit_transform(df['preprocessed']) joblib.dump(cv, "models/cv_{}.pkl".format(n_features)) if isinstance(n_topics, list): topic_numbers = n_topics else: topic_numbers = [n_topics] for num in topic_numbers: lda = LatentDirichletAllocation(n_components=num, max_iter=12, learning_method='online', learning_offset=30., random_state=0, n_jobs=6) print("training lda with {} topics".format(num)) lda.fit(cv.transform(df['preprocessed'])) utils.print_top_words(lda, cv.get_feature_names(), n_top_words) joblib.dump(lda, "models/lda_{}_{}.pkl".format(num, n_features)) utils.visualize_lda(lda, TDmat, cv, True, "html/lda_{}_{}.html".format(num, n_features))
def train(self): logging.info('Training...') best_hr = [0.] best_ndcg = [0.] # file to store the generated topic words if os.path.exists('res/topics_' + self.dataset + '.txt'): os.remove('res/topics_' + self.dataset + '.txt') logging.info('Successfully remove existing topic file!') batch_num = int(len(self.train_pairs) / self.batch_size) + 1 for epoch in range(self.max_epoch): if (epoch + 1) % 20 == 0: self.init_lr = self.init_lr * self.lr_decay logging.info('Training at epoch ' + str(epoch + 1) + ' ...') loss_total, gen_loss_total, latent_loss_total, reg_loss_total, cf_loss_total = 0., 0., 0., 0., 0. for batch in range(batch_num): batch_data, batch_labels = utils.load_batch( self.train_pairs, self.train_labels, batch, self.batch_size) batch_data = np.transpose(batch_data) docu1 = batch_data[0] docu2 = batch_data[1] docus = np.concatenate((docu1, docu2), axis=0) get_emb = [ list(range(len(docu1))), list(range(len(docu1), len(docu1) + len(docu2))) ] feed_dict = { self.batch_data: self.doc_contents[docus], self.batch_labels: np.array(batch_labels), self.learning_rate: self.init_lr, self.keep_prob: 1., self.get_emb: get_emb } _, loss_tmp, gen_loss_tmp, latent_loss_tmp, reg_loss_tmp, cf_loss_tmp = self.sess.run( (self.train_op, self.loss, self.gen_loss, self.latent_loss, self.reg_loss, self.cf_loss), feed_dict=feed_dict) loss_total += loss_tmp gen_loss_total += gen_loss_tmp latent_loss_total += latent_loss_tmp reg_loss_total += reg_loss_tmp cf_loss_total += cf_loss_tmp if (epoch + 1) % self.trained_print_step == 0: logging.info( 'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}, reg loss = {4}, cf loss = {5}\n' .format(epoch + 1, loss_total / batch_num, gen_loss_total / batch_num, latent_loss_total / batch_num, reg_loss_total / batch_num, 1000. * cf_loss_total / batch_num)) if (epoch + 1) % self.test_step == 0: logging.info('Testing at epoch ' + str(epoch + 1) + ' ...') z_test = self.sess.run(self.z, feed_dict={ self.batch_data: self.doc_contents, self.keep_prob: 1.0 }) feed_dict = {self.z_test: z_test, self.keep_prob: 1.0} # ave_rank, ave_auc = self._auc_test(feed_dict) # logging.info('ave rank = ' + str(ave_rank) + ', ave auc = ' + str(ave_auc) + '\n') hits, ndcgs = self._hit_test(feed_dict) logging.info('HR = ' + str(hits)) logging.info('NDCGS = ' + str(ndcgs) + '\n') if best_hr[-1] < hits[-1]: best_hr = hits if best_ndcg[-1] < ndcgs[-1]: best_ndcg = ndcgs if (epoch + 1) % self.print_words_step == 0: utils.print_top_words(self.sess.run(self.weights_words), self.vocab, self.dataset) logging.info('BEST HR = ' + str(best_hr)) logging.info('BEST NDCGS = ' + str(best_ndcg) + '\n\n\n')
def get_top_N_topic_terms(self, N, lda_model, vectorizer): top_words_list = print_top_words(lda_model, vectorizer.get_feature_names(), N) flat_top_words_list = set( [item for sublist in top_words_list for item in sublist]) return list(flat_top_words_list)
def _train_all(self, train_doc, x_valid): logging.info('Combined pre-training...') tf.reset_default_graph() input_dim = train_doc.shape[1] # the same as self.layers_list[-1] with tf.variable_scope('inference'): rec = { 'W1': tf.get_variable(name='W1', initializer=tf.constant( self.encode_weights[0]), dtype=tf.float32), 'b1': tf.get_variable(name='b1', initializer=tf.constant(self.encode_biases[0]), dtype=tf.float32) } for layer_id in range(1, len(self.layers_list)): key_w = 'W' + str(layer_id + 1) key_b = 'b' + str(layer_id + 1) rec[key_w] = tf.get_variable( name=key_w, initializer=tf.constant(self.encode_weights[layer_id]), dtype=tf.float32) rec[key_b] = tf.get_variable(name=key_b, initializer=tf.constant( self.encode_biases[layer_id]), dtype=tf.float32) rec['W_z_mean'] = tf.get_variable(name='W_z_mean', initializer=tf.constant( self.encode_weights[-2]), dtype=tf.float32) rec['b_z_mean'] = tf.get_variable(name='b_z_mean', initializer=tf.constant( self.encode_biases[-2]), dtype=tf.float32) rec['W_z_log_sigma'] = tf.get_variable( name='W_z_log_sigma', initializer=tf.constant(self.encode_weights[-1]), dtype=tf.float32) rec['b_z_log_sigma'] = tf.get_variable(name='b_z_log_sigma', initializer=tf.constant( self.encode_biases[-1]), dtype=tf.float32) with tf.variable_scope('generation'): gen = {} key_w = 'Wz' key_b = 'bz' gen[key_w] = tf.get_variable(name=key_w, initializer=tf.constant( self.decode_weights[-1]), dtype=tf.float32) gen[key_b] = tf.get_variable(name=key_b, initializer=tf.constant( self.decode_biases[-1]), dtype=tf.float32) for layer_id in reversed(range(1, len(self.layers_list))): key_w = 'W' + str(layer_id + 1) key_b = 'b' + str(layer_id + 1) gen[key_w] = tf.transpose(rec[key_w]) gen[key_b] = rec['b' + str(layer_id)] gen['W1'] = tf.transpose(rec['W1']) gen['b1'] = tf.get_variable( 'b1', shape=[input_dim], initializer=tf.constant_initializer(0.), dtype=tf.float32) for key in rec: self.weights.append(rec[key]) self.weights += [gen['Wz'], gen['bz'], gen['b1']] self.saver = tf.train.Saver(self.weights) doc_x_ = tf.placeholder(name='doc_x_', shape=[None, input_dim], dtype=tf.float32) net = utils.activate(tf.matmul(doc_x_, rec['W1']) + rec['b1'], activator=self.activations[0]) learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') for layer_id in range(1, len(self.layers_list)): key_w = 'W' + str(layer_id + 1) key_b = 'b' + str(layer_id + 1) net = utils.activate(tf.matmul(net, rec[key_w]) + rec[key_b], activator=self.activations[layer_id]) z_mean = tf.matmul(net, rec['W_z_mean']) + rec['b_z_mean'] z_log_sigma_sq = tf.matmul(net, rec['W_z_log_sigma']) + rec['b_z_log_sigma'] eps = tf.random_normal((self.batch_size, self.hidden_dim), 0, 1, seed=0, dtype=tf.float32) z = z_mean + tf.sqrt(tf.maximum(tf.exp(z_log_sigma_sq), 1e-10)) * eps net = utils.activate(tf.matmul(z, gen['Wz']) + gen['bz'], activator=self.activations[-1]) self.weights_words = gen['Wz'] for layer_id in reversed(range(1, len(self.layers_list))): key_w = 'W' + str(layer_id + 1) key_b = 'b' + str(layer_id + 1) net = utils.activate(tf.matmul(net, gen[key_w]) + gen[key_b], activator=self.activations[layer_id]) self.weights_words = tf.matmul(self.weights_words, gen[key_w]) x_recon = tf.squeeze(tf.matmul(net, gen['W1']) + gen['b1']) self.weights_words = tf.matmul(self.weights_words, gen['W1']) gen_loss = tf.reduce_mean( tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits( labels=doc_x_, logits=x_recon), axis=1)) latent_loss = 0.5 * tf.reduce_mean( tf.reduce_sum(tf.square(z_mean) + tf.exp(z_log_sigma_sq) - z_log_sigma_sq - 1, axis=1)) loss = gen_loss + latent_loss train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) batch_num = int(len(train_doc) / self.batch_size) + 1 for epoch in range(self.max_epoch * 2): loss_total, gen_loss_total, latent_loss_total = 0., 0., 0. for batch in range(batch_num): batch_x_, idx = utils.get_batch(train_doc, self.batch_size) feed_dict = {doc_x_: batch_x_, learning_rate: self.init_lr} _, loss_tmp, gen_loss_tmp, latent_loss_tmp = sess.run( (train_op, loss, gen_loss, latent_loss), feed_dict=feed_dict) loss_total += loss_tmp gen_loss_total += gen_loss_tmp latent_loss_total += latent_loss_tmp if (epoch + 1) % self.print_step == 0: if x_valid: valid_loss = self._validate_test(train_doc, sess, gen_loss, doc_x_, learning_rate) logging.info( 'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}, valid loss = {4}' .format(epoch + 1, loss_total / batch_num, gen_loss_total / batch_num, latent_loss_total / batch_num, valid_loss)) else: logging.info( 'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}' .format(epoch + 1, loss_total / batch_num, gen_loss_total / batch_num, latent_loss_total / batch_num)) # print out the topic words generated in stacked variational auto-encoder if (epoch + 1) % self.print_words_step == 0: utils.print_top_words(sess.run(self.weights_words), self.vocab, self.dataset) self.saver.save(sess, self.pretrain_dir) logging.info('Weights saved at ' + self.pretrain_dir)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default="configs/1k.yaml", help="Which configuration to use. See into 'config' folder") opt = parser.parse_args() with open(opt.config, 'r') as ymlfile: config = yaml.load(ymlfile, Loader=yaml.FullLoader) print(config) # dataset config_dataset = config['dataset'] folder_path = config_dataset['folder_path'] data_path = osp.join(folder_path, config_dataset['data_file']) vocab_path = osp.join(folder_path, config_dataset['vocab_file']) labels = config_dataset['labels'] maxlen = config_dataset['maxlen'] num_classes = len(labels) # model config_model = config['model'] n_encoder_1 = config_model['n_encoder_1'] n_encoder_2 = config_model['n_encoder_2'] n_latent = config_model['n_latent'] n_sentiment = config_model['n_sentiment'] dr = config_model['dropout'] ld = config_model['lambda'] al = config_model['alpha'] # training config_training = config['training'] lr = config_training['lr'] cls_lr = config_training['cls_lr'] bs = config_training['bs'] d_step = config_training['d_step'] epochs = config_training['epochs'] n_topwords = config_training['n_topwords'] n_labeled = config_training['n_labeled'] ratio = config_training['ratio'] exp = config_training['exp'] result = config_training['result'] write = config_training['write'] # create result folders os.makedirs(result, exist_ok=True) dataset = np.load(data_path) with open(vocab_path, 'rb') as vocabfile: vocab = pickle.load(vocabfile) (dataset_train_x, dataset_train_y), (dataset_test_x, dataset_test_y) = dataset id_vocab = utils.sort_values(vocab) vocab_size = len(vocab) tfms_unlabeled_x = [Onehotify(vocab_size)] tfms_labeled_x = [Padify(maxlen)] tfms_y = [YOnehotify(num_classes)] network_architecture = dict(n_hidden_recog_1=n_encoder_1, # 1st layer encoder neurons n_hidden_recog_2=n_encoder_2, # 2nd layer encoder neurons n_hidden_gener_1=vocab_size, # 1st layer decoder neurons n_input=vocab_size, # MNIST data input (img shape: 28*28) n_input_pi=maxlen, n_z=n_latent, n_p=num_classes) model = AVIJST(network_architecture, learning_rate=lr, cls_learning_rate=cls_lr, batch_size=bs) # Split data sss = StratifiedShuffleSplit(n_splits=exp, test_size=n_labeled, random_state=0) splitted_train = sss.split(dataset_train_x, dataset_train_y) for _, ds_train_labeled_idx in splitted_train: train_unlabeled_ds = Dataset(np.concatenate((dataset_train_x, dataset_test_x), axis=0),\ np.concatenate((dataset_train_y, dataset_test_y), axis=0),\ tfms_unlabeled_x, tfms_y) train_unlabeled_pi_ds = Dataset(np.concatenate((dataset_train_x, dataset_test_x), axis=0),\ np.concatenate((dataset_train_y, dataset_test_y), axis=0),\ tfms_labeled_x, tfms_y) train_labeled_ds = Dataset(dataset_train_x[ds_train_labeled_idx], dataset_train_y[ds_train_labeled_idx], tfms_labeled_x, tfms_y) test_ds = Dataset(dataset_test_x, dataset_test_y, tfms_labeled_x, tfms_y) train_unlabeled_dl = DataLoader(train_unlabeled_ds, bs, False) train_unlabeled_pi_dl = DataLoader(train_unlabeled_pi_ds, bs, False) train_labeled_dl = DataLoader(train_labeled_ds, bs, False) test_dl = DataLoader(test_ds, bs, False) start = time.time() print ("train_labeled_ds: ", len(train_labeled_ds)) print ("train_unlabeled_pi_ds: ", len(train_unlabeled_pi_ds)) print ("train_unlabeled_ds: ", len(train_unlabeled_ds)) train_labeled_iter = iter(train_labeled_dl) for epoch in range(epochs): avg_loss = 0. avg_kl_s_loss = 0. avg_kl_z_loss = 0. avg_cls_loss = 0. sum_t_c = 0. for idx, ((train_unlabeled_x, _), (train_unlabeled_pi_x, _)) in enumerate(zip(train_unlabeled_dl, train_unlabeled_pi_dl)): t_c = time.time() # Labeled if len(train_labeled_ds) > 0: train_labeled_x, train_labeled_y = None, None try: train_labeled_x, train_labeled_y = next(train_labeled_iter) except: train_labeled_iter = iter(train_labeled_dl) train_labeled_x, train_labeled_y = next(train_labeled_iter) loss_l = model.cls_fit(train_labeled_x, train_labeled_y) avg_cls_loss += loss_l / len(train_unlabeled_ds) * bs # Unlabeled loss_u, kl_s_loss, kl_z_loss, emb = model.partial_fit(train_unlabeled_x, train_unlabeled_pi_x) avg_loss += loss_u / len(train_unlabeled_ds) * bs avg_kl_s_loss += kl_s_loss / len(train_unlabeled_ds) * bs avg_kl_z_loss += kl_z_loss / len(train_unlabeled_ds) * bs c_elap = time.time() - t_c # Compute avg time sum_t_c += c_elap # Display logs per epoch step if (epoch + 1) % d_step == 0: weights = model.get_weights() print("##################################################") print("Epoch:", "%04d" % (epoch+1), \ "cls_cost=", "{:.9f}".format(avg_cls_loss), \ "cost=", "{:.9f}".format(avg_loss), \ "kl_s=", "{:.9f}".format(avg_kl_s_loss), \ "kl_z=", "{:.9f}".format(avg_kl_z_loss), \ "sum_t_c={:.2f}".format(sum_t_c)) utils.classification_evaluate_dl(lambda x: model.senti_prop(x), test_dl, n_sentiment, labels=['negative', 'positive']) utils.print_top_words(epoch + 1, weights, id_vocab, n_topwords, result, write, printout=False) print("##################################################") weights = model.get_weights() utils.classification_evaluate_dl(model, train_dl, n_latent, labels, show=True) utils.classification_evaluate_dl(model, test_dl, n_latent, labels, show=True) utils.print_top_words(epoch + 1, weights, id_vocab, n_topwords, result, write) utils.calc_perp(model, test_dl, gamma_prior_batch)
ngram_range=(1, 1), ) bow = cvec.fit_transform(lyrics) # TF-IDF tvec = TfidfVectorizer( min_df=5, max_df=0.95, stop_words=stop_words, ngram_range=(1, 3), ) tfidf = tvec.fit_transform(lyrics) # Topic Modelling n_top_words = 7 # Set number bow_feature_names = cvec.get_feature_names() tfidf_feature_names = tvec.get_feature_names() # NMF with TFIDF nmf2 = NMF(n_components=5, random_state=42, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.01, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (tfidf):") print_top_words(nmf2, tfidf_feature_names, n_top_words)