def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf): topics, _ = tm.get_topics(sess, topn=topn) topics = [ " ".join([idxvocab[w] for w in t]) for t in topics ] doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ] output = codecs.open(args.gen_sent_on_doc, "w", "utf-8") with tf.variable_scope("model", reuse=True, initializer=initializer): mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \ reuse_conv_variables=True) for d in range(len(docs)): output.write("\n" + "="*100 + "\n") output.write("Doc " + str(d) +":\n") output.write(doc_text[d]) doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol]) best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn) output.write("\nRepresentative topics:\n") output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \ for item in best_topics ]) + "\n") output.write("\nRepresentative words:\n") output.write("\n".join([ ("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words ]) + "\n") output.write("\nSentence generation (greedy; argmax):" + "\n") s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len+10, vocabxid[end_symbol]) output.write("[0] " + " ".join([ idxvocab[item] for item in s ]) + "\n") for temp in gen_temps: output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n") for i in xrange(gen_num): s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \ vocabxid[end_symbol]) output.write("[" + str(i) + "] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
def compute_dt_dist(docs, labels, tags, model, max_len, batch_size, pad_id, idxvocab, output_file): #generate batches num_batches = int(math.ceil(float(len(docs)) / batch_size)) dt_dist = [] t = [] combined = [] docid = 0 for i in xrange(num_batches): x, _, _, t, s = get_batch_doc(docs, labels, tags, i, max_len, cf.tag_len, batch_size, pad_id) attention, mean_topic = sess.run([model.attention, model.mean_topic], { model.doc: x, model.tag: t }) dt_dist.extend(attention[:s]) if debug: for si in xrange(s): d = x[si] print "\n\nDoc", docid, "=", " ".join( [idxvocab[item] for item in d if (item != pad_id)]) sorted_dist = matutils.argsort(attention[si], reverse=True) for ti in sorted_dist: print "Topic", ti, "=", attention[si][ti] docid += 1 np.save(open(output_file, "w"), dt_dist)
def run_epoch_doc(docs, labels, tags, tm, pad_id, cf): batches = int(math.ceil(float(len(docs))/cf.batch_size)) accs = [] for b in xrange(batches): d, y, m, t, num_docs = get_batch_doc(docs, labels, tags, b, cf.doc_len, cf.tag_len, cf.batch_size, pad_id) prob = sess.run(tm.sup_probs, {tm.doc:d, tm.label:y, tm.sup_mask: m, tm.tag: t}) pred = np.argmax(prob, axis=1) accs.extend(pred[:num_docs] == y[:num_docs]) print "\ntest classification accuracy = %.3f" % np.mean(accs)
def run_epoch(sents, docs, labels, tags, models, is_training): ####unsupervised topic and language model training#### #generate the batches tm_num_batches, lm_num_batches = int(math.ceil(float(len(sents[0]))/cf.batch_size)), \ int(math.ceil(float(len(sents[1]))/cf.batch_size)) batch_ids = [(item, 0) for item in range(tm_num_batches) ] + [(item, 1) for item in range(lm_num_batches)] seq_lens = (cf.tm_sent_len, cf.lm_sent_len) #shuffle batches and sentences random.shuffle(batch_ids) random.shuffle(sents[0]) random.shuffle(sents[1]) #set training and cost ops for topic and language model training tm_cost_ops = (tf.no_op(), tf.no_op(), tf.no_op(), tf.no_op()) lm_cost_ops = (tf.no_op(), tf.no_op(), tf.no_op(), tf.no_op()) if models[0] != None: tm_cost_ops = (models[0].tm_cost, (models[0].tm_train_op if is_training else tf.no_op()), tf.no_op(), tf.no_op()) if models[1] != None: lm_cost_ops = (tf.no_op(), tf.no_op(), models[1].lm_cost, (models[1].lm_train_op if is_training else tf.no_op())) cost_ops = (tm_cost_ops, lm_cost_ops) start_time = time.time() lm_costs, tm_costs, lm_words, tm_words = 0.0, 0.0, 0.0, 0.0 for bi, (b, model_id) in enumerate(batch_ids): tm_costs, tm_words, lm_costs, lm_words = fetch_batch_and_train(sents[model_id], docs[model_id], tags, \ models[model_id], seq_lens[model_id], b, (tm_costs, tm_words, lm_costs, lm_words), cost_ops[model_id]) #print progress output_string = "%d/%d: tm ppl = %.3f; lm ppl = %.3f; word/sec = %.1f" % \ (bi+1, len(batch_ids), np.exp(tm_costs/max(tm_words, 1.0)), np.exp(lm_costs/max(lm_words, 1.0)), \ float(tm_words + lm_words)/(time.time()-start_time)) print_progress(bi, len(batch_ids), is_training, output_string) ####supervised classification training#### if labels != None: #randomise the batches batch_ids = range(int(math.ceil(float(len(docs[0])) / cf.batch_size))) random.shuffle(batch_ids) start_time = time.time() costs, accs = 0.0, [] for bi, b in enumerate(batch_ids): d, y, m, t, num_docs = get_batch_doc(docs[0], labels, tags, b, cf.doc_len, cf.tag_len, cf.batch_size, 0) cost, prob, _ = sess.run([models[0].sup_cost, models[0].sup_probs, \ (models[0].sup_train_op if is_training else tf.no_op())], \ {models[0].doc:d, models[0].label:y, models[0].sup_mask: m, models[0].tag: t}) costs += cost * cf.batch_size #keep track of full cost pred = np.argmax(prob, axis=1) accs.extend(pred[:num_docs] == y[:num_docs]) #print progress output_string = "%d/%d: sup loss = %.3f; sup acc = %.3f; doc/sec = %.1f" % \ (bi+1, len(batch_ids), costs/((bi+1)*cf.batch_size), np.mean(accs), \ (bi+1)*cf.batch_size/(time.time()-start_time)) print_progress(bi, len(batch_ids), is_training, output_string) else: accs = None return -np.mean(accs) if accs != None else np.exp(lm_costs / max(lm_words, 1.0))