def gen_sent_on_topic(idxvocab, vocabxid, start_symbol, end_symbol, cf): output = codecs.open(args.gen_sent_on_topic, "wb", "utf-8") topics, entropy = tm.get_topics(sess, topn=topn) with tf.variable_scope("model", reuse=True, initializer=initializer): mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \ reuse_conv_variables=True) for t in range(cf.topic_number): output.write("\n" + "=" * 100 + "\n") output.write("Topic " + str(t) + ":\n") output.write(" ".join([idxvocab[item] for item in topics[t]]) + "\n\n") output.write("\nSentence generation (greedy; argmax):" + "\n") s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], 0, cf.lm_sent_len + 10, vocabxid[end_symbol]) output.write("[0] " + " ".join([idxvocab[item] for item in s]) + "\n") for temp in gen_temps: output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n") for i in range(gen_num): s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \ vocabxid[end_symbol]) output.write("[" + str(i) + "] " + " ".join([idxvocab[item] for item in s]) + "\n")
def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf): topics, _ = tm.get_topics(sess, topn=topn) topics = [ " ".join([idxvocab[w] for w in t]) for t in topics ] doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ] output = codecs.open(args.gen_sent_on_doc, "w", "utf-8") with tf.variable_scope("model", reuse=True, initializer=initializer): mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \ reuse_conv_variables=True) for d in range(len(docs)): output.write("\n" + "="*100 + "\n") output.write("Doc " + str(d) +":\n") output.write(doc_text[d]) doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol]) best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn) output.write("\nRepresentative topics:\n") output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \ for item in best_topics ]) + "\n") output.write("\nRepresentative words:\n") output.write("\n".join([ ("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words ]) + "\n") output.write("\nSentence generation (greedy; argmax):" + "\n") s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len+10, vocabxid[end_symbol]) output.write("[0] " + " ".join([ idxvocab[item] for item in s ]) + "\n") for temp in gen_temps: output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n") for i in xrange(gen_num): s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \ vocabxid[end_symbol]) output.write("[" + str(i) + "] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
print "Vocab size =", len(idxvocab) if cf.num_classes > 0: print "Class size (supervised) =", cf.num_classes if cf.num_tags > 0: print "Tag size =", cf.num_tags - 1 print_corpus_stats("Train corpus", train_sents, train_docs, train_stats) print_corpus_stats("Valid corpus", valid_sents, valid_docs, valid_stats) #train model with tf.Graph().as_default(), tf.Session() as sess: tf.set_random_seed(cf.seed) initializer = tf.contrib.layers.xavier_initializer() with tf.variable_scope("model", reuse=None, initializer=initializer): tm_train = TM(is_training=True, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.tm_sent_len, num_classes=num_classes, config=cf) if cf.topic_number > 0 else None lm_train = LM(is_training=True, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.lm_sent_len, config=cf, reuse_conv_variables=True) \ if cf.rnn_hidden_size > 0 else None with tf.variable_scope("model", reuse=True, initializer=initializer): tm_valid = TM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.tm_sent_len, num_classes=num_classes, config=cf) if cf.topic_number > 0 else None lm_valid = LM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.lm_sent_len, config=cf) if cf.rnn_hidden_size > 0 else None tf.initialize_all_variables().run() #initialise word embedding if cf.word_embedding_model: word_emb = init_embedding(mword, idxvocab) if cf.rnn_hidden_size > 0: sess.run(lm_train.lstm_word_embedding.assign(word_emb)) if cf.topic_number > 0: