def __init__(self): self.vocab_file = "data/vocab" self.train_file = "data/train.txt" self.dev_file = "data/dev.txt" self.dict_file = "data/dict.p" self.max_vocab_size = 5e4 self.debug = True self.num_epochs = 20 self.batch_size = 16 self.dropout = 0.1 self.vocab_size = 5e4 self.embedding_size = 300 self.lr = 1e-3 self.lstm_size = 128 self.filter_size = 96 self.attention_size = 128 self.grad_clip = 5 self.alpha = 1e-1 self.beta = 1e-1 self.l2_lambda = 3e-7 self.num_heads = 8 self.ans_limit = 20 self.embeddings = load_glove("data/glove.npz") self.dir_output = "results/save/" self.dir_model = self.dir_output + "model.weights/" if not os.path.exists(self.dir_output): os.makedirs(self.dir_output)
def _build_vars(self): with tf.variable_scope(self._name): vocab_path = "vocab.txt" vocab = Vocab(vocab_path, self._vocab_size) if self.word_init: word2vec = utils.load_glove(self._embedding_size) self.embeddings = utils.create_embedding( self, word2vec, vocab.id_to_word, self._embedding_size) else: self.embeddings = np.random.uniform( -0.5, 0.5, (len(vocab.id_to_word), self._embedding_size)) self.embeddings = tf.Variable(self.embeddings.astype(np.float32), name="Embedding")
min_score = min(resolved_scores) if essay_set_id == 7: min_score, max_score = 0, 30 elif essay_set_id == 8: min_score, max_score = 0, 60 print 'max_score is {} \t min_score is {}\n'.format(max_score, min_score) with open(out_dir+'/params', 'a') as f: f.write('max_score is {} \t min_score is {} \n'.format(max_score, min_score)) # include max score score_range = range(min_score, max_score+1) #word_idx, _ = data_utils.build_vocab(essay_list, vocab_limit) # load glove word_idx, word2vec = data_utils.load_glove(num_tokens, embedding_size) vocab_size = len(word_idx) + 1 # stat info on data set sent_size_list = map(len, [essay for essay in essay_list]) max_sent_size = max(sent_size_list) mean_sent_size = int(np.mean(map(len, [essay for essay in essay_list]))) print 'max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size) with open(out_dir+'/params', 'a') as f: f.write('max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size)) print 'The length of score range is {}'.format(len(score_range)) E = data_utils.vectorize_data(essay_list, word_idx, max_sent_size) labeled_data = zip(E, resolved_scores, sent_size_list)
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=100): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.vocab = {} self.ivocab = {} self.word2vec = {} self.word2vec_init = True if self.word2vec_init: # assert config.embed_size == 100 self.word2vec = load_glove(self.embedding_size) process_word(word="<eos>", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") # Define uncertain or unknown word index and vec for use later for training out-of-context data self.uncertain_word_index = process_word( word="sdfsssdf", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) self.set_max_sentence_length() # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.trainS, self.trainQ, self.trainA = vectorize_data_match( self.trainData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain=self.uncertain_word_index) self.valS, self.valQ, self.valA = vectorize_data_match( self.valData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain_word=True, uncertain=self.uncertain_word_index) self.candidates_vec = vectorize_candidates( candidates, self.word2vec, self.candidate_sentence_size, self.vocab, self.ivocab, self.embedding_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() # Set max sentence vector size self.build_vocab(data, candidates) answer_n_hot = np.zeros((self.vocab_size, len(self.candid2indx))) for ans_it in range(len(self.indx2candid)): ans = self.indx2candid[ans_it] n_hot = np.zeros((self.vocab_size, )) for w in tokenize(ans): assert w in self.word_idx n_hot[self.word_idx[w]] = 1 answer_n_hot[:, ans_it] = n_hot # Need to understand more about sentence size. Model failing because sentence size > candidate_sentence_size? Answers longer than queries? self.model = MemN2NDialogHybridMatch(self.batch_size, self.vocab_size, self.max_sentence_size, self.memory_size, self.embedding_size, answer_n_hot, match=FLAGS.match, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=self.task_id) # self.model = MemN2NDialogHybrid(self.batch_size, self.vocab_size, self.n_cand, self.max_sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, # hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph) self.kb = parse_kb(FLAGS.kb_file)
def train(restore=False): config = Config() data, vocab = utils.load_sentiment_treebank(SST_DIR, config.fine_grained) train_set, dev_set, test_set = data['train'], data['dev'], data['test'] num_emb = len(vocab) num_labels = 5 if config.fine_grained else 3 for _, dataset in data.items(): labels = [label for _, label in dataset] assert set(labels) <= set(xrange(num_labels)), set(labels) config.num_emb = num_emb config.output_dim = num_labels config.maxseqlen = utils.get_max_len_data(data) config.maxnodesize = utils.get_max_node_size(data) if config.fine_grained: classify_type = "fine_grained" else: classify_type = "binary" random.seed() np.random.seed() with tf.Graph().as_default(): #model = seq_att.tf_seqLSTM(config) #model = seq_att.tf_seqLSTMAtt(config) #model = seq_att.tf_seqbiLSTM(config) model = seq_att.tf_seqbiLSTMAtt(config) model_name = model.__class__.__name__ print 'The model is running now:', model_name, classify_type ckpt_base = os.path.join(ckpt_dir, model_name, classify_type) summary_base = os.path.join(summaries_dir, model_name, classify_type) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: tf.summary.FileWriter(summary_base, sess.graph) sess.run(init) if restore: f = os.path.join(ckpt_base, 'lstm_weights') saver.restore(sess, f) test_score = model.evaluate(test_set, sess, isDev=False) print 'test_score:', test_score if config.use_attention: visualize_attention(model, test_set, vocab, sess, ckpt_base) else: if config.emb_dim == 300: glove_file = os.path.join(GLOVE_DIR, 'glove.840B.300d.txt') else: tmp = 'glove.twitter.27B.' + str(config.emb_dim) + 'd.txt' glove_file = os.path.join(GLOVE_DIR, tmp) glove_embeddings = utils.load_glove(glove_file, vocab, config.emb_dim) sess.run(model.embedding_init, feed_dict={model.initial_emb: glove_embeddings}) avg_loss = model.train(train_set, test_set, sess, saver) print 'avg loss', avg_loss
# Network Parameters n_input = FLAGS.embedding_size #n_steps = ? # timesteps = num of words in article, cannot defined here n_hidden = FLAGS.num_hidden_nodes # hidden layer num of features n_classes = 2 # total classes (0-1 digits) #deal with input data training_path = '200_clean_graded_data.csv' #put name of training file here essay_list, label, problem_id, count_one, question_list = data_utils.load_open_response_data( training_path) #majorty class print('majorty class accuracy is : \n', count_one / len(label)) # load glove word_idx, word2vec = data_utils.load_glove(n_input) vocab_size = len(word_idx) + 1 # stat info on data set sent_size_list = map(len, [essay for essay in essay_list]) max_sent_size = max(sent_size_list) mean_sent_size = int(np.mean(map(len, [essay for essay in essay_list]))) question_sent_size_list = map(len, [question for question in question_list]) question_max_sent_size = max(question_sent_size_list) question_mean_sent_size = int( np.mean(map(len, [question for question in question_list]))) print('max sentence size: {} \nmean sentence size: {}\n'.format( max_sent_size, mean_sent_size))
def __init__(self): # load word embedding glove = data_utils.load_glove(FLAGS.glove_file) word2vec = data_utils.load_word2vec(FLAGS.word2vec_file) merged_embed, self.vocab_size = self.merge_glove_word2vec( glove, word2vec) dim = len(merged_embed[0]) merged_embed.append([0. for _ in xrange(dim)]) # load doc embedding self.doc_embedding, doc_dim = data_utils.load_fastText_embed(\ FLAGS.fastText_doc_file, FLAGS.fastText_vector_file) self.zero_doc_key = self.doc_key([self.vocab_size], [self.vocab_size]) self.doc_embedding[self.zero_doc_key] = [0. for _ in xrange(doc_dim)] FLAGS.fc_units = map(int, FLAGS.fc_units.split(',')) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) ''' graph ''' print 'Initializing model graph...' with tf.variable_scope('inputs'): self.training = tf.placeholder(tf.bool, name='training') self.title = tf.placeholder( tf.int32, shape=[None, None], name='title') # [batch size, sequence length] self.content = tf.placeholder(tf.int32, shape=[None, None], name='content') self.title_length = tf.placeholder(tf.int32, shape=[None], \ name='title_length') self.content_length = tf.placeholder(tf.int32, shape=[None],\ name='content_length') self.prices = tf.placeholder(tf.float32, name='prices', \ shape=[None, None, 7]) self.price_length = tf.placeholder(tf.int32, shape=[None], \ name='price_length') self.docs = tf.placeholder(tf.float32, name='docs', \ shape=[None, None, doc_dim]) self.doc_length = tf.placeholder(tf.int32, shape=[None], \ name='doc_length') self.label = tf.placeholder(tf.int32, shape=[None, 2], name='label') with tf.variable_scope('birnn_embed'): self.word_embedding = tf.Variable(merged_embed, dtype=tf.float32, name='word_embedding_matrix') title_embed = self.embed_birnn(FLAGS.title_units, FLAGS.title_layers, self.title, self.title_length, scope='title_embed_birnn') content_embed = self.embed_birnn(FLAGS.content_units, FLAGS.content_layers, self.content, self.content_length, scope='content_embed_birnn') price_embed = self.birnn(FLAGS.price_units, FLAGS.price_layers, self.prices, self.price_length, scope='price_birnn') doc_embed = self.birnn(FLAGS.doc_units, FLAGS.doc_layers, self.docs, self.doc_length, scope='doc_birnn') final_embed = tf.concat( [title_embed, content_embed, doc_embed, price_embed], 1) with tf.variable_scope('full_connect'): fc_inputs = final_embed for i in range(FLAGS.fc_layers): with tf.variable_scope('full_connect_layer_%d' % i): fc_outputs = tf.contrib.layers.legacy_fully_connected( fc_inputs, FLAGS.fc_units[i], activation_fn=tf.nn.relu, weight_regularizer=tf.contrib.layers.l2_regularizer( FLAGS.l2_coef)) fc_inputs = fc_outputs with tf.variable_scope('dropout'): dropout = tf.layers.dropout(fc_outputs, training=self.training) with tf.variable_scope('output'): W = tf.get_variable('W', shape=[FLAGS.fc_units[-1], 2], initializer=tf.truncated_normal_initializer()) biases = tf.get_variable( 'biases', shape=[2], initializer=tf.random_normal_initializer()) logits = tf.matmul(dropout, W) + biases self.result = tf.nn.softmax(logits) with tf.variable_scope('train'): self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=logits)) self.learning_rate = tf.Variable(FLAGS.init_lr, trainable=False, name="learning_rate") self.lr_decay_op = self.learning_rate.assign(self.learning_rate * FLAGS.lr_decay) self.global_step = tf.Variable(0, trainable=False, name='global_step') self.train_op = tf.train.AdamOptimizer(FLAGS.init_lr) \ .minimize(self.cross_entropy, self.global_step) with tf.variable_scope('logs'): self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) self.log_writer = tf.summary.FileWriter( os.path.join(FLAGS.train_dir, 'logs/'), self.session.graph) self.summary = tf.Summary()