def build_graph(self, reps, context_mask): """give the final prediction for start_pos and end_pos Args: reps: final output representation [batch_sz, context_length, hidden_sz] context_mask: [batch_sz, context_length] Return: (logits_start, probdist_start, logits_end, probdist_end) each of shape [batch_sz, context_length] """ cx_len = context_mask.shape[1] with vs.variable_scope(self.scope): start_reps = tf.contrib.layers.fully_connected( reps, num_outputs=self.hidden_sz) logits_start, probdist_start = self._pred_start( start_reps, context_mask) end_reps = tf.concat([reps, tf.expand_dims(probdist_start, 2)], 2) end_encoder = RNNEncoder(self.hidden_sz, 1, "lstm", "end_encoder") end_reps = end_encoder.build_graph(end_reps, context_mask) logits_end, probdist_end = self._pred_end(end_reps, context_mask) if not self.is_training: # [batch_sz]: index of starting word start_idx = tf.argmax(probdist_start, 1) # # [batch_sz, context_length]: 1 if valid for end word else 0.001 start_mask = 1 - 0.999 * tf.cast( tf.sequence_mask(start_idx, cx_len, dtype=tf.int32), tf.float32) # a position is valid for end work if both context mask and start mask are both 1 logits_end = logits_end * start_mask probdist_end = probdist_end * start_mask return (logits_start, probdist_start, logits_end, probdist_end)
class OutputDoubleLSTM(object): """base class for output representation""" def __init__(self, output_sz, keep_prob): """ Args: """ self.output_sz = output_sz self.scope = "double_lstm" self.keep_prob = keep_prob self.lstm_encoder1 = RNNEncoder(output_sz, 1, "lstm", "encoder1") self.lstm_encoder2 = RNNEncoder(output_sz, keep_prob, "lstm", "encoder2") def build_graph(self, reps, context_mask): """ Args: reps: [batch_sz, context_length, reps_sz] Return: [batch_sz, context_length, output_sz] """ with vs.variable_scope(self.scope): lstm_1_out = self.lstm_encoder1.build_graph(reps, context_mask) lstm_2_out = self.lstm_encoder2.build_graph( lstm_1_out, context_mask) return lstm_2_out
class OutputDoubleLSTMAct(object): """base class for output representation""" def __init__(self, output_sz, keep_prob, activation): """ Args: """ self.output_sz = output_sz self.activation = activation self.scope = "double_lstm_{}".format(activation) self.keep_prob = keep_prob self.lstm_encoder1 = RNNEncoder(output_sz, keep_prob, "lstm", "encoder1") self.lstm_encoder2 = RNNEncoder(output_sz, keep_prob, "gru", "encoder2") logger.error( "Output Layer with Double LSTM and Activation {} created ...". format(activation)) def build_graph(self, reps, context_mask): """ Args: reps: [batch_sz, context_length, reps_sz] Return: [batch_sz, context_length, output_sz] """ with vs.variable_scope(self.scope): lstm_1_out = self.lstm_encoder1.build_graph(reps, context_mask) lstm_2_out = self.lstm_encoder2.build_graph( lstm_1_out, context_mask) if self.activation == "tanh": return tf.nn.tanh(lstm_2_out) elif self.activation == "relu": return tf.nn.relu(lstm_2_out) sys.exit(0, "No such activation: {}!".format(self.activation))
def __init__(self, output_sz, keep_prob): """ Args: """ self.output_sz = output_sz self.scope = "output_lstm" self.lstm_encoder = RNNEncoder(output_sz, keep_prob, "lstm")
def __init__(self, output_sz, keep_prob): """ Args: """ self.output_sz = output_sz self.scope = "double_lstm" self.keep_prob = keep_prob self.lstm_encoder1 = RNNEncoder(output_sz, 1, "lstm", "encoder1") self.lstm_encoder2 = RNNEncoder(output_sz, keep_prob, "lstm", "encoder2")
def __init__(self, output_sz, keep_prob, activation): """ Args: """ self.output_sz = output_sz self.activation = activation self.scope = "double_lstm_{}".format(activation) self.keep_prob = keep_prob self.lstm_encoder1 = RNNEncoder(output_sz, keep_prob, "lstm", "encoder1") self.lstm_encoder2 = RNNEncoder(output_sz, keep_prob, "gru", "encoder2") logger.error( "Output Layer with Double LSTM and Activation {} created ...". format(activation))
def add_encoder(self, vocab_sizes, embedding_sizes, rnn_type, hidden_size, num_layers, bidirectional): encoder_input = DiscreteFeatureSequenceInput(vocab_sizes, embedding_sizes) encoder = RNNEncoder(encoder_input, rnn_type, hidden_size, num_layers, bidirectional) self.encoder = encoder return self
def from_args(cls, args, encoder_input_modules, decoder_input_modules, dropout=None, rnn_type=None, target_vocab_size=None, attention_type=None, bidirectional=None, learn_init=None, bridge_type=None): if learn_init is None: learn_init = bool(args.learn_init) if bridge_type is None: bridge_type = args.bridge_type encoder = RNNEncoder.from_args( args, encoder_input_size=encoder_input_modules.embedding_size, dropout=dropout, rnn_type=rnn_type, bidirectional=bidirectional) if args.rnn_type == "lstm": bridge1 = be.from_args(args, bridge_type=bridge_type, bidirectional=bidirectional) bridge2 = be.from_args(args, bridge_type=bridge_type, bidirectional=bidirectional) bridge = ParallelModule([bridge1, bridge2]) else: bridge = be.from_args(args, bridge_type=bridge_type, bidirectional=bidirectional) decoder = RNNDecoder.from_args( args, decoder_input_size=decoder_input_modules.embedding_size, dropout=dropout, rnn_type=rnn_type, target_vocab_size=target_vocab_size, attention_type=attention_type) return cls(encoder_input_modules, decoder_input_modules, encoder, bridge, decoder, learn_init=learn_init)
class OutputLSTM(object): """base class for output representation""" def __init__(self, output_sz, keep_prob): """ Args: """ self.output_sz = output_sz self.scope = "output_lstm" self.lstm_encoder = RNNEncoder(output_sz, keep_prob, "lstm") def build_graph(self, reps, context_mask): """ Args: reps: [batch_sz, context_length, reps_sz] Return: [batch_sz, context_length, output_sz] """ with vs.variable_scope(self.scope): return self.lstm_encoder.build_graph(reps, context_mask)
def from_args(cls, args, encoder_input_modules, dropout=None, rnn_type=None, target_vocab_size=None, bidirectional=None, learn_init=None): if learn_init is None: learn_init = bool(args.learn_init) if target_vocab_size is None: target_vocab_size = args.target_vocab_size if dropout is None: dropout = args.dropout encoder = RNNEncoder.from_args( args, encoder_input_size=encoder_input_modules.embedding_size, dropout=dropout, rnn_type=rnn_type, bidirectional=bidirectional) mlp_input_size = 0 for dim in encoder.rnn_state_dims: mlp_input_size += dim[0] * dim[2] mlp = MLP(mlp_input_size, target_vocab_size, dropout=dropout) return cls(encoder_input_modules, encoder, mlp, learn_init=learn_init)
def __init__(self, src_vocab_size, tgt_vocab_size, cue_vocab_size, goal_vocab_size, embed_size, hidden_size, padding_idx=None, num_layers=1, bidirectional=True, attn_mode="mlp", with_bridge=False, tie_embedding=False, dropout=0.0, use_gpu=False, use_bow=False, use_kd=False, use_posterior=False, device=None, unk_idx=None, force_copy=True, stage=None): super().__init__() self.src_vocab_size = src_vocab_size self.tgt_vocab_size = tgt_vocab_size self.cue_vocab_size = cue_vocab_size self.goal_vocab_size = goal_vocab_size self.embed_size = embed_size self.hidden_size = hidden_size self.padding_idx = padding_idx self.num_layers = num_layers self.bidirectional = bidirectional self.attn_mode = attn_mode self.with_bridge = with_bridge self.tie_embedding = tie_embedding self.dropout = dropout self.use_gpu = use_gpu self.use_bow = use_bow self.use_kd = use_kd self.use_posterior = use_posterior self.baseline = 0 self.device = device if device >= 0 else "cpu" self.unk_idx = unk_idx self.force_copy = force_copy self.stage = stage # the utterance embedding enc_embedder = Embedder(num_embeddings=self.src_vocab_size, embedding_dim=self.embed_size, padding_idx=self.padding_idx) self.utt_encoder = RNNEncoder(input_size=self.embed_size, hidden_size=self.hidden_size, embedder=enc_embedder, num_layers=self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout) if self.with_bridge: self.utt_bridge = nn.Sequential( nn.Linear(self.hidden_size, self.hidden_size), nn.Tanh()) self.goal_bridge = nn.Sequential( nn.Linear(self.hidden_size, self.hidden_size), nn.Tanh()) # self.prior_query_mlp = nn.Sequential(nn.Linear(self.hidden_size * 2, self.hidden_size), nn.Tanh()) self.fc1 = nn.Linear(self.hidden_size, self.hidden_size) self.fc2 = nn.Linear(self.hidden_size, self.hidden_size) self.fc3 = nn.Linear(self.hidden_size * 2, 1) if self.tie_embedding: # share the same embedding with utt encoder assert self.src_vocab_size == self.tgt_vocab_size == self.cue_vocab_size == self.goal_vocab_size self.dec_embedder = enc_embedder knowledge_embedder = enc_embedder goal_embedder = enc_embedder else: self.dec_embedder = Embedder(num_embeddings=self.tgt_vocab_size, embedding_dim=self.embed_size, padding_idx=self.padding_idx) knowledge_embedder = Embedder(num_embeddings=self.cue_vocab_size, embedding_dim=self.embed_size, padding_idx=self.padding_idx) goal_embedder = Embedder(num_embeddings=self.goal_vocab_size, embedding_dim=self.embed_size, padding_idx=self.padding_idx) self.knowledge_encoder = RNNEncoder(input_size=self.embed_size, hidden_size=self.hidden_size, embedder=knowledge_embedder, num_layers=self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout) self.goal_encoder = RNNEncoder(input_size=self.embed_size, hidden_size=self.hidden_size, embedder=goal_embedder, num_layers=self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout) self.prior_attention = Attention(query_size=self.hidden_size, memory_size=self.hidden_size, hidden_size=self.hidden_size, mode="dot", device=self.device) self.posterior_attention = Attention(query_size=self.hidden_size, memory_size=self.hidden_size, hidden_size=self.hidden_size, mode="dot", device=self.device) self.decoder = Decoder(input_size=self.embed_size, hidden_size=self.hidden_size, output_size=self.tgt_vocab_size, embedder=self.dec_embedder, num_layers=self.num_layers, attn_mode=self.attn_mode, memory_size=self.hidden_size, dropout=self.dropout, device=self.device) self.softmax = nn.Softmax(dim=-1) self.sigmoid = nn.Sigmoid() self.tanh = nn.Tanh() if self.use_bow: self.bow_output_layer = nn.Sequential( nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size), nn.Tanh(), nn.Linear(in_features=self.hidden_size, out_features=self.tgt_vocab_size), nn.LogSoftmax(dim=-1)) if self.use_kd: self.knowledge_dropout = nn.Dropout(self.dropout) if self.padding_idx is not None: self.weight = torch.ones(self.tgt_vocab_size) self.weight[self.padding_idx] = 0 else: self.weight = None self.nll_loss = NLLLoss(weight=self.weight, ignore_index=self.padding_idx, reduction='mean') self.copy_gen_loss = CopyGeneratorLoss(vocab_size=self.tgt_vocab_size, force_copy=self.force_copy, unk_index=self.unk_idx, ignore_index=self.padding_idx) self.kl_loss = torch.nn.KLDivLoss(reduction="mean") if self.use_gpu: self.cuda() self.weight = self.weight.cuda()
class CoAttn(BasicAttn): """class for CoAttention""" def __init__(self, keep_prob, key_vec_size, value_vec_size): BasicAttn.__init__(self, keep_prob, key_vec_size, value_vec_size) self.scope = "CoAttn" self.encoder = RNNEncoder(key_vec_size, keep_prob, "lstm") def build_graph(self, values, values_mask, keys, keys_mask): """ Args: values: [batch_sz, M, h] values_mask: [batch_sz, M] keys: [batch_sz, N, h] keys_mask: [batch_sz, N] (N = n_keys, M = n_values, h = hidden_size) Return: attn_dist: [batch_sz, N, num_values] output: [batch_sz, N, output_sz] """ h = self.key_vec_size M = values.shape[1] N = keys.shape[1] assert (values.shape[-1] == h) logger.error("values: {}".format(values.shape)) with vs.variable_scope(self.scope): # weight matrix: [h, h] W = tf.get_variable("W", [h, h], tf.float32, tf.contrib.layers.xavier_initializer()) # bias: [h] b = tf.get_variable("b", [h], tf.float32, tf.zeros_initializer()) # sentinel vectors for keys and values # k0, v0 = [tf.get_variable(name, [h, 1], tf.float32, # tf.zeros_initializer()) for name in ("k0", "v0")] # sen_mat = tf.matmul(v0, tf.transpose(k0, [1, 0])) # logger.error("sen_mat: {}".format(sen_mat.shape)) # [batch_sz * M, h] q_prime = tf.nn.tanh(tf.matmul(tf.reshape(values, [-1, h]), W) + b) # [batch_sz, M, h] q_prime = tf.reshape(q_prime, [-1, M, h]) # affinity matrix: L = [batch_sz, N, M] # logger.error("values: {}".format(values.shape)) # logger.error("tf.matmul(keys, tf.transpose(values, [0, 2, 1])): {}".format((tf.matmul(keys, tf.transpose(values, [0, 2, 1]))).shape)) L = tf.matmul(keys, tf.transpose(q_prime, [0, 2, 1])) logger.error("L: {}".format(L.shape)) ############ C2Q ############ # [batch_size, 1, M] values_mask_exp = tf.expand_dims(values_mask, 1) # [batch_size, N, 1] keys_mask_exp = tf.expand_dims(keys_mask, 2) # softmax for L over values: [batch_sz, N, M] _, alpha = masked_softmax(L, values_mask_exp, 2) logger.error("alpha: {}".format(alpha.shape)) # [batch_sz, N, h] k2v = tf.matmul(alpha, values) logger.error("k2v: {}".format(k2v.shape)) ############ Q2C ############ # softmax for L over keys: [batch_sz, N, M] _, beta = masked_softmax(L, keys_mask_exp, 1) logger.error("beta: {}".format(beta.shape)) # beta = tf.transpose(beta, [1, 2, 0]) logger.error("beta: {}".format(beta.shape)) # [batch_sz, M, h] v2k = tf.matmul(tf.transpose(beta, [0, 2, 1]), keys) logger.error("v2k: {}".format(v2k.shape)) ############ Second Level Attn ############ # [batch_sz, N, h]: alpha = [batch_sz, N, M], v2k = [batch_sz, M, h] s = tf.matmul(alpha, v2k) logger.error("s: {}".format(s.shape)) # [batch_sz, N, 2 * h] lstm_inputs = tf.concat([s, k2v], 2) logger.error("lstm_inputs: {}".format(lstm_inputs.shape)) logger.error("keys mask: {}".format(keys_mask.shape)) attn = self.encoder.build_graph(lstm_inputs, keys_mask) logger.error("attn: {}".format(attn.shape)) # Apply dropout attn = tf.nn.dropout(attn, self.keep_prob) return _, attn
if __name__ == '__main__': from encoder import CNNEncoder, RNNEncoder x1 = tf.placeholder(tf.int32, [None, 20], name="input_x1") x2 = tf.placeholder(tf.int32, [None, 20], name="input_x2") y = tf.placeholder(tf.float32, [None], name="input_y") cnn_encoder = CNNEncoder( sequence_length=20, embedding_dim=128, filter_sizes=[3, 4, 5], num_filters=100, ) rnn_encoder = RNNEncoder( rnn_cell='lstm', hidden_units=100, num_layers=2, dropout_keep_prob=0.7, use_dynamic=False, use_attention=False, ) model1 = SiameseSimilarityNets(input_x1=x1, input_x2=x2, input_y=y, word_embedding_type='rand', vocab_size=10000, embedding_size=128, encoder_type='cnn', cnn_encoder=cnn_encoder, rnn_encoder=rnn_encoder, dense_layer=False, l2_reg_lambda=0, pred_threshold=0.5,
import time import os import pprint import util import tensorflow as tf import datetime from encoder import RNNEncoder, RNNEncoderTrainer, RNNEncoderEvaluator pp = pprint.PrettyPrinter(indent=2) # Encoder parameters RNNEncoder.add_flags() # Training parameters tf.flags.DEFINE_integer("max_sequence_length", 525, "Examples will be padded/truncated to this length") tf.flags.DEFINE_integer("num_epochs", 20, "Number of training epochs") tf.flags.DEFINE_integer("checkpoint_every", 1, "Evaluate model after this number of steps") tf.flags.DEFINE_integer("evaluate_every", 1, "Evaluate model on dev set after this number of steps") # Session Parameters tf.flags.DEFINE_boolean("allow_soft_placement", False, "Allow soft device placement (e.g. no GPU)") tf.flags.DEFINE_boolean("log_device_placement", True, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS.batch_size print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value))
def train(): print("Using TensorFlow Version %s" % tf.__version__) assert "1.5" <= tf.__version__, "Need TensorFlow 1.5 or Later." print("\nParameters:") for attr in FLAGS: value = FLAGS[attr].value print("{}={}".format(attr.upper(), value)) print("") if not FLAGS.data_file: exit("Train data file is empty. Set --data_file argument.") dataset = Dataset(data_file=FLAGS.data_file, char_level=FLAGS.char_model, embedding_dim=FLAGS.embedding_dim) vocab, word2id = dataset.read_vocab() print("Vocabulary Size: {:d}".format(len(vocab))) # Generate batches data = dataset.process_data( data_file=FLAGS.data_file, sequence_length=FLAGS.max_document_length) # (x1, x2, y) train_data, eval_data = dataset.train_test_split( data, test_size=FLAGS.val_percentage, random_seed=FLAGS.random_seed) train_batches = dataset.batch_iter(train_data, FLAGS.batch_size, FLAGS.num_epochs, shuffle=True) with tf.Graph().as_default(): tf.set_random_seed(FLAGS.random_seed) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) input_x1 = tf.placeholder(tf.int32, [None, FLAGS.max_document_length], name="input_x1") input_x2 = tf.placeholder(tf.int32, [None, FLAGS.max_document_length], name="input_x2") input_y = tf.placeholder(tf.float32, [None], name="input_y") dropout_keep_prob = tf.placeholder(tf.float32, name="input_y") cnn_encoder = CNNEncoder( sequence_length=FLAGS.max_document_length, embedding_dim=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, ) rnn_encoder = RNNEncoder( rnn_cell=FLAGS.rnn_cell, hidden_units=FLAGS.hidden_units, num_layers=FLAGS.num_layers, dropout_keep_prob=dropout_keep_prob, use_dynamic=FLAGS.use_dynamic, use_attention=FLAGS.use_attention, ) with sess.as_default(): if FLAGS.model_class == 'similarity': model = SiameseSimilarityNets( input_x1=input_x1, input_x2=input_x2, input_y=input_y, encoder_type=FLAGS.model_type, cnn_encoder=cnn_encoder, rnn_encoder=rnn_encoder, vocab_size=len(vocab), embedding_size=FLAGS.embedding_dim, word_embedding_type=FLAGS.word_embedding_type, dense_layer=FLAGS.dense_layer, pred_threshold=FLAGS.pred_threshold, l2_reg_lambda=FLAGS.l2_reg_lambda, energy_func=FLAGS.energy_function, loss_func=FLAGS.loss_function, margin=FLAGS.margin, contrasive_loss_pos_weight=FLAGS.scale_pos_weight, weight_sharing=FLAGS.weight_sharing) print("Initialized SiameseSimilarityNets model.") elif FLAGS.model_class == 'classification': model = SiameseClassificationNets( input_x1=input_x1, input_x2=input_x2, input_y=input_y, word_embedding_type=FLAGS.word_embedding_type, vocab_size=len(vocab), embedding_size=FLAGS.embedding_dim, encoder_type=FLAGS.model_type, cnn_encoder=cnn_encoder, rnn_encoder=rnn_encoder, dense_layer=FLAGS.dense_layer, l2_reg_lambda=FLAGS.l2_reg_lambda, interaction='multiply', weight_sharing=FLAGS.weight_sharing) print("Initialized SiameseClassificationNets model.") else: raise ValueError( "Invalid model class. Expected one of {`similarity`, `classification`} " ) model.forward() # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.train.exponential_decay( FLAGS.lr, global_step, decay_steps=int(40000 / FLAGS.batch_size), decay_rate=FLAGS.weight_decay_rate, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # optimizer = tf.train.RMSPropOptimizer(learning_rate) # optimizer = tf.train.AdadeltaOptimizer(learning_rate, epsilon=1e-6) # for i, (g, v) in enumerate(grads_and_vars): # if g is not None: # grads_and_vars[i] = (tf.clip_by_global_norm(g, 5), v) # clip gradients # train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) if FLAGS.clip_norm: # improve loss, but small weight cause small score, need to turn threshold for better f1. variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(model.loss, variables), FLAGS.clip_norm) train_op = optimizer.apply_gradients(zip(grads, variables), global_step=global_step) grads_and_vars = zip(grads, variables) else: grads_and_vars = optimizer.compute_gradients(model.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) print("Defined gradient summaries.") # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) f1_summary = tf.summary.scalar("F1-score", model.f1) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, f1_summary, grad_summaries_merged]) train_summary_dir = os.path.join(FLAGS.model_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, f1_summary]) dev_summary_dir = os.path.join(FLAGS.model_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(FLAGS.model_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) graph_def = tf.get_default_graph().as_graph_def() with open(os.path.join(checkpoint_dir, "graphpb.txt"), 'w') as f: f.write(str(graph_def)) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Initialize all variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) if FLAGS.word_embedding_type != 'rand': # initial matrix with random uniform # embedding_init = np.random.uniform(-0.25, 0.25, (len(vocab), FLAGS.embedding_dim)) embedding_init = np.zeros(shape=(len(vocab), FLAGS.embedding_dim)) # load vectors from the word2vec print("Initializing word embedding with pre-trained word2vec.") words, vectors = dataset.load_word2vec() for idx, w in enumerate(vocab): vec = vectors[words.index(w)] embedding_init[idx] = np.asarray(vec).astype(np.float32) sess.run(model.W.assign(embedding_init)) print("Starting training...") F1_best = 0.0 last_improved_step = 0 for batch in train_batches: x1_batch, x2_batch, y_batch = zip(*batch) feed_dict = { input_x1: x1_batch, input_x2: x2_batch, input_y: y_batch, dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, loss, cm, acc, precision, recall, f1, summaries = sess.run( [ train_op, global_step, model.loss, model.cm, model.acc, model.precision, model.recall, model.f1, train_summary_op ], feed_dict) time_str = datetime.datetime.now().isoformat() if step % FLAGS.log_every_steps == 0: train_summary_writer.add_summary(summaries, step) print( "{} step {} TRAIN loss={:g} acc={:.3f} P={:.3f} R={:.3f} F1={:.6f}" .format(time_str, step, loss, acc, precision, recall, f1)) if step % FLAGS.evaluate_every_steps == 0: # eval x1_batch, x2_batch, y_batch = zip(*eval_data) feed_dict = { input_x1: x1_batch, input_x2: x2_batch, input_y: y_batch, dropout_keep_prob: 1 } #### debug for similarity model # x1, out1, out2, sim_euc, sim_cos, sim_ma, sim = sess.run( # [model.embedded_1, model.out1, model.out2, model.sim_euc, model.sim_cos, model.sim_ma, model.sim], feed_dict) # print(x1) # sim_euc = [round(s, 2) for s in sim_euc[:30]] # sim_cos = [round(s, 2) for s in sim_cos[:30]] # sim_ma = [round(s, 2) for s in sim_ma[:30]] # sim = [round(s, 2) for s in sim[:30]] # # print(out1) # out1 = [round(s, 3) for s in out1[0]] # out2 = [round(s, 3) for s in out2[0]] # print(zip(out1, out2)) # for w in zip(y_batch[:30], sim, sim_euc, sim_cos, sim_ma): # print(w) ##### debug for classification model # out1, out2, out, logits = sess.run( # [model.out1, model.out2, model.out, model.logits], feed_dict) # out1 = [round(s, 3) for s in out1[0]] # out2 = [round(s, 3) for s in out2[0]] # out = [round(s, 3) for s in out[0]] # print(zip(out1, out2)) # print(out) # print(logits) loss, cm, acc, precision, recall, f1, summaries = sess.run([ model.loss, model.cm, model.acc, model.precision, model.recall, model.f1, dev_summary_op ], feed_dict) dev_summary_writer.add_summary(summaries, step) if f1 > F1_best: F1_best = f1 last_improved_step = step if F1_best > 0.5: path = saver.save(sess, checkpoint_prefix, global_step=step) print( "Saved model with F1={} checkpoint to {}\n".format( F1_best, path)) improved_token = '*' else: improved_token = '' print( "{} step {} DEV loss={:g} acc={:.3f} cm{} P={:.3f} R={:.3f} F1={:.6f} {}" .format(time_str, step, loss, acc, cm, precision, recall, f1, improved_token)) # if step % FLAGS.checkpoint_every_steps == 0: # if F1 >= F1_best: # F1_best = F1 # path = saver.save(sess, checkpoint_prefix, global_step=step) # print("Saved model with F1={} checkpoint to {}\n".format(F1_best, path)) if step - last_improved_step > 4000: # 2000 steps print( "No improvement for a long time, early-stopping at best F1={}" .format(F1_best)) break
def __init__(self, keep_prob, key_vec_size, value_vec_size): BasicAttn.__init__(self, keep_prob, key_vec_size, value_vec_size) self.scope = "CoAttn" self.encoder = RNNEncoder(key_vec_size, keep_prob, "lstm")
def __init__(self, params): self.pos_relation_ids = tf.placeholder(tf.int32, [None, 3]) self.neg_relation_ids = tf.placeholder(tf.int32, [None, 3]) self.q_word_ids = tf.placeholder(tf.int32, [None, params['max_sentence_len']], name='q_word_ids') self.q_sentence_lengths = tf.placeholder(tf.int64, [None], name="q_sentence_lengths") self.q_char_ids = tf.placeholder(tf.int32, [None, params['max_sentence_len'], params['max_word_len']], name='q_char_ids') self.q_word_lengths = tf.placeholder(tf.int64, [None, params['max_sentence_len']]) self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.pattern_positions = tf.placeholder(tf.float32, [None, params['max_sentence_len'], params['question_config']['word_dim']]) self.relation_positions = tf.placeholder(tf.float32, [None, 3, params['relation_config']['word_dim']]) with tf.device('/gpu:%s' % params.get('gpu', 0)): if params['encode_name'] == 'CNN': question_encoder = CNNEncoder(params['question_config'], 'question_cnn') relation_encoder = CNNEncoder(params['relation_config'], 'relation_cnn') # relation_encoder = AdditionEncoder(params['relation_config'], 'relation_add') if 'char_dim' in params['question_config']: question = question_encoder.encode(self.q_char_ids) else: question = question_encoder.encode(self.q_word_ids) pos_relation = relation_encoder.encode(self.pos_relation_ids, False) neg_relation = relation_encoder.encode(self.neg_relation_ids, True) question = question / tf.sqrt(tf.reduce_sum(question ** 2, 1, keep_dims=True)) pos_relation = pos_relation / tf.sqrt(tf.reduce_sum(pos_relation ** 2, 1, keep_dims=True)) neg_relation = neg_relation / tf.sqrt(tf.reduce_sum(neg_relation ** 2, 1, keep_dims=True)) elif params['encode_name'] == 'ADD': if params['question_config'].get("use_position", False): question_encoder = PositionADDEncoder(params['question_config'], "question_add") question = question_encoder.encode(self.q_word_ids, self.pattern_positions) else: question_encoder = ADDEncoder(params['question_config'], "question_add") question = question_encoder.encode(self.q_word_ids, self.q_sentence_lengths) question = question / tf.sqrt(tf.reduce_sum(question ** 2, 1, keep_dims=True)) if params['relation_config'].get("use_position", False): relation_encoder = PositionADDEncoder(params['relation_config'], 'relation_add') pos_relation = relation_encoder.encode(self.pos_relation_ids, self.relation_positions) neg_relation = relation_encoder.encode(self.neg_relation_ids, self.relation_positions) else: relation_encoder = ADDEncoder(params['relation_config'], 'relation_add') pos_relation = relation_encoder.encode(self.pos_relation_ids, None) neg_relation = relation_encoder.encode(self.neg_relation_ids, None) pos_relation = pos_relation / tf.sqrt(tf.reduce_sum(pos_relation ** 2, 1, keep_dims=True)) neg_relation = neg_relation / tf.sqrt(tf.reduce_sum(neg_relation ** 2, 1, keep_dims=True)) elif params['encode_name'] == 'RNN': question_encoder = RNNEncoder(params['question_config'], 'question_rnn') relation_encoder = RNNEncoder(params['relation_config'], 'relation_rnn') # relation_encoder = AdditionEncoder(params['relation_config'], 'relation_add') question = question_encoder.encode(self.q_word_ids, self.q_sentence_lengths, self.q_char_ids, self.q_word_lengths, False) pos_relation = relation_encoder.encode(self.pos_relation_ids, None, None, None, False) neg_relation = relation_encoder.encode(self.neg_relation_ids, None, None, None, True) # question = question / tf.sqrt(tf.reduce_sum(question ** 2, 1, keep_dims=True)) # pos_relation = pos_relation / tf.sqrt(tf.reduce_sum(pos_relation ** 2, 1, keep_dims=True)) # neg_relation = neg_relation / tf.sqrt(tf.reduce_sum(neg_relation ** 2, 1, keep_dims=True)) # pos_relation = relation_encoder.encode(self.pos_relation_ids, None, False) # neg_relation = relation_encoder.encode(self.neg_relation_ids, None, True) else: raise ValueError('encoder_name should be one of [CNN, ADD, RNN]') self.question_drop = tf.nn.dropout(question, self.dropout_keep_prob) self.pos_relation_drop = tf.nn.dropout(pos_relation, self.dropout_keep_prob) self.neg_relation_drop = tf.nn.dropout(neg_relation, self.dropout_keep_prob) self.pos_sim = self.dot_sim(self.question_drop, self.pos_relation_drop) self.neg_sim = self.dot_sim(self.question_drop, self.neg_relation_drop) self.loss = tf.reduce_mean(tf.maximum(0., self.neg_sim + params['margin'] - self.pos_sim)) tvars = tf.trainable_variables() max_grad_norm = 2 self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm) optimizer = tf.train.AdamOptimizer(params['lr']) self.train_op = optimizer.apply_gradients(zip(self.grads, tvars)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False self.session = tf.Session(config=config) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1) if params['load_path']: self.saver.restore(self.session, params['load_path']) else: self.session.run(tf.initialize_all_variables()) self.params = params
def __init__(self, keep_prob, key_size, value_size): BasicAttn.__init__(self, keep_prob, key_size, value_size) self.scope = "SelfAttn" self.encoder = RNNEncoder(key_size, keep_prob, "gru") self.v_size = 35
class SelfAttn(BasicAttn): """class for SelfAttention""" def __init__(self, keep_prob, key_size, value_size): BasicAttn.__init__(self, keep_prob, key_size, value_size) self.scope = "SelfAttn" self.encoder = RNNEncoder(key_size, keep_prob, "gru") self.v_size = 35 def build_graph(self, values, values_mask, keys, keys_mask): """ Args: values: [batch_sz, M, h] values_mask: [batch_sz, M] keys: [batch_sz, N, h] keys_mask: [batch_sz, N] Return: attn_dist: [batch_sz, N, 2h] output: _ """ h = self.key_vec_size M = values_mask.shape[1] N = keys_mask.shape[1] v = self.v_size # convert keys to first level attention _, keys = super(SelfAttn, self).build_graph(values, values_mask, keys, keys_mask) with vs.variable_scope(self.scope): W_1 = tf.get_variable('W_1', [h, v], tf.float32, tf.contrib.layers.xavier_initializer()) W_2 = tf.get_variable('W_2', [h, v], tf.float32, tf.contrib.layers.xavier_initializer()) v_weight = tf.get_variable('v', [v, 1], tf.float32, tf.contrib.layers.xavier_initializer()) ###### W_1 * v_j & W_2 * v_i & their sum ###### keys = tf.reshape(keys, [-1, h]) # [batch_sz, N, N, v] - v: self.v_size W1v = tf.tile(tf.expand_dims(\ tf.reshape(tf.matmul(keys, W_1), [-1, N, v]),\ 2), [1, 1, N, 1]) # [batch_sz, N, N, v] W2v = tf.tile(tf.expand_dims(\ tf.reshape(tf.matmul(keys, W_2), [-1, N, v]),\ 2), [1, 1, N, 1]) # restore keys to [batch_sz, N, h] keys = tf.reshape(keys, [-1, N, h]) # [batch_sz, N, N, v] # each vector in W_mixed (i, j) is W1v_i + W2v_j W_mixed = W1v + tf.transpose(W2v, [0, 2, 1, 3]) # [batch_sz * N, N] E = tf.matmul(tf.reshape(W_mixed, [-1, v]), v_weight) # [batch_sz, N, N] E = tf.reshape(E, [-1, N, N]) # [N, batch_sz, N] _, alpha = masked_softmax(tf.transpose(E, [1, 0, 2]), keys_mask, 2) # [batch_sz, N, N] alpha = tf.transpose(alpha, [1, 0, 2]) # [batch_sz, N, h] alpha = tf.matmul(alpha, keys) #### Bi-RNN #### bidirectional_gru_input = tf.concat([keys, alpha], 2) attn = self.encoder.build_graph(bidirectional_gru_input, keys_mask) attn = tf.nn.dropout(attn, self.keep_prob) return None, attn