def __init__(self, args, is_training=True): self.args = args if not is_training: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': self.cell = rnn_cell.BasicRNNCell(args.rnn_size) elif args.model == 'gru': self.cell = rnn_cell.GRUCell(args.rnn_size) elif args.model == 'lstm': self.cell = rnn_cell.BasicLSTMCell(args.rnn_size) else: raise Exception('model type not supported: {}'.format(args.model)) self.cell = rnn_cell.MultiRNNCell([self.cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Target replication self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, 2]) softmax_b = tf.get_variable('softmax_b', [2]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(i, [1]) for i in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=None) output_tf = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output_tf, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])]) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable = False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars, aggregation_method=2), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, config, is_training=False): self.config = config self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 vocab_size = config.vocab_size self.max_grad_norm = config.max_grad_norm self.use_lstm = config.use_lstm # Placeholders for inputs. self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self.initial_state = array_ops.zeros( array_ops.pack([self.batch_size, self.num_steps]), dtype=tf.float32).set_shape([None, self.num_steps]) embedding = tf.get_variable( 'embedding', [self.config.vocab_size, self.config.hidden_size]) # Set up ACT cell and inner rnn-type cell for use inside the ACT cell. with tf.variable_scope("rnn"): if self.use_lstm: inner_cell = rnn_cell.BasicLSTMCell(self.config.hidden_size) else: inner_cell = rnn_cell.GRUCell(self.config.hidden_size) with tf.variable_scope("ACT"): act = ACTCell(self.config.hidden_size, inner_cell, config.epsilon, max_computation=config.max_computation, batch_size=self.batch_size) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.config.num_steps, inputs) ] self.outputs, final_state = rnn(act, inputs, dtype=tf.float32) # Softmax to get probability distribution over vocab. output = tf.reshape(tf.concat(1, self.outputs), [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self.logits = tf.matmul( output, softmax_w) + softmax_b # dim (numsteps*batchsize, vocabsize) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) # Add up loss and retrieve batch-normalised ponder cost: sum N + sum Remainder. ponder_cost = act.calculate_ponder_cost( time_penalty=self.config.ponder_time_penalty) self.cost = (tf.reduce_sum(loss) / batch_size) + ponder_cost self.final_state = self.outputs[-1] if is_training: self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.config.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, is_training=True): if not is_training: seq_length = 1 else: seq_length = args.seq_length if args.model == 'rnn': cell_gen = rnn_cell.BasicRNNCell(args.rnn_size) cell_dis = rnn_cell.BasicRNNCell(args.rnn_size) elif args.model == 'gru': cell_gen = rnn_cell.GRUCell(args.rnn_size) cell_dis = rnn_cell.GRUCell(args.rnn_size) elif args.model == 'lstm': cell_gen = rnn_cell.BasicLSTMCell(args.rnn_size) cell_dis = rnn_cell.BasicLSTMCell(args.rnn_size) else: raise Exception('model type not supported: {}'.format(args.model)) # Pass the generated sequences and targets (1) with tf.name_scope('input'): with tf.name_scope('data'): self.input_data = tf.placeholder(tf.int32, [args.batch_size, seq_length]) with tf.name_scope('targets'): self.targets = tf.placeholder(tf.int32, [args.batch_size, seq_length]) ############ # Generator ############ with tf.variable_scope('generator'): self.cell_gen = rnn_cell.MultiRNNCell([cell_gen] * args.num_layers) self.initial_state_gen = self.cell_gen.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', [args.vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) inputs_gen = tf.split(1, seq_length, tf.nn.embedding_lookup( embedding, self.input_data)) inputs_gen = [tf.squeeze(i, [1]) for i in inputs_gen] outputs_gen, last_state_gen = seq2seq.rnn_decoder(inputs_gen, self.initial_state_gen, self.cell_gen, loop_function=None) self.logits_sequence = [] for output_gen in outputs_gen: logits_gen = tf.nn.xw_plus_b(output_gen, softmax_w, softmax_b) self.logits_sequence.append(logits_gen) self.final_state_gen = last_state_gen ################ # Discriminator ################ with tf.variable_scope('discriminator'): self.cell_dis = rnn_cell.MultiRNNCell([cell_dis] * args.num_layers) self.initial_state_dis = self.cell_dis.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, 2]) softmax_b = tf.get_variable('softmax_b', [2]) inputs_dis = [] embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) for logit in self.logits_sequence: inputs_dis.append(tf.matmul(logit, embedding)) # inputs_dis.append(tf.matmul(tf.nn.softmax(logit), embedding)) outputs_dis, last_state_dis = seq2seq.rnn_decoder(inputs_dis, self.initial_state_dis, self.cell_dis, loop_function=None) probs, logits = [], [] for output_dis in outputs_dis: logit = tf.nn.xw_plus_b(output_dis, softmax_w, softmax_b) prob = tf.nn.softmax(logit) logits.append(logit) probs.append(prob) with tf.name_scope('summary'): probs = tf.pack(probs) probs_real = tf.slice(probs, [0,0,1], [args.seq_length, args.batch_size, 1]) variable_summaries(probs_real, 'probability of real') self.final_state_dis = last_state_dis ######### # Train ######### with tf.name_scope('train'): gen_loss = seq2seq.sequence_loss_by_example( logits, tf.unpack(tf.transpose(self.targets)), tf.unpack(tf.transpose(tf.ones_like(self.targets, dtype=tf.float32)))) self.gen_cost = tf.reduce_sum(gen_loss) / args.batch_size tf.scalar_summary('training loss', self.gen_cost) self.lr_gen = tf.Variable(0.0, trainable = False) self.tvars = tf.trainable_variables() gen_vars = [v for v in self.tvars if not v.name.startswith("discriminator/")] if is_training: gen_grads = tf.gradients(self.gen_cost, gen_vars) self.all_grads = tf.gradients(self.gen_cost, self.tvars) gen_grads_clipped, _ = tf.clip_by_global_norm(gen_grads, args.grad_clip) gen_optimizer = tf.train.AdamOptimizer(self.lr_gen) self.gen_train_op = gen_optimizer.apply_gradients( zip(gen_grads_clipped, gen_vars)) with tf.name_scope('summary'): with tf.name_scope('weight_summary'): for v in self.tvars: variable_summaries(v, v.op.name) if is_training: with tf.name_scope('grad_summary'): for var, grad in zip(self.tvars, self.all_grads): variable_summaries(grad, 'grad/' + var.op.name) self.merged = tf.merge_all_summaries()
def __init__(self, sess, config, data_feed, log_dir): vocab_size = len(data_feed.vocab) self.data_feed = data_feed with tf.name_scope("io"): self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_seq") self.input_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="seq_len") self.da_labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="dialog_acts") self.senti_labels = tf.placeholder( dtype=tf.float32, shape=(None, data_feed.feature_size[data_feed.SENTI_ID]), name="sentiments") self.learning_rate = tf.Variable(float(config.init_lr), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * config.lr_decay) max_sent_len = array_ops.shape(self.inputs)[1] batch_size = array_ops.shape(self.inputs)[0] with variable_scope.variable_scope("word-embedding"): embedding = tf.get_variable("embedding", [vocab_size, config.embed_size], dtype=tf.float32) input_embedding = embedding_ops.embedding_lookup( embedding, tf.squeeze(tf.reshape(self.inputs, [-1, 1]), squeeze_dims=[1])) input_embedding = tf.reshape(input_embedding, [-1, max_sent_len, config.embed_size]) with variable_scope.variable_scope("rnn"): if config.cell_type == "gru": cell = rnn_cell.GRUCell(config.cell_size) elif config.cell_type == "lstm": cell = rnn_cell.LSTMCell(config.cell_size, use_peepholes=False, forget_bias=1.0) elif config.cell_type == "rnn": cell = rnn_cell.BasicRNNCell(config.cell_size) else: raise ValueError("unknown RNN type") if config.keep_prob < 1.0: cell = rnn_cell.DropoutWrapper( cell, output_keep_prob=config.keep_prob, input_keep_prob=config.keep_prob) if config.num_layer > 1: cell = rnn_cell.MultiRNNCell([cell] * config.num_layer, state_is_tuple=True) # and enc_last_state will be same as the true last state outputs, _ = tf.nn.dynamic_rnn( cell, input_embedding, dtype=tf.float32, sequence_length=self.input_lens, ) # get the TRUE last outputs last_outputs = tf.reduce_sum( tf.mul( outputs, tf.expand_dims( tf.one_hot(self.input_lens - 1, max_sent_len), -1)), 1) self.dialog_acts = self.fnn( last_outputs, data_feed.feature_size[data_feed.DA_ID], [100], "dialog_act_fnn") self.sentiments = self.fnn( last_outputs, data_feed.feature_size[data_feed.SENTI_ID], [100], "setiment_fnn") self.loss = tf.reduce_sum(nn_ops.sparse_softmax_cross_entropy_with_logits(self.dialog_acts, self.da_labels)) \ + tf.reduce_sum(nn_ops.softmax_cross_entropy_with_logits(self.sentiments, self.senti_labels)) self.loss /= tf.to_float(batch_size) tf.scalar_summary("entropy_loss", self.loss) self.summary_op = tf.merge_all_summaries() # weight decay tvars = tf.trainable_variables() for v in tvars: print("Trainable %s" % v.name) # optimization if config.op == "adam": print("Use Adam") optimizer = tf.train.AdamOptimizer(self.learning_rate) elif config.op == "rmsprop": print("Use RMSProp") optimizer = tf.train.RMSPropOptimizer(self.learning_rate) else: print("Use SGD") optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.grad_clip) self.train_ops = optimizer.apply_gradients(zip(grads, tvars)) self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2) if log_dir is not None: train_log_dir = os.path.join(log_dir, "train") print("Save summary to %s" % log_dir) self.train_summary_writer = tf.train.SummaryWriter( train_log_dir, sess.graph)