def __init__(self, is_training, length): self.batch_size = batch_size = FLAGS.batch_size self.num_steps = num_steps = length hidden_size = FLAGS.hidden_dim self._input_data = tf.placeholder(tf.float32, [batch_size, None, FLAGS.input_dim]) self._targets = tf.placeholder(tf.float32, [batch_size, None, FLAGS.output_dim]) if FLAGS.model == "rnn": vanilla_rnn_cell = rnn_cell.BasicRNNCell(num_units=FLAGS.hidden_dim) if is_training and FLAGS.keep_prob < 1: vanilla_rnn_cell = rnn_cell.DropoutWrapper(vanilla_rnn_cell, output_keep_prob=FLAGS.keep_prob) if FLAGS.layer == 1: cell = vanilla_rnn_cell elif FLAGS.layer == 2: cell = rnn_cell.MultiRNNCell([vanilla_rnn_cell] * 2) elif FLAGS.model == "lstm": lstm_cell = rnn_cell.BasicLSTMCell(num_units=FLAGS.hidden_dim, forget_bias=1.0) if is_training and FLAGS.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=FLAGS.keep_prob) if FLAGS.layer == 1: cell = lstm_cell elif FLAGS.layer == 2: cell = rnn_cell.MultiRNNCell([lstm_cell] * 2) elif FLAGS.model == "gru": gru_cell = rnn_cell.GRUCell(num_units=FLAGS.hidden_dim) if is_training and FLAGS.keep_prob < 1: gru_cell = rnn_cell.DropoutWrapper(gru_cell, output_keep_prob=FLAGS.keep_prob) cell = gru_cell else: raise ValueError("Invalid model: %s", FLAGS.model) self._initial_state = cell.zero_state(batch_size, tf.float32) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(self._input_data[:, time_step, :], state) outputs.append(cell_output) self._final_state = state hidden_output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size]) V_1 = tf.get_variable("v_1", shape=[hidden_size, FLAGS.output_dim], initializer=tf.random_uniform_initializer(-tf.sqrt(1./hidden_size),tf.sqrt(1./hidden_size))) b_1 = tf.get_variable("b_1", shape=[FLAGS.output_dim], initializer=tf.constant_initializer(0.1)) logits = tf.add(tf.matmul(hidden_output, V_1), b_1) target = tf.reshape(self._targets, [-1, FLAGS.output_dim]) training_loss = tf.reduce_sum(tf.pow(logits-target, 2)) / 2 mse = tf.reduce_mean(tf.pow(logits-target, 2)) self._cost = mse if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(training_loss, tvars), FLAGS.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self): # Input self.point = tf.placeholder(tf.float32, [m, 1], 'points') # Used in training only self.variances = tf.placeholder(tf.float32, [k, 1], 'variances') self.weights = tf.placeholder(tf.float32, [k, 1], 'weights') self.hyperplanes = tf.placeholder( tf.float32, [m, m, k], 'hyperplanes') # Points which define the hyperplanes if rnn_type == 'lstm': self.initial_rnn_state = tf.placeholder_with_default( input=tf.zeros([m, 2 * num_rnn_layers * rnn_size]), shape=[None, 2 * num_rnn_layers * rnn_size]) else: # initial_rnn_state is passed during evaluation but not during training # each dimension has an independent hidden state, required in order to simulate Adam, RMSProp etc. self.initial_rnn_state = tf.placeholder_with_default( input=tf.zeros([m, num_rnn_layers * rnn_size]), shape=[None, num_rnn_layers * rnn_size]) # The scope allows these variables to be excluded from being reinitialized during the comparison phase with tf.variable_scope("optimizer"): if rnn_type == 'rnn': cell = rnn_cell.BasicRNNCell(rnn_size) elif rnn_type == 'gru': cell = rnn_cell.GRUCell(rnn_size) elif rnn_type == 'lstm': cell = rnn_cell.LSTMCell(rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * num_rnn_layers) updates = [] snf_losses = [] # Arguments passed to the condition and body functions time = tf.constant(0) point = self.point snf_loss = snf.calc_snf_loss_tf(point, self.hyperplanes, self.variances, self.weights) snf_losses.append(snf_loss) snf_grads = snf.calc_grads_tf(snf_loss, point) snf_grads = tf.squeeze(snf_grads, [0]) snf_loss_ta = tf.TensorArray(dtype=tf.float32, size=seq_length) update_ta = tf.TensorArray(dtype=tf.float32, size=seq_length) rnn_state = tf.zeros([m, rnn_size * num_rnn_layers]) loop_vars = [ time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, self.hyperplanes, self.variances, self.weights ] def condition(time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, hyperplanes, variances, weights): return tf.less(time, seq_length) def body(time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, hyperplanes, variances, weights): h, rnn_state_out = self.cell(snf_grads, rnn_state) # Final layer of the optimizer # Cannot use fc_layer due to a 'must be from the same frame' error d = np.sqrt(1.0) / np.sqrt( rnn_size + 1) ### should be sqrt(2, 3 or 6?) initializer = tf.random_uniform_initializer(-d, d) W = tf.get_variable("W", [rnn_size, 1], initializer=initializer) # No bias, linear activation function update = tf.matmul(h, W) update = tf.reshape(update, [m, 1]) update = inv_scale_grads(update) new_point = point + update snf_loss = snf.calc_snf_loss_tf(new_point, hyperplanes, variances, weights) snf_losses.append(snf_loss) snf_loss_ta = snf_loss_ta.write(time, snf_loss) update_ta = update_ta.write(time, update) snf_grads_out = snf.calc_grads_tf(snf_loss, point) snf_grads_out = tf.reshape(snf_grads_out, [m, 1]) time += 1 return [ time, new_point, snf_grads_out, rnn_state_out, snf_loss_ta, update_ta, hyperplanes, variances, weights ] # Do the computation with tf.variable_scope("o1"): res = tf.while_loop(condition, body, loop_vars) self.new_point = res[1] self.rnn_state_out = res[3] losses = res[4].pack() updates = res[5].pack() # Total change in the SNF loss # Improvement: 2 - 3 = -1 (small loss) snf_loss_change = losses[seq_length - 1] - losses[0] snf_loss_change = tf.maximum(snf_loss_change, loss_asymmetry * snf_loss_change) # Asymmetric loss self.loss_change_sign = tf.sign(snf_loss_change) # Oscillation cost overall_update = tf.zeros([m, 1]) norm_sum = 0.0 for i in range(seq_length): overall_update += updates[i, :, :] norm_sum += tf_norm(updates[i, :, :]) osc_cost = norm_sum / tf_norm(overall_update) # > 1 self.total_loss = snf_loss_change * tf.pow( osc_cost, tf.sign(snf_loss_change)) #===# Model training #===# #opt = tf.train.RMSPropOptimizer(0.01,momentum=0.5) opt = tf.train.AdamOptimizer() vars = tf.trainable_variables() gvs = opt.compute_gradients(self.total_loss, vars) self.gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for (grad, var) in gvs] self.grads_input = [(tf.placeholder(tf.float32, shape=v.get_shape()), v) for (g, v) in gvs] self.train_step = opt.apply_gradients(self.grads_input) #===# Comparison code #===# self.input_grads = tf.placeholder( tf.float32, [1, None, 1], 'input_grads') ### Remove first dimension? input_grads = tf.squeeze(self.input_grads, [0]) with tf.variable_scope("o1", reuse=True) as scope: h, self.rnn_state_out_compare = self.cell( input_grads, self.initial_rnn_state) W = tf.get_variable("W") update = tf.matmul(h, W) update = tf.reshape(update, [-1, 1]) self.update = inv_scale_grads(update)