def _build_model(self): """Our model takes in a vector of q_states from a segment and returns a reward for each one""" self.segment_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None, self.q_state_size), name="obs_placeholder") self.segment_alt_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None, self.q_state_size), name="obs_placeholder") # A vanilla MLP maps a q_state to a reward self.mlp = FullyConnectedMLP(self.q_state_size) self.q_state_reward_pred = self._predict_rewards(self.segment_placeholder) q_state_alt_reward_pred = self._predict_rewards(self.segment_alt_placeholder) # We use trajectory segments rather than individual q_states because video clips of segments are easier for # humans to evaluate segment_reward_pred_left = tf.reduce_sum(self.q_state_reward_pred, axis=1) segment_reward_pred_right = tf.reduce_sum(q_state_alt_reward_pred, axis=1) reward_logits = tf.stack([segment_reward_pred_left, segment_reward_pred_right], axis=1) # (batch_size, 2) self.labels = tf.placeholder(dtype=tf.float32, shape=(None,2), name="comparison_labels") # delta = 1e-5 # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta) data_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels,logits=reward_logits) self.loss_op = tf.reduce_mean(data_loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.train_op = tf.train.AdamOptimizer().minimize(self.loss_op, global_step=self.global_step)
def _build_model(self): """ Our model takes in path segments with states and actions, and generates Q values. These Q values serve as predictions of the true reward. We can compare two segments and sum the Q values to get a prediction of a label of which segment is better. We then learn the weights for our model by comparing these labels with an authority (either a human or synthetic labeler). """ # Set up observation placeholders self.segment_obs_placeholder = tf.placeholder(dtype=tf.float32, shape=self.obs_shape, name="obs_placeholder") self.segment_alt_obs_placeholder = tf.placeholder( dtype=tf.float32, shape=self.obs_shape, name="alt_obs_placeholder") self.segment_act_placeholder = tf.placeholder(dtype=tf.float32, shape=self.act_shape, name="act_placeholder") self.segment_alt_act_placeholder = tf.placeholder( dtype=tf.float32, shape=self.act_shape, name="alt_act_placeholder") # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value) mlp = FullyConnectedMLP(self.obs_shape, self.act_shape) self.q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp) alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp) # We use trajectory segments rather than individual (state, action) pairs because # video clips of segments are easier for humans to evaluate segment_reward_pred_left = tf.reduce_sum(self.q_value, axis=1) segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1) reward_logits = tf.stack( [segment_reward_pred_left, segment_reward_pred_right], axis=1) # (batch_size, 2) self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="comparison_labels") # delta = 1e-5 # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta) data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=reward_logits, labels=self.labels) self.loss_op = tf.reduce_mean(data_loss) global_step = tf.Variable(0, name='global_step', trainable=False) self.train_op = tf.train.AdamOptimizer().minimize( self.loss_op, global_step=global_step) return tf.get_default_graph()
def _build_model(self): """Our model takes in path segments with observations and actions, and generates rewards (Q-values).""" # Set up observation placeholder self.obs_placeholder = tf.placeholder(dtype=tf.float32, shape=(None, None) + self.obs_shape, name="obs_placeholder") # Set up action placeholder if self.discrete_action_space: self.act_placeholder = tf.placeholder(dtype=tf.float32, shape=(None, None), name="act_placeholder") # Discrete actions need to become one-hot vectors for the model segment_act = tf.one_hot(tf.cast(self.act_placeholder, tf.int32), self.act_shape[0]) # HACK Use a convolutional network for Atari # TODO Should check the input space dimensions, not the output space! net = SimpleConvolveObservationQNet(self.obs_shape, self.act_shape) else: self.act_placeholder = tf.placeholder(dtype=tf.float32, shape=(None, None) + self.act_shape, name="act_placeholder") # Assume the actions are how we want them segment_act = self.act_placeholder # In simple environments, default to a basic Multi-layer Perceptron (see TODO above) print('obs shape', self.obs_shape) print('act shape', self.act_shape) net = FullyConnectedMLP(self.obs_shape, self.act_shape) # Our neural network maps a (state, action) pair to a reward self.rewards = nn_predict_rewards(self.obs_placeholder, segment_act, net, self.obs_shape, self.act_shape) # We use trajectory segments rather than individual (state, action) pairs because # video clips of segments are easier for humans to evaluate self.segment_rewards = tf.reduce_sum(self.rewards, axis=1) self.targets = tf.placeholder(dtype=tf.float32, shape=(None, ), name="reward_targets") self.loss = tf.reduce_mean( tf.square(self.targets - self.segment_rewards)) self.train_op = tf.train.AdamOptimizer().minimize(self.loss) return tf.get_default_graph()
def _build_model(self): """ Our model takes in path segments with states and actions, and generates Q values. These Q values serve as predictions of the true reward. We can compare two segments and sum the Q values to get a prediction of a label of which segment is better. We then learn the weights for our model by comparing these labels with an authority (either a human or synthetic labeler). """ # Set up observation placeholders self.segment_obs_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.obs_shape, name="obs_placeholder") self.segment_alt_obs_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.obs_shape, name="alt_obs_placeholder") self.segment_act_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.act_shape, name="act_placeholder") self.segment_alt_act_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.act_shape, name="alt_act_placeholder") # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value) # make a list for reward networks mlps = [] self.q_values = [] self.loss_ops = [] self.train_ops = [] self.labels = tf.placeholder(dtype=tf.int32, shape=(None,), name="comparison_labels") # loop over the num_r to cluster of NNs for i in range(self.num_r): # NN for each reward mlp = FullyConnectedMLP(self.obs_shape, self.act_shape) mlps.append(mlp) # q_vlaue and alt_q_value for each reward network q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp) self.q_values.append(q_value) alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp) # mlp = FullyConnectedMLP(self.obs_shape, self.act_shape) # self.q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp) # alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp) # We use trajectory segments rather than individual (state, action) pairs because # video clips of segments are easier for humans to evaluate segment_reward_pred_left = tf.reduce_sum(q_value, axis=1) segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1) reward_logits = tf.stack([segment_reward_pred_left, segment_reward_pred_right], axis=1) # (batch_size, 2) # delta = 1e-5 # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta) data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=reward_logits, labels=self.labels) loss_op = tf.reduce_mean(data_loss) self.loss_ops.append(loss_op) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = tf.train.AdamOptimizer().minimize(loss_op, global_step=global_step) self.train_ops.append(train_op) # segment quality classifier # placeholder for the concatenated obs and act self.segment_placeholder = tf.placeholder( dtype=tf.float32, shape=(None,(np.prod(self.obs_shape)+np.prod(self.act_shape))*self._frames_per_segment), name="input_placeholder_classifier") # model mlp_classifier = FullyConnected_classifier(self.obs_shape, self.act_shape, self._frames_per_segment) # labels from human self.labels_from_human = tf.placeholder(dtype=tf.int32, shape=(None,), name="softmax_labels") # raw output from the classifier self.softmax_predicted_labels = mlp_classifier.run(self.segment_placeholder) # loss for classifier self.loss_softmax_classifier = tf.nn.sparse_softmax_cross_entropy_with_logits(logits= self.softmax_predicted_labels, labels=self.labels_from_human) self.train_softmax_classifier = tf.train.AdamOptimizer().minimize(self.loss_softmax_classifier) return tf.get_default_graph()
def _build_model(self): """ Our model takes in path segments with states and actions, and generates Q values. These Q values serve as predictions of the true reward. We can compare two segments and sum the Q values to get a prediction of a label of which segment is better. We then learn the weights for our model by comparing these labels with an authority (either a human or synthetic labeler). """ # Set up observation placeholders self.segment_obs_placeholder = tf.placeholder(dtype=tf.float32, shape=(None, None) + self.obs_shape, name="obs_placeholder") self.segment_alt_obs_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.obs_shape, name="alt_obs_placeholder") self.segment_act_placeholder = tf.placeholder(dtype=tf.float32, shape=(None, None) + self.act_shape, name="act_placeholder") self.segment_alt_act_placeholder = tf.placeholder( dtype=tf.float32, shape=(None, None) + self.act_shape, name="alt_act_placeholder") if self.use_bnn: print("Using BNN to generate more efficient queries") input_dim = np.prod(self.obs_shape) + np.prod(self.act_shape) self.rew_bnn = BNN(input_dim, [64, 64], 1, 10, self.sess, batch_size=1, trans_func=tf.nn.relu, n_samples=self.bnn_samples, out_func=None) self.bnn_q_value = self._predict_bnn_rewards( self.segment_obs_placeholder, self.segment_act_placeholder, self.rew_bnn) bnn_alt_q_value = self._predict_bnn_rewards( self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, self.rew_bnn) # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value) mlp = FullyConnectedMLP(self.obs_shape, self.act_shape) self.q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp) alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp) print("Constructed Reward Model") # We use trajectory segments rather than individual (state, action) pairs because # video clips of segments are easier for humans to evaluate segment_reward_pred_left = tf.reduce_sum(self.q_value, axis=1) segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1) reward_logits = tf.stack( [segment_reward_pred_left, segment_reward_pred_right], axis=1) # (batch_size, 2) self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="comparison_labels") # delta = 1e-5f # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta) self.data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=reward_logits, labels=self.labels) self.loss_op = tf.reduce_mean(self.data_loss) global_step = tf.Variable(0, name='global_step', trainable=False) self.train_op = tf.train.AdamOptimizer().minimize( self.loss_op, global_step=global_step) if self.use_bnn: segment_reward_bnn_left = tf.reduce_sum(self.bnn_q_value, axis=1) segment_reward_bnn_right = tf.reduce_sum(bnn_alt_q_value, axis=1) segment_reward_mean_left = tf.reduce_mean(self.bnn_q_value, axis=1) segment_reward_mean_right = tf.reduce_mean(bnn_alt_q_value, axis=1) # self.mean_rew_logits = tf.stack([segment_reward_mean_left, segment_reward_mean_right], axis=1) self.softmax_rew = tf.nn.softmax(reward_logits / self.softmax_beta) self.bnn_data_loss = self.rew_bnn.loss(segment_reward_bnn_left, segment_reward_bnn_right, self.labels) self.bnn_loss_op = tf.reduce_mean(self.bnn_data_loss) self.train_bnn_op = tf.train.AdamOptimizer().minimize( self.bnn_loss_op) self.plan_labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="plan_labels") self.planning_loss = self.rew_bnn.loss_last_sample( segment_reward_mean_left, segment_reward_mean_right, self.plan_labels) self.planning_kl = self.rew_bnn.fast_kl_div( self.planning_loss, self.rew_bnn.get_mus(), self.rew_bnn.get_rhos(), 0.01) print("Constructed Training Ops") return tf.get_default_graph()