Exemple #1
0
    def _build_model(self):
        """Our model takes in a vector of q_states from a segment and returns a reward for each one"""
        self.segment_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None, self.q_state_size), name="obs_placeholder")
        self.segment_alt_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None, self.q_state_size), name="obs_placeholder")

        # A vanilla MLP maps a q_state to a reward
        self.mlp = FullyConnectedMLP(self.q_state_size)
        self.q_state_reward_pred = self._predict_rewards(self.segment_placeholder)
        q_state_alt_reward_pred = self._predict_rewards(self.segment_alt_placeholder)

        # We use trajectory segments rather than individual q_states because video clips of segments are easier for
        # humans to evaluate
        segment_reward_pred_left = tf.reduce_sum(self.q_state_reward_pred, axis=1)
        segment_reward_pred_right = tf.reduce_sum(q_state_alt_reward_pred, axis=1)
        reward_logits = tf.stack([segment_reward_pred_left, segment_reward_pred_right], axis=1)  # (batch_size, 2)

        self.labels = tf.placeholder(dtype=tf.float32, shape=(None,2), name="comparison_labels")

        # delta = 1e-5
        # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta)

        data_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels,logits=reward_logits)

        self.loss_op = tf.reduce_mean(data_loss)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.train_op = tf.train.AdamOptimizer().minimize(self.loss_op, global_step=self.global_step)
Exemple #2
0
    def _build_model(self):
        """
        Our model takes in path segments with states and actions, and generates Q values.
        These Q values serve as predictions of the true reward.
        We can compare two segments and sum the Q values to get a prediction of a label
        of which segment is better. We then learn the weights for our model by comparing
        these labels with an authority (either a human or synthetic labeler).
        """
        # Set up observation placeholders
        self.segment_obs_placeholder = tf.placeholder(dtype=tf.float32,
                                                      shape=self.obs_shape,
                                                      name="obs_placeholder")
        self.segment_alt_obs_placeholder = tf.placeholder(
            dtype=tf.float32, shape=self.obs_shape, name="alt_obs_placeholder")

        self.segment_act_placeholder = tf.placeholder(dtype=tf.float32,
                                                      shape=self.act_shape,
                                                      name="act_placeholder")
        self.segment_alt_act_placeholder = tf.placeholder(
            dtype=tf.float32, shape=self.act_shape, name="alt_act_placeholder")

        # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value)
        mlp = FullyConnectedMLP(self.obs_shape, self.act_shape)

        self.q_value = self._predict_rewards(self.segment_obs_placeholder,
                                             self.segment_act_placeholder, mlp)
        alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder,
                                            self.segment_alt_act_placeholder,
                                            mlp)

        # We use trajectory segments rather than individual (state, action) pairs because
        # video clips of segments are easier for humans to evaluate
        segment_reward_pred_left = tf.reduce_sum(self.q_value, axis=1)
        segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1)
        reward_logits = tf.stack(
            [segment_reward_pred_left, segment_reward_pred_right],
            axis=1)  # (batch_size, 2)

        self.labels = tf.placeholder(dtype=tf.int32,
                                     shape=(None, ),
                                     name="comparison_labels")

        # delta = 1e-5
        # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta)

        data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=reward_logits, labels=self.labels)

        self.loss_op = tf.reduce_mean(data_loss)

        global_step = tf.Variable(0, name='global_step', trainable=False)
        self.train_op = tf.train.AdamOptimizer().minimize(
            self.loss_op, global_step=global_step)

        return tf.get_default_graph()
    def _build_model(self):
        """Our model takes in path segments with observations and actions, and generates rewards (Q-values)."""
        # Set up observation placeholder
        self.obs_placeholder = tf.placeholder(dtype=tf.float32,
                                              shape=(None, None) +
                                              self.obs_shape,
                                              name="obs_placeholder")

        # Set up action placeholder
        if self.discrete_action_space:
            self.act_placeholder = tf.placeholder(dtype=tf.float32,
                                                  shape=(None, None),
                                                  name="act_placeholder")
            # Discrete actions need to become one-hot vectors for the model
            segment_act = tf.one_hot(tf.cast(self.act_placeholder, tf.int32),
                                     self.act_shape[0])
            # HACK Use a convolutional network for Atari
            # TODO Should check the input space dimensions, not the output space!
            net = SimpleConvolveObservationQNet(self.obs_shape, self.act_shape)
        else:
            self.act_placeholder = tf.placeholder(dtype=tf.float32,
                                                  shape=(None, None) +
                                                  self.act_shape,
                                                  name="act_placeholder")
            # Assume the actions are how we want them
            segment_act = self.act_placeholder
            # In simple environments, default to a basic Multi-layer Perceptron (see TODO above)
            print('obs shape', self.obs_shape)
            print('act shape', self.act_shape)
            net = FullyConnectedMLP(self.obs_shape, self.act_shape)

        # Our neural network maps a (state, action) pair to a reward
        self.rewards = nn_predict_rewards(self.obs_placeholder, segment_act,
                                          net, self.obs_shape, self.act_shape)

        # We use trajectory segments rather than individual (state, action) pairs because
        # video clips of segments are easier for humans to evaluate
        self.segment_rewards = tf.reduce_sum(self.rewards, axis=1)

        self.targets = tf.placeholder(dtype=tf.float32,
                                      shape=(None, ),
                                      name="reward_targets")

        self.loss = tf.reduce_mean(
            tf.square(self.targets - self.segment_rewards))

        self.train_op = tf.train.AdamOptimizer().minimize(self.loss)

        return tf.get_default_graph()
Exemple #4
0
    def _build_model(self):
        """
        Our model takes in path segments with states and actions, and generates Q values.
        These Q values serve as predictions of the true reward.
        We can compare two segments and sum the Q values to get a prediction of a label
        of which segment is better. We then learn the weights for our model by comparing
        these labels with an authority (either a human or synthetic labeler).
        """
        # Set up observation placeholders
        self.segment_obs_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None) + self.obs_shape, name="obs_placeholder")
        self.segment_alt_obs_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None) + self.obs_shape, name="alt_obs_placeholder")

        self.segment_act_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None) + self.act_shape, name="act_placeholder")
        self.segment_alt_act_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None, None) + self.act_shape, name="alt_act_placeholder")


        # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value)
        # make a list for reward networks
        mlps = []
        self.q_values = []
        self.loss_ops = []
        self.train_ops = []
        self.labels = tf.placeholder(dtype=tf.int32, shape=(None,), name="comparison_labels")
        # loop over the num_r to cluster of NNs
        for i in range(self.num_r):
            # NN for each reward
            mlp = FullyConnectedMLP(self.obs_shape, self.act_shape)
            mlps.append(mlp)
            # q_vlaue and alt_q_value for each reward network
            q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp)
            self.q_values.append(q_value)

            alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp)

#       mlp = FullyConnectedMLP(self.obs_shape, self.act_shape)

#        self.q_value = self._predict_rewards(self.segment_obs_placeholder, self.segment_act_placeholder, mlp)
#        alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder, self.segment_alt_act_placeholder, mlp)

            # We use trajectory segments rather than individual (state, action) pairs because
            # video clips of segments are easier for humans to evaluate
            segment_reward_pred_left = tf.reduce_sum(q_value, axis=1)
            segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1)
            reward_logits = tf.stack([segment_reward_pred_left, segment_reward_pred_right], axis=1)  # (batch_size, 2)



            # delta = 1e-5
            # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta)

            data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=reward_logits, labels=self.labels)

            loss_op = tf.reduce_mean(data_loss)
            self.loss_ops.append(loss_op)

            global_step = tf.Variable(0, name='global_step', trainable=False)
            train_op = tf.train.AdamOptimizer().minimize(loss_op, global_step=global_step)
            self.train_ops.append(train_op)


        # segment quality classifier
        # placeholder for the concatenated obs and act
        self.segment_placeholder = tf.placeholder(
            dtype=tf.float32, shape=(None,(np.prod(self.obs_shape)+np.prod(self.act_shape))*self._frames_per_segment), name="input_placeholder_classifier")

        # model
        mlp_classifier = FullyConnected_classifier(self.obs_shape, self.act_shape, self._frames_per_segment)

        # labels from human
        self.labels_from_human = tf.placeholder(dtype=tf.int32, shape=(None,), name="softmax_labels")

        # raw output from the classifier
        self.softmax_predicted_labels = mlp_classifier.run(self.segment_placeholder)
        # loss for classifier
        self.loss_softmax_classifier = tf.nn.sparse_softmax_cross_entropy_with_logits(logits= self.softmax_predicted_labels, labels=self.labels_from_human)


        self.train_softmax_classifier = tf.train.AdamOptimizer().minimize(self.loss_softmax_classifier)


        return tf.get_default_graph()
Exemple #5
0
    def _build_model(self):
        """
        Our model takes in path segments with states and actions, and generates Q values.
        These Q values serve as predictions of the true reward.
        We can compare two segments and sum the Q values to get a prediction of a label
        of which segment is better. We then learn the weights for our model by comparing
        these labels with an authority (either a human or synthetic labeler).
        """
        # Set up observation placeholders
        self.segment_obs_placeholder = tf.placeholder(dtype=tf.float32,
                                                      shape=(None, None) +
                                                      self.obs_shape,
                                                      name="obs_placeholder")
        self.segment_alt_obs_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=(None, None) + self.obs_shape,
            name="alt_obs_placeholder")

        self.segment_act_placeholder = tf.placeholder(dtype=tf.float32,
                                                      shape=(None, None) +
                                                      self.act_shape,
                                                      name="act_placeholder")
        self.segment_alt_act_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=(None, None) + self.act_shape,
            name="alt_act_placeholder")

        if self.use_bnn:
            print("Using BNN to generate more efficient queries")
            input_dim = np.prod(self.obs_shape) + np.prod(self.act_shape)
            self.rew_bnn = BNN(input_dim, [64, 64],
                               1,
                               10,
                               self.sess,
                               batch_size=1,
                               trans_func=tf.nn.relu,
                               n_samples=self.bnn_samples,
                               out_func=None)
            self.bnn_q_value = self._predict_bnn_rewards(
                self.segment_obs_placeholder, self.segment_act_placeholder,
                self.rew_bnn)
            bnn_alt_q_value = self._predict_bnn_rewards(
                self.segment_alt_obs_placeholder,
                self.segment_alt_act_placeholder, self.rew_bnn)
            # A vanilla multi-layer perceptron maps a (state, action) pair to a reward (Q-value)
        mlp = FullyConnectedMLP(self.obs_shape, self.act_shape)
        self.q_value = self._predict_rewards(self.segment_obs_placeholder,
                                             self.segment_act_placeholder, mlp)
        alt_q_value = self._predict_rewards(self.segment_alt_obs_placeholder,
                                            self.segment_alt_act_placeholder,
                                            mlp)

        print("Constructed Reward Model")

        # We use trajectory segments rather than individual (state, action) pairs because
        # video clips of segments are easier for humans to evaluate
        segment_reward_pred_left = tf.reduce_sum(self.q_value, axis=1)
        segment_reward_pred_right = tf.reduce_sum(alt_q_value, axis=1)
        reward_logits = tf.stack(
            [segment_reward_pred_left, segment_reward_pred_right],
            axis=1)  # (batch_size, 2)
        self.labels = tf.placeholder(dtype=tf.int32,
                                     shape=(None, ),
                                     name="comparison_labels")

        # delta = 1e-5f
        # clipped_comparison_labels = tf.clip_by_value(self.comparison_labels, delta, 1.0-delta)

        self.data_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=reward_logits, labels=self.labels)

        self.loss_op = tf.reduce_mean(self.data_loss)

        global_step = tf.Variable(0, name='global_step', trainable=False)
        self.train_op = tf.train.AdamOptimizer().minimize(
            self.loss_op, global_step=global_step)

        if self.use_bnn:
            segment_reward_bnn_left = tf.reduce_sum(self.bnn_q_value, axis=1)
            segment_reward_bnn_right = tf.reduce_sum(bnn_alt_q_value, axis=1)
            segment_reward_mean_left = tf.reduce_mean(self.bnn_q_value, axis=1)
            segment_reward_mean_right = tf.reduce_mean(bnn_alt_q_value, axis=1)
            # self.mean_rew_logits = tf.stack([segment_reward_mean_left, segment_reward_mean_right], axis=1)
            self.softmax_rew = tf.nn.softmax(reward_logits / self.softmax_beta)
            self.bnn_data_loss = self.rew_bnn.loss(segment_reward_bnn_left,
                                                   segment_reward_bnn_right,
                                                   self.labels)
            self.bnn_loss_op = tf.reduce_mean(self.bnn_data_loss)
            self.train_bnn_op = tf.train.AdamOptimizer().minimize(
                self.bnn_loss_op)
            self.plan_labels = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="plan_labels")
            self.planning_loss = self.rew_bnn.loss_last_sample(
                segment_reward_mean_left, segment_reward_mean_right,
                self.plan_labels)
            self.planning_kl = self.rew_bnn.fast_kl_div(
                self.planning_loss, self.rew_bnn.get_mus(),
                self.rew_bnn.get_rhos(), 0.01)

        print("Constructed Training Ops")

        return tf.get_default_graph()