Esempio n. 1
0
def create_model(input_shape, num_actions, model_name, create_network_fn, learning_rate):  # noqa: D103
    """Create the Q-network model."""
    with tf.name_scope(model_name):
        input_frames = tf.placeholder(tf.float32, [None, input_shape],
                                      name ='input_frames')
        q_network, network_parameters = create_network_fn(
            input_frames, input_shape, num_actions)

        mean_max_Q =tf.reduce_mean( tf.reduce_max(q_network, axis=[1]), name='mean_max_Q')

        Q_vector_indexes = tf.placeholder(tf.int32, [None, 2], name ='Q_vector_indexes')
        gathered_outputs = tf.gather_nd(q_network, Q_vector_indexes, name='gathered_outputs')

        y_ph = tf.placeholder(tf.float32, name='y_ph')
        loss = mean_huber_loss(y_ph, gathered_outputs)
        train_step = tf.train.RMSPropOptimizer(learning_rate,
            decay=RMSP_DECAY, momentum=RMSP_MOMENTUM, epsilon=RMSP_EPSILON).minimize(loss)

    model = {
        'q_network' : q_network,
        'input_frames' : input_frames,
        'Q_vector_indexes' : Q_vector_indexes,
        'y_ph' : y_ph,
        'train_step': train_step,
        'mean_max_Q' : mean_max_Q,
    }
    return model, network_parameters
Esempio n. 2
0
def testHuberLoss():
	with tf.Session() as sess:
		y_true = tf.constant([0.7,0.3,0.8,0.1])
		y_pred = tf.constant([2.1,0.4,0.9,3.2])

		loss = sess.run(huber_loss(y_true, y_pred))
		mean_loss = sess.run(mean_huber_loss(y_true, y_pred))

		assert(np.isclose(loss, [0.9, 0.005, 0.005, 2.6]).all())
		assert(np.isclose(mean_loss, 0.8775))
Esempio n. 3
0
    def create_dueling_dqn_model(self, window, input_shape, num_actions,
                                 model_name):  # noqa: D103

        with tf.name_scope(model_name) as scope:

            self.x = tf.placeholder(
                tf.float32,
                shape=[None, input_shape[0], input_shape[1], window],
                name='input_state')
            self.y_true = tf.placeholder(tf.float32,
                                         shape=[None, 1],
                                         name='target_q_val')

            # conv1 layer
            self.W_conv1 = tf.Variable(tf.truncated_normal([8, 8, window, 16],
                                                           stddev=0.1),
                                       name='W_conv1')
            self.b_conv1 = tf.Variable(tf.constant(0.1, shape=[16]),
                                       name='b_conv1')

            self.conv1 = tf.nn.conv2d(
                self.x, self.W_conv1, strides=[1, 4, 4, 1],
                padding='VALID') + self.b_conv1
            self.relu_conv1 = tf.nn.relu(self.conv1)

            #conv2 layer
            self.W_conv2 = tf.Variable(tf.truncated_normal([4, 4, 16, 32],
                                                           stddev=0.1),
                                       name='W_conv2')
            self.b_conv2 = tf.Variable(tf.constant(0.1, shape=[32]),
                                       name='b_conv2')

            self.conv2 = tf.nn.conv2d(self.relu_conv1,
                                      self.W_conv2,
                                      strides=[1, 2, 2, 1],
                                      padding='VALID') + self.b_conv2
            self.relu_conv2 = tf.nn.relu(self.conv2)
            self.relu_conv2_flat = tf.reshape(self.relu_conv2,
                                              [-1, 9 * 9 * 32])

            # # # # # # # # # # # # # : Remember, the convolutional layers are shared, but not the fully connected # # # # # # # # # # # # # #

            # Now splitting into advantage and value streams, using 512 hidden units for Dueling architecture.

            # Advantage Stream:
            self.W_fc3_adv = tf.Variable(tf.truncated_normal([9 * 9 * 32, 512],
                                                             stddev=0.1),
                                         name='W_fc3_adv')
            self.b_fc3_adv = tf.Variable(tf.constant(0.1, shape=[512]),
                                         name='b_fc3_adv')

            self.fc3_adv = tf.matmul(self.relu_conv2_flat,
                                     self.W_fc3_adv) + self.b_fc3_adv
            self.relu_fc3_adv = tf.nn.relu(self.fc3_adv)

            self.W_fc4_adv = tf.Variable(tf.truncated_normal(
                [512, num_actions], stddev=0.1),
                                         name='W_fc4_adv')
            self.b_fc4_adv = tf.Variable(tf.constant(0.1, shape=[num_actions]),
                                         name='b_output')

            self.fc4_adv = tf.matmul(self.relu_fc3_adv,
                                     self.W_fc4_adv) + self.b_fc4_adv

            # Value Stream:
            self.W_fc3_val = tf.Variable(tf.truncated_normal([9 * 9 * 32, 512],
                                                             stddev=0.1),
                                         name='W_fc3_val')
            self.b_fc3_val = tf.Variable(tf.constant(0.1, shape=[512]),
                                         name='b_fc3_val')

            self.fc3_val = tf.matmul(self.relu_conv2_flat,
                                     self.W_fc3_val) + self.b_fc3_val
            self.relu_fc3_val = tf.nn.relu(self.fc3_val)

            self.W_fc4_val = tf.Variable(tf.truncated_normal([512, 1],
                                                             stddev=0.1),
                                         name='W_fc4_val')
            self.b_fc4_val = tf.Variable(tf.constant(0.1, shape=[1]),
                                         name='b_fc4_val')

            self.fc4_val = tf.matmul(self.relu_fc3_val,
                                     self.W_fc4_val) + self.b_fc4_val

            # Merging into Q values (subtracting out the average advantage to disambiguate the value and advantage.)
            self.pred_q = tf.add(self.fc4_val,
                                 tf.subtract(self.fc4_adv,
                                             tf.reduce_mean(self.fc4_adv)),
                                 name='pred_q')
            self.selected_action = tf.placeholder(
                tf.float32,
                shape=[None, self.num_actions],
                name='selected_action')

            if model_name.startswith('source'):
                self.pred_y = tf.reduce_sum(tf.multiply(
                    self.pred_q, self.selected_action),
                                            axis=1)
                self.loss = mean_huber_loss(self.y_true, self.pred_y)
                self.accumulated_avg_reward = tf.placeholder(
                    tf.float32, shape=(), name='accumulated_avg_reward')
                self.train = tf.train.AdamOptimizer(1e-4).minimize(
                    self.loss, name='Adam_minimizer')
                self.maxq_summary = tf.summary.scalar(
                    'Max_Q', tf.reduce_max(self.pred_q))
                self.loss_summary = tf.summary.scalar('Loss', self.loss)

                self.merged = tf.summary.merge_all()

                self.train_reward_val = tf.placeholder(tf.float32,
                                                       shape=(),
                                                       name='train_reward_val')
                self.reward_train_summary = tf.summary.scalar(
                    'Training_reward', self.train_reward_val)

                self.reward_summary = tf.summary.scalar(
                    'Average_reward', self.accumulated_avg_reward)
Esempio n. 4
0
    def create_dqn_model(self, window, input_shape, num_actions,
                         model_name):  # noqa: D103
        """Create the Q-network model.

                We highly recommend that you use tf.name_scope as discussed in
                class when creating the model and the layers. This will make it
                far easier to understnad your network architecture if you are
                logging with tensorboard.

                Parameters
                ----------
                window: int
                  Each input to the network is a sequence of frames. This value
                  defines how many frames are in the sequence.
                input_shape: tuple(int, int)
                  The expected input image size.
                num_actions: int
                  Number of possible actions. Defined by the gym environment.

                """
        # input placeholders

        with tf.name_scope(model_name) as scope:

            # Input and target placeholders.
            self.x = tf.placeholder(
                tf.float32,
                shape=[None, input_shape[0], input_shape[1], window],
                name='input_state')
            self.y_true = tf.placeholder(tf.float32,
                                         shape=[None, 1],
                                         name='target_q_val')

            # conv1 layer
            self.W_conv1 = tf.Variable(tf.truncated_normal([8, 8, window, 16],
                                                           stddev=0.1),
                                       name='W_conv1')
            self.b_conv1 = tf.Variable(tf.constant(0.1, shape=[16]),
                                       name='b_conv1')

            self.conv1 = tf.nn.conv2d(
                self.x, self.W_conv1, strides=[1, 4, 4, 1],
                padding='VALID') + self.b_conv1
            self.relu_conv1 = tf.nn.relu(self.conv1)

            #conv2 layer
            self.W_conv2 = tf.Variable(tf.truncated_normal([4, 4, 16, 32],
                                                           stddev=0.1),
                                       name='W_conv2')
            self.b_conv2 = tf.Variable(tf.constant(0.1, shape=[32]),
                                       name='b_conv2')

            self.conv2 = tf.nn.conv2d(self.relu_conv1,
                                      self.W_conv2,
                                      strides=[1, 2, 2, 1],
                                      padding='VALID') + self.b_conv2
            self.relu_conv2 = tf.nn.relu(self.conv2)
            self.relu_conv2_flat = tf.reshape(self.relu_conv2,
                                              [-1, 9 * 9 * 32])

            # fc3 layer
            self.W_fc3 = tf.Variable(tf.truncated_normal([9 * 9 * 32, 256],
                                                         stddev=0.1),
                                     name='W_fc3')
            self.b_fc3 = tf.Variable(tf.constant(0.1, shape=[256]),
                                     name='b_fc3')

            self.fc3 = tf.matmul(self.relu_conv2_flat, self.W_fc3) + self.b_fc3
            self.relu_fc3 = tf.nn.relu(self.fc3)

            # output layer
            self.W_fc4 = tf.Variable(tf.truncated_normal([256, num_actions],
                                                         stddev=0.1),
                                     name='W_output')
            self.b_fc4 = tf.Variable(tf.constant(0.1, shape=[num_actions]),
                                     name='b_output')

            # Selected Action is a one-hot encoding of which actions were chosen.
            self.selected_action = tf.placeholder(
                tf.float32,
                shape=[None, self.num_actions],
                name='selected_action')
            # Extract predicted Q values.
            self.pred_q = tf.add(tf.matmul(self.relu_fc3, self.W_fc4),
                                 self.b_fc4,
                                 name='pred_q')

            # For the source network, and not target network.
            if model_name.startswith('source'):

                # Predicted Q at the executed action.
                self.pred_y = tf.reduce_sum(tf.multiply(
                    self.pred_q, self.selected_action),
                                            axis=1)
                # Lloss value
                self.loss = mean_huber_loss(self.y_true, self.pred_y)

                # Evaluation reward.
                self.accumulated_avg_reward = tf.placeholder(
                    tf.float32, shape=(), name='accumulated_avg_reward')

                # Train with ADAM.
                self.train = tf.train.AdamOptimizer(1e-4).minimize(
                    self.loss, name='Adam_minimizer')

                self.maxq_summary = tf.summary.scalar(
                    'Max_Q', tf.reduce_max(self.pred_q))
                self.loss_summary = tf.summary.scalar('Loss', self.loss)

                self.merged = tf.summary.merge_all()

                # VARIABLE AND SUMMARY for training reward.
                self.train_reward_val = tf.placeholder(tf.float32,
                                                       shape=(),
                                                       name='train_reward_val')
                self.reward_train_summary = tf.summary.scalar(
                    'Training_reward', self.train_reward_val)

                self.reward_summary = tf.summary.scalar(
                    'Average_reward', self.accumulated_avg_reward)
Esempio n. 5
0
    def create_linear_model(self, window, input_shape, num_actions,
                            model_name):
        """
                Create Linear network

                Parameters
                ----------
                window: int
                  Each input to the network is a sequence of frames. This value
                  defines how many frames are in the sequence.
                input_shape: tuple(int, int)
                  The expected input image size.
                num_actions: int
                  Number of possible actions. Defined by the gym environment.
                """
        # input placeholders

        with tf.name_scope(model_name) as scope:

            # Input and target placeholders.
            self.x = tf.placeholder(
                tf.float32,
                shape=[None, input_shape[0], input_shape[1], window],
                name='input_state')
            self.y_true = tf.placeholder(tf.float32,
                                         shape=[None, 1],
                                         name='target_q_val')

            # Reshape input.
            self.x_flat = tf.reshape(self.x, [-1, 84 * 84 * 4],
                                     name='flat_input')

            # linear layer
            self.W = tf.Variable(tf.truncated_normal(
                [84 * 84 * 4, num_actions], stddev=0.1),
                                 name='weight')
            self.b = tf.Variable(tf.constant(0.1, shape=[num_actions]),
                                 name='bias')

            # Extract predicted Q values.
            self.pred_q = tf.add(tf.matmul(self.x_flat, self.W),
                                 self.b,
                                 name='pred_q')
            # Selected Action is a one-hot encoding of which actions were chosen.
            self.selected_action = tf.placeholder(
                tf.float32,
                shape=[None, self.num_actions],
                name='selected_action')

            # Create the following summaries only for the source network.
            if model_name.startswith('source'):

                # Predicted y: Q values for teh action selected.
                self.pred_y = tf.reduce_sum(tf.multiply(
                    self.pred_q, self.selected_action),
                                            axis=1)

                # Loss
                self.loss = mean_huber_loss(self.y_true, self.pred_y)

                # Evaluated Reward.
                self.accumulated_avg_reward = tf.placeholder(
                    tf.float32, shape=(), name='accumulated_avg_reward')

                # Train variable.
                self.train = tf.train.AdamOptimizer(1e-4).minimize(
                    self.loss, name='Adam_minimizer')
                self.maxq_summary = tf.summary.scalar(
                    'Max_Q', tf.reduce_max(self.pred_q))
                self.loss_summary = tf.summary.scalar('Loss', self.loss)

                self.merged = tf.summary.merge_all()

                # VARIABLE AND SUMMARY for training reward.
                self.train_reward_val = tf.placeholder(tf.float32,
                                                       shape=(),
                                                       name='train_reward_val')
                self.reward_train_summary = tf.summary.scalar(
                    'Training_reward', self.train_reward_val)

                # Evaluation reward summary.
                self.reward_summary = tf.summary.scalar(
                    'Average_reward', self.accumulated_avg_reward)