コード例 #1
0
ファイル: network.py プロジェクト: sunjieee/tensorflow-rl
    def __init__(self, conf):
        """ Initialize hyper-parameters, set up optimizer and network 
        layers common across Q and Policy/V nets. """

        self.name = conf['name']
        self.num_actions = conf['num_act']
        self.arch = conf['args'].arch
        self.batch_size = conf['args'].batch_size
        self.optimizer_type = conf['args'].opt_type
        self.optimizer_mode = conf['args'].opt_mode
        self.clip_loss_delta = conf['args'].clip_loss_delta
        self.clip_norm = conf['args'].clip_norm
        self.clip_norm_type = conf['args'].clip_norm_type
        self.input_shape = conf['input_shape']
        self.use_recurrent = conf['args'].alg_type.endswith('-lstm')

        with tf.name_scope(self.name):
            self.selected_action_ph = tf.placeholder(
                'float32', [self.batch_size, self.num_actions],
                name='selected_action')

            if self.arch == 'FC':
                self.input_ph = tf.placeholder('float32', [self.batch_size] +
                                               self.input_shape + [4],
                                               name='input')
                self.w1, self.b1, self.o1 = layers.fc('fc1',
                                                      layers.flatten(
                                                          self.input_ph),
                                                      40,
                                                      activation='relu')
                self.w2, self.b2, self.o2 = layers.fc('fc2',
                                                      self.o1,
                                                      40,
                                                      activation='relu')
                self.ox = self.o2
            elif self.arch == 'ATARI-TRPO':
                self.input_ph = tf.placeholder('float32',
                                               [self.batch_size, 84, 84, 4],
                                               name='input')
                self.w1, self.b1, self.o1 = layers.conv2d(
                    'conv1', self.input_ph, 16, 4, 4, 2)
                self.w2, self.b2, self.o2 = layers.conv2d(
                    'conv2', self.o1, 16, 4, 16, 2)
                self.w3, self.b3, self.o3 = layers.fc('fc3',
                                                      layers.flatten(self.o2),
                                                      20,
                                                      activation='relu')
                self.ox = self.o3
            elif self.arch == 'NIPS':
                self.input_ph = tf.placeholder('float32',
                                               [self.batch_size, 84, 84, 4],
                                               name='input')
                self.w1, self.b1, self.o1 = layers.conv2d(
                    'conv1', self.input_ph, 16, 8, 4, 4)
                self.w2, self.b2, self.o2 = layers.conv2d(
                    'conv2', self.o1, 32, 4, 16, 2)
                self.w3, self.b3, self.o3 = layers.fc('fc3',
                                                      layers.flatten(self.o2),
                                                      256,
                                                      activation='relu')
                self.ox = self.o3
            elif self.arch == 'NATURE':
                self.input_ph = tf.placeholder('float32',
                                               [self.batch_size, 84, 84, 4],
                                               name='input')
                self.w1, self.b1, self.o1 = layers.conv2d(
                    'conv1', self.input_ph, 32, 8, 4, 4)
                self.w2, self.b2, self.o2 = layers.conv2d(
                    'conv2', self.o1, 64, 4, 32, 2)
                self.w3, self.b3, self.o3 = layers.conv2d(
                    'conv3', self.o2, 64, 3, 64, 1)
                self.w4, self.b4, self.o4 = layers.fc('fc4',
                                                      layers.flatten(self.o3),
                                                      512,
                                                      activation='relu')
                self.ox = self.o4
            else:
                raise Exception('Invalid architecture `{}`'.format(self.arch))

            if self.use_recurrent:
                layer_name = 'lstm_layer'
                self.hidden_state_size = 256
                with tf.variable_scope(self.name + '/' + layer_name) as vs:
                    self.lstm_cell = CustomBasicLSTMCell(
                        self.hidden_state_size, forget_bias=1.0)

                    self.step_size = tf.placeholder(tf.float32, [None],
                                                    name='step_size')
                    self.initial_lstm_state = tf.placeholder(
                        tf.float32, [None, 2 * self.hidden_state_size],
                        name='initital_state')

                    batch_size = tf.shape(self.step_size)[0]
                    ox_reshaped = tf.reshape(
                        self.ox,
                        [batch_size, -1,
                         self.ox.get_shape().as_list()[-1]])

                    lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
                        self.lstm_cell,
                        ox_reshaped,
                        initial_state=self.initial_lstm_state,
                        sequence_length=self.step_size,
                        time_major=False,
                        scope=vs)

                    self.ox = tf.reshape(lstm_outputs, [-1, 256],
                                         name='reshaped_lstm_outputs')

                    # Get all LSTM trainable params
                    self.lstm_trainable_variables = [
                        v for v in tf.trainable_variables()
                        if v.name.startswith(vs.name)
                    ]
コード例 #2
0
    def _build_policy_head(self, input_state):
        self.adv_actor_ph = tf.placeholder("float", [self.batch_size],
                                           name='advantage')

        with tf.variable_scope(self.name + '/lstm_decoder') as vs:
            self.action_outputs = tf.placeholder(
                tf.float32, [self.batch_size, None, self.num_actions + 1],
                name='action_outputs')
            self.action_inputs = tf.placeholder(
                tf.float32, [self.batch_size, None, self.num_actions + 1],
                name='action_inputs')

            self.decoder_seq_lengths = tf.placeholder(
                tf.int32, [self.batch_size], name='decoder_seq_lengths')
            self.allowed_actions = tf.placeholder(
                tf.float32, [self.batch_size, None, self.num_actions + 1],
                name='allowed_actions')
            self.use_fixed_action = tf.placeholder(tf.bool,
                                                   name='use_fixed_action')
            self.temperature = tf.placeholder(tf.float32, name='temperature')

            self.decoder_hidden_state_size = input_state.get_shape().as_list(
            )[-1]
            self.decoder_lstm_cell = CustomBasicLSTMCell(
                self.decoder_hidden_state_size, forget_bias=1.0)
            self.decoder_initial_state = tf.placeholder(
                tf.float32,
                [self.batch_size, 2 * self.decoder_hidden_state_size],
                name='decoder_initial_state')

            self.network_state = tf.concat(
                axis=1,
                values=[
                    tf.zeros_like(input_state), input_state
                    # input_state, tf.zeros_like(input_state)
                ])

            self.W_actions = tf.get_variable(
                'W_actions',
                shape=[self.decoder_hidden_state_size, self.num_actions + 1],
                dtype='float32',
                initializer=tf.contrib.layers.xavier_initializer())
            self.b_actions = tf.get_variable(
                'b_actions',
                shape=[self.num_actions + 1],
                dtype='float32',
                initializer=tf.zeros_initializer())

            self.decoder_state, self.logits, self.actions = decoder(
                self.action_inputs,
                self.network_state,
                self.decoder_lstm_cell,
                self.decoder_seq_lengths,
                self.W_actions,
                self.b_actions,
                self.max_decoder_steps,
                vs,
                self.use_fixed_action,
                self.action_outputs,
                loop_function=loop_gumbel_softmax(self.temperature),
            )

            self.decoder_trainable_variables = [
                v for v in tf.trainable_variables()
                if v.name.startswith(vs.name)
            ]

        print 'Decoder out: s,l,a=', self.decoder_state.get_shape(
        ), self.logits.get_shape(), self.actions.get_shape()

        #mask softmax by allowed actions
        exp_logits = tf.exp(self.logits) * self.allowed_actions
        Z = tf.expand_dims(tf.reduce_sum(exp_logits, 2), 2)
        self.action_probs = exp_logits / Z
        log_action_probs = self.logits - tf.log(Z)

        sequence_probs = tf.reduce_prod(
            tf.reduce_sum(self.action_probs * self.action_outputs, 2), 1)
        log_sequence_probs = tf.reduce_sum(
            tf.reduce_sum(log_action_probs * self.action_outputs, 2), 1)

        # ∏a_i * ∑ log a_i
        self.output_layer_entropy = -tf.reduce_sum(
            tf.stop_gradient(1 + log_sequence_probs) * log_sequence_probs)
        self.entropy = -tf.reduce_sum(log_sequence_probs)

        print 'sp, lsp:', sequence_probs.get_shape(
        ), log_sequence_probs.get_shape()

        self.actor_advantage_term = tf.reduce_sum(
            log_sequence_probs[:self.max_local_steps] * self.adv_actor_ph)
        self.actor_entropy_term = self.beta * self.output_layer_entropy
        self.actor_objective = -(self.actor_advantage_term +
                                 self.actor_entropy_term)

        return self.actor_objective
コード例 #3
0
    def __init__(
            self,
            action_size,
            thread_index,  # -1 for global
            device="/cpu:0"):
        GameACNetwork.__init__(self, action_size, device)

        with tf.device(self._device):
            self.W_conv1 = self._conv_weight_variable([8, 8, 4,
                                                       16])  # stride=4
            self.b_conv1 = self._conv_bias_variable([16], 8, 8, 4)

            self.W_conv2 = self._conv_weight_variable([4, 4, 16,
                                                       32])  # stride=2
            self.b_conv2 = self._conv_bias_variable([32], 4, 4, 16)

            self.W_fc1 = self._fc_weight_variable([2592, 256])
            self.b_fc1 = self._fc_bias_variable([256], 2592)

            # lstm
            self.lstm = CustomBasicLSTMCell(256)

            # weight for policy output layer
            self.W_fc2 = self._fc_weight_variable([256, action_size])
            self.b_fc2 = self._fc_bias_variable([action_size], 256)

            # weight for value output layer
            self.W_fc3 = self._fc_weight_variable([256, 1])
            self.b_fc3 = self._fc_bias_variable([1], 256)

            # state (input)
            self.s = tf.placeholder("float", [None, 84, 84, 4])

            h_conv1 = tf.nn.relu(
                self._conv2d(self.s, self.W_conv1, 4) + self.b_conv1)
            h_conv2 = tf.nn.relu(
                self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2)

            h_conv2_flat = tf.reshape(h_conv2, [-1, 2592])
            h_fc1 = tf.nn.relu(
                tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1)
            # h_fc1 shape=(5,256)

            h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256])
            # h_fc_reshaped = (1,5,256)

            # place holder for LSTM unrolling time step size.
            self.step_size = tf.placeholder(tf.float32, [1])

            self.initial_lstm_state = tf.placeholder(tf.float32,
                                                     [1, self.lstm.state_size])

            scope = "net_" + str(thread_index)

            # Unrolling LSTM up to LOCAL_T_MAX time steps. (= 5time steps.)
            # When episode terminates unrolling time steps becomes less than LOCAL_TIME_STEP.
            # Unrolling step size is applied via self.step_size placeholder.
            # When forward propagating, step_size is 1.
            # (time_major = False, so output shape is [batch_size, max_time, cell.output_size])
            lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
                self.lstm,
                h_fc1_reshaped,
                initial_state=self.initial_lstm_state,
                sequence_length=self.step_size,
                time_major=False,
                scope=scope)

            # lstm_outputs: (1,5,256) for back prop, (1,1,256) for forward prop.

            lstm_outputs = tf.reshape(lstm_outputs, [-1, 256])

            # policy (output)
            self.pi = tf.nn.softmax(
                tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2)

            # value (output)
            v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3
            self.v = tf.reshape(v_, [-1])

            self.reset_state()
コード例 #4
0
    def __init__(
            self,
            action_size,
            thread_index,  # -1 for global
            device="/cpu:0"):
        GameACNetwork.__init__(self, action_size, device)
        print("Initializing LSTM Network ")

        with tf.device(self._device):
            self.W_conv1 = self._conv_weight_variable(
                [8, 1, len(FEATURES_LIST), 16])  # stride=4
            self.b_conv1 = self._conv_bias_variable([16], 8, 1,
                                                    len(FEATURES_LIST))

            self.W_conv2 = self._conv_weight_variable([1, 1, 16,
                                                       32])  # stride=2
            self.b_conv2 = self._conv_bias_variable([32], 1, 1, 16)

            self.W_fc1 = self._fc_weight_variable([2592, 256])
            self.b_fc1 = self._fc_bias_variable([256], 2592)

            # lstm
            self.lstm = CustomBasicLSTMCell(256)

            # 256 must be larger than SEQUENCE_LENGTH
            # weight for policy output layer
            self.W_fc2 = self._fc_weight_variable([256, action_size])
            self.b_fc2 = self._fc_bias_variable([action_size], 256)

            # weight for value output layer
            self.W_fc3 = self._fc_weight_variable([256, 1])
            self.b_fc3 = self._fc_bias_variable([1], 256)

            # state (input)
            #self.s = tf.placeholder("float", [None, 84, 84, 4])
            self.s = tf.placeholder(
                "float", [None, SEQUENCE_LENGTH, 1,
                          len(FEATURES_LIST)])

            h_conv1 = tf.nn.relu(
                self._conv2d(self.s, self.W_conv1, 1) + self.b_conv1)
            h_conv2 = tf.nn.relu(
                self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2)

            h_conv2_flat = tf.reshape(h_conv2, [-1, 2592])
            h_fc1 = tf.nn.relu(
                tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1)
            # h_fc1 shape=(5,256)
            ##h_fc1 = tf.Print(h_fc1, [h_fc1], message="NN This is h_fc1: ", summarize=40)

            h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256])
            # h_fc_reshaped = (1,5,256)

            self.step_size = tf.placeholder(tf.float32, [1])

            self.initial_lstm_state = tf.placeholder(tf.float32,
                                                     [1, self.lstm.state_size])

            scope = "net_" + str(thread_index)

            # time_major = False, so output shape is [batch_size, max_time, cell.output_size]
            lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
                self.lstm,
                h_fc1_reshaped,
                initial_state=self.initial_lstm_state,
                sequence_length=self.step_size,
                time_major=False,
                scope=scope)

            # lstm_outputs: (1,5,256), (1,1,256)

            lstm_outputs = tf.reshape(lstm_outputs, [-1, 256])

            # policy (output)
            self.pi = tf.nn.softmax(
                tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2)
            ##self.pi = tf.Print(self.pi, [self.pi], message="NN This is self.pi: ", summarize=40)

            # value (output)
            v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3
            ##v_ = tf.Print(v_, [v_], message="NN This is v_ ", summarize=40)
            self.v = tf.reshape(v_, [-1])
            ##self.v = tf.Print(self.v, [self.v], message="NN This is self.v: ", summarize=40)

            # in OK  tensorflow/core/kernels/logging_ops.cc:79] NN This is self.v: [-0.036351625]
            #I tensorflow/core/kernels/logging_ops.cc:79] NN This is self.pi: [0.49193981 0.50806022]
            #I tensorflow/core/kernels/logging_ops.cc:79] NN This is self.v: [-0.03456594]

            self.reset_state()
            print("Initializing Network finished")
コード例 #5
0
ファイル: actor_net.py プロジェクト: walternie/RDPG
    def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.MAX_STEP = MAX_STEP
        self.BATCH_SIZE = BATCH_SIZE
        self.g = tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            """
            Actor network:
            """
            self.a_input_states = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='input_placeholder')
            self.a_grad_from_critic = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='input_placeholder')

            self.W_a = tf.Variable(
                tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS]))
            self.B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('actor'):
                self.lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.lstm_layers = [self.lstm_cell] * N_LAYERS
                self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.lstm_layers, state_is_tuple=True)
                self.init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                    self.lstm_cell,
                    self.a_input_states,
                    initial_state=self.init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.a_input_states))
            self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2])
            self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list,
                                                [-1, HIDDEN_UNITS])
            self.lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                              self.lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.pred_t = [
                tf.matmul(self.lstm_output, self.W_a) + self.B_a
                for self.lstm_output in self.lstm_outputs_list
            ]
            self.pred_t_array = tf.pack(self.pred_t)
            self.pred_t_array = tf.transpose(
                self.pred_t_array,
                [1, 0, 2])  #(to get shape of batch sizexstepxdimension)
            """
            last relevant action (while evaluating actor during testing)
            """
            self.last_lstm_output = self.last_relevant(
                self.lstm_outputs, self.length(self.a_input_states))
            self.action_last_state = tf.matmul(self.last_lstm_output,
                                               self.W_a) + self.B_a
            #optimizer:
            #self.params = tf.trainable_variables()
            self.params = [
                self.lstm_layers[0].weights, self.lstm_layers[0].bias,
                self.lstm_layers[1].weights, self.lstm_layers[1].bias,
                self.W_a, self.B_a
            ]
            self.a_grad_from_criticT = tf.transpose(self.a_grad_from_critic,
                                                    perm=[1, 0, 2])
            self.gradient = tf.gradients(
                tf.pack(self.pred_t), self.params, -self.a_grad_from_criticT /
                (self.MAX_STEP *
                 BATCH_SIZE))  #- because we are interested in maximization
            self.opt = tf.train.AdamOptimizer(LEARNING_RATE)
            self.optimizer = self.opt.apply_gradients(
                zip(self.gradient, self.params))
            print("Initialized Actor Network...")
            """
            Target Actor network:
            """
            self.t_a_input_states = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='input_placeholder')
            self.t_a_grad_from_critic = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='input_placeholder')

            self.t_W_a = tf.Variable(
                tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS]))
            self.t_B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('target_actor'):
                self.t_lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS
                self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.t_lstm_layers, state_is_tuple=True)
                self.t_init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn(
                    self.t_lstm_cell,
                    self.t_a_input_states,
                    initial_state=self.t_init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.t_a_input_states))
            self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs,
                                                    [1, 0, 2])
            self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list,
                                                  [-1, HIDDEN_UNITS])
            self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                                self.t_lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.t_pred_t = [
                tf.matmul(self.t_lstm_output, self.t_W_a) + self.t_B_a
                for self.t_lstm_output in self.t_lstm_outputs_list
            ]
            self.t_pred_t = tf.pack(self.t_pred_t)
            self.t_pred_t = tf.transpose(
                self.t_pred_t,
                [1, 0, 2])  #(to get shape of batch sizexstepxdimension)
            """
            last relevant action (while evaluating actor during testing)
            """
            self.t_last_lstm_output = self.last_relevant(
                self.t_lstm_outputs, self.length(self.t_a_input_states))
            self.t_action_last_state = tf.matmul(self.t_last_lstm_output,
                                                 self.t_W_a) + self.t_B_a
            print("Initialized Target Actor Network...")
            self.sess.run(tf.initialize_all_variables())

            #To initialize critic and target with the same values:
            # copy target parameters
            self.sess.run([
                self.t_lstm_layers[0].weights.assign(
                    self.lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    self.lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias),
                self.t_W_a.assign(self.W_a),
                self.t_B_a.assign(self.B_a)
            ])

            self.update_target_actor_op = [
                self.t_lstm_layers[0].weights.assign(
                    TAU * self.lstm_layers[0].weights +
                    (1 - TAU) * self.t_lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(
                    TAU * self.lstm_layers[0].bias +
                    (1 - TAU) * self.t_lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    TAU * self.lstm_layers[1].weights +
                    (1 - TAU) * self.t_lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(
                    TAU * self.lstm_layers[1].bias +
                    (1 - TAU) * self.t_lstm_layers[1].bias),
                self.t_W_a.assign(TAU * self.W_a + (1 - TAU) * self.t_W_a),
                self.t_B_a.assign(TAU * self.B_a + (1 - TAU) * self.t_B_a)
            ]
コード例 #6
0
ファイル: actor_net.py プロジェクト: walternie/RDPG
class ActorNet:
    """ Actor Neural Network model of the RDPG algorithm """
    def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.MAX_STEP = MAX_STEP
        self.BATCH_SIZE = BATCH_SIZE
        self.g = tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            """
            Actor network:
            """
            self.a_input_states = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='input_placeholder')
            self.a_grad_from_critic = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='input_placeholder')

            self.W_a = tf.Variable(
                tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS]))
            self.B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('actor'):
                self.lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.lstm_layers = [self.lstm_cell] * N_LAYERS
                self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.lstm_layers, state_is_tuple=True)
                self.init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                    self.lstm_cell,
                    self.a_input_states,
                    initial_state=self.init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.a_input_states))
            self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2])
            self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list,
                                                [-1, HIDDEN_UNITS])
            self.lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                              self.lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.pred_t = [
                tf.matmul(self.lstm_output, self.W_a) + self.B_a
                for self.lstm_output in self.lstm_outputs_list
            ]
            self.pred_t_array = tf.pack(self.pred_t)
            self.pred_t_array = tf.transpose(
                self.pred_t_array,
                [1, 0, 2])  #(to get shape of batch sizexstepxdimension)
            """
            last relevant action (while evaluating actor during testing)
            """
            self.last_lstm_output = self.last_relevant(
                self.lstm_outputs, self.length(self.a_input_states))
            self.action_last_state = tf.matmul(self.last_lstm_output,
                                               self.W_a) + self.B_a
            #optimizer:
            #self.params = tf.trainable_variables()
            self.params = [
                self.lstm_layers[0].weights, self.lstm_layers[0].bias,
                self.lstm_layers[1].weights, self.lstm_layers[1].bias,
                self.W_a, self.B_a
            ]
            self.a_grad_from_criticT = tf.transpose(self.a_grad_from_critic,
                                                    perm=[1, 0, 2])
            self.gradient = tf.gradients(
                tf.pack(self.pred_t), self.params, -self.a_grad_from_criticT /
                (self.MAX_STEP *
                 BATCH_SIZE))  #- because we are interested in maximization
            self.opt = tf.train.AdamOptimizer(LEARNING_RATE)
            self.optimizer = self.opt.apply_gradients(
                zip(self.gradient, self.params))
            print("Initialized Actor Network...")
            """
            Target Actor network:
            """
            self.t_a_input_states = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='input_placeholder')
            self.t_a_grad_from_critic = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='input_placeholder')

            self.t_W_a = tf.Variable(
                tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS]))
            self.t_B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('target_actor'):
                self.t_lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS
                self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.t_lstm_layers, state_is_tuple=True)
                self.t_init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn(
                    self.t_lstm_cell,
                    self.t_a_input_states,
                    initial_state=self.t_init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.t_a_input_states))
            self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs,
                                                    [1, 0, 2])
            self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list,
                                                  [-1, HIDDEN_UNITS])
            self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                                self.t_lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.t_pred_t = [
                tf.matmul(self.t_lstm_output, self.t_W_a) + self.t_B_a
                for self.t_lstm_output in self.t_lstm_outputs_list
            ]
            self.t_pred_t = tf.pack(self.t_pred_t)
            self.t_pred_t = tf.transpose(
                self.t_pred_t,
                [1, 0, 2])  #(to get shape of batch sizexstepxdimension)
            """
            last relevant action (while evaluating actor during testing)
            """
            self.t_last_lstm_output = self.last_relevant(
                self.t_lstm_outputs, self.length(self.t_a_input_states))
            self.t_action_last_state = tf.matmul(self.t_last_lstm_output,
                                                 self.t_W_a) + self.t_B_a
            print("Initialized Target Actor Network...")
            self.sess.run(tf.initialize_all_variables())

            #To initialize critic and target with the same values:
            # copy target parameters
            self.sess.run([
                self.t_lstm_layers[0].weights.assign(
                    self.lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    self.lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias),
                self.t_W_a.assign(self.W_a),
                self.t_B_a.assign(self.B_a)
            ])

            self.update_target_actor_op = [
                self.t_lstm_layers[0].weights.assign(
                    TAU * self.lstm_layers[0].weights +
                    (1 - TAU) * self.t_lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(
                    TAU * self.lstm_layers[0].bias +
                    (1 - TAU) * self.t_lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    TAU * self.lstm_layers[1].weights +
                    (1 - TAU) * self.t_lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(
                    TAU * self.lstm_layers[1].bias +
                    (1 - TAU) * self.t_lstm_layers[1].bias),
                self.t_W_a.assign(TAU * self.W_a + (1 - TAU) * self.t_W_a),
                self.t_B_a.assign(TAU * self.B_a + (1 - TAU) * self.t_B_a)
            ]

    def length(self, data):
        used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def last_relevant(
        self, output, length
    ):  #method used while evaluating target net: where input is one or few time steps
        L_BATCH_SIZE = tf.shape(output)[0]
        max_length = int(output.get_shape()[1])
        out_size = int(output.get_shape()[2])
        index = tf.range(0, L_BATCH_SIZE) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, out_size])
        relevant = tf.gather(flat, index)
        return relevant

    def train_actor(self, o_n_t, del_Q_a):
        self.sess.run(self.optimizer,
                      feed_dict={
                          self.a_input_states: o_n_t,
                          self.a_grad_from_critic: del_Q_a
                      })

    def evaluate_actor(self, o_n_t):
        return self.sess.run(self.action_last_state,
                             feed_dict={self.a_input_states: o_n_t})[0]

    def evaluate_actor_batch(self, o_n_t):
        return self.sess.run(self.pred_t_array,
                             feed_dict={self.a_input_states: o_n_t})

    def evaluate_target_actor(self, o_n_t):
        return self.sess.run(self.t_pred_t,
                             feed_dict={self.t_a_input_states: o_n_t})

    def update_target_actor(self):
        self.sess.run(self.update_target_actor_op)
コード例 #7
0
    def _create_network(self):
        state_dim = self._state_dim
        state_chn = self._state_chn
        action_dim = self._action_dim
        with tf.device(self._device):
            # state input
            self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_chn])

            # conv1
            self.W_conv1 = weight_variable([8, 8, state_chn, 16])
            self.b_conv1 = bias_variable([16])
            h_conv1 = tf.nn.relu(conv2d(self.state_input, self.W_conv1, 4) + self.b_conv1)

            # conv2
            self.W_conv2 = weight_variable([4, 4, 16, 32])
            self.b_conv2 = bias_variable([32])
            h_conv2 = tf.nn.relu(conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2)

            h_conv2_out_size = np.prod(h_conv2.get_shape().as_list()[1:])
            print 'h_conv2_out_size', h_conv2_out_size
            h_conv2_flat = tf.reshape(h_conv2, [-1, h_conv2_out_size])

            # conv3
            # self.W_conv3 = weight_variable([3, 3, 32, 64])
            # self.b_conv3 = bias_variable([64])
            # h_conv3 = tf.nn.relu(conv2d(h_conv2, self.W_conv3, 1) + self.b_conv3)

            # h_conv3_out_size = np.prod(h_conv3.get_shape().as_list()[1:])
            # print 'h_conv3_out_size', h_conv3_out_size
            # h_conv3_flat = tf.reshape(h_conv3, [-1, h_conv3_out_size])

            # fc1
            self.W_fc1 = weight_variable([h_conv2_out_size, 256])
            self.b_fc1 = bias_variable([256])
            h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1)

            # reshape to fit lstm (1, 5, 256)
            h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256])

            self.lstm = CustomBasicLSTMCell(256)
            self.step_size = tf.placeholder('float', [1])
            self.initial_lstm_state = tf.placeholder('float', [1, self.lstm.state_size])
            scope = 'net_' + str(self._thread_index)

            # Unrolling LSTM up to LOCAL_T_MAX time steps. (= 5time steps.)
            # (time_major = False, so output shape is [batch_size, max_time, cell.output_size])
            # refer: https://www.tensorflow.org/versions/r0.11/api_docs/python/nn.html#dynamic_rnn
            lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
                self.lstm,
                h_fc1_reshaped,
                initial_state=self.initial_lstm_state,
                sequence_length=self.step_size,
                time_major=False,
                scope=scope
            )
            print lstm_outputs.get_shape()
            lstm_outputs = tf.reshape(lstm_outputs, [-1, 256])

            # fc2: (pi) for policy output
            self.W_fc2 = weight_variable([256, action_dim])
            self.b_fc2 = bias_variable([action_dim])
            self.policy_output = tf.nn.softmax(tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2)

            # fc3: (v)  for value output
            self.W_fc3 = weight_variable([256, 1])
            self.b_fc3 = bias_variable([1])
            v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3
            self.value_output = tf.reshape(v_, [-1])

            self.reset_lstm_state()
        return
コード例 #8
0
    def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.MAX_STEP = MAX_STEP
        self.BATCH_SIZE = BATCH_SIZE
        self.g = tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            """
            Critic Q network:
            """
            self.c_input_state = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='inputstate_placeholder')
            self.c_input_action = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='inputaction_placeholder')
            self.c_input = tf.concat(2,
                                     [self.c_input_state, self.c_input_action])
            self.c_target = tf.placeholder(
                "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION],
                name='label_placeholder')
            self.W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1]))
            self.B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('critic'):
                self.lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.lstm_layers = [self.lstm_cell] * N_LAYERS
                self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.lstm_layers, state_is_tuple=True)
                self.init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                    self.lstm_cell,
                    self.c_input,
                    initial_state=self.init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.c_input))
            self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2])
            self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list,
                                                [-1, HIDDEN_UNITS])
            self.lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                              self.lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.pred_t = [
                tf.matmul(self.lstm_output, self.W_c) + self.B_c
                for self.lstm_output in self.lstm_outputs_list
            ]
            #converting target value to list of tensors
            self.c_target_list = tf.transpose(self.c_target, [1, 0, 2])
            self.c_target_list = tf.reshape(self.c_target_list,
                                            [-1, TARGET_VALUE_DIMENSION])
            self.c_target_list = tf.split(0, self.MAX_STEP, self.c_target_list)
            #optimizer:

            self.predt_T = tf.pack(self.pred_t)
            #transposing it to shape BatchsizeXtimestepXdimension:
            self.predt_T = tf.transpose(self.predt_T, [1, 0, 2])

            self.square_diff = tf.pow((self.predt_T - self.c_target), 2)
            #no. of time_steps:
            self.eff_time_step = tf.reduce_sum(self.length(self.c_input))
            self.eff_time_step = tf.to_float(self.eff_time_step)
            #mean loss over time step:
            self.loss_t = tf.reduce_sum(
                self.square_diff, reduction_indices=1) / self.eff_time_step
            self.loss_n = tf.reduce_sum(self.loss_t,
                                        reduction_indices=0) / self.BATCH_SIZE

            #self.params = tf.trainable_variables()
            self.params = [
                self.lstm_layers[0].weights, self.lstm_layers[0].bias,
                self.lstm_layers[1].weights, self.lstm_layers[1].bias,
                self.W_c, self.B_c
            ]
            #self.gradient = tf.gradients(tf.pack(self.pred_t),self.params,tf.sub(self.pred_t,self.c_target_list)/(self.MAX_STEP*self.BATCH_SIZE))
            self.gradient = tf.gradients(self.loss_n, self.params)
            #self.gradient = tf.gradients(self.predt_T,self.params,(self.predt_T-self.c_target)/(self.MAX_STEP*self.BATCH_SIZE))
            self.critic_gradient = tf.gradients(self.predt_T,
                                                self.c_input_action)
            self.opt = tf.train.AdamOptimizer(LEARNING_RATE)
            self.optimizer = self.opt.apply_gradients(
                zip(self.gradient, self.params))
            print("Initialized Critic Network...")
            """
            Target critic Q network:
            """
            #critic_q_model_parameters:
            self.t_c_input_state = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='inputstate_placeholder')
            self.t_c_input_action = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='inputaction_placeholder')
            self.t_c_input = tf.concat(
                2, [self.t_c_input_state, self.t_c_input_action])
            self.t_c_target = tf.placeholder(
                "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION],
                name='label_placeholder')
            self.t_W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1]))
            self.t_B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('target_critic'):
                self.t_lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS
                self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.t_lstm_layers, state_is_tuple=True)
                self.t_init_state = self.t_lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn(
                    self.t_lstm_cell,
                    self.t_c_input,
                    initial_state=self.t_init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.t_c_input))
            self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs,
                                                    [1, 0, 2])
            self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list,
                                                  [-1, HIDDEN_UNITS])
            self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                                self.t_lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.t_pred_t = [
                tf.matmul(self.t_lstm_output, self.t_W_c) + self.t_B_c
                for self.t_lstm_output in self.t_lstm_outputs_list
            ]
            self.t_pred_t = tf.pack(self.t_pred_t)
            self.t_pred_t = tf.transpose(self.t_pred_t, [1, 0, 2])
            #converting target value to list of tensors
            self.t_c_target_list = tf.transpose(self.t_c_target, [1, 0, 2])
            self.t_c_target_list = tf.reshape(self.t_c_target_list,
                                              [-1, TARGET_VALUE_DIMENSION])
            self.t_c_target_list = tf.split(0, self.MAX_STEP,
                                            self.t_c_target_list)
            print("Initialized Target Critic Network...")
            self.sess.run(tf.initialize_all_variables())

            #To initialize critic and target with the same values:
            # copy target parameters
            self.sess.run([
                self.t_lstm_layers[0].weights.assign(
                    self.lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    self.lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias),
                self.t_W_c.assign(self.W_c),
                self.t_B_c.assign(self.B_c)
            ])

            self.update_target_critic_op = [
                self.t_lstm_layers[0].weights.assign(
                    TAU * self.lstm_layers[0].weights +
                    (1 - TAU) * self.t_lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(
                    TAU * self.lstm_layers[0].bias +
                    (1 - TAU) * self.t_lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    TAU * self.lstm_layers[1].weights +
                    (1 - TAU) * self.t_lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(
                    TAU * self.lstm_layers[1].bias +
                    (1 - TAU) * self.t_lstm_layers[1].bias),
                self.t_W_c.assign(TAU * self.W_c + (1 - TAU) * self.t_W_c),
                self.t_B_c.assign(TAU * self.B_c + (1 - TAU) * self.t_B_c)
            ]
コード例 #9
0
class CriticNet:
    """ Critic Q value Neural Network model of the RDPG algorithm """
    def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.MAX_STEP = MAX_STEP
        self.BATCH_SIZE = BATCH_SIZE
        self.g = tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            """
            Critic Q network:
            """
            self.c_input_state = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='inputstate_placeholder')
            self.c_input_action = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='inputaction_placeholder')
            self.c_input = tf.concat(2,
                                     [self.c_input_state, self.c_input_action])
            self.c_target = tf.placeholder(
                "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION],
                name='label_placeholder')
            self.W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1]))
            self.B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('critic'):
                self.lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.lstm_layers = [self.lstm_cell] * N_LAYERS
                self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.lstm_layers, state_is_tuple=True)
                self.init_state = self.lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                    self.lstm_cell,
                    self.c_input,
                    initial_state=self.init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.c_input))
            self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2])
            self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list,
                                                [-1, HIDDEN_UNITS])
            self.lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                              self.lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.pred_t = [
                tf.matmul(self.lstm_output, self.W_c) + self.B_c
                for self.lstm_output in self.lstm_outputs_list
            ]
            #converting target value to list of tensors
            self.c_target_list = tf.transpose(self.c_target, [1, 0, 2])
            self.c_target_list = tf.reshape(self.c_target_list,
                                            [-1, TARGET_VALUE_DIMENSION])
            self.c_target_list = tf.split(0, self.MAX_STEP, self.c_target_list)
            #optimizer:

            self.predt_T = tf.pack(self.pred_t)
            #transposing it to shape BatchsizeXtimestepXdimension:
            self.predt_T = tf.transpose(self.predt_T, [1, 0, 2])

            self.square_diff = tf.pow((self.predt_T - self.c_target), 2)
            #no. of time_steps:
            self.eff_time_step = tf.reduce_sum(self.length(self.c_input))
            self.eff_time_step = tf.to_float(self.eff_time_step)
            #mean loss over time step:
            self.loss_t = tf.reduce_sum(
                self.square_diff, reduction_indices=1) / self.eff_time_step
            self.loss_n = tf.reduce_sum(self.loss_t,
                                        reduction_indices=0) / self.BATCH_SIZE

            #self.params = tf.trainable_variables()
            self.params = [
                self.lstm_layers[0].weights, self.lstm_layers[0].bias,
                self.lstm_layers[1].weights, self.lstm_layers[1].bias,
                self.W_c, self.B_c
            ]
            #self.gradient = tf.gradients(tf.pack(self.pred_t),self.params,tf.sub(self.pred_t,self.c_target_list)/(self.MAX_STEP*self.BATCH_SIZE))
            self.gradient = tf.gradients(self.loss_n, self.params)
            #self.gradient = tf.gradients(self.predt_T,self.params,(self.predt_T-self.c_target)/(self.MAX_STEP*self.BATCH_SIZE))
            self.critic_gradient = tf.gradients(self.predt_T,
                                                self.c_input_action)
            self.opt = tf.train.AdamOptimizer(LEARNING_RATE)
            self.optimizer = self.opt.apply_gradients(
                zip(self.gradient, self.params))
            print("Initialized Critic Network...")
            """
            Target critic Q network:
            """
            #critic_q_model_parameters:
            self.t_c_input_state = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_STATES],
                name='inputstate_placeholder')
            self.t_c_input_action = tf.placeholder(
                "float", [None, self.MAX_STEP, self.N_ACTIONS],
                name='inputaction_placeholder')
            self.t_c_input = tf.concat(
                2, [self.t_c_input_state, self.t_c_input_action])
            self.t_c_target = tf.placeholder(
                "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION],
                name='label_placeholder')
            self.t_W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1]))
            self.t_B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003))
            #lstms
            with tf.variable_scope('target_critic'):
                self.t_lstm_cell = CustomBasicLSTMCell(
                    HIDDEN_UNITS
                )  #basiclstmcell modified to get access to cell weights
                self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS
                self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell(
                    self.t_lstm_layers, state_is_tuple=True)
                self.t_init_state = self.t_lstm_cell.zero_state(
                    self.BATCH_SIZE, tf.float32)
                self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn(
                    self.t_lstm_cell,
                    self.t_c_input,
                    initial_state=self.t_init_state,
                    dtype=tf.float32,
                    sequence_length=self.length(self.t_c_input))
            self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs,
                                                    [1, 0, 2])
            self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list,
                                                  [-1, HIDDEN_UNITS])
            self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP,
                                                self.t_lstm_outputs_list)
            #prediction(output) at each time step:(list of tensors)
            self.t_pred_t = [
                tf.matmul(self.t_lstm_output, self.t_W_c) + self.t_B_c
                for self.t_lstm_output in self.t_lstm_outputs_list
            ]
            self.t_pred_t = tf.pack(self.t_pred_t)
            self.t_pred_t = tf.transpose(self.t_pred_t, [1, 0, 2])
            #converting target value to list of tensors
            self.t_c_target_list = tf.transpose(self.t_c_target, [1, 0, 2])
            self.t_c_target_list = tf.reshape(self.t_c_target_list,
                                              [-1, TARGET_VALUE_DIMENSION])
            self.t_c_target_list = tf.split(0, self.MAX_STEP,
                                            self.t_c_target_list)
            print("Initialized Target Critic Network...")
            self.sess.run(tf.initialize_all_variables())

            #To initialize critic and target with the same values:
            # copy target parameters
            self.sess.run([
                self.t_lstm_layers[0].weights.assign(
                    self.lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    self.lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias),
                self.t_W_c.assign(self.W_c),
                self.t_B_c.assign(self.B_c)
            ])

            self.update_target_critic_op = [
                self.t_lstm_layers[0].weights.assign(
                    TAU * self.lstm_layers[0].weights +
                    (1 - TAU) * self.t_lstm_layers[0].weights),
                self.t_lstm_layers[0].bias.assign(
                    TAU * self.lstm_layers[0].bias +
                    (1 - TAU) * self.t_lstm_layers[0].bias),
                self.t_lstm_layers[1].weights.assign(
                    TAU * self.lstm_layers[1].weights +
                    (1 - TAU) * self.t_lstm_layers[1].weights),
                self.t_lstm_layers[1].bias.assign(
                    TAU * self.lstm_layers[1].bias +
                    (1 - TAU) * self.t_lstm_layers[1].bias),
                self.t_W_c.assign(TAU * self.W_c + (1 - TAU) * self.t_W_c),
                self.t_B_c.assign(TAU * self.B_c + (1 - TAU) * self.t_B_c)
            ]

    def length(self, data):
        used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def last_relevant(
        self, output, length
    ):  #method used while evaluating target net: where input is one or few time steps
        self.BATCH_SIZE = tf.shape(output)[0]
        max_length = int(output.get_shape()[1])
        out_size = int(output.get_shape()[2])
        index = tf.range(0, self.BATCH_SIZE) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, out_size])
        relevant = tf.gather(flat, index)
        return relevant

    def train_critic(self, o_n_t, a_n_t, y_n_t):
        self.sess.run(self.optimizer,
                      feed_dict={
                          self.c_input_state: o_n_t,
                          self.c_input_action: a_n_t,
                          self.c_target: y_n_t
                      })

    def compute_critic_gradient(
            self, o_n_t, a_n_t):  #critic gradient with respect to action
        #check = np.array(self.sess.run(self.critic_gradient, feed_dict={self.c_input_state: o_n_t,self.c_input_action: a_n_t})[0])
        #print check.shape
        #raw_input('check shape')
        return self.sess.run(self.critic_gradient,
                             feed_dict={
                                 self.c_input_state: o_n_t,
                                 self.c_input_action: a_n_t
                             })[0]

    def evaluate_target_critic(self, o_n_t, a_n_t):
        return self.sess.run(self.t_pred_t,
                             feed_dict={
                                 self.t_c_input_state: o_n_t,
                                 self.t_c_input_action: a_n_t
                             })

    def update_target_critic(self):
        self.sess.run(self.update_target_critic_op)