def __init__(self, conf): """ Initialize hyper-parameters, set up optimizer and network layers common across Q and Policy/V nets. """ self.name = conf['name'] self.num_actions = conf['num_act'] self.arch = conf['args'].arch self.batch_size = conf['args'].batch_size self.optimizer_type = conf['args'].opt_type self.optimizer_mode = conf['args'].opt_mode self.clip_loss_delta = conf['args'].clip_loss_delta self.clip_norm = conf['args'].clip_norm self.clip_norm_type = conf['args'].clip_norm_type self.input_shape = conf['input_shape'] self.use_recurrent = conf['args'].alg_type.endswith('-lstm') with tf.name_scope(self.name): self.selected_action_ph = tf.placeholder( 'float32', [self.batch_size, self.num_actions], name='selected_action') if self.arch == 'FC': self.input_ph = tf.placeholder('float32', [self.batch_size] + self.input_shape + [4], name='input') self.w1, self.b1, self.o1 = layers.fc('fc1', layers.flatten( self.input_ph), 40, activation='relu') self.w2, self.b2, self.o2 = layers.fc('fc2', self.o1, 40, activation='relu') self.ox = self.o2 elif self.arch == 'ATARI-TRPO': self.input_ph = tf.placeholder('float32', [self.batch_size, 84, 84, 4], name='input') self.w1, self.b1, self.o1 = layers.conv2d( 'conv1', self.input_ph, 16, 4, 4, 2) self.w2, self.b2, self.o2 = layers.conv2d( 'conv2', self.o1, 16, 4, 16, 2) self.w3, self.b3, self.o3 = layers.fc('fc3', layers.flatten(self.o2), 20, activation='relu') self.ox = self.o3 elif self.arch == 'NIPS': self.input_ph = tf.placeholder('float32', [self.batch_size, 84, 84, 4], name='input') self.w1, self.b1, self.o1 = layers.conv2d( 'conv1', self.input_ph, 16, 8, 4, 4) self.w2, self.b2, self.o2 = layers.conv2d( 'conv2', self.o1, 32, 4, 16, 2) self.w3, self.b3, self.o3 = layers.fc('fc3', layers.flatten(self.o2), 256, activation='relu') self.ox = self.o3 elif self.arch == 'NATURE': self.input_ph = tf.placeholder('float32', [self.batch_size, 84, 84, 4], name='input') self.w1, self.b1, self.o1 = layers.conv2d( 'conv1', self.input_ph, 32, 8, 4, 4) self.w2, self.b2, self.o2 = layers.conv2d( 'conv2', self.o1, 64, 4, 32, 2) self.w3, self.b3, self.o3 = layers.conv2d( 'conv3', self.o2, 64, 3, 64, 1) self.w4, self.b4, self.o4 = layers.fc('fc4', layers.flatten(self.o3), 512, activation='relu') self.ox = self.o4 else: raise Exception('Invalid architecture `{}`'.format(self.arch)) if self.use_recurrent: layer_name = 'lstm_layer' self.hidden_state_size = 256 with tf.variable_scope(self.name + '/' + layer_name) as vs: self.lstm_cell = CustomBasicLSTMCell( self.hidden_state_size, forget_bias=1.0) self.step_size = tf.placeholder(tf.float32, [None], name='step_size') self.initial_lstm_state = tf.placeholder( tf.float32, [None, 2 * self.hidden_state_size], name='initital_state') batch_size = tf.shape(self.step_size)[0] ox_reshaped = tf.reshape( self.ox, [batch_size, -1, self.ox.get_shape().as_list()[-1]]) lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm_cell, ox_reshaped, initial_state=self.initial_lstm_state, sequence_length=self.step_size, time_major=False, scope=vs) self.ox = tf.reshape(lstm_outputs, [-1, 256], name='reshaped_lstm_outputs') # Get all LSTM trainable params self.lstm_trainable_variables = [ v for v in tf.trainable_variables() if v.name.startswith(vs.name) ]
def _build_policy_head(self, input_state): self.adv_actor_ph = tf.placeholder("float", [self.batch_size], name='advantage') with tf.variable_scope(self.name + '/lstm_decoder') as vs: self.action_outputs = tf.placeholder( tf.float32, [self.batch_size, None, self.num_actions + 1], name='action_outputs') self.action_inputs = tf.placeholder( tf.float32, [self.batch_size, None, self.num_actions + 1], name='action_inputs') self.decoder_seq_lengths = tf.placeholder( tf.int32, [self.batch_size], name='decoder_seq_lengths') self.allowed_actions = tf.placeholder( tf.float32, [self.batch_size, None, self.num_actions + 1], name='allowed_actions') self.use_fixed_action = tf.placeholder(tf.bool, name='use_fixed_action') self.temperature = tf.placeholder(tf.float32, name='temperature') self.decoder_hidden_state_size = input_state.get_shape().as_list( )[-1] self.decoder_lstm_cell = CustomBasicLSTMCell( self.decoder_hidden_state_size, forget_bias=1.0) self.decoder_initial_state = tf.placeholder( tf.float32, [self.batch_size, 2 * self.decoder_hidden_state_size], name='decoder_initial_state') self.network_state = tf.concat( axis=1, values=[ tf.zeros_like(input_state), input_state # input_state, tf.zeros_like(input_state) ]) self.W_actions = tf.get_variable( 'W_actions', shape=[self.decoder_hidden_state_size, self.num_actions + 1], dtype='float32', initializer=tf.contrib.layers.xavier_initializer()) self.b_actions = tf.get_variable( 'b_actions', shape=[self.num_actions + 1], dtype='float32', initializer=tf.zeros_initializer()) self.decoder_state, self.logits, self.actions = decoder( self.action_inputs, self.network_state, self.decoder_lstm_cell, self.decoder_seq_lengths, self.W_actions, self.b_actions, self.max_decoder_steps, vs, self.use_fixed_action, self.action_outputs, loop_function=loop_gumbel_softmax(self.temperature), ) self.decoder_trainable_variables = [ v for v in tf.trainable_variables() if v.name.startswith(vs.name) ] print 'Decoder out: s,l,a=', self.decoder_state.get_shape( ), self.logits.get_shape(), self.actions.get_shape() #mask softmax by allowed actions exp_logits = tf.exp(self.logits) * self.allowed_actions Z = tf.expand_dims(tf.reduce_sum(exp_logits, 2), 2) self.action_probs = exp_logits / Z log_action_probs = self.logits - tf.log(Z) sequence_probs = tf.reduce_prod( tf.reduce_sum(self.action_probs * self.action_outputs, 2), 1) log_sequence_probs = tf.reduce_sum( tf.reduce_sum(log_action_probs * self.action_outputs, 2), 1) # ∏a_i * ∑ log a_i self.output_layer_entropy = -tf.reduce_sum( tf.stop_gradient(1 + log_sequence_probs) * log_sequence_probs) self.entropy = -tf.reduce_sum(log_sequence_probs) print 'sp, lsp:', sequence_probs.get_shape( ), log_sequence_probs.get_shape() self.actor_advantage_term = tf.reduce_sum( log_sequence_probs[:self.max_local_steps] * self.adv_actor_ph) self.actor_entropy_term = self.beta * self.output_layer_entropy self.actor_objective = -(self.actor_advantage_term + self.actor_entropy_term) return self.actor_objective
def __init__( self, action_size, thread_index, # -1 for global device="/cpu:0"): GameACNetwork.__init__(self, action_size, device) with tf.device(self._device): self.W_conv1 = self._conv_weight_variable([8, 8, 4, 16]) # stride=4 self.b_conv1 = self._conv_bias_variable([16], 8, 8, 4) self.W_conv2 = self._conv_weight_variable([4, 4, 16, 32]) # stride=2 self.b_conv2 = self._conv_bias_variable([32], 4, 4, 16) self.W_fc1 = self._fc_weight_variable([2592, 256]) self.b_fc1 = self._fc_bias_variable([256], 2592) # lstm self.lstm = CustomBasicLSTMCell(256) # weight for policy output layer self.W_fc2 = self._fc_weight_variable([256, action_size]) self.b_fc2 = self._fc_bias_variable([action_size], 256) # weight for value output layer self.W_fc3 = self._fc_weight_variable([256, 1]) self.b_fc3 = self._fc_bias_variable([1], 256) # state (input) self.s = tf.placeholder("float", [None, 84, 84, 4]) h_conv1 = tf.nn.relu( self._conv2d(self.s, self.W_conv1, 4) + self.b_conv1) h_conv2 = tf.nn.relu( self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2) h_conv2_flat = tf.reshape(h_conv2, [-1, 2592]) h_fc1 = tf.nn.relu( tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1) # h_fc1 shape=(5,256) h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256]) # h_fc_reshaped = (1,5,256) # place holder for LSTM unrolling time step size. self.step_size = tf.placeholder(tf.float32, [1]) self.initial_lstm_state = tf.placeholder(tf.float32, [1, self.lstm.state_size]) scope = "net_" + str(thread_index) # Unrolling LSTM up to LOCAL_T_MAX time steps. (= 5time steps.) # When episode terminates unrolling time steps becomes less than LOCAL_TIME_STEP. # Unrolling step size is applied via self.step_size placeholder. # When forward propagating, step_size is 1. # (time_major = False, so output shape is [batch_size, max_time, cell.output_size]) lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, h_fc1_reshaped, initial_state=self.initial_lstm_state, sequence_length=self.step_size, time_major=False, scope=scope) # lstm_outputs: (1,5,256) for back prop, (1,1,256) for forward prop. lstm_outputs = tf.reshape(lstm_outputs, [-1, 256]) # policy (output) self.pi = tf.nn.softmax( tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2) # value (output) v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3 self.v = tf.reshape(v_, [-1]) self.reset_state()
def __init__( self, action_size, thread_index, # -1 for global device="/cpu:0"): GameACNetwork.__init__(self, action_size, device) print("Initializing LSTM Network ") with tf.device(self._device): self.W_conv1 = self._conv_weight_variable( [8, 1, len(FEATURES_LIST), 16]) # stride=4 self.b_conv1 = self._conv_bias_variable([16], 8, 1, len(FEATURES_LIST)) self.W_conv2 = self._conv_weight_variable([1, 1, 16, 32]) # stride=2 self.b_conv2 = self._conv_bias_variable([32], 1, 1, 16) self.W_fc1 = self._fc_weight_variable([2592, 256]) self.b_fc1 = self._fc_bias_variable([256], 2592) # lstm self.lstm = CustomBasicLSTMCell(256) # 256 must be larger than SEQUENCE_LENGTH # weight for policy output layer self.W_fc2 = self._fc_weight_variable([256, action_size]) self.b_fc2 = self._fc_bias_variable([action_size], 256) # weight for value output layer self.W_fc3 = self._fc_weight_variable([256, 1]) self.b_fc3 = self._fc_bias_variable([1], 256) # state (input) #self.s = tf.placeholder("float", [None, 84, 84, 4]) self.s = tf.placeholder( "float", [None, SEQUENCE_LENGTH, 1, len(FEATURES_LIST)]) h_conv1 = tf.nn.relu( self._conv2d(self.s, self.W_conv1, 1) + self.b_conv1) h_conv2 = tf.nn.relu( self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2) h_conv2_flat = tf.reshape(h_conv2, [-1, 2592]) h_fc1 = tf.nn.relu( tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1) # h_fc1 shape=(5,256) ##h_fc1 = tf.Print(h_fc1, [h_fc1], message="NN This is h_fc1: ", summarize=40) h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256]) # h_fc_reshaped = (1,5,256) self.step_size = tf.placeholder(tf.float32, [1]) self.initial_lstm_state = tf.placeholder(tf.float32, [1, self.lstm.state_size]) scope = "net_" + str(thread_index) # time_major = False, so output shape is [batch_size, max_time, cell.output_size] lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, h_fc1_reshaped, initial_state=self.initial_lstm_state, sequence_length=self.step_size, time_major=False, scope=scope) # lstm_outputs: (1,5,256), (1,1,256) lstm_outputs = tf.reshape(lstm_outputs, [-1, 256]) # policy (output) self.pi = tf.nn.softmax( tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2) ##self.pi = tf.Print(self.pi, [self.pi], message="NN This is self.pi: ", summarize=40) # value (output) v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3 ##v_ = tf.Print(v_, [v_], message="NN This is v_ ", summarize=40) self.v = tf.reshape(v_, [-1]) ##self.v = tf.Print(self.v, [self.v], message="NN This is self.v: ", summarize=40) # in OK tensorflow/core/kernels/logging_ops.cc:79] NN This is self.v: [-0.036351625] #I tensorflow/core/kernels/logging_ops.cc:79] NN This is self.pi: [0.49193981 0.50806022] #I tensorflow/core/kernels/logging_ops.cc:79] NN This is self.v: [-0.03456594] self.reset_state() print("Initializing Network finished")
def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.MAX_STEP = MAX_STEP self.BATCH_SIZE = BATCH_SIZE self.g = tf.Graph() with self.g.as_default(): self.sess = tf.InteractiveSession() """ Actor network: """ self.a_input_states = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='input_placeholder') self.a_grad_from_critic = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='input_placeholder') self.W_a = tf.Variable( tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS])) self.B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('actor'): self.lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.lstm_layers = [self.lstm_cell] * N_LAYERS self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.lstm_layers, state_is_tuple=True) self.init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn( self.lstm_cell, self.a_input_states, initial_state=self.init_state, dtype=tf.float32, sequence_length=self.length(self.a_input_states)) self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2]) self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list, [-1, HIDDEN_UNITS]) self.lstm_outputs_list = tf.split(0, self.MAX_STEP, self.lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.pred_t = [ tf.matmul(self.lstm_output, self.W_a) + self.B_a for self.lstm_output in self.lstm_outputs_list ] self.pred_t_array = tf.pack(self.pred_t) self.pred_t_array = tf.transpose( self.pred_t_array, [1, 0, 2]) #(to get shape of batch sizexstepxdimension) """ last relevant action (while evaluating actor during testing) """ self.last_lstm_output = self.last_relevant( self.lstm_outputs, self.length(self.a_input_states)) self.action_last_state = tf.matmul(self.last_lstm_output, self.W_a) + self.B_a #optimizer: #self.params = tf.trainable_variables() self.params = [ self.lstm_layers[0].weights, self.lstm_layers[0].bias, self.lstm_layers[1].weights, self.lstm_layers[1].bias, self.W_a, self.B_a ] self.a_grad_from_criticT = tf.transpose(self.a_grad_from_critic, perm=[1, 0, 2]) self.gradient = tf.gradients( tf.pack(self.pred_t), self.params, -self.a_grad_from_criticT / (self.MAX_STEP * BATCH_SIZE)) #- because we are interested in maximization self.opt = tf.train.AdamOptimizer(LEARNING_RATE) self.optimizer = self.opt.apply_gradients( zip(self.gradient, self.params)) print("Initialized Actor Network...") """ Target Actor network: """ self.t_a_input_states = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='input_placeholder') self.t_a_grad_from_critic = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='input_placeholder') self.t_W_a = tf.Variable( tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS])) self.t_B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('target_actor'): self.t_lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.t_lstm_layers, state_is_tuple=True) self.t_init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn( self.t_lstm_cell, self.t_a_input_states, initial_state=self.t_init_state, dtype=tf.float32, sequence_length=self.length(self.t_a_input_states)) self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs, [1, 0, 2]) self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list, [-1, HIDDEN_UNITS]) self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP, self.t_lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.t_pred_t = [ tf.matmul(self.t_lstm_output, self.t_W_a) + self.t_B_a for self.t_lstm_output in self.t_lstm_outputs_list ] self.t_pred_t = tf.pack(self.t_pred_t) self.t_pred_t = tf.transpose( self.t_pred_t, [1, 0, 2]) #(to get shape of batch sizexstepxdimension) """ last relevant action (while evaluating actor during testing) """ self.t_last_lstm_output = self.last_relevant( self.t_lstm_outputs, self.length(self.t_a_input_states)) self.t_action_last_state = tf.matmul(self.t_last_lstm_output, self.t_W_a) + self.t_B_a print("Initialized Target Actor Network...") self.sess.run(tf.initialize_all_variables()) #To initialize critic and target with the same values: # copy target parameters self.sess.run([ self.t_lstm_layers[0].weights.assign( self.lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( self.lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias), self.t_W_a.assign(self.W_a), self.t_B_a.assign(self.B_a) ]) self.update_target_actor_op = [ self.t_lstm_layers[0].weights.assign( TAU * self.lstm_layers[0].weights + (1 - TAU) * self.t_lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign( TAU * self.lstm_layers[0].bias + (1 - TAU) * self.t_lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( TAU * self.lstm_layers[1].weights + (1 - TAU) * self.t_lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign( TAU * self.lstm_layers[1].bias + (1 - TAU) * self.t_lstm_layers[1].bias), self.t_W_a.assign(TAU * self.W_a + (1 - TAU) * self.t_W_a), self.t_B_a.assign(TAU * self.B_a + (1 - TAU) * self.t_B_a) ]
class ActorNet: """ Actor Neural Network model of the RDPG algorithm """ def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.MAX_STEP = MAX_STEP self.BATCH_SIZE = BATCH_SIZE self.g = tf.Graph() with self.g.as_default(): self.sess = tf.InteractiveSession() """ Actor network: """ self.a_input_states = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='input_placeholder') self.a_grad_from_critic = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='input_placeholder') self.W_a = tf.Variable( tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS])) self.B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('actor'): self.lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.lstm_layers = [self.lstm_cell] * N_LAYERS self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.lstm_layers, state_is_tuple=True) self.init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn( self.lstm_cell, self.a_input_states, initial_state=self.init_state, dtype=tf.float32, sequence_length=self.length(self.a_input_states)) self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2]) self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list, [-1, HIDDEN_UNITS]) self.lstm_outputs_list = tf.split(0, self.MAX_STEP, self.lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.pred_t = [ tf.matmul(self.lstm_output, self.W_a) + self.B_a for self.lstm_output in self.lstm_outputs_list ] self.pred_t_array = tf.pack(self.pred_t) self.pred_t_array = tf.transpose( self.pred_t_array, [1, 0, 2]) #(to get shape of batch sizexstepxdimension) """ last relevant action (while evaluating actor during testing) """ self.last_lstm_output = self.last_relevant( self.lstm_outputs, self.length(self.a_input_states)) self.action_last_state = tf.matmul(self.last_lstm_output, self.W_a) + self.B_a #optimizer: #self.params = tf.trainable_variables() self.params = [ self.lstm_layers[0].weights, self.lstm_layers[0].bias, self.lstm_layers[1].weights, self.lstm_layers[1].bias, self.W_a, self.B_a ] self.a_grad_from_criticT = tf.transpose(self.a_grad_from_critic, perm=[1, 0, 2]) self.gradient = tf.gradients( tf.pack(self.pred_t), self.params, -self.a_grad_from_criticT / (self.MAX_STEP * BATCH_SIZE)) #- because we are interested in maximization self.opt = tf.train.AdamOptimizer(LEARNING_RATE) self.optimizer = self.opt.apply_gradients( zip(self.gradient, self.params)) print("Initialized Actor Network...") """ Target Actor network: """ self.t_a_input_states = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='input_placeholder') self.t_a_grad_from_critic = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='input_placeholder') self.t_W_a = tf.Variable( tf.random_normal([HIDDEN_UNITS, self.N_ACTIONS])) self.t_B_a = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('target_actor'): self.t_lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.t_lstm_layers, state_is_tuple=True) self.t_init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn( self.t_lstm_cell, self.t_a_input_states, initial_state=self.t_init_state, dtype=tf.float32, sequence_length=self.length(self.t_a_input_states)) self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs, [1, 0, 2]) self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list, [-1, HIDDEN_UNITS]) self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP, self.t_lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.t_pred_t = [ tf.matmul(self.t_lstm_output, self.t_W_a) + self.t_B_a for self.t_lstm_output in self.t_lstm_outputs_list ] self.t_pred_t = tf.pack(self.t_pred_t) self.t_pred_t = tf.transpose( self.t_pred_t, [1, 0, 2]) #(to get shape of batch sizexstepxdimension) """ last relevant action (while evaluating actor during testing) """ self.t_last_lstm_output = self.last_relevant( self.t_lstm_outputs, self.length(self.t_a_input_states)) self.t_action_last_state = tf.matmul(self.t_last_lstm_output, self.t_W_a) + self.t_B_a print("Initialized Target Actor Network...") self.sess.run(tf.initialize_all_variables()) #To initialize critic and target with the same values: # copy target parameters self.sess.run([ self.t_lstm_layers[0].weights.assign( self.lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( self.lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias), self.t_W_a.assign(self.W_a), self.t_B_a.assign(self.B_a) ]) self.update_target_actor_op = [ self.t_lstm_layers[0].weights.assign( TAU * self.lstm_layers[0].weights + (1 - TAU) * self.t_lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign( TAU * self.lstm_layers[0].bias + (1 - TAU) * self.t_lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( TAU * self.lstm_layers[1].weights + (1 - TAU) * self.t_lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign( TAU * self.lstm_layers[1].bias + (1 - TAU) * self.t_lstm_layers[1].bias), self.t_W_a.assign(TAU * self.W_a + (1 - TAU) * self.t_W_a), self.t_B_a.assign(TAU * self.B_a + (1 - TAU) * self.t_B_a) ] def length(self, data): used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def last_relevant( self, output, length ): #method used while evaluating target net: where input is one or few time steps L_BATCH_SIZE = tf.shape(output)[0] max_length = int(output.get_shape()[1]) out_size = int(output.get_shape()[2]) index = tf.range(0, L_BATCH_SIZE) * max_length + (length - 1) flat = tf.reshape(output, [-1, out_size]) relevant = tf.gather(flat, index) return relevant def train_actor(self, o_n_t, del_Q_a): self.sess.run(self.optimizer, feed_dict={ self.a_input_states: o_n_t, self.a_grad_from_critic: del_Q_a }) def evaluate_actor(self, o_n_t): return self.sess.run(self.action_last_state, feed_dict={self.a_input_states: o_n_t})[0] def evaluate_actor_batch(self, o_n_t): return self.sess.run(self.pred_t_array, feed_dict={self.a_input_states: o_n_t}) def evaluate_target_actor(self, o_n_t): return self.sess.run(self.t_pred_t, feed_dict={self.t_a_input_states: o_n_t}) def update_target_actor(self): self.sess.run(self.update_target_actor_op)
def _create_network(self): state_dim = self._state_dim state_chn = self._state_chn action_dim = self._action_dim with tf.device(self._device): # state input self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_chn]) # conv1 self.W_conv1 = weight_variable([8, 8, state_chn, 16]) self.b_conv1 = bias_variable([16]) h_conv1 = tf.nn.relu(conv2d(self.state_input, self.W_conv1, 4) + self.b_conv1) # conv2 self.W_conv2 = weight_variable([4, 4, 16, 32]) self.b_conv2 = bias_variable([32]) h_conv2 = tf.nn.relu(conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2) h_conv2_out_size = np.prod(h_conv2.get_shape().as_list()[1:]) print 'h_conv2_out_size', h_conv2_out_size h_conv2_flat = tf.reshape(h_conv2, [-1, h_conv2_out_size]) # conv3 # self.W_conv3 = weight_variable([3, 3, 32, 64]) # self.b_conv3 = bias_variable([64]) # h_conv3 = tf.nn.relu(conv2d(h_conv2, self.W_conv3, 1) + self.b_conv3) # h_conv3_out_size = np.prod(h_conv3.get_shape().as_list()[1:]) # print 'h_conv3_out_size', h_conv3_out_size # h_conv3_flat = tf.reshape(h_conv3, [-1, h_conv3_out_size]) # fc1 self.W_fc1 = weight_variable([h_conv2_out_size, 256]) self.b_fc1 = bias_variable([256]) h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1) # reshape to fit lstm (1, 5, 256) h_fc1_reshaped = tf.reshape(h_fc1, [1, -1, 256]) self.lstm = CustomBasicLSTMCell(256) self.step_size = tf.placeholder('float', [1]) self.initial_lstm_state = tf.placeholder('float', [1, self.lstm.state_size]) scope = 'net_' + str(self._thread_index) # Unrolling LSTM up to LOCAL_T_MAX time steps. (= 5time steps.) # (time_major = False, so output shape is [batch_size, max_time, cell.output_size]) # refer: https://www.tensorflow.org/versions/r0.11/api_docs/python/nn.html#dynamic_rnn lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, h_fc1_reshaped, initial_state=self.initial_lstm_state, sequence_length=self.step_size, time_major=False, scope=scope ) print lstm_outputs.get_shape() lstm_outputs = tf.reshape(lstm_outputs, [-1, 256]) # fc2: (pi) for policy output self.W_fc2 = weight_variable([256, action_dim]) self.b_fc2 = bias_variable([action_dim]) self.policy_output = tf.nn.softmax(tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2) # fc3: (v) for value output self.W_fc3 = weight_variable([256, 1]) self.b_fc3 = bias_variable([1]) v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3 self.value_output = tf.reshape(v_, [-1]) self.reset_lstm_state() return
def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.MAX_STEP = MAX_STEP self.BATCH_SIZE = BATCH_SIZE self.g = tf.Graph() with self.g.as_default(): self.sess = tf.InteractiveSession() """ Critic Q network: """ self.c_input_state = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='inputstate_placeholder') self.c_input_action = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='inputaction_placeholder') self.c_input = tf.concat(2, [self.c_input_state, self.c_input_action]) self.c_target = tf.placeholder( "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION], name='label_placeholder') self.W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1])) self.B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('critic'): self.lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.lstm_layers = [self.lstm_cell] * N_LAYERS self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.lstm_layers, state_is_tuple=True) self.init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn( self.lstm_cell, self.c_input, initial_state=self.init_state, dtype=tf.float32, sequence_length=self.length(self.c_input)) self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2]) self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list, [-1, HIDDEN_UNITS]) self.lstm_outputs_list = tf.split(0, self.MAX_STEP, self.lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.pred_t = [ tf.matmul(self.lstm_output, self.W_c) + self.B_c for self.lstm_output in self.lstm_outputs_list ] #converting target value to list of tensors self.c_target_list = tf.transpose(self.c_target, [1, 0, 2]) self.c_target_list = tf.reshape(self.c_target_list, [-1, TARGET_VALUE_DIMENSION]) self.c_target_list = tf.split(0, self.MAX_STEP, self.c_target_list) #optimizer: self.predt_T = tf.pack(self.pred_t) #transposing it to shape BatchsizeXtimestepXdimension: self.predt_T = tf.transpose(self.predt_T, [1, 0, 2]) self.square_diff = tf.pow((self.predt_T - self.c_target), 2) #no. of time_steps: self.eff_time_step = tf.reduce_sum(self.length(self.c_input)) self.eff_time_step = tf.to_float(self.eff_time_step) #mean loss over time step: self.loss_t = tf.reduce_sum( self.square_diff, reduction_indices=1) / self.eff_time_step self.loss_n = tf.reduce_sum(self.loss_t, reduction_indices=0) / self.BATCH_SIZE #self.params = tf.trainable_variables() self.params = [ self.lstm_layers[0].weights, self.lstm_layers[0].bias, self.lstm_layers[1].weights, self.lstm_layers[1].bias, self.W_c, self.B_c ] #self.gradient = tf.gradients(tf.pack(self.pred_t),self.params,tf.sub(self.pred_t,self.c_target_list)/(self.MAX_STEP*self.BATCH_SIZE)) self.gradient = tf.gradients(self.loss_n, self.params) #self.gradient = tf.gradients(self.predt_T,self.params,(self.predt_T-self.c_target)/(self.MAX_STEP*self.BATCH_SIZE)) self.critic_gradient = tf.gradients(self.predt_T, self.c_input_action) self.opt = tf.train.AdamOptimizer(LEARNING_RATE) self.optimizer = self.opt.apply_gradients( zip(self.gradient, self.params)) print("Initialized Critic Network...") """ Target critic Q network: """ #critic_q_model_parameters: self.t_c_input_state = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='inputstate_placeholder') self.t_c_input_action = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='inputaction_placeholder') self.t_c_input = tf.concat( 2, [self.t_c_input_state, self.t_c_input_action]) self.t_c_target = tf.placeholder( "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION], name='label_placeholder') self.t_W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1])) self.t_B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('target_critic'): self.t_lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.t_lstm_layers, state_is_tuple=True) self.t_init_state = self.t_lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn( self.t_lstm_cell, self.t_c_input, initial_state=self.t_init_state, dtype=tf.float32, sequence_length=self.length(self.t_c_input)) self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs, [1, 0, 2]) self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list, [-1, HIDDEN_UNITS]) self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP, self.t_lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.t_pred_t = [ tf.matmul(self.t_lstm_output, self.t_W_c) + self.t_B_c for self.t_lstm_output in self.t_lstm_outputs_list ] self.t_pred_t = tf.pack(self.t_pred_t) self.t_pred_t = tf.transpose(self.t_pred_t, [1, 0, 2]) #converting target value to list of tensors self.t_c_target_list = tf.transpose(self.t_c_target, [1, 0, 2]) self.t_c_target_list = tf.reshape(self.t_c_target_list, [-1, TARGET_VALUE_DIMENSION]) self.t_c_target_list = tf.split(0, self.MAX_STEP, self.t_c_target_list) print("Initialized Target Critic Network...") self.sess.run(tf.initialize_all_variables()) #To initialize critic and target with the same values: # copy target parameters self.sess.run([ self.t_lstm_layers[0].weights.assign( self.lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( self.lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias), self.t_W_c.assign(self.W_c), self.t_B_c.assign(self.B_c) ]) self.update_target_critic_op = [ self.t_lstm_layers[0].weights.assign( TAU * self.lstm_layers[0].weights + (1 - TAU) * self.t_lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign( TAU * self.lstm_layers[0].bias + (1 - TAU) * self.t_lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( TAU * self.lstm_layers[1].weights + (1 - TAU) * self.t_lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign( TAU * self.lstm_layers[1].bias + (1 - TAU) * self.t_lstm_layers[1].bias), self.t_W_c.assign(TAU * self.W_c + (1 - TAU) * self.t_W_c), self.t_B_c.assign(TAU * self.B_c + (1 - TAU) * self.t_B_c) ]
class CriticNet: """ Critic Q value Neural Network model of the RDPG algorithm """ def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.MAX_STEP = MAX_STEP self.BATCH_SIZE = BATCH_SIZE self.g = tf.Graph() with self.g.as_default(): self.sess = tf.InteractiveSession() """ Critic Q network: """ self.c_input_state = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='inputstate_placeholder') self.c_input_action = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='inputaction_placeholder') self.c_input = tf.concat(2, [self.c_input_state, self.c_input_action]) self.c_target = tf.placeholder( "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION], name='label_placeholder') self.W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1])) self.B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('critic'): self.lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.lstm_layers = [self.lstm_cell] * N_LAYERS self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.lstm_layers, state_is_tuple=True) self.init_state = self.lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn( self.lstm_cell, self.c_input, initial_state=self.init_state, dtype=tf.float32, sequence_length=self.length(self.c_input)) self.lstm_outputs_list = tf.transpose(self.lstm_outputs, [1, 0, 2]) self.lstm_outputs_list = tf.reshape(self.lstm_outputs_list, [-1, HIDDEN_UNITS]) self.lstm_outputs_list = tf.split(0, self.MAX_STEP, self.lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.pred_t = [ tf.matmul(self.lstm_output, self.W_c) + self.B_c for self.lstm_output in self.lstm_outputs_list ] #converting target value to list of tensors self.c_target_list = tf.transpose(self.c_target, [1, 0, 2]) self.c_target_list = tf.reshape(self.c_target_list, [-1, TARGET_VALUE_DIMENSION]) self.c_target_list = tf.split(0, self.MAX_STEP, self.c_target_list) #optimizer: self.predt_T = tf.pack(self.pred_t) #transposing it to shape BatchsizeXtimestepXdimension: self.predt_T = tf.transpose(self.predt_T, [1, 0, 2]) self.square_diff = tf.pow((self.predt_T - self.c_target), 2) #no. of time_steps: self.eff_time_step = tf.reduce_sum(self.length(self.c_input)) self.eff_time_step = tf.to_float(self.eff_time_step) #mean loss over time step: self.loss_t = tf.reduce_sum( self.square_diff, reduction_indices=1) / self.eff_time_step self.loss_n = tf.reduce_sum(self.loss_t, reduction_indices=0) / self.BATCH_SIZE #self.params = tf.trainable_variables() self.params = [ self.lstm_layers[0].weights, self.lstm_layers[0].bias, self.lstm_layers[1].weights, self.lstm_layers[1].bias, self.W_c, self.B_c ] #self.gradient = tf.gradients(tf.pack(self.pred_t),self.params,tf.sub(self.pred_t,self.c_target_list)/(self.MAX_STEP*self.BATCH_SIZE)) self.gradient = tf.gradients(self.loss_n, self.params) #self.gradient = tf.gradients(self.predt_T,self.params,(self.predt_T-self.c_target)/(self.MAX_STEP*self.BATCH_SIZE)) self.critic_gradient = tf.gradients(self.predt_T, self.c_input_action) self.opt = tf.train.AdamOptimizer(LEARNING_RATE) self.optimizer = self.opt.apply_gradients( zip(self.gradient, self.params)) print("Initialized Critic Network...") """ Target critic Q network: """ #critic_q_model_parameters: self.t_c_input_state = tf.placeholder( "float", [None, self.MAX_STEP, self.N_STATES], name='inputstate_placeholder') self.t_c_input_action = tf.placeholder( "float", [None, self.MAX_STEP, self.N_ACTIONS], name='inputaction_placeholder') self.t_c_input = tf.concat( 2, [self.t_c_input_state, self.t_c_input_action]) self.t_c_target = tf.placeholder( "float", [None, self.MAX_STEP, TARGET_VALUE_DIMENSION], name='label_placeholder') self.t_W_c = tf.Variable(tf.random_normal([HIDDEN_UNITS, 1])) self.t_B_c = tf.Variable(tf.random_normal([1], -0.003, 0.003)) #lstms with tf.variable_scope('target_critic'): self.t_lstm_cell = CustomBasicLSTMCell( HIDDEN_UNITS ) #basiclstmcell modified to get access to cell weights self.t_lstm_layers = [self.t_lstm_cell] * N_LAYERS self.t_lstm_cell = tf.nn.rnn_cell.MultiRNNCell( self.t_lstm_layers, state_is_tuple=True) self.t_init_state = self.t_lstm_cell.zero_state( self.BATCH_SIZE, tf.float32) self.t_lstm_outputs, self.t_final_state = tf.nn.dynamic_rnn( self.t_lstm_cell, self.t_c_input, initial_state=self.t_init_state, dtype=tf.float32, sequence_length=self.length(self.t_c_input)) self.t_lstm_outputs_list = tf.transpose(self.t_lstm_outputs, [1, 0, 2]) self.t_lstm_outputs_list = tf.reshape(self.t_lstm_outputs_list, [-1, HIDDEN_UNITS]) self.t_lstm_outputs_list = tf.split(0, self.MAX_STEP, self.t_lstm_outputs_list) #prediction(output) at each time step:(list of tensors) self.t_pred_t = [ tf.matmul(self.t_lstm_output, self.t_W_c) + self.t_B_c for self.t_lstm_output in self.t_lstm_outputs_list ] self.t_pred_t = tf.pack(self.t_pred_t) self.t_pred_t = tf.transpose(self.t_pred_t, [1, 0, 2]) #converting target value to list of tensors self.t_c_target_list = tf.transpose(self.t_c_target, [1, 0, 2]) self.t_c_target_list = tf.reshape(self.t_c_target_list, [-1, TARGET_VALUE_DIMENSION]) self.t_c_target_list = tf.split(0, self.MAX_STEP, self.t_c_target_list) print("Initialized Target Critic Network...") self.sess.run(tf.initialize_all_variables()) #To initialize critic and target with the same values: # copy target parameters self.sess.run([ self.t_lstm_layers[0].weights.assign( self.lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign(self.lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( self.lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign(self.lstm_layers[1].bias), self.t_W_c.assign(self.W_c), self.t_B_c.assign(self.B_c) ]) self.update_target_critic_op = [ self.t_lstm_layers[0].weights.assign( TAU * self.lstm_layers[0].weights + (1 - TAU) * self.t_lstm_layers[0].weights), self.t_lstm_layers[0].bias.assign( TAU * self.lstm_layers[0].bias + (1 - TAU) * self.t_lstm_layers[0].bias), self.t_lstm_layers[1].weights.assign( TAU * self.lstm_layers[1].weights + (1 - TAU) * self.t_lstm_layers[1].weights), self.t_lstm_layers[1].bias.assign( TAU * self.lstm_layers[1].bias + (1 - TAU) * self.t_lstm_layers[1].bias), self.t_W_c.assign(TAU * self.W_c + (1 - TAU) * self.t_W_c), self.t_B_c.assign(TAU * self.B_c + (1 - TAU) * self.t_B_c) ] def length(self, data): used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def last_relevant( self, output, length ): #method used while evaluating target net: where input is one or few time steps self.BATCH_SIZE = tf.shape(output)[0] max_length = int(output.get_shape()[1]) out_size = int(output.get_shape()[2]) index = tf.range(0, self.BATCH_SIZE) * max_length + (length - 1) flat = tf.reshape(output, [-1, out_size]) relevant = tf.gather(flat, index) return relevant def train_critic(self, o_n_t, a_n_t, y_n_t): self.sess.run(self.optimizer, feed_dict={ self.c_input_state: o_n_t, self.c_input_action: a_n_t, self.c_target: y_n_t }) def compute_critic_gradient( self, o_n_t, a_n_t): #critic gradient with respect to action #check = np.array(self.sess.run(self.critic_gradient, feed_dict={self.c_input_state: o_n_t,self.c_input_action: a_n_t})[0]) #print check.shape #raw_input('check shape') return self.sess.run(self.critic_gradient, feed_dict={ self.c_input_state: o_n_t, self.c_input_action: a_n_t })[0] def evaluate_target_critic(self, o_n_t, a_n_t): return self.sess.run(self.t_pred_t, feed_dict={ self.t_c_input_state: o_n_t, self.t_c_input_action: a_n_t }) def update_target_critic(self): self.sess.run(self.update_target_critic_op)