def process_pc(self, batch): """ Returns feed dictionary for `pixel control` loss estimation subgraph. """ if not self.use_off_policy_aac: # use single pass of network on same off-policy batch feeder = { pl: value for pl, value in zip( self.local_network.pc_lstm_state_pl_flatten, flatten_nested(batch['context'])) } feeder.update({ self.local_network.pc_state_in: batch['state'], self.local_network.pc_a_r_in: batch['last_action_reward'], self.pc_action: batch['action'], self.pc_target: batch['pixel_change'] }) else: feeder = { self.pc_action: batch['action'], self.pc_target: batch['pixel_change'] } return feeder
def __init__(self, x_in, ob_space, ac_space, lstm_class, lstm_layers): # Flatten end expand with fake time dim to feed to LSTM bank: x = tf.expand_dims(batch_flatten(x_in), [0]) # x = tf.expand_dims(self.flatten_homebrew(x_in), [0]) try: if self.train_phase is not None: pass except: self.train_phase = tf.placeholder_with_default( tf.constant(False, dtype=tf.bool), shape=(), name='train_phase_flag_pl' ) self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #print('GOT HERE 2, x:', x.shape) #print('GOT HERE 2, train_phase:', self.train_phase.shape) #print('GOT HERE 2, update_ops:', self.update_ops) # Define LSTM layers: lstm = [] for size in lstm_layers: lstm += [lstm_class(size, state_is_tuple=True)] self.lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True) # self.lstm = lstm[0] # Get time_dimension as [1]-shaped tensor: step_size = tf.expand_dims(tf.shape(x)[1], [0]) #step_size = tf.shape(self.x)[:1] #print('GOT HERE 3') self.lstm_init_state = self.lstm.zero_state(1, dtype=tf.float32) lstm_state_pl = self.rnn_placeholders(self.lstm.zero_state(1, dtype=tf.float32)) self.lstm_state_pl_flatten = flatten_nested(lstm_state_pl) #print('GOT HERE 4, x:', x.shape) lstm_outputs, self.lstm_state_out = tf.nn.dynamic_rnn( self.lstm, x, initial_state=lstm_state_pl, sequence_length=step_size, time_major=False ) #print('GOT HERE 5') x = tf.reshape(lstm_outputs, [-1, lstm_layers[-1]]) self.logits = self.linear(x, ac_space, "action", self.normalized_columns_initializer(0.01)) self.vf = tf.reshape(self.linear(x, 1, "value", self.normalized_columns_initializer(1.0)), [-1]) self.sample = self.categorical_sample(self.logits, ac_space)[0, :] self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Add moving averages to save list (meant for Batch_norm layer): moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*') renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*') self.var_list += moving_var_list + renorm_var_list
def value(self, ob, lstm_state): sess = tf.get_default_session() feeder = { pl: value for pl, value in zip(self.lstm_state_pl_flatten, flatten_nested(lstm_state)) } feeder.update({self.x: [ob], self.train_phase: False}) return sess.run(self.vf, feeder)[0]
def act(self, ob, lstm_state): sess = tf.get_default_session() feeder = { pl: value for pl, value in zip(self.lstm_state_pl_flatten, flatten_nested(lstm_state)) } feeder.update({self.x: [ob], self.train_phase: False}) #print('#####_feeder:\n', feeder) return sess.run([self.sample, self.vf, self.lstm_state_out], feeder)
def get_a3c_value(self, observation, lstm_state, action_reward): """Called by thread-runner.""" sess = tf.get_default_session() feeder = {pl: value for pl, value in zip(self.a3c_lstm_state_pl_flatten, flatten_nested(lstm_state))} feeder.update( {self.a3c_state_in: [observation], self.a3c_a_r_in: [action_reward], self.train_phase: False} ) return sess.run(self.a3c_vf, feeder)[0]
def process(self, sess): """ Algorithm single training step. Grabs an on_policy_rollout that's been produced by the thread runner, samples off_policy rollout[s] from replay memory and updates the parameters. The update is then sent to the parameter server. """ # Copy weights from shared to local new_policy: sess.run(self.sync) # Get and process rollout: on_policy_rollout = self.pull_batch_from_queue() on_policy_batch = on_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for on-policy AAC loss estimation graph: feed_dict = { pl: value for pl, value in zip(self.local_network.on_lstm_state_pl_flatten, flatten_nested(on_policy_batch['context'])) } feed_dict.update({ self.local_network.on_state_in: on_policy_batch['state'], self.local_network.on_a_r_in: on_policy_batch['last_action_reward'], self.on_pi_act_target: on_policy_batch['action'], self.on_pi_adv_target: on_policy_batch['advantage'], self.on_pi_r_target: on_policy_batch['r'], self.local_network.train_phase: True, }) # Every worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 if should_compute_summary: fetches = [self.train_op, self.model_summary_op, self.inc_step] else: fetches = [self.train_op, self.inc_step] fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1
def feed_dict_rnn_context(placeholders, values): """ Creates tf.feed_dict for flat placeholders and nested values. Args: placeholders: flat structure of placeholders values: nested structure of values Returns: flat feed dictionary """ return {key: value for key, value in zip(placeholders, flatten_nested(values))}
def flat_placeholders(ob_space, batch_dim=None, name='flt'): """ Given nested observation space as dictionary of shape tuples, returns flattened dictionary of batch-wise placeholders. Args: ob_space: [nested dict] of tuples name: name_scope batch_dim: batch dimension Returns: flat dictionary of tf.placeholders """ return flatten_nested(nested_placeholders(ob_space, batch_dim=batch_dim, name=name))
def process(self, sess): """ Grabs a rollout that's been produced by the thread runner, and updates the parameters. The update is then sent to the parameter server. """ sess.run(self.sync) # copy weights from shared to local rollout = self.pull_batch_from_queue() batch = process_rollout(rollout, gamma=self.model_gamma, lambda_=self.model_lambda) # Only chief worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 # self.task == 0 and if should_compute_summary: fetches = [self.model_summary_op, self.train_op, self.global_step] else: fetches = [self.train_op, self.global_step] feed_dict = { pl: value for pl, value in zip(self.local_network.lstm_state_pl_flatten, flatten_nested(batch.features)) } feed_dict.update({ self.local_network.x: batch.si, self.ac: batch.a, self.adv: batch.adv, self.r: batch.r, self.local_network.train_phase: True, }) #print('TRAIN_FEED_DICT:\n', feed_dict) #print('\n=======S=======\n') #for key,value in feed_dict.items(): # try: # print(key,':', value.shape,'\n') # except: # print(key, ':', value, '\n') #print('\n=====E======\n') fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1
def lstm_network(x, lstm_sequence_length, lstm_class=rnn.BasicLSTMCell, lstm_layers=(256, ), name='lstm', reuse=False, **kwargs): """ Stage2 network: from features to flattened LSTM output. Defines [multi-layered] dynamic [possibly shared] LSTM network. Returns: batch-wise flattened output tensor; lstm initial state tensor; lstm state output tensor; lstm flattened feed placeholders as tuple. """ with tf.variable_scope(name, reuse=reuse): # Flatten, add action/reward and expand with fake [time] batch? dim to feed LSTM bank: #x = tf.concat([x, a_r] ,axis=-1) #x = tf.concat([batch_flatten(x), a_r], axis=-1) #x = tf.expand_dims(x, [0]) # Define LSTM layers: lstm = [] for size in lstm_layers: lstm += [lstm_class(size, state_is_tuple=True)] lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True) # Get time_dimension as [1]-shaped tensor: step_size = tf.expand_dims(tf.shape(x)[1], [0]) lstm_init_state = lstm.zero_state(1, dtype=tf.float32) lstm_state_pl = rnn_placeholders(lstm.zero_state(1, dtype=tf.float32)) lstm_state_pl_flatten = flatten_nested(lstm_state_pl) lstm_outputs, lstm_state_out = tf.nn.dynamic_rnn( lstm, x, initial_state=lstm_state_pl, sequence_length=lstm_sequence_length, time_major=False) #x_out = tf.reshape(lstm_outputs, [-1, lstm_layers[-1]]) x_out = lstm_outputs return x_out, lstm_init_state, lstm_state_out, lstm_state_pl_flatten
def process_pc(self, batch): """ Returns feed dictionary for `pixel control` loss estimation subgraph. """ if not self.use_off_policy_a3c: # use single pass of network on same off-policy batch feeder = { pl: value for pl, value in zip( self.local_network.pc_lstm_state_pl_flatten, flatten_nested(batch.features)) } feeder.update({ self.local_network.pc_state_in: batch.si, self.local_network.pc_a_r_in: batch.last_ar, self.pc_action: batch.a, self.pc_target: batch.pc }) else: feeder = {self.pc_action: batch.a, self.pc_target: batch.pc} return feeder
def process_vr(self, batch): """ Returns feed dictionary for `value replay` loss estimation subgraph. """ if not self.use_off_policy_aac: # use single pass of network on same off-policy batch feeder = { pl: value for pl, value in zip( self.local_network.vr_lstm_state_pl_flatten, flatten_nested(batch['context'])) } # ...passes lstm context feeder.update({ self.local_network.vr_state_in: batch['state'], self.local_network.vr_a_r_in: batch['last_action_reward'], self.vr_target: batch['r'], }) else: feeder = {self.vr_target: batch['r']} # redundant actually :) return feeder
def process_vr(self, batch): """ Returns feed dictionary for `value replay` loss estimation subgraph. """ if not self.use_off_policy_a3c: # use single pass of network on same off-policy batch feeder = { pl: value for pl, value in zip( self.local_network.vr_lstm_state_pl_flatten, flatten_nested(batch.features)) } # ...passes lstm context feeder.update({ self.local_network.vr_state_in: batch.si, self.local_network.vr_a_r_in: batch.last_ar, #self.vr_action: batch.a, # don't need those for value fn. estimation #self.vr_advantage: batch.adv, # neither.. self.vr_target: batch.r, }) else: feeder = {self.vr_target: batch.r} # redundant actually :) return feeder
def lstm_network( x, lstm_sequence_length, lstm_class=rnn.BasicLSTMCell, lstm_layers=(256,), static=False, name='lstm', reuse=False, **kwargs ): """ Stage2 network: from features to flattened LSTM output. Defines [multi-layered] dynamic [possibly shared] LSTM network. Returns: batch-wise flattened output tensor; lstm initial state tensor; lstm state output tensor; lstm flattened feed placeholders as tuple. """ with tf.variable_scope(name, reuse=reuse): # Prepare rnn type: if static: rnn_net = tf.nn.static_rnn # Remove time dimension (suppose always get one) and wrap to list: x = [x[:, 0, :]] else: rnn_net = tf.nn.dynamic_rnn # Define LSTM layers: lstm = [] for size in lstm_layers: lstm += [lstm_class(size)] #, state_is_tuple=True)] lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True) # Get time_dimension as [1]-shaped tensor: step_size = tf.expand_dims(tf.shape(x)[1], [0]) lstm_init_state = lstm.zero_state(1, dtype=tf.float32) lstm_state_pl = rnn_placeholders(lstm.zero_state(1, dtype=tf.float32)) lstm_state_pl_flatten = flatten_nested(lstm_state_pl) # print('rnn_net: ', rnn_net) lstm_outputs, lstm_state_out = rnn_net( cell=lstm, inputs=x, initial_state=lstm_state_pl, sequence_length=lstm_sequence_length, ) # print('\nlstm_outputs: ', lstm_outputs) # print('\nlstm_state_out:', lstm_state_out) # Unwrap and expand: if static: x_out = lstm_outputs[0][:, None, :] else: x_out = lstm_outputs state_out = lstm_state_out return x_out, lstm_init_state, state_out, lstm_state_pl_flatten
def process(self, sess): """ Grabs a on_policy_rollout that's been produced by the thread runner, samples off_policy rollout[s] from replay memory and updates the parameters. The update is then sent to the parameter server. """ sess.run(self.sync) # copy weights from shared to local # Get and process on_policy_rollout for A3C train step: on_policy_rollout = self.pull_batch_from_queue() on_policy_batch = on_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for on-policy A3C loss estimation graph: feed_dict = { pl: value for pl, value in zip(self.local_network.a3c_lstm_state_pl_flatten, flatten_nested(on_policy_batch.features)) } # ..passes lstm context feed_dict.update({ self.local_network.a3c_state_in: on_policy_batch.si, self.local_network.a3c_a_r_in: on_policy_batch.last_ar, self.a3c_act_target: on_policy_batch.a, self.a3c_adv_target: on_policy_batch.adv, self.a3c_r_target: on_policy_batch.r, self.local_network.train_phase: True, }) if self.use_off_policy_a3c or self.use_pixel_control or self.use_value_replay: # Get sample from replay memory: if self.use_rebalanced_replay: off_policy_sample = self.memory.sample_priority( self.replay_rollout_length, skewness=self.rebalance_skewness, exact_size=False) else: off_policy_sample = self.memory.sample_uniform( self.replay_rollout_length) off_policy_rollout = Rollout() off_policy_rollout.add_memory_sample(off_policy_sample) off_policy_batch = off_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for off-policy A3C loss estimation graph: off_policy_feeder = { pl: value for pl, value in zip( self.local_network.off_a3c_lstm_state_pl_flatten, flatten_nested(off_policy_batch.features)) } off_policy_feeder.update({ self.local_network.off_a3c_state_in: off_policy_batch.si, self.local_network.off_a3c_a_r_in: off_policy_batch.last_ar, self.off_policy_act_target: off_policy_batch.a, self.off_policy_adv_target: off_policy_batch.adv, self.off_policy_r_target: off_policy_batch.r, }) feed_dict.update(off_policy_feeder) # Update with reward prediction subgraph: if self.use_reward_prediction: # Rebalanced 50/50 sample for RP: rp_sample = self.memory.sample_priority(self.rp_sequence_size, skewness=2, exact_size=True) feed_dict.update(self.process_rp(rp_sample)) # Pixel control ... if self.use_pixel_control: feed_dict.update(self.process_pc(off_policy_batch)) # VR... if self.use_value_replay: feed_dict.update(self.process_vr(off_policy_batch)) if self.use_memory: # Save on_policy_rollout to replay memory: self.memory.add_rollout(on_policy_rollout) # Every worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 # self.task == 0 and if should_compute_summary: fetches = [self.model_summary_op, self.train_op, self.global_step] else: fetches = [self.train_op, self.global_step] #print('TRAIN_FEED_DICT:\n', feed_dict) #print('\n=======S=======\n') #for key,value in feed_dict.items(): # try: # print(key,':', value.shape,'\n') # except: # print(key, ':', value, '\n') #print('\n=====E======\n') # And finally... fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1
def process(self, sess): """ Grabs a on_policy_rollout that's been produced by the thread runner, samples off_policy rollout[s] from replay memory and updates the parameters. The update is then sent to the parameter server. """ # Copy weights from shared to local new_policy: sess.run(self.sync) # Get and process rollout for on-policy train step: on_policy_rollout = self.pull_batch_from_queue() on_policy_batch = on_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for on-policy AAC loss estimation graph: feed_dict = { pl: value for pl, value in zip(self.local_network.on_lstm_state_pl_flatten, flatten_nested(on_policy_batch['context'])) } feed_dict.update({ self.local_network.on_state_in: on_policy_batch['state'], self.local_network.on_a_r_in: on_policy_batch['last_action_reward'], self.on_pi_act_target: on_policy_batch['action'], self.on_pi_adv_target: on_policy_batch['advantage'], self.on_pi_r_target: on_policy_batch['r'], self.local_network.train_phase: True, }) if self.use_off_policy_aac or self.use_pixel_control or self.use_value_replay: # Get sample from replay memory: if self.use_rebalanced_replay: off_policy_sample = self.memory.sample_priority( self.replay_rollout_length, skewness=self.rebalance_skewness, exact_size=False) else: off_policy_sample = self.memory.sample_uniform( self.replay_rollout_length) off_policy_rollout = Rollout() off_policy_rollout.add_memory_sample(off_policy_sample) off_policy_batch = off_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for off-policy AAC loss estimation graph: off_policy_feeder = { pl: value for pl, value in zip( self.local_network.off_lstm_state_pl_flatten, flatten_nested(off_policy_batch['context'])) } off_policy_feeder.update({ self.local_network.off_state_in: off_policy_batch['state'], self.local_network.off_a_r_in: off_policy_batch['last_action_reward'], self.off_pi_act_target: off_policy_batch['action'], self.off_pi_adv_target: off_policy_batch['advantage'], self.off_pi_r_target: off_policy_batch['r'], }) feed_dict.update(off_policy_feeder) # Update with reward prediction subgraph: if self.use_reward_prediction: # Rebalanced 50/50 sample for RP: rp_sample = self.memory.sample_priority(self.rp_sequence_size, skewness=2, exact_size=True) feed_dict.update(self.process_rp(rp_sample)) # Pixel control ... if self.use_pixel_control: feed_dict.update(self.process_pc(off_policy_batch)) # VR... if self.use_value_replay: feed_dict.update(self.process_vr(off_policy_batch)) if self.use_memory: # Save on_policy_rollout to replay memory: self.memory.add_rollout(on_policy_rollout) # Every worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 fetches = [self.train_op] if should_compute_summary: fetches = [self.train_op, self.model_summary_op, self.inc_step] else: fetches = [self.train_op, self.inc_step] fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1