def act(self, observation, lstm_state, last_action, last_reward): """ Predicts action. Args: observation: dictionary containing single observation lstm_state: lstm context value last_action: action value from previous step last_reward: reward value previous step Returns: Action as dictionary of several action encodings, actions logits, V-fn value, output RNN state """ sess = tf.get_default_session() feeder = { pl: value for pl, value in zip(self.on_lstm_state_pl_flatten, flatten_nested(lstm_state)) } feeder.update( feed_dict_from_nested(self.on_state_in, observation, expand_batch=True)) feeder.update({ self.on_last_a_in: last_action, self.on_last_reward_in: last_reward, self.on_batch_size: 1, self.on_time_length: 1, self.train_phase: False }) # action_one_hot, logits, value, context = sess.run( # [self.on_sample, self.on_logits, self.on_vf, self.on_lstm_state_out], # feeder # ) # return action_one_hot, logits, value, context logits, value, context = sess.run( [self.on_logits, self.on_vf, self.on_lstm_state_out], feeder) logits = logits[0, ...] if self.ac_space.is_discrete: # Use multinomial to get sample (discrete): sample = np.random.multinomial(1, softmax(logits)) sample = self.ac_space._cat_to_vec(np.argmax(sample)) else: # Use DP to get sample (continuous): sample = sample_dp(logits, alpha=self.action_dp_alpha) # Get all needed action encodings: action = self.ac_space._vec_to_action(sample) one_hot = self.ac_space._vec_to_one_hot(sample) action_pack = { 'environment': action, 'encoded': self.ac_space.encode(action), 'one_hot': one_hot, } # print('action_pack: ', action_pack) return action_pack, logits, value, context
def get_ep_render(self, is_test=False): """ Collects episode, environment and policy visualisations. Relies on environment renderer class methods, so it is only valid when environment rendering is enabled (typically it is true for master runner). Returns: dictionary of images as rgb arrays """ # Only render chief worker and test (slave) environment: # if self.task < 1 and ( # is_test or( # self.local_episode % self.env_render_freq == 0 and not self.data_sample_config['mode'] # ) # ): if self.task < 1 and self.local_episode % self.env_render_freq == 0: # Render environment (chief worker only): render_stat = { mode: self.env.render(mode)[None, :] for mode in self.env.render_modes } # Update renderings with aux: # ep_a_logits = self.ep_accum['logits'] # ep_value = self.ep_accum['value'] # self.log.notice('ep_logits shape: {}'.format(np.asarray(ep_a_logits).shape)) # self.log.notice('ep_value shape: {}'.format(np.asarray(ep_value).shape)) # Unpack LSTM states: rnn_1, rnn_2 = zip(*self.ep_accum['context']) rnn_1 = [state[0] for state in rnn_1] rnn_2 = [state[0] for state in rnn_2] c1, h1 = zip(*rnn_1) c2, h2 = zip(*rnn_2) # Render everything implemented (doh!): implemented_aux_images = { 'action_prob': self.env.renderer.draw_plot( # data=softmax(np.asarray(ep_a_logits)[:, 0, :] - np.asarray(ep_a_logits).max()), data=softmax(np.asarray( self.ep_accum['logits'])), #[:, 0, :]), title='Episode actions probabilities', figsize=(12, 4), box_text='', xlabel='Backward env. steps', ylabel='R+', line_labels=['Hold', 'Buy', 'Sell', 'Close'])[None, ...], 'value_fn': self.env.renderer.draw_plot(data=np.asarray( self.ep_accum['value']), title='Episode Value function', figsize=(12, 4), xlabel='Backward env. steps', ylabel='R', line_labels=['Value'])[None, ...], # 'lstm_1_c': norm_image(np.asarray(c1).T[None, :, 0, :, None]), 'lstm_1_h': self.norm_image(np.asarray(h1).T[None, :, 0, :, None]), # 'lstm_2_c': norm_image(np.asarray(c2).T[None, :, 0, :, None]), 'lstm_2_h': self.norm_image(np.asarray(h2).T[None, :, 0, :, None]) } # Pick what has been set: aux_images = { summary: implemented_aux_images[summary] for summary in self.aux_render_modes } render_stat.update(aux_images) else: render_stat = None return render_stat
def VerboseEnvRunnerFn( sess, env, policy, task, rollout_length, summary_writer, episode_summary_freq, env_render_freq, atari_test, ep_summary, memory_config, log, aux_summaries=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), ): """ More verbose function for runtime logic of the thread runner. Extends per-episode summaries with visualiation of: actions porbabilities distribution, value function, hidden LSTM state. In it's default configuration supposed to be used with stacked_LSTM architecture. Args: env: environment instance policy: policy instance task: int rollout_length: int episode_summary_freq: int env_render_freq: int atari_test: bool, Atari or BTGyn ep_summary: dict of tf.summary op and placeholders memory_config: replay memory configuration dictionary log: logbook logger aux_summaries: list of str, additional summaries to compute Yelds: collected data as dictionary of on_policy, off_policy rollouts, episode statistics and summaries. """ if memory_config is not None: memory = memory_config['class_ref'](**memory_config['kwargs']) else: memory = _DummyMemory() # Pass sample config to environment: last_state = env.reset(**policy.get_sample_config()) last_context = policy.get_initial_features(state=last_state) length = 0 local_episode = 0 reward_sum = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1) # Summary averages accumulators: total_r = [] cpu_time = [] final_value = [] total_steps = [] total_steps_atari = [] # Aux accumulators: ep_a_logits = [] ep_value = [] ep_context = [] ep_stat = None test_ep_stat = None render_stat = None norm_image = lambda x: np.round((x - x.min()) / np.ptp(x) * 255) if env.data_master is True: # Hacky but we need env.renderer methods ready env.renderer.initialize_pyplot() while True: terminal_end = False rollout = Rollout() action, logits, value_, context = policy.act(last_state, last_context, last_action_reward) ep_a_logits.append(logits) ep_value.append(value_) ep_context.append(context) #log.debug('*: A: {}, V: {}, step: {} '.format(action, value_, length)) # argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) # Partially collect first experience of rollout: last_experience = { 'position': {'episode': local_episode, 'step': length}, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, } # Execute user-defined callbacks to policy, if any: for key, callback in policy.callback.items(): last_experience[key] = callback(**locals()) length += 1 reward_sum += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1) for roll_step in range(1, rollout_length): if not terminal: # Continue adding experiences to rollout: action, logits, value_, context = policy.act(last_state, last_context, last_action_reward) #log.debug('A: {}, V: {}, step: {} '.format(action, value_, length)) ep_a_logits.append(logits) ep_value.append(value_) ep_context.append(context) #log.notice('context: {}'.format(context)) # Argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) # Partially collect next experience: experience = { 'position': {'episode': local_episode, 'step': length}, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, #'pixel_change': 0 #policy.get_pc_target(state, last_state), } for key, callback in policy.callback.items(): experience[key] = callback(**locals()) # Bootstrap to complete and push previous experience: last_experience['r'] = value_ rollout.add(last_experience) memory.add(last_experience) # Housekeeping: length += 1 reward_sum += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1) last_experience = experience if terminal: # Finished episode within last taken step: terminal_end = True # All environment-specific summaries are here due to fact # only runner allowed to interact with environment: # Accumulate values for averaging: total_r += [reward_sum] total_steps_atari += [length] if not atari_test: episode_stat = env.get_stat() # get episode statistic last_i = info[-1] # pull most recent info cpu_time += [episode_stat['runtime'].total_seconds()] final_value += [last_i['broker_value']] total_steps += [episode_stat['length']] # Episode statistics: try: # Was it test episode ( `type` in metadata is not zero)? if not atari_test and state['metadata']['type']: is_test_episode = True else: is_test_episode = False except KeyError: is_test_episode = False if is_test_episode: test_ep_stat = dict( total_r=total_r[-1], final_value=final_value[-1], steps=total_steps[-1] ) else: if local_episode % episode_summary_freq == 0: if not atari_test: # BTgym: ep_stat = dict( total_r=np.average(total_r), cpu_time=np.average(cpu_time), final_value=np.average(final_value), steps=np.average(total_steps) ) else: # Atari: ep_stat = dict( total_r=np.average(total_r), steps=np.average(total_steps_atari) ) total_r = [] cpu_time = [] final_value = [] total_steps = [] total_steps_atari = [] if task == 0 and local_episode % env_render_freq == 0 : if not atari_test: # Render environment (chief worker only, and not in atari atari_test mode): render_stat = { mode: env.render(mode)[None,:] for mode in env.render_modes } # Update renderings with aux: # log.notice('ep_logits shape: {}'.format(np.asarray(ep_a_logits).shape)) # log.notice('ep_value shape: {}'.format(np.asarray(ep_value).shape)) # Unpack LSTM states: rnn_1, rnn_2 = zip(*ep_context) rnn_1 = [state[0] for state in rnn_1] rnn_2 = [state[0] for state in rnn_2] c1, h1 = zip(*rnn_1) c2, h2 = zip(*rnn_2) aux_images = { 'action_prob': env.renderer.draw_plot( # data=softmax(np.asarray(ep_a_logits)[:, 0, :] - np.asarray(ep_a_logits).max()), data=softmax(np.asarray(ep_a_logits)[:, 0, :]), title='Episode actions probabilities', figsize=(12, 4), box_text='', xlabel='Backward env. steps', ylabel='R+', line_labels=['Hold', 'Buy', 'Sell', 'Close'] )[None, ...], 'value_fn': env.renderer.draw_plot( data=np.asarray(ep_value), title='Episode Value function', figsize=(12, 4), xlabel='Backward env. steps', ylabel='R', line_labels = ['Value'] )[None, ...], #'lstm_1_c': norm_image(np.asarray(c1).T[None, :, 0, :, None]), 'lstm_1_h': norm_image(np.asarray(h1).T[None, :, 0, :, None]), #'lstm_2_c': norm_image(np.asarray(c2).T[None, :, 0, :, None]), 'lstm_2_h': norm_image(np.asarray(h2).T[None, :, 0, :, None]) } render_stat.update(aux_images) else: # Atari: render_stat = dict(render_atari=state['external'][None,:] * 255) # New episode: last_state = env.reset(**policy.get_sample_config()) last_context = policy.get_initial_features(state=last_state, context=last_context) length = 0 reward_sum = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1) # reset per-episode accumulators: ep_a_logits = [] ep_value = [] ep_context = [] # Increment global and local episode counts: sess.run(policy.inc_episode) local_episode += 1 break # After rolling `rollout_length` or less (if got `terminal`) # complete final experience of the rollout: if not terminal_end: # Bootstrap: last_experience['r'] = np.asarray( [policy.get_value(last_state, last_context, last_action_reward)] ) else: last_experience['r'] = np.asarray([0.0]) rollout.add(last_experience) # Only training rollouts are added to replay memory: try: # Was it test (`type` in metadata is not zero)? if not atari_test and last_experience['state']['metadata']['type']: is_test = True else: is_test = False except KeyError: is_test = False if not is_test: memory.add(last_experience) # Once we have enough experience and memory can be sampled, yield it, # and have the ThreadRunner place it on a queue: if memory.is_full(): data = dict( on_policy=rollout, off_policy=memory.sample_uniform(sequence_size=rollout_length), off_policy_rp=memory.sample_priority(exact_size=True), ep_summary=ep_stat, test_ep_summary=test_ep_stat, render_summary=render_stat, ) yield data ep_stat = None test_ep_stat = None render_stat = None