def train_controller(self, batch): expected_rewards = [] for i, episode_rewards in enumerate(batch['rewards']): expected_rewards.append(get_expected_rewards(episode_rewards, self.discount)) # Fit Critic av_critic_loss = [] for i in range(self.nb_critic_iter): _, critic_loss = self.sess.run([self.train_critic_op, self.critic_loss], feed_dict={ self.inputs: batch['states'] , self.actions: batch['actions'] , self.expected_rewards: expected_rewards , self.rewards: batch['rewards'] , self.mask_plh: batch['mask'] , self.next_states: batch['next_states'] }) av_critic_loss.append(critic_loss) self.sess.run(self.update_fixed_vars_op) _, policy_loss = self.sess.run([self.train_policy_op, self.policy_loss], feed_dict={ self.inputs: batch['states'] , self.actions: batch['actions'] , self.expected_rewards: expected_rewards , self.rewards: batch['rewards'] , self.mask_plh: batch['mask'] , self.next_states: batch['next_states'] }) summary, _, episode_id = self.sess.run([self.all_summary_t, self.inc_ep_id_op, self.episode_id], feed_dict={ self.policy_loss_plh: policy_loss, self.critic_loss_plh: np.mean(av_critic_loss), }) self.sw.add_summary(summary, episode_id) return
def learn_from_episode(self, env, render): obs = env.reset() act, _ = self.act(obs) score = 0 historyType = np.dtype([ ('states', 'float32', (env.observation_space.shape[0] + 1, )), ('actions', 'int32', (1, )), ('rewards', 'float32'), ('next_states', 'float32', (env.observation_space.shape[0] + 1, )), ('next_actions', 'int32'), ]) history = np.array([], dtype=historyType) done = False while True: if render: env.render() next_obs, reward, done, info = env.step(act) next_act, _ = self.act(next_obs) memory = np.array( [(np.concatenate((obs, [0])), [act], reward, np.concatenate((next_obs, [1 if done else 0])), next_act)], dtype=historyType) history = np.append(history, memory) score += reward obs = next_obs act = next_act if done: break # Learning _, policy_loss, q_loss, loss = self.sess.run( [self.train_op, self.policy_loss, self.q_loss, self.loss], feed_dict={ self.inputs: history['states'], self.actions: history['actions'], self.rewards: get_expected_rewards(history['rewards']), self.next_states: history['next_states'], self.next_actions: history['next_actions'], }) summary, _, episode_id = self.sess.run( [self.all_summary_t, self.inc_ep_id_op, self.episode_id], feed_dict={ self.score_plh: score, self.policy_loss_plh: policy_loss, self.q_loss_plh: q_loss, self.loss_plh: loss, }) self.sw.add_summary(summary, episode_id) return
def learn_from_episode(self, env, render=False): t = 0 score = 0 av_loss = [] historyType = np.dtype([('states', 'int32', ()), ('actions', 'int32', ()), ('rewards', 'float32'), ('estimates', 'float32')]) history = np.array([], dtype=historyType) done = False obs = env.reset() act, state_id, estimate = self.act(obs) while not done: if render: env.render() next_obs, reward, done, info = env.step(act) next_act, next_state_id, next_estimate = self.act(next_obs, done) memory = np.array([(state_id, act, reward, next_estimate)], dtype=historyType) history = np.append(history, memory) if t >= self.n_step - 1: # In this case, it is a lot faster to use Python directly to compute the targets targets = capacities.get_n_step_expected_rewards(history['rewards'][- self.n_step:], history['estimates'][- self.n_step:], self.discount, self.n_step) _, loss = self.sess.run([self.train_op, self.loss], feed_dict={ self.inputs_plh: [ history['states'][- self.n_step] ], self.actions_t: [ history['actions'][- self.n_step] ], self.targets_t: [ targets[0] ], }) av_loss.append(loss) t += 1 score += reward obs = next_obs state_id = next_state_id act = next_act # We now have to finish the learning if self.n_step - 1 > 0: min_step = min(self.n_step, len(history)) targets = capacities.get_expected_rewards(history['rewards'][- min_step:], self.discount) _, loss = self.sess.run([self.train_op, self.loss], feed_dict={ self.inputs_plh: history['states'][-min_step:], self.actions_t: history['actions'][-min_step:], self.targets_t: targets, }) av_loss.append(loss) summary, _, episode_id = self.sess.run([self.all_summary_t, self.inc_ep_id_op, self.episode_id], feed_dict={ self.score_plh: score, self.loss_plh: np.mean(av_loss), }) self.sw.add_summary(summary, episode_id)
def train_controller(self, batch): for i, episode_rewards in enumerate(batch['rewards']): batch['rewards'][i] = get_expected_rewards(episode_rewards, self.discount) _, loss = self.sess.run([self.train_op, self.loss], feed_dict={ self.inputs: batch['states'] , self.actions: batch['actions'] , self.rewards: batch['rewards'] , self.mask_plh: batch['mask'] }) summary, episode_id = self.sess.run([self.loss_sum_t, self.episode_id], feed_dict={ self.loss_plh: np.mean(loss), }) self.sw.add_summary(summary, episode_id) return
def train_controller(self, batch): # print('Training controller') for i, episode_rewards in enumerate(batch['rewards']): batch['rewards'][i] = get_expected_rewards(episode_rewards, self.discount) _, c_sum, time, _ = self.sess.run( [ self.c_train_op, self.all_c_summary_t, self.time, self.inc_time_op ], feed_dict={ self.state_input_plh: batch['states'], self.actions_t: batch['actions'], self.c_rewards_plh: batch['rewards'], self.mask_plh: batch['mask'] }) self.sw.add_summary(c_sum, time) return