def learn_REINFORCE(self): """Learn using updates like in the REINFORCE algorithm.""" reporter = Reporter() total_n_trajectories = 0 iteration = self.start_at_iter while iteration < self.n_iter and not self.master.stop_requested: iteration += 1 # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.task_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], self.config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.master.session.run( [self.loss, self.apply_grad], feed_dict={ self.master.states: all_state, self.master.action_taken: all_action, self.master.advantage: all_adv }) print("Task:", self.task_id) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) summary = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: np.mean(episode_rewards), self.master.episode_length: np.mean(episode_lengths) }) self.writer.add_summary(summary[0], iteration) self.writer.flush()
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory.states for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory.rewards, config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory.actions for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ sum(trajectory.rewards) for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory.rewards) for trajectory in trajectories ]) # episode lengths # TODO: deal with RNN state summary, _ = self.session.run( [self.summary_op, self.train], feed_dict={ self.states: all_state, self.a_n: all_action, self.adv_n: all_adv, self.episode_lengths: np.mean(episode_lengths), self.rewards: np.mean(episode_rewards) }) self.writer.add_summary(summary, iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config possible_actions = np.arange(self.env_runner.nA) total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_action = (possible_actions == all_action[:, None]).astype( np.float32) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards returns = np.concatenate([ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ]) qw_new = self.get_critic_value(all_state) episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.session.run( [self.summary_op, self.critic_train, self.actor_train], feed_dict={ self.states: all_state, self.critic_target: returns, self.states: all_state, self.actions_taken: all_action, self.critic_feedback: qw_new, self.critic_rewards: returns, self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writer.add_summary(results[0], iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = np.zeros(len(self.envs)) for iteration in range(config["n_iter"]): self.session.run([self.reset_accum_grads]) for i, task_runner in enumerate(self.task_runners): if self.config["switch_at_iter"] is not None: if iteration >= self.config["switch_at_iter"] and i != (len(self.task_runners) - 1): continue elif iteration < self.config["switch_at_iter"] and i == len(self.task_runners) - 1: continue # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = task_runner.get_trajectories() total_n_trajectories[i] += len(trajectories) all_state = np.concatenate([trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories] max_len = max(len(ret) for ret in rets) padded_rets = [np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate([trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([trajectory["reward"].sum() for trajectory in trajectories]) # episode total rewards episode_lengths = np.array([len(trajectory["reward"]) for trajectory in trajectories]) # episode lengths results = self.session.run([self.losses[i], self.add_accum_grads[i], self.accum_grads], feed_dict={ self.states: all_state, self.action_taken: all_action, self.advantage: all_adv }) summary = self.session.run([self.summary_op], feed_dict={ self.loss: results[0], self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writers[i].add_summary(summary[0], iteration) self.writers[i].flush() print("Task:", i) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i]) # Apply accumulated gradient after all the gradients of each task are summed self.session.run([self.apply_gradients]) if self.config["save_model"]: if not os.path.exists(self.monitor_path): os.makedirs(self.monitor_path) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): reporter = Reporter() self.session.run([self.reset_accumulative_grads]) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config["batch_size"]) episode_rewards = np.zeros(self.config["batch_size"]) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory() episode_rewards[episode_nr % self.config["batch_size"]] = sum( trajectory["reward"]) episode_lengths[episode_nr % self.config["batch_size"]] = len( trajectory["reward"]) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory["action"][:, None]).astype( np.float32) # one-hot encoding discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) self.session.run( [self.accumulate_grads], feed_dict={ self.states: trajectory["state"], self.action_taken: action_taken, self.feedback: feedback }) if episode_nr % self.config["batch_size"] == 0: # batch is done iteration += 1 self.session.run([self.apply_gradients]) self.session.run([self.reset_accumulative_grads]) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config["draw_frequency"] == 0: reporter.draw_rewards(mean_rewards) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): reporter = Reporter() gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) rmsprop1 = np.zeros_like(self.w1) rmsprop2 = np.zeros_like(self.w2) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config["batch_size"]) episode_rewards = np.zeros(self.config["batch_size"]) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(self.config["episode_max_length"]) episode_rewards[episode_nr % self.config["batch_size"]] = sum( trajectory["reward"]) episode_lengths[episode_nr % self.config["batch_size"]] = len( trajectory["reward"]) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory["action"][:, None]).astype( np.float32) # one-hot encoding epdlogp = action_taken - trajectory["prob"] # episode_states = np.vstack(encountered_states) discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # print(discounted_episode_rewards) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) discounted_episode_rewards /= np.std(discounted_episode_rewards) epdlogp *= np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) change_w1, change_w2 = self.backward_step(trajectory["state"], trajectory['x1'], epdlogp) gradient1 += change_w1 gradient2 += change_w2 if episode_nr % self.config["batch_size"] == 0: # batch is done iteration += 1 rmsprop1 = self.config["decay_rate"] * rmsprop1 + ( 1 - self.config["decay_rate"]) * gradient1**2 rmsprop2 = self.config["decay_rate"] * rmsprop2 + ( 1 - self.config["decay_rate"]) * gradient2**2 self.w1 += self.config["learning_rate"] * gradient1 / ( np.sqrt(rmsprop1) + 1e-5) self.w2 += self.config["learning_rate"] * gradient2 / ( np.sqrt(rmsprop2) + 1e-5) gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config["draw_frequency"] == 0: reporter.draw_rewards(mean_rewards)