def learn(self, env): reporter = Reporter() self.session.run([self.reset_accumulative_grads]) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config['batch_size']) episode_rewards = np.zeros(self.config['batch_size']) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(env, self.config["episode_max_length"]) episode_rewards[episode_nr % self.config['batch_size']] = sum( trajectory['reward']) episode_lengths[episode_nr % self.config['batch_size']] = len( trajectory['reward']) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory['action'][:, None]).astype( np.float32) # one-hot encoding discounted_episode_rewards = discount_rewards( trajectory['reward'], self.config['gamma']) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) self.session.run( [self.accumulate_grads], feed_dict={ self.state: trajectory["state"], self.action_taken: action_taken, self.feedback: feedback }) if episode_nr % self.config['batch_size'] == 0: # batch is done iteration += 1 self.session.run([self.apply_gradients]) self.session.run([self.reset_accumulative_grads]) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config['draw_frequency'] == 0: reporter.draw_rewards(mean_rewards)
def learn(self): reporter = Reporter() gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) rmsprop1 = np.zeros_like(self.w1) rmsprop2 = np.zeros_like(self.w2) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config['batch_size']) episode_rewards = np.zeros(self.config['batch_size']) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(self.config["episode_max_length"]) episode_rewards[episode_nr % self.config['batch_size']] = sum(trajectory['reward']) episode_lengths[episode_nr % self.config['batch_size']] = len(trajectory['reward']) episode_nr += 1 action_taken = (np.arange(self.nA) == trajectory['action'][:, None]).astype(np.float32) # one-hot encoding epdlogp = action_taken - trajectory['prob'] # episode_states = np.vstack(encountered_states) discounted_episode_rewards = discount_rewards(trajectory['reward'], self.config['gamma']) # print(discounted_episode_rewards) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) discounted_episode_rewards /= np.std(discounted_episode_rewards) epdlogp *= np.reshape(np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) change_w1, change_w2 = self.backward_step(trajectory['state'], trajectory['x1'], epdlogp) gradient1 += change_w1 gradient2 += change_w2 if episode_nr % self.config['batch_size'] == 0: # batch is done iteration += 1 rmsprop1 = self.config['decay_rate'] * rmsprop1 + (1 - self.config['decay_rate']) * gradient1**2 rmsprop2 = self.config['decay_rate'] * rmsprop2 + (1 - self.config['decay_rate']) * gradient2**2 self.w1 += self.config['learning_rate'] * gradient1 / (np.sqrt(rmsprop1) + 1e-5) self.w2 += self.config['learning_rate'] * gradient2 / (np.sqrt(rmsprop2) + 1e-5) gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config['draw_frequency'] == 0: reporter.draw_rewards(mean_rewards)