def train(self): epsilon = 1.00 epsiode_rewards = [] for episode in range(1, self.num_episodes + 1): state, reward = self.tsm.initialize() rewards = [] for _ in tqdm(range(self.tsm.episode_length)): if random.random() < epsilon: action = self.random_action() else: action = self.actor_trainer.select_action( inputs=np.array([state.features]))[0][0] trans_state, reward = self.tsm.step(action) rewards.append(reward) self.rpb.store(old_state=state, action=action, reward=reward, new_state=trans_state) if self.rpb.ready(self.batch_size): transitions = self.rpb.sample(batch_size=self.batch_size, recurrent=False) batch_states = [] # [batch_size, num_assets, num_features] batch_actions = [] # [batch_size, 1] batch_y = [] # [batch_size, 1] for transition in transitions: old_state, action, reward, new_state = transition target_action = self.actor_target.select_action( inputs=np.array([new_state.features])) target_q = self.critic_target.get_q_value( inputs=np.array([new_state.features]), actions=target_action)[0] y = reward + self.gamma * target_q #print("Y:", y) #print("Y:", y, "Target_q:", target_q, "Target_action:", target_action, "reward:", reward) batch_y.append(y) batch_states.append(old_state.features) batch_actions.append([action]) self.critic_trainer.train_step( inputs=np.array(batch_states), actions=np.array(batch_actions), predicted_q_value=np.array(batch_y)) policy_actions = self.actor_trainer.select_action( inputs=np.array( batch_states)) # [batch_size, num_assets] action_grads = self.critic_trainer.get_action_gradients( inputs=np.array(batch_states), actions=policy_actions)[0] self.actor_trainer.train_step( inputs=np.array(batch_states), action_gradient=np.array(action_grads)) ActorNetwork.update_actor(self.sess, self.tau) CriticNetwork.update_critic(self.sess, self.tau) state = trans_state epsiode_rewards.append(np.sum(rewards)) if epsilon > 0.1: epsilon -= 2.0 / self.num_episodes if (episode % 1) == 0: self.infer(train=False, episode=episode) plt.plot(epsiode_rewards) plt.savefig("./episode_rewards.png") self.infer(train=False, episode=episode)
def train(self): global_step = 0 training_rewards = [] for episode in range(1, self.num_episodes + 1): state = self.env.reset() state = np.reshape(state, (self.actor_trainer.state_dimension, )) episode_rewards = 0 episode_ave_max_q = 0 for _ in range(self.episode_length): action = self.actor_trainer.select_action(inputs=np.array( [state]))[0] + self.actor_noise() # [action_dim] trans_state, reward, terminal, info = self.env.step(action) trans_state = np.reshape( trans_state, (self.actor_trainer.state_dimension, )) episode_rewards += reward self.rpb.store_w_terminal(old_state=state, action=action, reward=reward, terminal=terminal, new_state=trans_state) if self.rpb.ready(self.batch_size): batch_states, batch_actions, batch_rewards, batch_terminal, batch_trans_state \ = self.rpb.sample_batch(batch_size=self.batch_size) target_actions = self.actor_target.select_action( inputs=batch_trans_state) # [batch_size, action_dim] target_q = self.critic_target.get_q_value( inputs=batch_trans_state, # [batch_size, 1] actions=target_actions) batch_y = [] for ind in range(self.batch_size): if batch_terminal[ind]: batch_y.append([batch_rewards[ind]]) else: batch_y.append(batch_rewards[ind] + self.gamma * target_q[ind]) batch_y = np.array(batch_y) # [batch_size, 1] self.critic_trainer.train_step(inputs=batch_states, actions=batch_actions, predicted_q_value=batch_y) policy_actions = self.actor_trainer.select_action( inputs=batch_states) # [batch_size, num_assets] action_grads = self.critic_trainer.get_action_gradients( inputs=batch_states, actions=policy_actions)[0] self.actor_trainer.train_step( inputs=batch_states, action_gradient=np.array(action_grads)) ActorNetwork.update_actor(self.sess, self.tau) CriticNetwork.update_critic(self.sess, self.tau) global_step += 1 state = trans_state if terminal: print("Episode number:", episode) summary = self.sess.run( self.summary_ops, feed_dict={self.episode_reward: episode_rewards}) self.writer.add_summary(summary, episode) print("Reward:", episode_rewards) break