def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed, self.config.use_GPU) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.exploration_strategy = OU_Noise_Exploration(self.config)
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = Neural_Network(self.state_size + self.action_size, 1, self.random_seed, self.hyperparameters["Critic"], "VANILLA_NN").to(self.device) self.critic_target = copy.deepcopy(self.critic_local).to(self.device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.random_seed) self.actor_local = Neural_Network(self.state_size, self.action_size, self.random_seed, self.hyperparameters["Actor"], "VANILLA_NN").to(self.device) self.actor_target = copy.deepcopy(self.actor_local).to(self.device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.noise = OU_Noise(self.action_size, self.random_seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"])
def __init__(self, config, global_action_id_to_primitive_actions, action_length_reward_bonus, end_of_episode_symbol="/"): super().__init__(config) self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.exploration_strategy = Epsilon_Greedy_Exploration(config) self.oracle = self.create_oracle() self.oracle_optimizer = optim.Adam( self.oracle.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_local = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) self.q_network_local.print_model_summary() self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.action_length_reward_bonus = action_length_reward_bonus self.abandon_ship = config.hyperparameters["abandon_ship"]
def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed, self.config.use_GPU) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"]) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters[ "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"]
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target.load_state_dict(copy.deepcopy(self.critic_local.state_dict())) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target.load_state_dict(copy.deepcopy(self.actor_local.state_dict())) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"])
def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim): self.num_agents = num_agents self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.config = config # Replay Buffer相关参数 self.batch_size = config['batch_size'] self.buffer_size = config['buffer_size'] self.buffer = Replay_Buffer(self.buffer_size, self.batch_size) self.lr = config['lr'] self.tau = config['tau'] self.agents = [] self.update_step = config['update_step'] self.curr_step = 0 self._init_agents()
def put_adapted_experiences_in_a_replay_buffer(self, action_id_to_actions): """Adds experiences to the replay buffer after re-imagining that the actions taken were macro-actions according to action_rules as well as primitive actions. NOTE that we want to put both primitive actions and macro-actions into replay buffer so that it can learn that its better to do a macro-action rather than the same primitive actions (which we will enforce with reward penalty) """ actions_to_action_id = {v: k for k, v in action_id_to_actions.items()} self.num_actions = len(action_id_to_actions) print(actions_to_action_id) for key in actions_to_action_id.keys(): assert isinstance(key, tuple) assert isinstance(actions_to_action_id[key], int) episodes = len(self.states) for data_type in [ self.states, self.next_states, self.rewards, self.actions, self.dones ]: assert len(data_type) == episodes max_action_length = self.calculate_max_action_length( actions_to_action_id) if self.action_balanced_replay_buffer: print("Using action balanced replay buffer") replay_buffer = Action_Balanced_Replay_Buffer( self.buffer_size, self.batch_size, self.seed, num_actions=self.num_actions, self.use_GPU) else: print("Using ordinary replay buffer") replay_buffer = Replay_Buffer(self.buffer_size, self.batch_size, self.seed) for episode_ix in range(episodes): self.add_adapted_experience_for_an_episode(episode_ix, actions_to_action_id, max_action_length, replay_buffer) return replay_buffer
def __init__(self, env, sess, batch_size=32, tau=0.125, learning_rate=0.0001): self.env = env self.sess = sess self.obs_dim = self.env.num_states self.act_dim = self.env.num_actions # hyperparameters self.lr = learning_rate self.bs = batch_size self.eps = 1.0 self.eps_decay = 0.995 self.gamma = 0.95 self.tau = tau self.buffer_size = 5000 self.hidden_dim = 32 # replay buuffer self.replay_buffer = Replay_Buffer(self.buffer_size) # create model self.model, self.weights, self.state = self.create_actor() self.target_model, self.target_weights, self.target_state = self.create_actor( ) # gradients self.action_gradient = tf.placeholder(tf.float32, [None, self.act_dim]) self.params_grad = tf.gradients( self.model.output, self.weights, -self.action_gradient) # negative for grad ascend grads = zip(self.params_grad, self.weights) # optimizer & run self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(grads) self.sess.run(tf.initialize_all_variables()) self.writer = tf.summary.FileWriter("./logs", graph=tf.get_default_graph()) self.merge_op = tf.summary.merge_all()
class Basic_Agents: def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim): self.num_agents = num_agents self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.config = config # Replay Buffer相关参数 self.batch_size = config['batch_size'] self.buffer_size = config['buffer_size'] self.buffer = Replay_Buffer(self.buffer_size, self.batch_size) self.lr = config['lr'] self.tau = config['tau'] self.agents = [] self.update_step = config['update_step'] self.curr_step = 0 self._init_agents() def _init_agents(self): self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) self.share_para = self.embedding.parameters() self.all_para = self.embedding.parameters() # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) # para = chain(self.embedding.parameters(), self.agents[i].get_q_network().parameters()) # self.optimizer.append(optim.Adam(self.agents[i].get_q_network().parameters(), lr=1e-3)) # self.all_para = chain(self.all_para) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4) def get_agent(self, i): return self.agents[i] def step(self, state, test=False): state_embedding = self._get_embedding(state) action = [] for i in range(self.num_agents): action.append(self.agents[i].step(state_embedding[:, i], test)) action = np.asarray(action) self.curr_step += 1 return action def learn(self): # if self.curr_step > 0 and self.curr_step % self.update_step == 0: for i in range(self.update_step): states, actions, rewards, next_states, is_dones = self.sample_experience( ) actions = torch.from_numpy(actions).long().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) is_dones = torch.from_numpy(is_dones).float().to(self.device) states_embedding = self._get_embedding(states) next_states_embedding = self._get_embedding(next_states) next_states_embedding_target = self._get_embedding_target( next_states) total_loss = 0 for i in range(self.num_agents): actions_values_current = self.agents[ i].cal_current_actions_value( next_states_embedding[:, i], next_states_embedding_target[:, i], rewards[:, i], is_dones) actions_values_expected = self.agents[ i].cal_expected_actions_value(states_embedding[:, i], actions[:, i]) loss = F.mse_loss(actions_values_expected, actions_values_current) # loss.backward(retain_graph=True) total_loss += loss # 反向传播 # self.optimizer[i].zero_grad() self.share_optimizer.zero_grad() total_loss.backward() # self._scale_shared_grads() torch.nn.utils.clip_grad_value_(self.all_para, 1) self.share_optimizer.step() for i in range(self.num_agents): # torch.nn.utils.clip_grad_value_(self.agents[i].q_network_current.parameters(), 1) # self.optimizer[i].step() # 更新target net Dueling_DDQN_Learner.soft_update_of_target_network( self.agents[i].q_network_current, self.agents[i].q_network_target, self.tau) self._update_sharing_target_network() # self.share_optimizer.zero_grad() def get_share_para(self): return dict(self.embedding.named_parameters()) def store_experience(self, states, actions, rewards, next_states, is_dones): self.buffer.store_experience(states, actions, rewards, next_states, is_dones) def sample_experience(self): states, actions, rewards, next_states, is_dones = self.buffer.sample_experience( ) return states, actions, rewards, next_states, is_dones def _get_embedding(self, state): return self.embedding(state) def _get_embedding_target(self, state): return self.embedding_target(state) def _update_sharing_target_network(self): Dueling_DDQN_Learner.soft_update_of_target_network( self.embedding, self.embedding_target, self.tau) def get_attention_score(self, i): return -1 def _scale_shared_grads(self): """ Scale gradients for parameters that are shared since they accumulate gradients from the critic loss function multiple times """ for p in self.share_para: p.grad.data.mul_(1. / self.num_agents) def save_model(self, path): share_model_name = path + '/share_model.pkl' torch.save(self.embedding.state_dict(), share_model_name) for i in range(self.num_agents): unique_model_name = path + '/q_network_%d.pkl' % i torch.save(self.agents[i].q_network_current.state_dict(), unique_model_name) def load_model(self, path): share_model_name = path + '/share_model.pkl' self.embedding.load_state_dict( torch.load(share_model_name, map_location=self.device)) for i in range(self.num_agents): unique_model_name = path + '/q_network_%d.pkl' % i self.agents[i].q_network_current.load_state_dict( torch.load(unique_model_name, map_location=self.device))
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.target_entropy = -np.prod( self.environment.action_space.shape).item( ) # heuristic value from the paper self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"]) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"]
class DDPG_Agent(Base_Agent): agent_name = "DDPG" def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = Neural_Network(self.state_size + self.action_size, 1, self.random_seed, self.hyperparameters["Critic"], "VANILLA_NN").to(self.device) self.critic_target = copy.deepcopy(self.critic_local).to(self.device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.random_seed) self.actor_local = Neural_Network(self.state_size, self.action_size, self.random_seed, self.hyperparameters["Actor"], "VANILLA_NN").to(self.device) self.actor_target = copy.deepcopy(self.actor_local).to(self.device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.noise = OU_Noise(self.action_size, self.random_seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) def reset_game(self): """Resets the game information so we are ready to play a new episode""" Base_Agent.reset_game(self) self.noise.reset() def step(self): """Runs a step in the game""" while not self.done: self.pick_and_conduct_action() self.update_next_state_reward_done_and_score() if self.time_for_critic_and_actor_to_learn(): for _ in range(self.hyperparameters[ "learning_updates_per_learning_session"]): states, actions, rewards, next_states, dones = self.memory.sample( ) # Sample experiences self.critic_learn(states, actions, rewards, next_states, dones) self.actor_learn(states) self.save_experience() self.state = self.next_state #this is to set the state for the next iteration self.episode_step_number += 1 self.episode_number += 1 def pick_action(self): """Picks an action using the actor network and then adds some noise to it to ensure exploration""" state = torch.from_numpy(self.state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() return action def critic_learn(self, states, actions, rewards, next_states, dones): loss = self.compute_loss(states, next_states, rewards, actions, dones) self.take_optimisation_step( self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.soft_update_of_target_network( self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) def compute_loss(self, states, next_states, rewards, actions, dones): with torch.no_grad(): critic_targets = self.compute_critic_targets( next_states, rewards, dones) critic_expected = self.compute_expected_critic_values(states, actions) loss = functional.mse_loss(critic_expected, critic_targets) return loss def compute_critic_targets(self, next_states, rewards, dones): critic_targets_next = self.compute_critic_values_for_next_states( next_states) critic_targets = self.compute_critic_values_for_current_states( rewards, critic_targets_next, dones) return critic_targets def compute_critic_values_for_next_states(self, next_states): with torch.no_grad(): actions_next = self.actor_target(next_states) critic_targets_next = self.critic_target( torch.cat((next_states, actions_next), 1)) return critic_targets_next def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones): critic_targets_current = rewards + ( self.hyperparameters["discount_rate"] * critic_targets_next * (1 - dones)) return critic_targets_current def compute_expected_critic_values(self, states, actions): critic_expected = self.critic_local(torch.cat((states, actions), 1)) return critic_expected def time_for_critic_and_actor_to_learn(self): return self.enough_experiences_to_learn_from( ) and self.episode_step_number % self.hyperparameters[ "update_every_n_steps"] == 0 def actor_learn(self, states): if self.done: #we only update the learning rate at end of each episode self.update_learning_rate( self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer) actor_loss = self.calculate_actor_loss(states) self.take_optimisation_step( self.actor_optimizer, self.actor_local, actor_loss, self.hyperparameters["Actor"]["gradient_clipping_norm"]) self.soft_update_of_target_network( self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"]) def calculate_actor_loss(self, states): actions_pred = self.actor_local(states) actor_loss = -self.critic_local(torch.cat( (states, actions_pred), 1)).mean() return actor_loss
#state = state.unsqueeze(1) """ """ _____TESTING ENSEMBLE CRITIC_____ ensemble_critic = EnsembleCritic(1, 8, 1) state = torch.FloatTensor(env.reset()).unsqueeze(0) action = torch.FloatTensor(1).unsqueeze(1) all_qs = ensemble_critic(state, action) print(all_qs) """ ANNEAL_RATE = .00003 TEMP_MIN = 0.005 default_temp = 1.0 log_interval = 10 replay = Replay_Buffer(1000) state = torch.FloatTensor(env.reset()).unsqueeze(0) num_qs = 1 state_dim = 8 num_samples_match = 10 NUM_EPISODES = 50 batch_size = 250 bear = BEAR(num_qs, state_dim, 1, 10, ANNEAL_RATE, TEMP_MIN, default_temp) running_rewards = [] if __name__ == "__main__": running_reward = 0 for i in range(NUM_EPISODES): state = env.reset() done = False
env = gym.make(env_name) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] device = torch.device("cuda: %d" % gpu_id if use_cuda else "cpu") # critic critic_local = Critic(n_states, n_actions).to(device) critic_target = Critic(n_states, n_actions).to(device) model_deep_copy(from_model=critic_local, to_model=critic_target) optim_critic = optim.Adam(critic_local.parameters(), lr=lr_critic, eps=1e-4) memory = Replay_Buffer(buffer_size, batch_size, mem_seed) # actor actor_local = Actor(n_states).to(device) actor_target = Actor(n_states).to(device) model_deep_copy(from_model=actor_local, to_model=actor_target) optim_actor = optim.Adam(actor_local.parameters(), lr=lr_actor, eps=1e-4) # ou noise ou_noise = OU_Noise(size=n_actions, seed=ou_seed, mu=mu, theta=theta, sigma=sigma) ou_noise.reset()
class SAC(Base_Agent): """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained to maximise the entropy of their actions as well as their cumulative reward""" agent_name = "SAC" def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"]) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"] def save_result(self): """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only want to keep track of the results during the evaluation episodes""" if self.episode_number == 1 or not self.do_evaluation_iterations: self.game_full_episode_scores.extend( [self.total_episode_score_so_far]) self.rolling_results.append( np.mean( self.game_full_episode_scores[-1 * self.rolling_score_window:])) self.save_max_result_seen() elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0: self.game_full_episode_scores.extend([ self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE) ]) self.rolling_results.extend([ np.mean( self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE) ]) self.save_max_result_seen() def reset_game(self): """Resets the game information so we are ready to play a new episode""" Base_Agent.reset_game(self) if self.add_extra_noise: self.noise.reset() def step(self): """Runs an episode on the game, saving the experience and running a learning step if appropriate""" eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations self.episode_step_number_val = 0 while not self.done: self.episode_step_number_val += 1 self.action = self.pick_action(eval_ep) self.conduct_action(self.action) if self.time_for_critic_and_actor_to_learn(): for _ in range(self.hyperparameters[ "learning_updates_per_learning_session"]): self.learn() mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask)) self.state = self.next_state self.global_step_number += 1 print(self.total_episode_score_so_far) if eval_ep: self.print_summary_of_latest_evaluation_episode() self.episode_number += 1 def pick_action(self, eval_ep, state=None): """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps, 2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False. The difference between evaluation and training mode is that training mode does more exploration""" if state is None: state = self.state if eval_ep: action = self.actor_pick_action(state=state, eval=True) elif self.global_step_number < self.hyperparameters[ "min_steps_before_learning"]: action = self.environment.action_space.sample() print("Picking random action ", action) else: action = self.actor_pick_action(state=state) if self.add_extra_noise: self.action += self.noise.sample() return action def actor_pick_action(self, state=None, eval=False): """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly from the network and so did not involve any random sampling""" if state is None: state = self.state state = torch.FloatTensor([state]).to(self.device) if len(state.shape) == 1: state = state.unsqueeze(0) if eval == False: action, _, _ = self.produce_action_and_action_info(state) else: with torch.no_grad(): _, z, action = self.produce_action_and_action_info(state) action = action.detach().cpu().numpy() return action[0] def produce_action_and_action_info(self, state): """Given the state, produces an action, the log probability of the action, and the tanh of the mean action""" actor_output = self.actor_local(state) mean, log_std = actor_output[:, :self. action_size], actor_output[:, self. action_size:] std = log_std.exp() normal = Normal(mean, std) x_t = normal.rsample( ) #rsample means it is sampled using reparameterisation trick action = torch.tanh(x_t) log_prob = normal.log_prob(x_t) log_prob -= torch.log(1 - action.pow(2) + EPSILON) log_prob = log_prob.sum(1, keepdim=True) return action, log_prob, torch.tanh(mean) def time_for_critic_and_actor_to_learn(self): """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the actor and critic""" return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \ self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 def learn(self): """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter""" state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences( ) qf1_loss, qf2_loss = self.calculate_critic_losses( state_batch, action_batch, reward_batch, next_state_batch, mask_batch) policy_loss, log_pi = self.calculate_actor_loss(state_batch) if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi) else: alpha_loss = None self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss) def sample_experiences(self): return self.memory.sample() def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch): """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy term is taken into account""" with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info( next_state_batch) qf1_next_target = self.critic_target( torch.cat((next_state_batch, next_state_action), 1)) qf2_next_target = self.critic_target_2( torch.cat((next_state_batch, next_state_action), 1)) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + ( 1.0 - mask_batch) * self.hyperparameters["discount_rate"] * ( min_qf_next_target) qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1)) qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1)) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) return qf1_loss, qf2_loss def calculate_actor_loss(self, state_batch): """Calculates the loss for the actor. This loss includes the additional entropy term""" action, log_pi, _ = self.produce_action_and_action_info(state_batch) qf1_pi = self.critic_local(torch.cat((state_batch, action), 1)) qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1)) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() return policy_loss, log_pi def calculate_entropy_tuning_loss(self, log_pi): """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning is True.""" alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() return alpha_loss def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss, alpha_loss): """Updates the parameters for the actor, both critics and (if specified) the temperature parameter""" self.take_optimisation_step( self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.take_optimisation_step( self.critic_optimizer_2, self.critic_local_2, critic_loss_2, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.take_optimisation_step( self.actor_optimizer, self.actor_local, actor_loss, self.hyperparameters["Actor"]["gradient_clipping_norm"]) self.soft_update_of_target_network( self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) self.soft_update_of_target_network( self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"]) if alpha_loss is not None: self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None) self.alpha = self.log_alpha.exp() def print_summary_of_latest_evaluation_episode(self): """Prints a summary of the latest episode""" print(" ") print("----------------------------") print("Episode score {} ".format(self.total_episode_score_so_far)) print("----------------------------")
def play_game(train_indicator=1): env = environment.Environment() # Rohit's custom environment obs_dim = env.num_states act_dim = env.num_actions buffer_size = 5000 batch_size = 32 gamma = 0.95 tau = 0.001 np.random.seed(1337) vision = False explore = 100000. eps_count = 2000 max_steps = 100000 reward = 0 done = False epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # actor, critic and buffer actor = Actor_Network(env, sess) critic = Critic_Network(env, sess) replay_buffer = Replay_Buffer() # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("WOW WOW WOW, Cannot find the weight") for e in range(eps_count): # receive initial observation state s_t = env._reset() # cos theta, sin theta, theta dot s_t = np.asarray(s_t) total_reward = 0 done = False step = 0 while (done == False): if step > 200: break loss = 0 epsilon -= 1.0 / explore a_t = np.zeros([1, act_dim]) noise_t = np.zeros([1, act_dim]) # select action according to current policy and exploration noise a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.0, 0.60, 0.30) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], 0.0, 0.60, 0.30) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] s_t1, r_t, done, _ = env._step(a_t[0]) s_t1 = np.asarray(s_t1) # add to replay buffer replay_buffer.add(s_t, a_t[0], r_t, s_t1, done) # pdb.set_trace() # sample from replay buffer batch = replay_buffer.sample_batch() states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 step += 1 print('step: {}'.format(step)) if np.mod(e, 3) == 0: if (train_indicator): print('saving model') actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print('episode: ', e, ' total rewards: ', total_reward) # Plotting states states = env.plotState xs = states[:, 0] ys = states[:, 1] zs = states[:, 2] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.plot(xs, ys, zs) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') # plt.show() save_path = './plots/' + str(e) + '.png' plt.savefig(save_path)
if i % log_interval == 0: print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} @ t={}, \ last reward {:.1f}, running reward {:.1f}""".format(i, env.starting_portfolio_value, \ env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward)) dqn = DQN_Agent() serieslength = 250 env = TradingEnvironment(max_stride=4, series_length=serieslength,starting_cash_mean=100, randomize_cash_std=100, starting_shares_mean=100,randomize_shares_std=10, inaction_penalty=100.0) BATCH_SIZE = 250 if __name__ == "__main__": num_episodes = 50 gamma = .97 target_update = 10 replay_buffer = Replay_Buffer(1000) optimizer = optim.RMSprop(dqn.policy_net.parameters()) train(num_episodes, target_update, gamma, env, dqn, replay_buffer, optimizer) #sample trading run total_rewards = 0 total_profits = 0 failed_goes = 0 num_goes = 120 env = TradingEnvironment(max_stride=4, series_length=serieslength,starting_cash_mean=100, randomize_cash_std=100, starting_shares_mean=100,randomize_shares_std=10) for i in range(num_goes): done = False env.reset() reward_this_go = 1e-8 for j in range(0, env.series_length+1):
class DDQN_Wrapper(Base_Agent): def __init__(self, config, global_action_id_to_primitive_actions, action_length_reward_bonus, end_of_episode_symbol="/"): super().__init__(config) self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.exploration_strategy = Epsilon_Greedy_Exploration(config) self.oracle = self.create_oracle() self.oracle_optimizer = optim.Adam( self.oracle.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_local = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) self.q_network_local.print_model_summary() self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.action_length_reward_bonus = action_length_reward_bonus self.abandon_ship = config.hyperparameters["abandon_ship"] def create_oracle(self): """Creates the network we will use to predict the next state""" oracle_hyperparameters = copy.deepcopy(self.hyperparameters) oracle_hyperparameters["columns_of_data_to_be_embedded"] = [] oracle_hyperparameters["embedding_dimensions"] = [] oracle_hyperparameters["linear_hidden_units"] = [5, 5] oracle_hyperparameters["final_layer_activation"] = [None, "tanh"] oracle = self.create_NN(input_dim=self.state_size + 2, output_dim=[self.state_size + 1, 1], hyperparameters=oracle_hyperparameters) oracle.print_model_summary() return oracle def run_n_episodes(self, num_episodes, episodes_to_run_with_no_exploration): self.turn_on_any_epsilon_greedy_exploration() self.round_of_macro_actions = [] self.episode_actions_scores_and_exploration_status = [] num_episodes_to_get_to = self.episode_number + num_episodes while self.episode_number < num_episodes_to_get_to: self.reset_game() self.step() self.save_and_print_result() if num_episodes_to_get_to - self.episode_number == episodes_to_run_with_no_exploration: self.turn_off_any_epsilon_greedy_exploration() assert len(self.episode_actions_scores_and_exploration_status ) == num_episodes, "{} vs. {}".format( len(self.episode_actions_scores_and_exploration_status), num_episodes) assert len(self.episode_actions_scores_and_exploration_status[0]) == 3 assert self.episode_actions_scores_and_exploration_status[0][2] in [ True, False ] assert isinstance( self.episode_actions_scores_and_exploration_status[0][1], list) assert isinstance( self.episode_actions_scores_and_exploration_status[0][1][0], int) assert isinstance( self.episode_actions_scores_and_exploration_status[0][0], int) or isinstance( self.episode_actions_scores_and_exploration_status[0][0], float) return self.episode_actions_scores_and_exploration_status, self.round_of_macro_actions def step(self): """Runs a step within a game including a learning step if required""" step_number = 0.0 self.state = np.append( self.state, step_number / 200.0) #Divide by 200 because there are 200 steps in cart pole self.total_episode_score_so_far = 0 episode_macro_actions = [] while not self.done: surprised = False macro_action = self.pick_action() primitive_actions = self.global_action_id_to_primitive_actions[ macro_action] primitive_actions_conducted = 0 for ix, action in enumerate(primitive_actions): if self.abandon_ship and primitive_actions_conducted > 0: if self.abandon_macro_action(action): break step_number += 1 self.action = action self.next_state, self.reward, self.done, _ = self.environment.step( action) self.next_state = np.append( self.next_state, step_number / 200.0 ) #Divide by 200 because there are 200 steps in cart pole self.total_episode_score_so_far += self.reward if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0) primitive_actions_conducted += 1 self.track_episodes_data() self.save_experience() if len(primitive_actions) > 1: surprised = self.am_i_surprised() self.state = self.next_state if self.time_for_q_network_to_learn(): for _ in range( self.hyperparameters["learning_iterations"]): self.q_network_learn() self.oracle_learn() if self.done or surprised: break episode_macro_actions.append(macro_action) self.round_of_macro_actions.append(macro_action) if random.random() < 0.1: print(Counter(episode_macro_actions)) self.save_episode_actions_with_score() self.episode_number += 1 self.logger.info("END OF EPISODE") def am_i_surprised(self): """Returns boolean indicating whether the next_state was a surprise or not""" with torch.no_grad(): state = torch.from_numpy(self.state).float().unsqueeze(0).to( self.device) action = torch.Tensor([[self.action]]) states_and_actions = torch.cat( (state, action), dim=1) #must change this for all games besides cart pole predictions = self.oracle(states_and_actions) predicted_next_state = predictions[0, :-1] difference = F.mse_loss(predicted_next_state, torch.Tensor(self.next_state)) if difference > 0.5: print("Surprise! Loss {} -- {} vs. {}".format( difference, predicted_next_state, self.next_state)) return True else: return False def abandon_macro_action(self, action): """Returns boolean indicating whether to abandon macro action or not""" state = torch.from_numpy(self.state).float().unsqueeze(0).to( self.device) with torch.no_grad(): primitive_q_values = self.calculate_q_values( state, local=True, primitive_actions_only=True) q_value_highest = torch.max(primitive_q_values) q_values_action = primitive_q_values[:, action] if q_value_highest > 0.0: multiplier = 0.7 else: multiplier = 1.3 if q_values_action < multiplier * q_value_highest: print("BREAKING Action {} -- Q Values {}".format( action, primitive_q_values)) return True else: return False def pick_action(self, state=None): """Uses the local Q network and an epsilon greedy policy to pick an action""" if state is None: state = self.state if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state]) state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) if len(state.shape) < 2: state = state.unsqueeze(0) self.q_network_local.eval() #puts network in evaluation mode with torch.no_grad(): action_values = self.calculate_q_values( state, local=True, primitive_actions_only=False) self.q_network_local.train() #puts network back in training mode action = self.exploration_strategy.perturb_action_for_exploration_purposes( { "action_values": action_values, "turn_off_exploration": self.turn_off_exploration, "episode_number": self.episode_number }) self.logger.info("Q values {} -- Action chosen {}".format( action_values, action)) return action def calculate_q_values(self, states, local, primitive_actions_only): """Calculates the q values using the local q network""" if local: primitive_q_values = self.q_network_local(states) else: primitive_q_values = self.q_network_target(states) num_actions = len(self.global_action_id_to_primitive_actions) if primitive_actions_only or num_actions <= self.action_size: return primitive_q_values extra_q_values = self.calculate_macro_action_q_values( states, num_actions) extra_q_values = torch.Tensor([extra_q_values]) all_q_values = torch.cat((primitive_q_values, extra_q_values), dim=1) return all_q_values def calculate_macro_action_q_values(self, state, num_actions): assert state.shape[0] == 1 q_values = [] for action_id in range(self.action_size, num_actions): macro_action = self.global_action_id_to_primitive_actions[ action_id] predicted_next_state = state cumulated_reward = 0 action_ix = 0 for action in macro_action[:-1]: predictions = self.oracle( torch.cat((predicted_next_state, torch.Tensor([[action]])), dim=1)) rewards = predictions[:, -1] predicted_next_state = predictions[:, :-1] cumulated_reward += ( rewards.item() + self.action_length_reward_bonus ) * self.hyperparameters["discount_rate"]**(action_ix) action_ix += 1 final_action = macro_action[-1] final_q_value = self.q_network_local(predicted_next_state)[ 0, final_action] total_q_value = cumulated_reward + final_q_value * self.hyperparameters[ "discount_rate"]**(action_ix) q_values.append(total_q_value) return q_values def time_for_q_network_to_learn(self): """Returns boolean indicating whether enough steps have been taken for learning to begin and there are enough experiences in the replay buffer to learn from""" return self.right_amount_of_steps_taken( ) and self.enough_experiences_to_learn_from() def right_amount_of_steps_taken(self): """Returns boolean indicating whether enough steps have been taken for learning to begin""" return self.global_step_number % self.hyperparameters[ "update_every_n_steps"] == 0 def q_network_learn(self, experiences=None): """Runs a learning iteration for the Q network""" if experiences is None: states, actions, rewards, next_states, dones = self.sample_experiences( ) #Sample experiences else: states, actions, rewards, next_states, dones = experiences loss = self.compute_loss(states, next_states, rewards, actions, dones) self.take_optimisation_step( self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"]) def sample_experiences(self): """Draws a random sample of experience from the memory buffer""" experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences return states, actions, rewards, next_states, dones def compute_loss(self, states, next_states, rewards, actions, dones): """Computes the loss required to train the Q network""" with torch.no_grad(): max_action_indexes = self.calculate_q_values( next_states, local=True, primitive_actions_only=True).detach().argmax(1) Q_targets_next = self.calculate_q_values( next_states, local=False, primitive_actions_only=True).gather( 1, max_action_indexes.unsqueeze(1)) Q_targets = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones)) Q_expected = self.calculate_q_values( states, local=True, primitive_actions_only=True).gather(1, actions.long( )) # must convert actions to long so can be used as index loss = F.mse_loss(Q_expected, Q_targets) return loss def save_episode_actions_with_score(self): self.episode_actions_scores_and_exploration_status.append([ self.total_episode_score_so_far, self.episode_actions + [self.end_of_episode_symbol], self.turn_off_exploration ]) def oracle_learn(self): states, actions, rewards, next_states, _ = self.sample_experiences( ) # Sample experiences states_and_actions = torch.cat( (states, actions), dim=1) #must change this for all games besides cart pole predictions = self.oracle(states_and_actions) loss = F.mse_loss(torch.cat((next_states, rewards), dim=1), predictions) / float(next_states.shape[1] + 1.0) self.take_optimisation_step( self.oracle_optimizer, self.oracle, loss, self.hyperparameters["gradient_clipping_norm"]) self.logger.info("Oracle Loss {}".format(loss))
def train_quad(debug=True): env = environment.QuadCopterEnv(debug) # Rohit's custom environment obs_dim = env.num_states act_dim = env.num_actions buffer_size = 5000 batch_size = 32 gamma = 0.98 tau = 0.001 np.random.seed(1337) vision = False explore = 1000 #100000 eps_count = 500 #1000 max_steps = 40 #100000 reward = 0 done = False epsilon = 1 indicator = 0 plot_state = False plot_reward = True episode_rewards = [] episode = [] # Configue tensorflow CPU/GPU config = tf.ConfigProto(device_count={'GPU': 0}) sess = tf.Session(config=config) #from tensorflow.keras import backend as K #K.set_session(sess) tf.compat.v1.keras.backend.set_session(sess) # Define actor, critic and buffer actor = Actor_Network(env, sess) critic = Critic_Network(env, sess) replay_buffer = Replay_Buffer() # Save location save_dir = os.path.join(os.getcwd(), save_path) if not os.path.isdir(save_dir): os.makedirs(save_dir) os.chdir(save_dir) # Plot total reward plt.ion() plt.title('Training Curve') plt.xlabel('Episodes') plt.ylabel('Total Reward') plt.grid() # Episode loop for epi in range(eps_count): # Receive initial observation state s_t = env.reset() # Initial position info s_t = np.asarray(s_t) total_reward = 0 done = False step = 0 # Step loop while (done == False): if step > max_steps: # Episode length is 200 steps break step += 1 if debug: print('--------------------------------') print('step: {}'.format(step)) loss = 0 epsilon -= 1.0 / explore # Reduce every step a_t = np.zeros([1, act_dim]) noise_t = np.zeros([1, act_dim]) # Select action acoording to current policy and exploration noise a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) print('epsilon: {}'.format(epsilon)) #noise_t[0][0] = max(epsilon,0.0) * ou_func(a_t_original[0][0], 0.0 , 0.60, 1) #noise_t[0][1] = max(epsilon,0.0) * ou_func(a_t_original[0][1], 0.0 , 0.60, 1) #noise_t[0][2] = max(epsilon,0.0) * ou_func(a_t_original[0][2], 0.0 , 0.60, 1) noise_t[0][0] = max(epsilon, 0.0) * ou_func( a_t_original[0][0], 0.0, 0.1, 0.4) noise_t[0][1] = max(epsilon, 0.0) * ou_func( a_t_original[0][1], 0.0, 0.1, 0.4) noise_t[0][2] = max(epsilon, 0.0) * ou_func( a_t_original[0][2], 0.0, 0.1, 0.4) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] #a_t[0][2] = a_t_original[0][2] + noise_t[0][2] a_t[0][2] = 0 s_t1, r_t, done, _ = env.step(a_t[0]) s_t1 = np.asarray(s_t1) # Add current data to replay buffer replay_buffer.add(s_t, a_t[0], r_t, s_t1, done) # Sample from replay buffer batch = replay_buffer.sample_batch() states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch ]) # Just make a empty array has same shape # Calculate target Q values (What is target Q values) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # y_t is like the label of for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma * target_q_values[k] # Train critic model loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # One step finish, save models if ((epi + 1) % 50 == 0): a_model_name = '%d_actor_model.h5' % (epi + 1) c_model_name = '%d_critic_model.h5' % (epi + 1) filepath = os.path.join(save_dir, a_model_name) actor.model.save(a_model_name) critic.model.save(c_model_name) print( 'episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})' .format(epi + 1, step, total_reward, s_t[0], s_t[1], s_t[2])) if plot_reward: episode_rewards.append(total_reward) episode.append(epi + 1) plt.plot(episode, episode_rewards, 'b') plt.pause(0.001) plt.savefig("Training Curve.png")
def train_quad(debug=True): env = environment.Environment(debug) # Rohit's custom environment obs_dim = env.num_states act_dim = env.num_actions buffer_size = 5000 batch_size = 32 gamma = 0.98 tau = 0.001 np.random.seed(1337) vision = False explore = 100000 eps_count = 1000 max_steps = 100000 reward = 0 done = False epsilon = 1 indicator = 0 plot_state = False plot_reward = True episode_rewards = [] episode = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # actor, critic and buffer actor = Actor_Network(env, sess) critic = Critic_Network(env, sess) replay_buffer = Replay_Buffer() # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("WOW WOW WOW, Cannot find the weight") # timestr = time.strftime("%Y%m%d-%H%M%S") # save_path = 'saved_models_rohit_' + timestr save_dir = os.path.join(os.getcwd(), save_path) if not os.path.isdir(save_dir): os.makedirs(save_dir) os.chdir(save_dir) plt.ion() plt.title('Training Curve') plt.xlabel('Episodes') plt.ylabel('Total Reward') plt.grid() for epi in range (eps_count): # receive initial observation state s_t = env._reset() # cos theta, sin theta, theta dot s_t = np.asarray(s_t) total_reward = 0 done = False step = 0 while(done == False): if step > 200: break step += 1 if debug: print('--------------------------------') print('step: {}'.format(step)) loss = 0 epsilon -= 1.0/explore a_t = np.zeros([1, act_dim]) noise_t = np.zeros([1, act_dim]) # select action according to current policy and exploration noise a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon,0) * ou_func(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = max(epsilon,0) * ou_func(a_t_original[0][1], 0.0 , 0.60, 0.30) noise_t[0][2] = max(epsilon,0) * ou_func(a_t_original[0][2], 0.0 , 0.60, 0.30) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] s_t1, r_t, done, _ = env._step(a_t[0]) s_t1 = np.asarray(s_t1) # add to replay buffer replay_buffer.add(s_t, a_t[0], r_t, s_t1, done) # sample from replay buffer batch = replay_buffer.sample_batch() states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range (len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma*target_q_values[k] loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # pdb.set_trace() if ((epi+1)%50 == 0): a_model_name = '%d_actor_model.h5' %(epi+1) c_model_name = '%d_critic_model.h5' % (epi+1) filepath = os.path.join(save_dir, a_model_name) actor.model.save(a_model_name) critic.model.save(c_model_name) # print ('saving model') # actor.model.save_weights("actormodel.h5", overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) # critic.model.save_weights("criticmodel.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print('episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})'.format(epi+1, step, total_reward, s_t[0], s_t[1], s_t[2])) ############# Plotting states ############ # if plot_state: # states = env.plotState # xs = states[:,0] # ys = states[:,1] # zs = states[:,2] # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # ax.plot(xs, ys, zs) # ax.set_xlabel('X') # ax.set_ylabel('Y') # ax.set_zlabel('Z') # # plt.show() # save_path = './plots/'+str(e)+'.png' # plt.savefig(save_path) ######################################### ################ Plotting rewards ############## if plot_reward: episode_rewards.append(total_reward) episode.append(epi+1) plt.plot(episode,episode_rewards,'b') plt.pause(0.001) plt.savefig("Training Curve.png")
class DDPG(Base_Agent): """A DDPG Agent""" agent_name = "DDPG" def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.exploration_strategy = OU_Noise_Exploration(self.config) def step(self): """Runs a step in the game""" while not self.done: # print("State ", self.state.shape) self.action = self.pick_action() self.conduct_action(self.action) if self.time_for_critic_and_actor_to_learn(): for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): states, actions, rewards, next_states, dones = self.sample_experiences() self.critic_learn(states, actions, rewards, next_states, dones) self.actor_learn(states) self.save_experience() self.state = self.next_state #this is to set the state for the next iteration self.global_step_number += 1 self.episode_number += 1 def sample_experiences(self): return self.memory.sample() def pick_action(self, state=None): """Picks an action using the actor network and then adds some noise to it to ensure exploration""" if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action}) return action.squeeze(0) def critic_learn(self, states, actions, rewards, next_states, dones): """Runs a learning iteration for the critic""" loss = self.compute_loss(states, next_states, rewards, actions, dones) self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"]) self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) def compute_loss(self, states, next_states, rewards, actions, dones): """Computes the loss for the critic""" with torch.no_grad(): critic_targets = self.compute_critic_targets(next_states, rewards, dones) critic_expected = self.compute_expected_critic_values(states, actions) loss = functional.mse_loss(critic_expected, critic_targets) return loss def compute_critic_targets(self, next_states, rewards, dones): """Computes the critic target values to be used in the loss for the critic""" critic_targets_next = self.compute_critic_values_for_next_states(next_states) critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones) return critic_targets def compute_critic_values_for_next_states(self, next_states): """Computes the critic values for next states to be used in the loss for the critic""" with torch.no_grad(): actions_next = self.actor_target(next_states) critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1)) return critic_targets_next def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones): """Computes the critic values for current states to be used in the loss for the critic""" critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones)) return critic_targets_current def compute_expected_critic_values(self, states, actions): """Computes the expected critic values to be used in the loss for the critic""" critic_expected = self.critic_local(torch.cat((states, actions), 1)) return critic_expected def time_for_critic_and_actor_to_learn(self): """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the actor and critic""" return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 def actor_learn(self, states): """Runs a learning iteration for the actor""" if self.done: #we only update the learning rate at end of each episode self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer) actor_loss = self.calculate_actor_loss(states) self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss, self.hyperparameters["Actor"]["gradient_clipping_norm"]) self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"]) def calculate_actor_loss(self, states): """Calculates the loss for the actor""" actions_pred = self.actor_local(states) actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean() return actor_loss
def train_quad(debug=True): env = environment.Environment(debug) # Rohit's custom environment obs_dim = env.num_states act_dim = env.num_actions buffer_size = 5000 batch_size = 32 gamma = 0.98 tau = 0.001 np.random.seed(1337) vision = False explore = 100000 eps_count = 1000 max_steps = 100000 reward = 0 done = False epsilon = 1 indicator = 0 plot_state = False plot_reward = True episode_rewards = [] episode = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # actor, critic and buffer actor = Actor_Network(env, sess) critic = Critic_Network(env, sess) replay_buffer = Replay_Buffer() # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("WOW WOW WOW, Cannot find the weight") # timestr = time.strftime("%Y%m%d-%H%M%S") # save_path = 'saved_models_rohit_' + timestr save_dir = os.path.join(os.getcwd(), save_path) if not os.path.isdir(save_dir): os.makedirs(save_dir) os.chdir(save_dir) plt.ion() plt.title('Training Curve') plt.xlabel('Episodes') plt.ylabel('Total Reward') plt.grid() for epi in range(eps_count): # receive initial observation state s_t = env._reset() # cos theta, sin theta, theta dot s_t = np.asarray(s_t) total_reward = 0 done = False step = 0 while (done == False): if step > 200: break step += 1 if debug: print('--------------------------------') print('step: {}'.format(step)) loss = 0 epsilon -= 1.0 / explore a_t = np.zeros([1, act_dim]) noise_t = np.zeros([1, act_dim]) # select action according to current policy and exploration noise a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon, 0) * ou_func(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = max(epsilon, 0) * ou_func(a_t_original[0][1], 0.0, 0.60, 0.30) noise_t[0][2] = max(epsilon, 0) * ou_func(a_t_original[0][2], 0.0, 0.60, 0.30) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] s_t1, r_t, done, _ = env._step(a_t[0]) s_t1 = np.asarray(s_t1) # add to replay buffer replay_buffer.add(s_t, a_t[0], r_t, s_t1, done) # sample from replay buffer batch = replay_buffer.sample_batch() states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma * target_q_values[k] loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # pdb.set_trace() if ((epi + 1) % 50 == 0): a_model_name = '%d_actor_model.h5' % (epi + 1) c_model_name = '%d_critic_model.h5' % (epi + 1) filepath = os.path.join(save_dir, a_model_name) actor.model.save(a_model_name) critic.model.save(c_model_name) # print ('saving model') # actor.model.save_weights("actormodel.h5", overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) # critic.model.save_weights("criticmodel.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print( 'episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})' .format(epi + 1, step, total_reward, s_t[0], s_t[1], s_t[2])) ############# Plotting states ############ # if plot_state: # states = env.plotState # xs = states[:,0] # ys = states[:,1] # zs = states[:,2] # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # ax.plot(xs, ys, zs) # ax.set_xlabel('X') # ax.set_ylabel('Y') # ax.set_zlabel('Z') # # plt.show() # save_path = './plots/'+str(e)+'.png' # plt.savefig(save_path) ######################################### ################ Plotting rewards ############## if plot_reward: episode_rewards.append(total_reward) episode.append(epi + 1) plt.plot(episode, episode_rewards, 'b') plt.pause(0.001) plt.savefig("Training Curve.png")