Beispiel #1
0
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed,
            self.config.use_GPU)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.exploration_strategy = OU_Noise_Exploration(self.config)
Beispiel #2
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.hyperparameters = config.hyperparameters
     self.critic_local = Neural_Network(self.state_size + self.action_size,
                                        1, self.random_seed,
                                        self.hyperparameters["Critic"],
                                        "VANILLA_NN").to(self.device)
     self.critic_target = copy.deepcopy(self.critic_local).to(self.device)
     self.critic_optimizer = optim.Adam(
         self.critic_local.parameters(),
         lr=self.hyperparameters["Critic"]["learning_rate"])
     self.memory = Replay_Buffer(
         self.hyperparameters["Critic"]["buffer_size"],
         self.hyperparameters["batch_size"], self.random_seed)
     self.actor_local = Neural_Network(self.state_size, self.action_size,
                                       self.random_seed,
                                       self.hyperparameters["Actor"],
                                       "VANILLA_NN").to(self.device)
     self.actor_target = copy.deepcopy(self.actor_local).to(self.device)
     self.actor_optimizer = optim.Adam(
         self.actor_local.parameters(),
         lr=self.hyperparameters["Actor"]["learning_rate"])
     self.noise = OU_Noise(self.action_size, self.random_seed,
                           self.hyperparameters["mu"],
                           self.hyperparameters["theta"],
                           self.hyperparameters["sigma"])
Beispiel #3
0
    def __init__(self,
                 config,
                 global_action_id_to_primitive_actions,
                 action_length_reward_bonus,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.oracle = self.create_oracle()
        self.oracle_optimizer = optim.Adam(
            self.oracle.parameters(), lr=self.hyperparameters["learning_rate"])

        self.q_network_local = self.create_NN(input_dim=self.state_size + 1,
                                              output_dim=self.action_size)
        self.q_network_local.print_model_summary()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size + 1,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.action_length_reward_bonus = action_length_reward_bonus
        self.abandon_ship = config.hyperparameters["abandon_ship"]
Beispiel #4
0
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size,
                                             output_dim=self.action_size,
                                             key_to_use="Critic",
                                             override_seed=self.config.seed +
                                             1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed,
            self.config.use_GPU)

        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.environment.action_space.shape).to(
                    self.device)).item()  # heuristic value from the paper
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"])
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]
        assert not self.hyperparameters[
            "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
        self.add_extra_noise = False
        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target.load_state_dict(copy.deepcopy(self.critic_local.state_dict()))

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target.load_state_dict(copy.deepcopy(self.actor_local.state_dict()))

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
                              self.hyperparameters["theta"], self.hyperparameters["sigma"])
    def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim):

        self.num_agents = num_agents
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.config = config

        # Replay Buffer相关参数
        self.batch_size = config['batch_size']
        self.buffer_size = config['buffer_size']
        self.buffer = Replay_Buffer(self.buffer_size, self.batch_size)

        self.lr = config['lr']
        self.tau = config['tau']
        self.agents = []

        self.update_step = config['update_step']
        self.curr_step = 0
        self._init_agents()
Beispiel #7
0
    def put_adapted_experiences_in_a_replay_buffer(self, action_id_to_actions):
        """Adds experiences to the replay buffer after re-imagining that the actions taken were macro-actions according to
         action_rules as well as primitive actions.

         NOTE that we want to put both primitive actions and macro-actions into replay buffer so that it can learn that
         its better to do a macro-action rather than the same primitive actions (which we will enforce with reward penalty)
         """

        actions_to_action_id = {v: k for k, v in action_id_to_actions.items()}

        self.num_actions = len(action_id_to_actions)

        print(actions_to_action_id)

        for key in actions_to_action_id.keys():
            assert isinstance(key, tuple)
            assert isinstance(actions_to_action_id[key], int)

        episodes = len(self.states)
        for data_type in [
                self.states, self.next_states, self.rewards, self.actions,
                self.dones
        ]:
            assert len(data_type) == episodes

        max_action_length = self.calculate_max_action_length(
            actions_to_action_id)

        if self.action_balanced_replay_buffer:
            print("Using action balanced replay buffer")
            replay_buffer = Action_Balanced_Replay_Buffer(
                self.buffer_size,
                self.batch_size,
                self.seed,
                num_actions=self.num_actions,
                self.use_GPU)
        else:
            print("Using ordinary replay buffer")
            replay_buffer = Replay_Buffer(self.buffer_size, self.batch_size,
                                          self.seed)

        for episode_ix in range(episodes):
            self.add_adapted_experience_for_an_episode(episode_ix,
                                                       actions_to_action_id,
                                                       max_action_length,
                                                       replay_buffer)

        return replay_buffer
    def __init__(self,
                 env,
                 sess,
                 batch_size=32,
                 tau=0.125,
                 learning_rate=0.0001):
        self.env = env
        self.sess = sess

        self.obs_dim = self.env.num_states
        self.act_dim = self.env.num_actions

        # hyperparameters
        self.lr = learning_rate
        self.bs = batch_size
        self.eps = 1.0
        self.eps_decay = 0.995
        self.gamma = 0.95
        self.tau = tau
        self.buffer_size = 5000
        self.hidden_dim = 32

        # replay buuffer
        self.replay_buffer = Replay_Buffer(self.buffer_size)

        # create model
        self.model, self.weights, self.state = self.create_actor()
        self.target_model, self.target_weights, self.target_state = self.create_actor(
        )

        # gradients
        self.action_gradient = tf.placeholder(tf.float32, [None, self.act_dim])
        self.params_grad = tf.gradients(
            self.model.output, self.weights,
            -self.action_gradient)  # negative for grad ascend
        grads = zip(self.params_grad, self.weights)

        # optimizer & run
        self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(grads)
        self.sess.run(tf.initialize_all_variables())

        self.writer = tf.summary.FileWriter("./logs",
                                            graph=tf.get_default_graph())
        self.merge_op = tf.summary.merge_all()
class Basic_Agents:
    def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim):

        self.num_agents = num_agents
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.config = config

        # Replay Buffer相关参数
        self.batch_size = config['batch_size']
        self.buffer_size = config['buffer_size']
        self.buffer = Replay_Buffer(self.buffer_size, self.batch_size)

        self.lr = config['lr']
        self.tau = config['tau']
        self.agents = []

        self.update_step = config['update_step']
        self.curr_step = 0
        self._init_agents()

    def _init_agents(self):
        self.embedding = Embedding_Layer(self.input_dim,
                                         self.hidden_dim).to(self.device)
        self.embedding_target = Embedding_Layer(
            self.input_dim, self.hidden_dim).to(self.device)
        Dueling_DDQN_Learner.copy_network(self.embedding,
                                          self.embedding_target)

        self.share_para = self.embedding.parameters()
        self.all_para = self.embedding.parameters()
        # init the optimizer
        for i in range(self.num_agents):
            self.agents.append(Dueling_DDQN_Learner(self.config))
            self.all_para = chain(self.all_para,
                                  self.agents[i].get_q_network().parameters())
            # para = chain(self.embedding.parameters(), self.agents[i].get_q_network().parameters())
            # self.optimizer.append(optim.Adam(self.agents[i].get_q_network().parameters(), lr=1e-3))
        # self.all_para = chain(self.all_para)
        self.share_optimizer = optim.RMSprop(self.all_para,
                                             lr=self.lr,
                                             weight_decay=1e-4)

    def get_agent(self, i):
        return self.agents[i]

    def step(self, state, test=False):
        state_embedding = self._get_embedding(state)
        action = []
        for i in range(self.num_agents):
            action.append(self.agents[i].step(state_embedding[:, i], test))
        action = np.asarray(action)
        self.curr_step += 1
        return action

    def learn(self):
        # if self.curr_step > 0 and self.curr_step % self.update_step == 0:
        for i in range(self.update_step):
            states, actions, rewards, next_states, is_dones = self.sample_experience(
            )
            actions = torch.from_numpy(actions).long().to(self.device)
            rewards = torch.from_numpy(rewards).float().to(self.device)
            is_dones = torch.from_numpy(is_dones).float().to(self.device)
            states_embedding = self._get_embedding(states)
            next_states_embedding = self._get_embedding(next_states)
            next_states_embedding_target = self._get_embedding_target(
                next_states)
            total_loss = 0
            for i in range(self.num_agents):
                actions_values_current = self.agents[
                    i].cal_current_actions_value(
                        next_states_embedding[:, i],
                        next_states_embedding_target[:, i], rewards[:, i],
                        is_dones)
                actions_values_expected = self.agents[
                    i].cal_expected_actions_value(states_embedding[:, i],
                                                  actions[:, i])
                loss = F.mse_loss(actions_values_expected,
                                  actions_values_current)
                # loss.backward(retain_graph=True)
                total_loss += loss
                # 反向传播
                # self.optimizer[i].zero_grad()
            self.share_optimizer.zero_grad()
            total_loss.backward()
            # self._scale_shared_grads()
            torch.nn.utils.clip_grad_value_(self.all_para, 1)
            self.share_optimizer.step()
            for i in range(self.num_agents):
                # torch.nn.utils.clip_grad_value_(self.agents[i].q_network_current.parameters(), 1)
                # self.optimizer[i].step()
                # 更新target net
                Dueling_DDQN_Learner.soft_update_of_target_network(
                    self.agents[i].q_network_current,
                    self.agents[i].q_network_target, self.tau)
            self._update_sharing_target_network()
            # self.share_optimizer.zero_grad()

    def get_share_para(self):
        return dict(self.embedding.named_parameters())

    def store_experience(self, states, actions, rewards, next_states,
                         is_dones):
        self.buffer.store_experience(states, actions, rewards, next_states,
                                     is_dones)

    def sample_experience(self):
        states, actions, rewards, next_states, is_dones = self.buffer.sample_experience(
        )
        return states, actions, rewards, next_states, is_dones

    def _get_embedding(self, state):
        return self.embedding(state)

    def _get_embedding_target(self, state):
        return self.embedding_target(state)

    def _update_sharing_target_network(self):
        Dueling_DDQN_Learner.soft_update_of_target_network(
            self.embedding, self.embedding_target, self.tau)

    def get_attention_score(self, i):
        return -1

    def _scale_shared_grads(self):
        """
        Scale gradients for parameters that are shared since they accumulate
        gradients from the critic loss function multiple times
        """
        for p in self.share_para:
            p.grad.data.mul_(1. / self.num_agents)

    def save_model(self, path):
        share_model_name = path + '/share_model.pkl'
        torch.save(self.embedding.state_dict(), share_model_name)
        for i in range(self.num_agents):
            unique_model_name = path + '/q_network_%d.pkl' % i
            torch.save(self.agents[i].q_network_current.state_dict(),
                       unique_model_name)

    def load_model(self, path):
        share_model_name = path + '/share_model.pkl'
        self.embedding.load_state_dict(
            torch.load(share_model_name, map_location=self.device))
        for i in range(self.num_agents):
            unique_model_name = path + '/q_network_%d.pkl' % i
            self.agents[i].q_network_current.load_state_dict(
                torch.load(unique_model_name, map_location=self.device))
Beispiel #10
0
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(
            input_dim=self.state_size + self.action_size,
            output_dim=1,
            key_to_use="Critic",
            override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                              self.action_size,
                                              output_dim=1,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size * 2,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.target_entropy = -np.prod(
            self.environment.action_space.shape).item(
            )  # heuristic value from the paper
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.environment.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"])
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed,
                                  self.hyperparameters["mu"],
                                  self.hyperparameters["theta"],
                                  self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]
Beispiel #11
0
class DDPG_Agent(Base_Agent):
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = Neural_Network(self.state_size + self.action_size,
                                           1, self.random_seed,
                                           self.hyperparameters["Critic"],
                                           "VANILLA_NN").to(self.device)
        self.critic_target = copy.deepcopy(self.critic_local).to(self.device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.random_seed)
        self.actor_local = Neural_Network(self.state_size, self.action_size,
                                          self.random_seed,
                                          self.hyperparameters["Actor"],
                                          "VANILLA_NN").to(self.device)
        self.actor_target = copy.deepcopy(self.actor_local).to(self.device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.noise = OU_Noise(self.action_size, self.random_seed,
                              self.hyperparameters["mu"],
                              self.hyperparameters["theta"],
                              self.hyperparameters["sigma"])

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        self.noise.reset()

    def step(self):
        """Runs a step in the game"""
        while not self.done:
            self.pick_and_conduct_action()
            self.update_next_state_reward_done_and_score()
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.memory.sample(
                    )  # Sample experiences
                    self.critic_learn(states, actions, rewards, next_states,
                                      dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.episode_step_number += 1
        self.episode_number += 1

    def pick_action(self):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        state = torch.from_numpy(self.state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return action

    def critic_learn(self, states, actions, rewards, next_states, dones):
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, loss,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(
                next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        critic_targets_next = self.compute_critic_values_for_next_states(
            next_states)
        critic_targets = self.compute_critic_values_for_current_states(
            rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(
                torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards,
                                                 critic_targets_next, dones):
        critic_targets_current = rewards + (
            self.hyperparameters["discount_rate"] * critic_targets_next *
            (1 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        return self.enough_experiences_to_learn_from(
        ) and self.episode_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def actor_learn(self, states):
        if self.done:  #we only update the learning rate at end of each episode
            self.update_learning_rate(
                self.hyperparameters["Actor"]["learning_rate"],
                self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.actor_local, self.actor_target,
            self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat(
            (states, actions_pred), 1)).mean()
        return actor_loss
#state = state.unsqueeze(1) 
"""

"""
_____TESTING ENSEMBLE CRITIC_____
ensemble_critic = EnsembleCritic(1, 8, 1)
state = torch.FloatTensor(env.reset()).unsqueeze(0)
action = torch.FloatTensor(1).unsqueeze(1)
all_qs = ensemble_critic(state, action)
print(all_qs)
"""
ANNEAL_RATE = .00003
TEMP_MIN = 0.005  
default_temp = 1.0
log_interval = 10 
replay = Replay_Buffer(1000)
state = torch.FloatTensor(env.reset()).unsqueeze(0) 
num_qs = 1
state_dim = 8 
num_samples_match = 10 
NUM_EPISODES = 50  
batch_size = 250  
bear = BEAR(num_qs, state_dim, 1, 10, ANNEAL_RATE, TEMP_MIN, default_temp)
running_rewards = [] 

if __name__ == "__main__":
    
    running_reward = 0  
    for i in range(NUM_EPISODES): 
        state = env.reset()
        done = False
    env = gym.make(env_name)
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

    device = torch.device("cuda: %d" % gpu_id if use_cuda else "cpu")

    # critic
    critic_local = Critic(n_states, n_actions).to(device)
    critic_target = Critic(n_states, n_actions).to(device)
    model_deep_copy(from_model=critic_local, to_model=critic_target)

    optim_critic = optim.Adam(critic_local.parameters(),
                              lr=lr_critic,
                              eps=1e-4)

    memory = Replay_Buffer(buffer_size, batch_size, mem_seed)

    # actor
    actor_local = Actor(n_states).to(device)
    actor_target = Actor(n_states).to(device)
    model_deep_copy(from_model=actor_local, to_model=actor_target)

    optim_actor = optim.Adam(actor_local.parameters(), lr=lr_actor, eps=1e-4)

    # ou noise
    ou_noise = OU_Noise(size=n_actions,
                        seed=ou_seed,
                        mu=mu,
                        theta=theta,
                        sigma=sigma)
    ou_noise.reset()
class SAC(Base_Agent):
    """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
      https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
      to maximise the entropy of their actions as well as their cumulative reward"""
    agent_name = "SAC"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(
            input_dim=self.state_size + self.action_size,
            output_dim=1,
            key_to_use="Critic",
            override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                              self.action_size,
                                              output_dim=1,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size * 2,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.environment.action_space.shape).to(
                    self.device)).item()  # heuristic value from the paper
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"])
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed,
                                  self.hyperparameters["mu"],
                                  self.hyperparameters["theta"],
                                  self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]

    def save_result(self):
        """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
        want to keep track of the results during the evaluation episodes"""
        if self.episode_number == 1 or not self.do_evaluation_iterations:
            self.game_full_episode_scores.extend(
                [self.total_episode_score_so_far])
            self.rolling_results.append(
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:]))
            self.save_max_result_seen()

        elif (self.episode_number -
              1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
            self.game_full_episode_scores.extend([
                self.total_episode_score_so_far
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.rolling_results.extend([
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:])
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.save_max_result_seen()

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        if self.add_extra_noise: self.noise.reset()

    def step(self):
        """Runs an episode on the game, saving the experience and running a learning step if appropriate"""
        eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
        self.episode_step_number_val = 0
        while not self.done:
            self.episode_step_number_val += 1
            self.action = self.pick_action(eval_ep)
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    self.learn()
            mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done
            if not eval_ep:
                self.save_experience(experience=(self.state, self.action,
                                                 self.reward, self.next_state,
                                                 mask))
            self.state = self.next_state
            self.global_step_number += 1
        print(self.total_episode_score_so_far)
        if eval_ep: self.print_summary_of_latest_evaluation_episode()
        self.episode_number += 1

    def pick_action(self, eval_ep, state=None):
        """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
         2) Using the actor in evaluation mode if eval_ep is True  3) Using the actor in training mode if eval_ep is False.
         The difference between evaluation and training mode is that training mode does more exploration"""
        if state is None: state = self.state
        if eval_ep: action = self.actor_pick_action(state=state, eval=True)
        elif self.global_step_number < self.hyperparameters[
                "min_steps_before_learning"]:
            action = self.environment.action_space.sample()
            print("Picking random action ", action)
        else:
            action = self.actor_pick_action(state=state)
        if self.add_extra_noise:
            self.action += self.noise.sample()
        return action

    def actor_pick_action(self, state=None, eval=False):
        """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
        an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
        from the network and so did not involve any random sampling"""
        if state is None: state = self.state
        state = torch.FloatTensor([state]).to(self.device)
        if len(state.shape) == 1: state = state.unsqueeze(0)
        if eval == False:
            action, _, _ = self.produce_action_and_action_info(state)
        else:
            with torch.no_grad():
                _, z, action = self.produce_action_and_action_info(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def produce_action_and_action_info(self, state):
        """Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
        actor_output = self.actor_local(state)
        mean, log_std = actor_output[:, :self.
                                     action_size], actor_output[:, self.
                                                                action_size:]
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample(
        )  #rsample means it is sampled using reparameterisation trick
        action = torch.tanh(x_t)
        log_prob = normal.log_prob(x_t)
        log_prob -= torch.log(1 - action.pow(2) + EPSILON)
        log_prob = log_prob.sum(1, keepdim=True)
        return action, log_prob, torch.tanh(mean)

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
               self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def learn(self):
        """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences(
        )
        qf1_loss, qf2_loss = self.calculate_critic_losses(
            state_batch, action_batch, reward_batch, next_state_batch,
            mask_batch)
        policy_loss, log_pi = self.calculate_actor_loss(state_batch)
        if self.automatic_entropy_tuning:
            alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
        else:
            alpha_loss = None
        self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss)

    def sample_experiences(self):
        return self.memory.sample()

    def calculate_critic_losses(self, state_batch, action_batch, reward_batch,
                                next_state_batch, mask_batch):
        """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
         term is taken into account"""
        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(
                next_state_batch)
            qf1_next_target = self.critic_target(
                torch.cat((next_state_batch, next_state_action), 1))
            qf2_next_target = self.critic_target_2(
                torch.cat((next_state_batch, next_state_action), 1))
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + (
                1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (
                    min_qf_next_target)
        qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
        qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        return qf1_loss, qf2_loss

    def calculate_actor_loss(self, state_batch):
        """Calculates the loss for the actor. This loss includes the additional entropy term"""
        action, log_pi, _ = self.produce_action_and_action_info(state_batch)
        qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
        qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        return policy_loss, log_pi

    def calculate_entropy_tuning_loss(self, log_pi):
        """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
        is True."""
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()
        return alpha_loss

    def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss,
                              alpha_loss):
        """Updates the parameters for the actor, both critics and (if specified) the temperature parameter"""
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, critic_loss_1,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.take_optimisation_step(
            self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])
        self.soft_update_of_target_network(
            self.critic_local_2, self.critic_target_2,
            self.hyperparameters["Critic"]["tau"])
        if alpha_loss is not None:
            self.take_optimisation_step(self.alpha_optim, None, alpha_loss,
                                        None)
            self.alpha = self.log_alpha.exp()

    def print_summary_of_latest_evaluation_episode(self):
        """Prints a summary of the latest episode"""
        print(" ")
        print("----------------------------")
        print("Episode score {} ".format(self.total_episode_score_so_far))
        print("----------------------------")
Beispiel #15
0
def play_game(train_indicator=1):

    env = environment.Environment()  # Rohit's custom environment

    obs_dim = env.num_states
    act_dim = env.num_actions

    buffer_size = 5000
    batch_size = 32
    gamma = 0.95
    tau = 0.001

    np.random.seed(1337)

    vision = False

    explore = 100000.
    eps_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # actor, critic and buffer
    actor = Actor_Network(env, sess)
    critic = Critic_Network(env, sess)
    replay_buffer = Replay_Buffer()

    # try:
    #   actor.model.load_weights("actormodel.h5")
    #   critic.model.load_weights("criticmodel.h5")
    #   actor.target_model.load_weights("actormodel.h5")
    #   critic.target_model.load_weights("criticmodel.h5")
    #   print("Weight load successfully")
    # except:
    #   print("WOW WOW WOW, Cannot find the weight")

    for e in range(eps_count):

        # receive initial observation state
        s_t = env._reset()  # cos theta, sin theta, theta dot
        s_t = np.asarray(s_t)
        total_reward = 0
        done = False
        step = 0

        while (done == False):
            if step > 200:
                break

            loss = 0
            epsilon -= 1.0 / explore

            a_t = np.zeros([1, act_dim])
            noise_t = np.zeros([1, act_dim])

            # select action according to current policy and exploration noise
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.0, 0.60, 0.30)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], 0.0, 0.60, 0.30)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            s_t1, r_t, done, _ = env._step(a_t[0])
            s_t1 = np.asarray(s_t1)

            # add to replay buffer
            replay_buffer.add(s_t, a_t[0], r_t, s_t1, done)
            # pdb.set_trace()
            # sample from replay buffer
            batch = replay_buffer.sample_batch()
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            step += 1
            print('step: {}'.format(step))

        if np.mod(e, 3) == 0:
            if (train_indicator):
                print('saving model')
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print('episode: ', e, ' total rewards: ', total_reward)

        # Plotting states
        states = env.plotState
        xs = states[:, 0]
        ys = states[:, 1]
        zs = states[:, 2]

        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        ax.plot(xs, ys, zs)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        # plt.show()
        save_path = './plots/' + str(e) + '.png'
        plt.savefig(save_path)
        if i % log_interval == 0:
            print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} @ t={}, \
            last reward {:.1f}, running reward {:.1f}""".format(i, env.starting_portfolio_value, \
                env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))


dqn = DQN_Agent()  
serieslength = 250  
env = TradingEnvironment(max_stride=4, series_length=serieslength,starting_cash_mean=100, randomize_cash_std=100, starting_shares_mean=100,randomize_shares_std=10, inaction_penalty=100.0)
BATCH_SIZE = 250 

if __name__ == "__main__": 
    num_episodes = 50 
    gamma = .97 
    target_update = 10 
    replay_buffer = Replay_Buffer(1000)
    optimizer = optim.RMSprop(dqn.policy_net.parameters())
    train(num_episodes, target_update, gamma, env, dqn, replay_buffer, optimizer) 
    #sample trading run 

    total_rewards = 0 
    total_profits = 0
    failed_goes = 0 
    num_goes = 120 
    env = TradingEnvironment(max_stride=4, series_length=serieslength,starting_cash_mean=100, randomize_cash_std=100, starting_shares_mean=100,randomize_shares_std=10)

    for i in range(num_goes):
        done = False 
        env.reset()
        reward_this_go = 1e-8 
        for j in range(0, env.series_length+1):  
Beispiel #17
0
class DDQN_Wrapper(Base_Agent):
    def __init__(self,
                 config,
                 global_action_id_to_primitive_actions,
                 action_length_reward_bonus,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.oracle = self.create_oracle()
        self.oracle_optimizer = optim.Adam(
            self.oracle.parameters(), lr=self.hyperparameters["learning_rate"])

        self.q_network_local = self.create_NN(input_dim=self.state_size + 1,
                                              output_dim=self.action_size)
        self.q_network_local.print_model_summary()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size + 1,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.action_length_reward_bonus = action_length_reward_bonus
        self.abandon_ship = config.hyperparameters["abandon_ship"]

    def create_oracle(self):
        """Creates the network we will use to predict the next state"""
        oracle_hyperparameters = copy.deepcopy(self.hyperparameters)
        oracle_hyperparameters["columns_of_data_to_be_embedded"] = []
        oracle_hyperparameters["embedding_dimensions"] = []
        oracle_hyperparameters["linear_hidden_units"] = [5, 5]
        oracle_hyperparameters["final_layer_activation"] = [None, "tanh"]
        oracle = self.create_NN(input_dim=self.state_size + 2,
                                output_dim=[self.state_size + 1, 1],
                                hyperparameters=oracle_hyperparameters)
        oracle.print_model_summary()
        return oracle

    def run_n_episodes(self, num_episodes,
                       episodes_to_run_with_no_exploration):
        self.turn_on_any_epsilon_greedy_exploration()
        self.round_of_macro_actions = []
        self.episode_actions_scores_and_exploration_status = []
        num_episodes_to_get_to = self.episode_number + num_episodes
        while self.episode_number < num_episodes_to_get_to:
            self.reset_game()
            self.step()
            self.save_and_print_result()
            if num_episodes_to_get_to - self.episode_number == episodes_to_run_with_no_exploration:
                self.turn_off_any_epsilon_greedy_exploration()
        assert len(self.episode_actions_scores_and_exploration_status
                   ) == num_episodes, "{} vs. {}".format(
                       len(self.episode_actions_scores_and_exploration_status),
                       num_episodes)
        assert len(self.episode_actions_scores_and_exploration_status[0]) == 3
        assert self.episode_actions_scores_and_exploration_status[0][2] in [
            True, False
        ]
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1], list)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1][0], int)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][0],
            int) or isinstance(
                self.episode_actions_scores_and_exploration_status[0][0],
                float)
        return self.episode_actions_scores_and_exploration_status, self.round_of_macro_actions

    def step(self):
        """Runs a step within a game including a learning step if required"""
        step_number = 0.0
        self.state = np.append(
            self.state, step_number /
            200.0)  #Divide by 200 because there are 200 steps in cart pole

        self.total_episode_score_so_far = 0
        episode_macro_actions = []
        while not self.done:
            surprised = False
            macro_action = self.pick_action()
            primitive_actions = self.global_action_id_to_primitive_actions[
                macro_action]
            primitive_actions_conducted = 0
            for ix, action in enumerate(primitive_actions):
                if self.abandon_ship and primitive_actions_conducted > 0:
                    if self.abandon_macro_action(action):
                        break

                step_number += 1
                self.action = action
                self.next_state, self.reward, self.done, _ = self.environment.step(
                    action)
                self.next_state = np.append(
                    self.next_state, step_number / 200.0
                )  #Divide by 200 because there are 200 steps in cart pole

                self.total_episode_score_so_far += self.reward
                if self.hyperparameters["clip_rewards"]:
                    self.reward = max(min(self.reward, 1.0), -1.0)
                primitive_actions_conducted += 1
                self.track_episodes_data()
                self.save_experience()

                if len(primitive_actions) > 1:

                    surprised = self.am_i_surprised()

                self.state = self.next_state
                if self.time_for_q_network_to_learn():
                    for _ in range(
                            self.hyperparameters["learning_iterations"]):
                        self.q_network_learn()
                        self.oracle_learn()
                if self.done or surprised: break
            episode_macro_actions.append(macro_action)
            self.round_of_macro_actions.append(macro_action)
        if random.random() < 0.1: print(Counter(episode_macro_actions))
        self.save_episode_actions_with_score()
        self.episode_number += 1
        self.logger.info("END OF EPISODE")

    def am_i_surprised(self):
        """Returns boolean indicating whether the next_state was a surprise or not"""
        with torch.no_grad():
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
            action = torch.Tensor([[self.action]])

            states_and_actions = torch.cat(
                (state, action),
                dim=1)  #must change this for all games besides cart pole
            predictions = self.oracle(states_and_actions)
            predicted_next_state = predictions[0, :-1]

            difference = F.mse_loss(predicted_next_state,
                                    torch.Tensor(self.next_state))
            if difference > 0.5:
                print("Surprise! Loss {} -- {} vs. {}".format(
                    difference, predicted_next_state, self.next_state))
                return True
            else:
                return False

    def abandon_macro_action(self, action):
        """Returns boolean indicating whether to abandon macro action or not"""
        state = torch.from_numpy(self.state).float().unsqueeze(0).to(
            self.device)
        with torch.no_grad():
            primitive_q_values = self.calculate_q_values(
                state, local=True, primitive_actions_only=True)
        q_value_highest = torch.max(primitive_q_values)
        q_values_action = primitive_q_values[:, action]
        if q_value_highest > 0.0: multiplier = 0.7
        else: multiplier = 1.3
        if q_values_action < multiplier * q_value_highest:
            print("BREAKING Action {} -- Q Values {}".format(
                action, primitive_q_values))
            return True
        else:
            return False

    def pick_action(self, state=None):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""
        if state is None: state = self.state
        if isinstance(state, np.int64) or isinstance(state, int):
            state = np.array([state])
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if len(state.shape) < 2: state = state.unsqueeze(0)
        self.q_network_local.eval()  #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.calculate_q_values(
                state, local=True, primitive_actions_only=False)
        self.q_network_local.train()  #puts network back in training mode
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action_values": action_values,
                "turn_off_exploration": self.turn_off_exploration,
                "episode_number": self.episode_number
            })
        self.logger.info("Q values {} -- Action chosen {}".format(
            action_values, action))
        return action

    def calculate_q_values(self, states, local, primitive_actions_only):
        """Calculates the q values using the local q network"""
        if local:
            primitive_q_values = self.q_network_local(states)
        else:
            primitive_q_values = self.q_network_target(states)

        num_actions = len(self.global_action_id_to_primitive_actions)
        if primitive_actions_only or num_actions <= self.action_size:
            return primitive_q_values

        extra_q_values = self.calculate_macro_action_q_values(
            states, num_actions)
        extra_q_values = torch.Tensor([extra_q_values])
        all_q_values = torch.cat((primitive_q_values, extra_q_values), dim=1)

        return all_q_values

    def calculate_macro_action_q_values(self, state, num_actions):
        assert state.shape[0] == 1
        q_values = []
        for action_id in range(self.action_size, num_actions):
            macro_action = self.global_action_id_to_primitive_actions[
                action_id]
            predicted_next_state = state
            cumulated_reward = 0
            action_ix = 0
            for action in macro_action[:-1]:
                predictions = self.oracle(
                    torch.cat((predicted_next_state, torch.Tensor([[action]])),
                              dim=1))
                rewards = predictions[:, -1]
                predicted_next_state = predictions[:, :-1]
                cumulated_reward += (
                    rewards.item() + self.action_length_reward_bonus
                ) * self.hyperparameters["discount_rate"]**(action_ix)
                action_ix += 1
            final_action = macro_action[-1]
            final_q_value = self.q_network_local(predicted_next_state)[
                0, final_action]
            total_q_value = cumulated_reward + final_q_value * self.hyperparameters[
                "discount_rate"]**(action_ix)
            q_values.append(total_q_value)
        return q_values

    def time_for_q_network_to_learn(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
        enough experiences in the replay buffer to learn from"""
        return self.right_amount_of_steps_taken(
        ) and self.enough_experiences_to_learn_from()

    def right_amount_of_steps_taken(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin"""
        return self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def q_network_learn(self, experiences=None):
        """Runs a learning iteration for the Q network"""
        if experiences is None:
            states, actions, rewards, next_states, dones = self.sample_experiences(
            )  #Sample experiences
        else:
            states, actions, rewards, next_states, dones = experiences
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(
            self.q_network_optimizer, self.q_network_local, loss,
            self.hyperparameters["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.q_network_local,
                                           self.q_network_target,
                                           self.hyperparameters["tau"])

    def sample_experiences(self):
        """Draws a random sample of experience from the memory buffer"""
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences
        return states, actions, rewards, next_states, dones

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss required to train the Q network"""
        with torch.no_grad():
            max_action_indexes = self.calculate_q_values(
                next_states, local=True,
                primitive_actions_only=True).detach().argmax(1)
            Q_targets_next = self.calculate_q_values(
                next_states, local=False, primitive_actions_only=True).gather(
                    1, max_action_indexes.unsqueeze(1))
            Q_targets = rewards + (self.hyperparameters["discount_rate"] *
                                   Q_targets_next * (1 - dones))
        Q_expected = self.calculate_q_values(
            states, local=True,
            primitive_actions_only=True).gather(1, actions.long(
            ))  # must convert actions to long so can be used as index
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def save_episode_actions_with_score(self):

        self.episode_actions_scores_and_exploration_status.append([
            self.total_episode_score_so_far,
            self.episode_actions + [self.end_of_episode_symbol],
            self.turn_off_exploration
        ])

    def oracle_learn(self):
        states, actions, rewards, next_states, _ = self.sample_experiences(
        )  # Sample experiences
        states_and_actions = torch.cat(
            (states, actions),
            dim=1)  #must change this for all games besides cart pole
        predictions = self.oracle(states_and_actions)
        loss = F.mse_loss(torch.cat((next_states, rewards), dim=1),
                          predictions) / float(next_states.shape[1] + 1.0)
        self.take_optimisation_step(
            self.oracle_optimizer, self.oracle, loss,
            self.hyperparameters["gradient_clipping_norm"])
        self.logger.info("Oracle Loss {}".format(loss))
Beispiel #18
0
def train_quad(debug=True):

    env = environment.QuadCopterEnv(debug)  # Rohit's custom environment

    obs_dim = env.num_states
    act_dim = env.num_actions

    buffer_size = 5000
    batch_size = 32
    gamma = 0.98
    tau = 0.001

    np.random.seed(1337)

    vision = False

    explore = 1000  #100000
    eps_count = 500  #1000
    max_steps = 40  #100000
    reward = 0
    done = False
    epsilon = 1
    indicator = 0

    plot_state = False
    plot_reward = True

    episode_rewards = []
    episode = []

    # Configue tensorflow CPU/GPU
    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    #from tensorflow.keras import backend as K
    #K.set_session(sess)
    tf.compat.v1.keras.backend.set_session(sess)

    # Define actor, critic and buffer
    actor = Actor_Network(env, sess)
    critic = Critic_Network(env, sess)
    replay_buffer = Replay_Buffer()

    # Save location
    save_dir = os.path.join(os.getcwd(), save_path)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    os.chdir(save_dir)

    # Plot total reward
    plt.ion()
    plt.title('Training Curve')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.grid()

    # Episode loop
    for epi in range(eps_count):
        # Receive initial observation state
        s_t = env.reset()  # Initial position info
        s_t = np.asarray(s_t)
        total_reward = 0
        done = False
        step = 0

        # Step loop
        while (done == False):
            if step > max_steps:  # Episode length is 200 steps
                break

            step += 1
            if debug:
                print('--------------------------------')
                print('step: {}'.format(step))

            loss = 0
            epsilon -= 1.0 / explore  # Reduce every step

            a_t = np.zeros([1, act_dim])
            noise_t = np.zeros([1, act_dim])

            # Select action acoording to current policy and exploration noise
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            print('epsilon: {}'.format(epsilon))

            #noise_t[0][0] = max(epsilon,0.0) * ou_func(a_t_original[0][0],  0.0 , 0.60, 1)
            #noise_t[0][1] = max(epsilon,0.0) * ou_func(a_t_original[0][1],  0.0 , 0.60, 1)
            #noise_t[0][2] = max(epsilon,0.0) * ou_func(a_t_original[0][2],  0.0 , 0.60, 1)

            noise_t[0][0] = max(epsilon, 0.0) * ou_func(
                a_t_original[0][0], 0.0, 0.1, 0.4)
            noise_t[0][1] = max(epsilon, 0.0) * ou_func(
                a_t_original[0][1], 0.0, 0.1, 0.4)
            noise_t[0][2] = max(epsilon, 0.0) * ou_func(
                a_t_original[0][2], 0.0, 0.1, 0.4)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            #a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            a_t[0][2] = 0
            s_t1, r_t, done, _ = env.step(a_t[0])
            s_t1 = np.asarray(s_t1)

            # Add current data to replay buffer
            replay_buffer.add(s_t, a_t[0], r_t, s_t1, done)

            # Sample from replay buffer
            batch = replay_buffer.sample_batch()
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch
                              ])  # Just make a empty array has same shape

            # Calculate target Q values (What is target Q values)
            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            # y_t is like the label of
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            # Train critic model
            loss += critic.model.train_on_batch([states, actions], y_t)
            a_for_grad = actor.model.predict(states)
            grads = critic.gradients(states, a_for_grad)
            actor.train(states, grads)
            actor.target_train()
            critic.target_train()

            total_reward += r_t
            s_t = s_t1

        # One step finish, save models
        if ((epi + 1) % 50 == 0):
            a_model_name = '%d_actor_model.h5' % (epi + 1)
            c_model_name = '%d_critic_model.h5' % (epi + 1)
            filepath = os.path.join(save_dir, a_model_name)
            actor.model.save(a_model_name)
            critic.model.save(c_model_name)
        print(
            'episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})'
            .format(epi + 1, step, total_reward, s_t[0], s_t[1], s_t[2]))

        if plot_reward:
            episode_rewards.append(total_reward)
            episode.append(epi + 1)
            plt.plot(episode, episode_rewards, 'b')
            plt.pause(0.001)

    plt.savefig("Training Curve.png")
Beispiel #19
0
def train_quad(debug=True):
    
    env = environment.Environment(debug)  # Rohit's custom environment

    obs_dim = env.num_states
    act_dim = env.num_actions

    buffer_size = 5000
    batch_size = 32
    gamma = 0.98
    tau = 0.001

    np.random.seed(1337)

    vision = False

    explore = 100000
    eps_count = 1000
    max_steps = 100000
    reward = 0
    done = False
    epsilon = 1
    indicator = 0

    plot_state = False
    plot_reward = True

    episode_rewards = []
    episode = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # actor, critic and buffer
    actor = Actor_Network(env, sess)
    critic = Critic_Network(env, sess)
    replay_buffer = Replay_Buffer()

    # try:
    #   actor.model.load_weights("actormodel.h5")
    #   critic.model.load_weights("criticmodel.h5")
    #   actor.target_model.load_weights("actormodel.h5")
    #   critic.target_model.load_weights("criticmodel.h5")
    #   print("Weight load successfully")
    # except:
    #   print("WOW WOW WOW, Cannot find the weight")

    # timestr = time.strftime("%Y%m%d-%H%M%S")
    # save_path = 'saved_models_rohit_' + timestr
    save_dir = os.path.join(os.getcwd(), save_path)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    os.chdir(save_dir)

    plt.ion()
    plt.title('Training Curve')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.grid()

    for epi in range (eps_count):

        # receive initial observation state
        s_t = env._reset()  # cos theta, sin theta, theta dot
        s_t = np.asarray(s_t)
        total_reward = 0
        done = False
        step = 0

        while(done == False):
            if step > 200:
                break
            
            step += 1
            if debug:
                print('--------------------------------')
                print('step: {}'.format(step))

            loss = 0
            epsilon -= 1.0/explore

            a_t = np.zeros([1, act_dim])
            noise_t = np.zeros([1, act_dim])

            # select action according to current policy and exploration noise
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            
            noise_t[0][0] = max(epsilon,0) * ou_func(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = max(epsilon,0) * ou_func(a_t_original[0][1],  0.0 , 0.60, 0.30)
            noise_t[0][2] = max(epsilon,0) * ou_func(a_t_original[0][2],  0.0 , 0.60, 0.30)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            s_t1, r_t, done, _ = env._step(a_t[0])
            s_t1 = np.asarray(s_t1)

            # add to replay buffer
            replay_buffer.add(s_t, a_t[0], r_t, s_t1, done)

            # sample from replay buffer
            batch = replay_buffer.sample_batch()
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            for k in range (len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma*target_q_values[k]

        
            loss += critic.model.train_on_batch([states, actions], y_t)
            a_for_grad = actor.model.predict(states)
            grads = critic.gradients(states, a_for_grad)
            actor.train(states, grads)
            actor.target_train()
            critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # pdb.set_trace()

        if ((epi+1)%50 == 0):
            a_model_name = '%d_actor_model.h5' %(epi+1)
            c_model_name = '%d_critic_model.h5' % (epi+1)
            filepath = os.path.join(save_dir, a_model_name)
            actor.model.save(a_model_name)
            critic.model.save(c_model_name)

                # print ('saving model')
                # actor.model.save_weights("actormodel.h5", overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                # critic.model.save_weights("criticmodel.h5", overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print('episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})'.format(epi+1, step, total_reward, s_t[0], s_t[1], s_t[2]))
        ############# Plotting states ############
        # if plot_state:
        #     states = env.plotState
        #     xs = states[:,0]
        #     ys = states[:,1]
        #     zs = states[:,2]

        #     fig = plt.figure()
        #     ax = fig.add_subplot(111, projection='3d')

        #     ax.plot(xs, ys, zs)
        #     ax.set_xlabel('X')
        #     ax.set_ylabel('Y')
        #     ax.set_zlabel('Z')
        #     # plt.show()
        #     save_path = './plots/'+str(e)+'.png'
        #     plt.savefig(save_path)
        #########################################

    ################ Plotting rewards ############## 
        if plot_reward:
            episode_rewards.append(total_reward)
            episode.append(epi+1)
            plt.plot(episode,episode_rewards,'b')
            plt.pause(0.001)
        
    plt.savefig("Training Curve.png")
Beispiel #20
0
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.exploration_strategy = OU_Noise_Exploration(self.config)

    def step(self):
        """Runs a step in the game"""
        while not self.done:
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences()
                    self.critic_learn(states, actions, rewards, next_states, dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def sample_experiences(self):
        return self.memory.sample()

    def pick_action(self, state=None):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        critic_targets_next = self.compute_critic_values_for_next_states(next_states)
        critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done: #we only update the learning rate at end of each episode
            self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
                                    self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
        return actor_loss
def train_quad(debug=True):

    env = environment.Environment(debug)  # Rohit's custom environment

    obs_dim = env.num_states
    act_dim = env.num_actions

    buffer_size = 5000
    batch_size = 32
    gamma = 0.98
    tau = 0.001

    np.random.seed(1337)

    vision = False

    explore = 100000
    eps_count = 1000
    max_steps = 100000
    reward = 0
    done = False
    epsilon = 1
    indicator = 0

    plot_state = False
    plot_reward = True

    episode_rewards = []
    episode = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # actor, critic and buffer
    actor = Actor_Network(env, sess)
    critic = Critic_Network(env, sess)
    replay_buffer = Replay_Buffer()

    # try:
    #   actor.model.load_weights("actormodel.h5")
    #   critic.model.load_weights("criticmodel.h5")
    #   actor.target_model.load_weights("actormodel.h5")
    #   critic.target_model.load_weights("criticmodel.h5")
    #   print("Weight load successfully")
    # except:
    #   print("WOW WOW WOW, Cannot find the weight")

    # timestr = time.strftime("%Y%m%d-%H%M%S")
    # save_path = 'saved_models_rohit_' + timestr
    save_dir = os.path.join(os.getcwd(), save_path)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    os.chdir(save_dir)

    plt.ion()
    plt.title('Training Curve')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.grid()

    for epi in range(eps_count):

        # receive initial observation state
        s_t = env._reset()  # cos theta, sin theta, theta dot
        s_t = np.asarray(s_t)
        total_reward = 0
        done = False
        step = 0

        while (done == False):
            if step > 200:
                break

            step += 1
            if debug:
                print('--------------------------------')
                print('step: {}'.format(step))

            loss = 0
            epsilon -= 1.0 / explore

            a_t = np.zeros([1, act_dim])
            noise_t = np.zeros([1, act_dim])

            # select action according to current policy and exploration noise
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            noise_t[0][0] = max(epsilon, 0) * ou_func(a_t_original[0][0], 0.0,
                                                      0.60, 0.30)
            noise_t[0][1] = max(epsilon, 0) * ou_func(a_t_original[0][1], 0.0,
                                                      0.60, 0.30)
            noise_t[0][2] = max(epsilon, 0) * ou_func(a_t_original[0][2], 0.0,
                                                      0.60, 0.30)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            s_t1, r_t, done, _ = env._step(a_t[0])
            s_t1 = np.asarray(s_t1)

            # add to replay buffer
            replay_buffer.add(s_t, a_t[0], r_t, s_t1, done)

            # sample from replay buffer
            batch = replay_buffer.sample_batch()
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            loss += critic.model.train_on_batch([states, actions], y_t)
            a_for_grad = actor.model.predict(states)
            grads = critic.gradients(states, a_for_grad)
            actor.train(states, grads)
            actor.target_train()
            critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # pdb.set_trace()

        if ((epi + 1) % 50 == 0):
            a_model_name = '%d_actor_model.h5' % (epi + 1)
            c_model_name = '%d_critic_model.h5' % (epi + 1)
            filepath = os.path.join(save_dir, a_model_name)
            actor.model.save(a_model_name)
            critic.model.save(c_model_name)

            # print ('saving model')
            # actor.model.save_weights("actormodel.h5", overwrite=True)
            # with open("actormodel.json", "w") as outfile:
            #     json.dump(actor.model.to_json(), outfile)

            # critic.model.save_weights("criticmodel.h5", overwrite=True)
            # with open("criticmodel.json", "w") as outfile:
            #     json.dump(critic.model.to_json(), outfile)

        print(
            'episode: {}, num_steps: {}, total rewards: {:.2f}, final state: ({:.2f},{:.2f},{:.2f})'
            .format(epi + 1, step, total_reward, s_t[0], s_t[1], s_t[2]))
        ############# Plotting states ############
        # if plot_state:
        #     states = env.plotState
        #     xs = states[:,0]
        #     ys = states[:,1]
        #     zs = states[:,2]

        #     fig = plt.figure()
        #     ax = fig.add_subplot(111, projection='3d')

        #     ax.plot(xs, ys, zs)
        #     ax.set_xlabel('X')
        #     ax.set_ylabel('Y')
        #     ax.set_zlabel('Z')
        #     # plt.show()
        #     save_path = './plots/'+str(e)+'.png'
        #     plt.savefig(save_path)
        #########################################

        ################ Plotting rewards ##############
        if plot_reward:
            episode_rewards.append(total_reward)
            episode.append(epi + 1)
            plt.plot(episode, episode_rewards, 'b')
            plt.pause(0.001)

    plt.savefig("Training Curve.png")