def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2,
            gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300,
            max_size=1000000, batch_size=100,warmup=1000, noise=0.1):
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.gamma = gamma 
        self.n_actions = n_actions
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.update_actor_iter = update_actor_interval

        self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic1')
        self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic2')
        self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='actor')
        # target nets
        self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic1')
        self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic2')
        self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='Target_actor')
        self.noise = noise 
        # set the target nets to be exactly as our nets
        self.update_network_parameters(tau=1)
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Exemple #3
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # _build up the actor/critic evaluated network
        self.actor_net = Actor(env_params, hidden_units=256)
        self.critic_net = Critic(env_params, hidden_units=256)

        # sync the networks across the cpus for parallel training (when running at workstation)
        sync_networks(self.actor_net)
        sync_networks(self.critic_net)

        # _build up the actor/critic target network
        self.actor_target_net = Actor(env_params, hidden_units=256)
        self.critic_target_net = Critic(env_params, hidden_units=256)

        # if gpu is used
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()

        # the optimizer of the networks
        self.actor_optimizer = torch.optim.Adam(
            self.actor_net.parameters(), lr=self.args.learning_rate_actor)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_net.parameters(), lr=self.args.learning_rate_critic)

        # HER sample function
        self.her_sample = HER(self.args.replay_strategy,
                              self.args.replay_ratio, self.env.compute_reward)

        # experience buffer
        self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size,
                                       self.her_sample.her_sample_transitions)

        # the normalization of the observation and goal
        self.obs_norm = Normalizer(size=env_params['obs'],
                                   clip_range=self.args.clip_range)
        self.goal_norm = Normalizer(size=env_params['d_goal'],
                                    clip_range=self.args.clip_range)

        # create the dictionary to save the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)

            # get the model path
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 fc1Dms=400,
                 fc2Dms=300,
                 max_size=1000000,
                 batch_size=64):
        self.alpha = alpha
        self.tau = tau
        self.beta = beta
        self.batch_size = batch_size
        self.gamma = gamma
        self.n_actions = n_actions
        print(batch_size, fc1Dms, fc2Dms)
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNet(alpha=alpha,
                              input_dims=input_dims,
                              n_actions=n_actions,
                              fc1Dms=fc1Dms,
                              fc2Dms=fc2Dms,
                              name='actor')
        self.critic = CriticNet(beta=beta,
                                input_dims=input_dims,
                                n_actions=n_actions,
                                fc1Dms=fc1Dms,
                                fc2Dms=fc2Dms,
                                name='critic')
        self.target_actor = ActorNet(alpha=alpha,
                                     input_dims=input_dims,
                                     n_actions=n_actions,
                                     fc1Dms=fc1Dms,
                                     fc2Dms=fc2Dms,
                                     name='target_actor')
        self.target_critic = CriticNet(beta=beta,
                                       input_dims=input_dims,
                                       n_actions=n_actions,
                                       fc1Dms=fc1Dms,
                                       fc2Dms=fc2Dms,
                                       name='target_critic')

        self.update_network_parameters(tau=1)
Exemple #6
0
    ########################## Make env and save inform. ###########################
    env = gym.make(args.env_name)
    args.action_dim = env.action_space.shape[0]
    args.max_action = int(env.action_space.high[0])
    args.state_dim = env.observation_space.shape[0]

    ################################### Set seed ###################################
    env.seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    ############## Set replay-buffer, rl-agent, es-agents, and actor ###############
    replay_buffer = ReplayBuffer(args,
                                 args.state_dim,
                                 args.action_dim,
                                 max_size=args.buffer_size)
    rl_agent = TD3(args.state_dim, args.action_dim, args.max_action, args)
    es_agent = CEM(args,
                   num_params=rl_agent.actor.get_size(),
                   mu_init=rl_agent.actor.get_params())
    actor = copy.deepcopy(rl_agent.actor)

    ##################### Set data-frame for saving results ########################
    df_log = pd.DataFrame(columns=["Step", "AvgES", "BestES", "RL"])
    df_steps = pd.DataFrame(columns=["Step"] +
                            [f"Ind{i}" for i in range(1, args.pop_size + 1)])
    df_fitness = pd.DataFrame(columns=["Step"] +
                              [f"Ind{i}" for i in range(1, args.pop_size + 1)])
    df_mu = pd.DataFrame(columns=["Step", "Mean", "Std"] +
                         [f"Reward{i}" for i in range(1, args.n_eval + 1)])
Exemple #7
0
class DDPGAgent:
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # _build up the actor/critic evaluated network
        self.actor_net = Actor(env_params, hidden_units=256)
        self.critic_net = Critic(env_params, hidden_units=256)

        # sync the networks across the cpus for parallel training (when running at workstation)
        sync_networks(self.actor_net)
        sync_networks(self.critic_net)

        # _build up the actor/critic target network
        self.actor_target_net = Actor(env_params, hidden_units=256)
        self.critic_target_net = Critic(env_params, hidden_units=256)

        # if gpu is used
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()

        # the optimizer of the networks
        self.actor_optimizer = torch.optim.Adam(
            self.actor_net.parameters(), lr=self.args.learning_rate_actor)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_net.parameters(), lr=self.args.learning_rate_critic)

        # HER sample function
        self.her_sample = HER(self.args.replay_strategy,
                              self.args.replay_ratio, self.env.compute_reward)

        # experience buffer
        self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size,
                                       self.her_sample.her_sample_transitions)

        # the normalization of the observation and goal
        self.obs_norm = Normalizer(size=env_params['obs'],
                                   clip_range=self.args.clip_range)
        self.goal_norm = Normalizer(size=env_params['d_goal'],
                                    clip_range=self.args.clip_range)

        # create the dictionary to save the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)

            # get the model path
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)

    ###############################
    # Name: learning
    # Function: Training the model
    # Comment:
    ###############################
    def learning(self):
        success_rate_history = []
        for epoch in range(self.args.n_epochs):
            for _ in range(self.args.n_cycles):
                exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff = [], [], [], []
                for _ in range(self.args.num_exp_per_mpi):
                    # reset the environment and experience
                    exp_obs, exp_a_goal, exp_d_goal, exp_actions = [], [], [], []
                    observations = self.env.reset()
                    obs = observations['observation']
                    a_goal = observations['achieved_goal']
                    d_goal = observations['desired_goal']

                    # interact with the environment
                    for t in range(self.env_params['max_timesteps']):
                        with torch.no_grad():
                            input_tensor = self._pre_process_inputs(
                                obs, d_goal)
                            policy_predictions = self.actor_net(input_tensor)
                            action = self._choose_action(policy_predictions)

                        # get the observations from the action
                        observations_next, _, _, info = self.env.step(action)
                        obs_next = observations_next['observation']
                        a_goal_next = observations_next['achieved_goal']
                        exp_obs.append(obs.copy())
                        exp_a_goal.append(a_goal.copy())
                        exp_d_goal.append(d_goal.copy())
                        exp_actions.append(action.copy())
                        # update the state
                        obs = obs_next
                        a_goal = a_goal_next
                    exp_obs.append(obs.copy())
                    exp_a_goal.append(a_goal.copy())
                    exp_obs_buff.append(exp_obs)
                    exp_a_goal_buff.append(exp_a_goal)
                    exp_d_goal_buff.append(exp_d_goal)
                    exp_actions_buff.append(exp_actions)
                exp_obs_buff = np.array(exp_obs_buff)
                exp_a_goal_buff = np.array(exp_a_goal_buff)
                exp_d_goal_buff = np.array(exp_d_goal_buff)
                exp_actions_buff = np.array(exp_actions_buff)
                # store the transitions
                self.exp_buffer.store_transition([
                    exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff,
                    exp_actions_buff
                ])
                self._update_normalizer([
                    exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff,
                    exp_actions_buff
                ])
                for _ in range(self.args.n_batches):
                    self._update_network()  # training the network
                # soft update the network parameter
                self._soft_update_target_network(self.actor_target_net,
                                                 self.actor_net)
                self._soft_update_target_network(self.critic_target_net,
                                                 self.critic_net)
            # start evaluation
            success_rate = self._evaluate_agent()
            if MPI.COMM_WORLD.Get_rank() == 0:
                print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format(
                    datetime.now(), epoch, success_rate))
                torch.save([
                    self.obs_norm.mean, self.obs_norm.std, self.goal_norm.mean,
                    self.goal_norm.std,
                    self.actor_net.state_dict()
                ], self.model_path + '/model.pt')
            success_rate_history.append(success_rate)
        success_rate_history = np.array(success_rate_history)
        np.savetxt('Plot_Data/Pen_HER.txt',
                   success_rate_history,
                   fmt='%f',
                   delimiter=',')

    ###############################
    # Name: _pre_process_inputs
    # Function: process the inputs for the actor network
    # Comment:
    ###############################
    def _pre_process_inputs(self, obs, goal):
        obs_norm = self.obs_norm.normalize(obs)
        goal_norm = self.goal_norm.normalize(goal)
        # concatenate the stuffs
        inputs = np.concatenate([obs_norm, goal_norm])
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.cuda()

        return inputs

    def _choose_action(self, policy_predictions):
        action = policy_predictions.cpu().numpy().squeeze()
        # create the noise
        action += self.args.noise_epsilon * self.env_params[
            'action_max'] * np.random.randn(*action.shape)
        action = np.clip(action, -self.env_params['action_max'],
                         self.env_params['action_max'])
        random_action = np.random.uniform(low=-self.env_params['action_max'],
                                          high=self.env_params['action_max'],
                                          size=self.env_params['action'])
        # decide random or not
        action += np.random.binomial(1, self.args.random_epsilon,
                                     1)[0] * (random_action - action)

        return action

    def _update_normalizer(self, experience_buff):
        exp_obs, exp_a_goal, exp_d_goal, exp_actions = experience_buff
        exp_obs_next = exp_obs[:, 1:, :]
        exp_a_goal_next = exp_a_goal[:, 1:, :]
        num_exps = exp_actions.shape[1]
        buffer_temp = {
            'obs': exp_obs,
            'a_goal': exp_a_goal,
            'd_goal': exp_d_goal,
            'actions': exp_actions,
            'obs_next': exp_obs_next,
            'a_goal_next': exp_a_goal_next,
        }
        transitions = self.her_sample.her_sample_transitions(
            buffer_temp, num_exps)
        obs, d_goal = transitions['obs'], transitions['d_goal']
        transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal(
            obs, d_goal)
        # update
        self.obs_norm.update(transitions['obs'])
        self.goal_norm.update(transitions['d_goal'])
        # recompute the stats
        self.obs_norm.recompute_stats()
        self.goal_norm.recompute_stats()

    ###############################
    # Name: _pre_process_obs_goal
    # Function: process the observation and desired goal for the normalization
    # Comment:
    ###############################
    def _pre_process_obs_goal(self, obs, goal):
        obs_proceed = np.clip(obs, -self.args.clip_obs, self.args.clip_obs)
        goal_proceed = np.clip(goal, -self.args.clip_obs, self.args.clip_obs)

        return obs_proceed, goal_proceed

    ###############################
    # Name: _soft_update_target_network
    # Function: soft update the parameters of the target network
    # Comment:
    ###############################
    def _soft_update_target_network(self, target_net, eval_net):
        for target_param, param in zip(target_net.parameters(),
                                       eval_net.parameters()):
            target_param.data.copy_((1 - self.args.avg_coeff) * param.data +
                                    self.args.avg_coeff * target_param.data)

    ###############################
    # Name: _update_network
    # Function: train the parameters of the actor network and critic network
    # Comment:
    ###############################
    def _update_network(self):
        # sample the transitions
        transitions = self.exp_buffer.sample(self.args.batch_size)
        obs, obs_next, d_goal = transitions['obs'], transitions[
            'obs_next'], transitions['d_goal']
        transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal(
            obs, d_goal)
        transitions['obs_next'], transitions[
            'd_goal_next'] = self._pre_process_obs_goal(obs_next, d_goal)
        observation_norm = self.obs_norm.normalize(transitions['obs'])
        d_goal_norm = self.goal_norm.normalize(transitions['d_goal'])
        inputs_norm = np.concatenate([observation_norm, d_goal_norm], axis=1)

        observation_next_norm = self.obs_norm.normalize(
            transitions['obs_next'])
        d_goal_next_norm = self.goal_norm.normalize(transitions['d_goal_next'])
        inputs_next_norm = np.concatenate(
            [observation_next_norm, d_goal_next_norm], axis=1)

        inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32)
        inputs_next_norm_tensor = torch.tensor(inputs_next_norm,
                                               dtype=torch.float32)
        actions_tensor = torch.tensor(transitions['actions'],
                                      dtype=torch.float32)
        reward_tensor = torch.tensor(transitions['reward'],
                                     dtype=torch.float32)

        if self.args.cuda:
            inputs_norm_tensor = inputs_norm_tensor.cuda()
            inputs_next_norm_tensor = inputs_next_norm_tensor.cuda()
            actions_tensor = actions_tensor.cuda()
            reward_tensor = reward_tensor.cuda()

        # calculate the target Q value function
        with torch.no_grad():
            actions_next = self.actor_target_net(inputs_next_norm_tensor)
            q_next_value = self.critic_target_net(inputs_next_norm_tensor,
                                                  actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = reward_tensor + self.args.gamma * q_next_value
            target_q_value = target_q_value.detach()
            clip_return = 1 / (1 - self.args.gamma)  # ??????????????
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)

        # calculate the loss
        real_q_value = self.critic_net(inputs_norm_tensor, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()

        # the actor loss
        actions_real = self.actor_net(inputs_norm_tensor)
        actor_loss = -self.critic_net(inputs_norm_tensor, actions_real).mean()
        actor_loss += self.args.action_l2 * (
            actions_real / self.env_params['action_max']).pow(2).mean()

        # start to train the network
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_net)
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        sync_grads(self.critic_net)
        self.critic_optimizer.step()

    ###############################
    # Name: _evaluate_agent
    # Function: evaluate the agent
    # Comment:
    ###############################
    def _evaluate_agent(self):
        all_success_rate = []
        for _ in range(self.args.n_eval):
            per_success_rate = []
            observations = self.env.reset()
            obs = observations['observation']
            d_goal = observations['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._pre_process_inputs(obs, d_goal)
                    policy_predictions = self.actor_net(input_tensor)
                    action = policy_predictions.detach().cpu().numpy().squeeze(
                    )
                observations_next, _, _, info = self.env.step(action)
                obs = observations_next['observation']
                d_goal = observations_next['desired_goal']
                per_success_rate.append(info['is_success'])
            all_success_rate.append(per_success_rate)
        all_success_rate = np.array(all_success_rate)
        local_success_rate = np.mean(all_success_rate[:, -1])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate,
                                                       op=MPI.SUM)

        return global_success_rate / MPI.COMM_WORLD.Get_size()
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        #self.count = 0
        #self.score = 0
        #self.total_reward = 0
        #self.best_w = None
        #self.best_score = -np.inf
        #self.noise_scale = 0.1


    def reset_episode(self):
        #self.count = 0
        #self.score = 0
        #self.total_reward = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        #self.count += 1
        #self.total_reward += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        self.noise.sample()
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
       
        
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   
        

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
class Agent:
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)

    def noise_action(self, state):
        '''Get action with noise'''
        return self.action(state) + self.exploration_noise.noise()

    def action(self, state):
        '''Get action from online actor'''
        return self.actor.action(state)

    def train(self):
        '''Train Networks'''
        # Draw sample from Replay Buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([d[0] for d in minibatch])
        action_batch = np.asarray([d[1] for d in minibatch])
        reward_batch = np.asarray([d[2] for d in minibatch])
        next_state_batch = np.asarray([d[3] for d in minibatch])
        done_batch = np.asarray([d[4] for d in minibatch])

        # Train Critic
        next_action_batch = self.actor.target_actions(next_state_batch)
        target_q_value_batch = self.critic.target_q(next_state_batch,
                                                    next_action_batch)
        # q = r if done else r+gamma*target_q
        q_batch = reward_batch.reshape(
            (BATCH_SIZE, 1)) + (1. - done_batch.reshape(
                BATCH_SIZE, 1).astype(float)) * GAMMA * target_q_value_batch
        self.critic.train(q_batch, state_batch, action_batch)

        # Train Actor
        action_batch_grads = self.actor.actions(state_batch)
        q_grads_batch = self.critic.gradients(state_batch, action_batch_grads)
        self.actor.train(q_grads_batch, state_batch)

        # Slowly update Target Networks
        self.actor.update_target()
        self.critic.update_target()

    def perceive(self, state, action, reward, next_state, done):
        '''Add transition to replay buffer and train if there are sufficient amount of transitions'''
        # Add samples
        self.replay_buffer.add(state, action, reward, next_state, done)
        # Train if there are sufficient number of samples
        if self.replay_buffer.count() > REPLAY_START:
            self.train()
        # Reset the noise for next episode
        if done:
            self.exploration_noise.reset()
class DDpgAgent():
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 fc1Dms=400,
                 fc2Dms=300,
                 max_size=1000000,
                 batch_size=64):
        self.alpha = alpha
        self.tau = tau
        self.beta = beta
        self.batch_size = batch_size
        self.gamma = gamma
        self.n_actions = n_actions
        print(batch_size, fc1Dms, fc2Dms)
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNet(alpha=alpha,
                              input_dims=input_dims,
                              n_actions=n_actions,
                              fc1Dms=fc1Dms,
                              fc2Dms=fc2Dms,
                              name='actor')
        self.critic = CriticNet(beta=beta,
                                input_dims=input_dims,
                                n_actions=n_actions,
                                fc1Dms=fc1Dms,
                                fc2Dms=fc2Dms,
                                name='critic')
        self.target_actor = ActorNet(alpha=alpha,
                                     input_dims=input_dims,
                                     n_actions=n_actions,
                                     fc1Dms=fc1Dms,
                                     fc2Dms=fc2Dms,
                                     name='target_actor')
        self.target_critic = CriticNet(beta=beta,
                                       input_dims=input_dims,
                                       n_actions=n_actions,
                                       fc1Dms=fc1Dms,
                                       fc2Dms=fc2Dms,
                                       name='target_critic')

        self.update_network_parameters(tau=1)

    def choose_action(self, state):
        self.actor.eval(
        )  # set the network into evaluation mode (because we are batch norm)
        state = T.tensor([state], dtype=T.float).to(self.actor.device)
        # the actions we got is totally determenistic so we need to add noise
        mu = self.actor.forward(state).to(self.actor.device)
        # adding noise to the actor output (states in the paper p4)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(
            self.actor.device)
        # back to train mode
        self.actor.train()
        # detach() takes the tensor from the cpu, then we convert it to numpy
        # in order to feed it to the invironment
        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, new_states, dones = \
                self.memory.sampling(self.batch_size)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        dones = T.tensor(dones).to(self.actor.device)
        new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device)
        # print(states.shape)
        # print('actions size inside learn', actions.shape)
        target_actions = self.target_actor.forward(new_states)
        target_critic_value = self.target_critic.forward(
            new_states, target_actions)
        critic_value = self.critic.forward(states, actions)

        target_critic_value[
            dones] = 0.0  # make the value of the terminal state =0
        target_critic_value = target_critic_value.view(
            -1)  # not sure why ?? TODO:test

        target = rewards + self.gamma * target_critic_value
        target = target.view(self.batch_size,
                             1)  # convert to the same size as critic_value

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        # print("im inside the learn function >>>>>>")
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau * critic_state_dict[name].clone() + \
                                    (1-tau) * target_critic_state_dict[name].clone()
        for name in actor_state_dict:
            actor_state_dict[name] = tau * actor_state_dict[name].clone() + \
                                    (1-tau) * target_actor_state_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
class TD3Agent():
    def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2,
            gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300,
            max_size=1000000, batch_size=100,warmup=1000, noise=0.1):
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.gamma = gamma 
        self.n_actions = n_actions
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.update_actor_iter = update_actor_interval

        self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic1')
        self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic2')
        self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='actor')
        # target nets
        self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic1')
        self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic2')
        self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='Target_actor')
        self.noise = noise 
        # set the target nets to be exactly as our nets
        self.update_network_parameters(tau=1)


    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward,
                                new_state, done)

    def choose_action(self, state):
        # check if we are in time after the warmup
        if self.time_step < self.warmup:
            # scale is the standard deviation
            mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,)))
        else:
            # print("the warmup is done")
            state = T.tensor(state, dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state).to(self.actor.device)
            
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                     dtype=T.float).to(self.actor.device)

        # we want to make sure the mu is not out of the max action the env can take
        mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
        self.time_step +=1
        
        # action.shape= (2,)
        # print(action)
        return mu_prime.cpu().detach().numpy()
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            # print("not learning")
            return 
        
        states, actions, rewards, new_states, dones = \
                self.memory.sampling(self.batch_size)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        dones = T.tensor(dones).to(self.actor.device)
        new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device)

        # regularization
        # might breake if elements of min and max are not equal
        target_action = self.target_actor.forward(new_states) + \
                T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)
        target_action = T.clamp(target_action, self.min_action[0], self.max_action[0])

        target_critic1_q = self.target_critic_1.forward(new_states, target_action)
        target_critic2_q = self.target_critic_2.forward(new_states, target_action)

        target_critic1_q[dones] = 0
        target_critic2_q[dones] = 0

        target_critic1_q = target_critic1_q.view(-1)
        target_critic2_q = target_critic2_q.view(-1)

        q1 = self.critic_1.forward(states, actions)
        q2 = self.critic_2.forward(states, actions)
        

        y = rewards + self.gamma * T.min(target_critic1_q, target_critic2_q)
        y = y.view(self.batch_size, 1)

        
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q1_loss = F.mse_loss(y, q1)
        q2_loss = F.mse_loss(y, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()

        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr +=1

        if self.learn_step_cntr % self.update_actor_iter != 0:
            return 
        # print("learning>>")
        self.actor.optimizer.zero_grad()
        actor_loss = self.critic_1.forward(states, self.actor.forward(states))
        actor_loss = -T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        target_actor_params = self.target_actor.named_parameters()

        critic1_params = self.critic_1.named_parameters()
        critic2_params = self.critic_2.named_parameters()
        target_critic1_params = self.target_critic_1.named_parameters()
        target_critic2_params = self.target_critic_2.named_parameters()

        critic1_state_dict = dict(critic1_params)
        critic2_state_dict = dict(critic2_params)
        target_critic1_state_dict = dict(target_critic1_params)
        target_critic2_state_dict = dict(target_critic2_params)

        actor_state_dict = dict(actor_params)
        target_actor_state_dict = dict(target_actor_params)

        # for name in target_actor_state_dict:
        #     target_actor_state_dict[name] = tau * actor_state_dict[name].clone() + \
        #             (1-tau) * target_actor_state_dict[name].clone()

        # for name in target_critic1_state_dict:
        #     target_critic1_state_dict[name] = tau * critic1_state_dict[name].clone() +\
        #             (1-tau) * target_critic1_state_dict[name].clone()

        # for name in target_critic2_state_dict:
        #     target_critic2_state_dict[name] = tau * critic2_state_dict[name].clone() +\
        #             (1-tau) * target_critic2_state_dict[name].clone()

           
        # self.target_actor.load_state_dict(target_actor_state_dict) 
        # self.target_critic_1.load_state_dict(target_critic1_state_dict)
        # self.target_critic_2.load_state_dict(target_critic2_state_dict)


        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                    (1-tau) * target_actor_state_dict[name].clone()

        for name in critic1_state_dict:
            critic1_state_dict[name] = tau*critic1_state_dict[name].clone() + \
                    (1-tau) * target_critic1_state_dict[name].clone()

        for name in critic2_state_dict:
            critic2_state_dict[name] = tau*critic2_state_dict[name].clone() + \
                    (1-tau) * target_critic2_state_dict[name].clone()

           
        self.target_actor.load_state_dict(actor_state_dict) 
        self.target_critic_1.load_state_dict(critic1_state_dict)
        self.target_critic_2.load_state_dict(critic2_state_dict)


    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()