Example #1
0
class DDPGAgents():
    """ Agent used to interact with and learns from the environment """
    def __init__(self, state_size, action_size, config):
        """ Initialize an agent object """

        self.state_size = state_size
        self.action_size = action_size
        self.config = config

        # retrieve number of agents
        self.num_agents = config["DDPG"]["num_agents"]

        # logging for this class
        self.logger = logging.getLogger(self.__class__.__name__)

        # gpu support
        self.device = pick_device(config, self.logger)

        ## Actor local and target networks
        self.actor_local = Actor(state_size, action_size,
                                 config).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  config).to(self.device)
        self.actor_optimizer = getattr(
            optim, config["optimizer_actor"]["optimizer_type"])(
                self.actor_local.parameters(),
                betas=tuple(config["optimizer_actor"]["betas"]),
                **config["optimizer_actor"]["optimizer_params"])

        ## Critic local and target networks
        self.critic_local = Critic(state_size, action_size,
                                   config).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    config).to(self.device)
        self.critic_optimizer = getattr(
            optim, config["optimizer_critic"]["optimizer_type"])(
                self.critic_local.parameters(),
                betas=tuple(config["optimizer_critic"]["betas"]),
                **config["optimizer_critic"]["optimizer_params"])

        ## Noise process
        self.noise = OUNoise((self.num_agents, action_size))

        ## Replay memory
        self.memory = ReplayBuffer(config=config,
                                   action_size=action_size,
                                   buffer_size=int(
                                       config["DDPG"]["buffer_size"]),
                                   batch_size=config["trainer"]["batch_size"])

    def step(self, state, action, reward, next_state, done):
        """ Save experience in replay memory, 
		and use random sample from buffer to learn """

        # Save experience in replay memory shared by all agents
        for agent in range(self.num_agents):
            self.memory.add(state[agent, :], action[agent, :], reward[agent],
                            next_state[agent, :], done[agent])

        # learn every timestep as long as enough samples are available in memory
        if len(self.memory) > self.config["trainer"]["batch_size"]:
            experiences = self.memory.sample()
            self.learn(experiences, self.config["DDPG"]["gamma"])

    def act(self, states, add_noise=False):
        """ Returns actions for given state as per current policy """

        # Convert state to tensor²
        states = torch.from_numpy(states).float().to(self.device)

        # prepare actions numpy array for all agents
        actions = np.zeros((self.num_agents, self.action_size))

        ## Evaluation mode
        self.actor_local.eval()
        with torch.no_grad():
            # Forward pass of local actor network
            for agent, state in enumerate(states):
                action_values = self.actor_local.forward(
                    state).cpu().data.numpy()
                actions[agent, :] = action_values

        # pdb.set_trace()
        ## Training mode
        self.actor_local.train()
        if add_noise:
            # Add noise to improve exploration to our actor policy
            # action_values += torch.from_numpy(self.noise.sample()).type(torch.FloatTensor).to(self.device)
            actions += self.noise.sample()
        # Clip action to stay in the range [-1, 1] for our task
        actions = np.clip(actions, -1, 1)

        return actions

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples """

        states, actions, rewards, next_states, dones = experiences

        ## Update actor (policy) network using the sampled policy gradient
        # Compute actor loss
        actions_pred = self.actor_local.forward(states)
        actor_loss = -self.critic_local.forward(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ## Update critic (value) network
        # Get predicted next-state actions and Q-values from target models
        actions_next = self.actor_target.forward(next_states)
        Q_targets_next = self.critic_target.forward(next_states, actions_next)
        # Compute Q-targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q-values from local critic model
        Q_expected = self.critic_local.forward(states, actions)
        # Compute loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ## Update target networks with a soft update
        self.soft_update(self.actor_local, self.actor_target,
                         self.config["DDPG"]["tau"])
        self.soft_update(self.critic_local, self.critic_target,
                         self.config["DDPG"]["tau"])

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters,
		improves the stability of learning """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        """ Reset noise """
        self.noise.reset()
Example #2
0
class TD3Agent():
    def __init__(self, env: object, gamma: float, delay_step: int, tau: float,
                 buffer_maxlen: int, noise_std: float, noise_bound: float,
                 critic_lr: float, actor_lr: float):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the state
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Total_step initialization
        self.steps = 0

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.delay_step = delay_step

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic1 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic2 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # copy weight parameters to the target Q network and actor network
        for target_param, param in zip(self.target_critic1.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_critic2.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=self.critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(),
                                            lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int, steps: int):
        self.steps = steps

        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss
        # Sample actions for the next states (s_t+1) using the target actor
        next_actions = self.target_actor.forward(next_states)
        next_actions = self.rescale_action(next_actions)

        # Adding gaussian noise to the actions
        noise = self.get_noise(next_actions, self.noise_std + 0.1,
                               -self.noise_bound, self.noise_bound)
        noisy_next_actions = next_actions + noise

        # Compute Q(s_t+1,a_t+1)
        next_q1 = self.target_critic1(next_states, noisy_next_actions)
        next_q2 = self.target_critic2(next_states, noisy_next_actions)

        # Choose minimum Q
        min_q = torch.min(next_q1, next_q2)

        # Find expected Q, i.e., r(t) + gamma*next_q
        expected_q = rewards + (1 - dones) * self.gamma * min_q

        # Find current Q values for the given states and actions from replay buffer
        curr_q1 = self.critic1.forward(states, actions)
        curr_q2 = self.critic2.forward(states, actions)

        # Compute loss between Q network and expected Q
        critic1_loss = F.mse_loss(curr_q1, expected_q.detach())
        critic2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # actor update (computing the loss)

        if self.steps % self.delay_step == 0:
            # Sample new actions for the current states (s_t) using the current actor
            new_actions = self.actor.forward(states)

            # Compute Q(s_t,a_t)
            new_q1 = self.critic1.forward(states, new_actions)

            # Compute the actor loss, i.e., -Q1
            actor_loss = -new_q1.mean()

            # Backpropagate the losses and update actor network parameters
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the target networks
            for target_param, param in zip(self.target_critic1.parameters(),
                                           self.critic1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_critic2.parameters(),
                                           self.critic2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_actor.parameters(),
                                           self.actor.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

    def get_noise(self, action: torch.Tensor, sigma: float, bottom: float,
                  top: float) -> torch.Tensor:
        # sigma: standard deviation of the noise
        # bottom,top: minimum and maximum values for the given noiuse
        return torch.normal(torch.zeros(action.size()),
                            sigma).clamp(bottom, top).to(self.device)

    def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action)
        # Convert state numpy to tensor
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)

        if stochastic:
            # Add gaussian noise to the rescaled action
            action = self.rescale_action(action) + self.get_noise(
                action, self.noise_std, -self.noise_bound, self.noise_bound)
        else:
            action = self.rescale_action(action)

        # Convert action tensor to numpy
        action = action.squeeze(0).cpu().detach().numpy()
        return action

    def rescale_action(self, action: torch.Tensor) -> torch.Tensor:
        # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"
        torch.save(self.actor.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"  # Best
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor.load_state_dict(torch.load(savePath))
Example #3
0
def main():
    env = gym.make('InvertedPendulum-v2')
    # states: [x, theta, x', theta']
    # action: [horizontal force]
    nstates = 4
    nactions = 1

    T = 2048  # environement steps per update
    batch_size = 64
    epochs = 10
    lr = 0.01
    discount = 0.99
    clipping_epsilon = 0.2
    lam = 0.95  # GAE parameter
    total_timesteps = 1000000

    actor = Actor(nstates, nactions)
    critic = Critic(nstates)

    n_updates = total_timesteps // T
    if total_timesteps % T != 0:
        n_updates += 1

    n_batches_per_update = T // batch_size
    if T % batch_size != 0:
        n_batches_per_update += 1

    episode_rewards = []
    critic_losses = []

    for update in tqdm(range(n_updates)):
        states, actions, rewards, dones, values, log_probs, ep_rewards = rollout(
            env, actor, critic, T, nstates, max_ep_length)

        episode_rewards += ep_rewards

        advantages, returns = get_advantages_and_returns(
            dones, rewards, values, discount, lam, T)

        idx = np.arange(T)

        for k in range(epochs):
            np.random.default_rng().shuffle(idx)

            for n in range(0, n_batches_per_update, batch_size):
                batch_idx = idx[n:n + batch_size]
                state = states[batch_idx]
                action = actions[batch_idx]
                log_prob = log_probs[batch_idx]
                advantage = advantages[batch_idx]
                G = returns[batch_idx]

                _, current_log_probs = actor.forward(batch_states,
                                                     batch_actions,
                                                     requires_grad=True)
                ratios = np.exp(current_log_probs - batch_log_probs)
                clipped_ratios = np.minimum(
                    1 + clipping_epsilon,
                    np.maximum(1 - clipping_epsilon, ratios))

                unclipped_surrogate = ratios * batch_A
                clipped_surrogate = clipped_ratios * batch_A
                actor_loss = -np.minimum(unclipped_surrogate,
                                         clipped_surrogate).mean()

                current_state_values = critic.forward(batch_states,
                                                      requires_grad=True)
                critic_loss = ((current_state_values -
                                batch_returns)**2).mean()

                # derivative of actor_loss w.r.t current_log_probs
                dAL_dlp = -unclipped_surrogate
                # derivative of clipped_ratios w.r.t ratios
                dcr_dr = np.zeros_like(ratios)
                dcr_dr[(ratios < 1 + clipping_epsilon)
                       & (ratios > 1 - clipping_epsilon)] = 1.0
                # only include the derivative of the clipped_ratio if the clipped_ratio was used
                clipped_used_idx = clipped_surrogate < unclipped_surrogate
                dAL_dlp[clipped_used_idx] *= dcr_dr[clipped_used_idx]

                # derivative of critic_loss w.r.t current_state_values
                dCL_dsv = current_state_values - batch_returns

                actor.backward(dAL_dlp)
                critic.backward(dCL_dsv)

                actor.optimization_step(lr)
                critic.optimization_step(lr)

                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

    env.close()

    fig, ax = plt.subplots()
    ax.plot(moving_average(episode_rewards, 100))
    plt.show()
    plt.close()

    fig, ax = plt.subplots()
    ax.plot(moving_average(critic_losses, 10))
    plt.show()
    plt.close()
Example #4
0
class AgentDDPG:
    def __init__(self, params):

        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']

        nn_params = params['nn_params']
        nn_params['nn_actor']['l1'][0] = state_size
        nn_params['nn_actor']['l3'][1] = action_size
        nn_params['nn_critic']['l1'][0] = state_size + action_size

        self.__actor_local = Actor(nn_params['nn_actor']).to(device)
        self.__actor_target = Actor(nn_params['nn_actor']).to(device)
        self.__critic_local = Critic(nn_params['nn_critic']).to(device)
        self.__critic_target = Critic(nn_params['nn_critic']).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__memory = ReplayBuffer(buf_params)
        self.__t = 0

        self.gamma = params['gamma']
        self.learning_rate_actor = params['learning_rate_actor']
        self.learning_rate_critic = params['learning_rate_critic']
        self.tau = params['tau']

        self.__optimiser_actor = optim.Adam(self.__actor_local.parameters(),
                                            self.learning_rate_actor)
        self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(),
                                             self.learning_rate_critic)
        self.__uo_process = UOProcess()
        # other parameters
        self.agent_loss = 0.0

    # Set methods
    def set_learning_rate(self, lr_actor, lr_critic):
        self.learning_rate_actor = lr_actor
        self.learning_rate_critic = lr_critic
        for param_group in self.__optimiser_actor.param_groups:
            param_group['lr'] = lr_actor
        for param_group in self.__optimiser_critic.param_groups:
            param_group['lr'] = lr_critic

    # Get methods
    def get_actor(self):
        return self.__actor_local

    def get_critic(self):
        return self.__critic_local

    # Other methods
    def step(self, state, action, reward, next_state, done):
        # add experience to memory
        self.__memory.add(state, action, reward, next_state, done)

        if self.__memory.is_ready():
            experiences = self.__memory.sample()
            self.__update(experiences)

    def choose_action(self, state, mode='train'):
        if mode == 'train':
            # state should be transformed to a tensor
            state = torch.from_numpy(
                np.array(state)).float().unsqueeze(0).to(device)
            self.__actor_local.eval()
            with torch.no_grad():
                action = self.__actor_local(state) + self.__uo_process.sample()
            self.__actor_local.train()
            return list(np.clip(action.cpu().numpy().squeeze(), -1, 1))
        elif mode == 'test':
            # state should be transformed to a tensor
            state = torch.from_numpy(
                np.array(state)).float().unsqueeze(0).to(device)
            self.__actor_local.eval()
            with torch.no_grad():
                action = self.__actor_local(state)
            self.__actor_local.train()
            return list(np.clip(action.cpu().numpy().squeeze(), -1, 1))
        else:
            print("Invalid mode value")

    def reset(self, sigma):
        self.__uo_process.reset(sigma)

    def __update(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # update critic
        # ----------------------------------------------------------
        loss_fn = nn.MSELoss()
        self.__optimiser_critic.zero_grad()
        # form target
        next_actions = self.__actor_target(next_states)
        Q_target_next = self.__critic_target.forward(
            torch.cat((next_states, next_actions), dim=1)).detach()
        targets = rewards + self.gamma * Q_target_next * (1 - dones)
        # form output
        outputs = self.__critic_local.forward(
            torch.cat((states, actions), dim=1))
        mean_loss_critic = loss_fn(
            outputs, targets)  # minus added since it's gradient ascent
        mean_loss_critic.backward()
        self.__optimiser_critic.step()

        # update actor
        # ----------------------------------------------------------
        self.__optimiser_actor.zero_grad()
        predicted_actions = self.__actor_local(states)
        mean_loss_actor = -self.__critic_local.forward(
            torch.cat((states, predicted_actions), dim=1)).mean()
        mean_loss_actor.backward()
        self.__optimiser_actor.step()  # update actor

        self.__soft_update(self.__critic_local, self.__critic_target, self.tau)
        self.__soft_update(self.__actor_local, self.__actor_target, self.tau)

    @staticmethod
    def __soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #5
0
class AgentDDPG:
    """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks"""
    def __init__(self,
                 state_size,
                 hidden_size,
                 action_size,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 use_cuda=False,
                 actor_path=None,
                 critic_path=None):
        # Params
        self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size
        self.gamma, self.tau = gamma, tau
        self.use_cuda = use_cuda

        # Networks
        self.actor = Actor(state_size, hidden_size, action_size)
        self.actor_target = Actor(state_size, hidden_size, action_size)

        self.critic = Critic(state_size + action_size, hidden_size,
                             action_size)
        self.critic_target = Critic(state_size + action_size, hidden_size,
                                    action_size)

        # Load model state_dicts from saved file
        if actor_path and path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        if critic_path and path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

        # Hard copy params from original networks to target networks
        copy_params(self.actor, self.actor_target)
        copy_params(self.critic, self.critic_target)

        if self.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        # Create replay buffer for storing experience
        self.replay_buffer = ReplayBuffer(cache_size=int(1e6))

        # Training
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)

    def save_to_file(self, actor_file, critic_file):
        # Save the state_dict's of the Actor and Critic networks
        torch.save(self.actor.state_dict(), actor_file)
        torch.save(self.critic.state_dict(), critic_file)

    def get_action(self, state):
        """Select action with respect to state according to current policy and exploration noise"""
        state = Variable(torch.from_numpy(state).float())

        if self.use_cuda:
            state = state.cuda()

        a = self.actor.forward(state)

        if self.use_cuda:
            return a.detach().cpu().numpy()

        return a.detach().numpy()

    def save_experience(self, state_t, action_t, reward_t, state_t1):
        self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1)

    def update(self, batch_size):
        states, actions, rewards, next_states = self.replay_buffer.get_samples(
            batch_size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        if self.use_cuda:
            states = states.cuda()
            next_states = next_states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()

        # Critic loss
        Qvals = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_Q = self.critic_target.forward(next_states, next_actions.detach())
        Qprime = rewards + self.gamma * next_Q
        critic_loss = self.critic_criterion(Qvals, Qprime)

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        policy_loss = -self.critic.forward(states,
                                           self.actor.forward(states)).mean()

        # Update actor
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        soft_copy_params(self.actor, self.actor_target, self.tau)
        soft_copy_params(self.critic, self.critic_target, self.tau)

    def add_noise_to_weights(self, amount=0.1):
        self.actor.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.actor_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
class TD3Agent:
    def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std,
                 noise_bound, critic_lr, actor_lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step

        # initialize actor and critic networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic1_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic1_target.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.critic2_target.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        action_space_noise = self.generate_action_space_noise(action_batch)
        next_actions = self.actor.forward(state_batch) + action_space_noise
        next_Q1 = self.critic1_target.forward(next_state_batch, next_actions)
        next_Q2 = self.critic2_target.forward(next_state_batch, next_actions)
        expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2)

        # critic loss
        curr_Q1 = self.critic1.forward(state_batch, action_batch)
        curr_Q2 = self.critic2.forward(state_batch, action_batch)
        critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach())
        critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach())

        # update critics
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # delyaed update for actor & target networks
        if (self.update_step % self.delay_step == 0):
            # actor
            self.actor_optimizer.zero_grad()
            policy_gradient = -self.critic1(state_batch,
                                            self.actor(state_batch)).mean()
            policy_gradient.backward()
            self.actor_optimizer.step()

            # target networks
            self.update_targets()

        self.update_step += 1

    def generate_action_space_noise(self, action_batch):
        noise = torch.normal(torch.zeros(action_batch.size()),
                             self.noise_std).clamp(-self.noise_bound,
                                                   self.noise_bound).to(
                                                       self.device)
        return noise

    def update_targets(self):
        for target_param, param in zip(self.critic1_target.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic2_target.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))
Example #7
0
class Agent:
    def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]):
        self.env= env
        self.env_params = env_params
        self.args = args


        # networks
        if models == None:
                self.actor = Actor(self.env_params).double()
                self.critic = Critic(self.env_params).double()
        else:
                self.actor , self.critic = self.LoadModels()
        # target networks used to predict env actions with
        self.actor_target = Actor(self.env_params,).double()
        self.critic_target = Critic(self.env_params).double()

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        if self.args.cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()


        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.normalize = Normalizer(env_params,self.args.gamma)
        self.buffer = ReplayBuffer(1_000_000, self.env_params)
        self.tensorboard = ModifiedTensorBoard(log_dir = f"logs")
        self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes]

    def ModelsEval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def ModelsTrain(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def GreedyAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy().squeeze()
        return action

    def NoiseAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy()
            action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape)
            action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action'])
        return action.squeeze()

    def Update(self):
        self.ModelsTrain()
        for i in range(self.args.n_batch):
            state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size)
            a_batch = torch.tensor(a_batch,dtype=torch.double)
            r_batch = torch.tensor(r_batch,dtype=torch.double)
            # d_batch = torch.tensor(d_batch,dtype=torch.double)
            state = torch.tensor(state,dtype=torch.double)
            nextstate = torch.tensor(nextstate,dtype=torch.double)
            # d_batch = 1 - d_batch

            if self.args.cuda:
                a_batch = a_batch.cuda()
                r_batch = r_batch.cuda()
                # d_batch = d_batch.cuda()
                state = state.cuda()
                nextstate = nextstate.cuda()

            with torch.no_grad():
                action_next = self.actor_target.forward(nextstate)
                q_next = self.critic_target.forward(nextstate,action_next)
                q_next = q_next.detach().squeeze()
                q_target = r_batch + self.args.gamma * q_next
                q_target = q_target.detach().squeeze()

            q_prime = self.critic.forward(state, a_batch).squeeze()
            critic_loss = F.mse_loss(q_target, q_prime)

            action = self.actor.forward(state)
            actor_loss = -self.critic.forward(state, action).mean()
            # params = torch.cat([x.view(-1) for x in self.actor.parameters()])
            # l2_reg = self.args.l2_norm *torch.norm(params,2)
            # actor_loss += l2_reg

            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()

            self.critic_optim.zero_grad()
            critic_loss.backward()
            self.critic_optim.step()

        self.SoftUpdateTarget(self.critic, self.critic_target)
        self.SoftUpdateTarget(self.actor, self.actor_target)

    def Explore(self):
        for epoch in range(self.args.n_epochs +1):
            start_time = time.process_time()
            for cycle in range(self.args.n_cycles):
                for _ in range(self.args.num_rollouts_per_mpi):
                    state = self.env.reset()
                    for t in range(self.env_params['max_timesteps']):
                        action = self.NoiseAction(state)
                        nextstate, reward, done, info = self.env.step([action])
                        nextstate = nextstate.squeeze()
                        reward = self.normalize.normalize_reward(reward)
                        self.buffer.StoreTransition(state, action, reward, nextstate, done)
                        state = nextstate
                    self.Update()
            avg_reward = self.Evaluate()
            self.tensorboard.step = epoch
            elapsed_time = time.process_time() - start_time
            print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\
                    Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds")
            if epoch % 5 or epoch + 1 == self.args.n_epochs:
                self.SaveModels(epoch)
                self.record(epoch)


    def Evaluate(self):
        self.ModelsEval()
        total_reward = []
        episode_reward = 0
        succes_rate = []
        for episode in range(self.args.n_evaluate):
            state = self.env.reset()
            episode_reward = 0
            for t in range(self.env_params['max_timesteps']):
                action = self.GreedyAction(state)
                nextstate, reward, done, info = self.env.step([action])
                episode_reward += reward
                state = nextstate
                if done or t + 1 == self.env_params['max_timesteps']:
                    total_reward.append(episode_reward)
                    episode_reward = 0

        average_reward = sum(total_reward)/len(total_reward)
        min_reward = min(total_reward)
        max_reward = max(total_reward)
        self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward)
        return average_reward

    def record(self, epoch):
        self.ModelsEval()
        try:
            if not os.path.exists("videos"):
                os.mkdir('videos')
            recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4')
            for _ in range(self.args.n_record):
                done =False
                state = self.env.reset()
                while not done:
                    recorder.capture_frame()
                    action = self.GreedyAction(state)
                    nextstate,reward,done,info = self.env.step([action])
                    state = nextstate
                recorder.close()
        except Exception as e:
            print(e)

    def SaveModels(self, ep):
        if not os.path.exists("models"):
            os.mkdir('models')
        torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt'))
        torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt'))

    def LoadModels(self, actorpath, criticpath):
        actor = Actor(self.env_params, self.hidden_neurons)
        critic  = Critic(self.env_params, self.hidden_neurons)
        actor.load_state_dict(torch.load(actorpath))
        critic.load_state_dict(torch.load(criticpath))
        return actor, critic

    def SoftUpdateTarget(self, source, target):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
Example #8
0
class DDPGAgent:
    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate,
                 actor_learning_rate):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        self.noise = OUNoise(self.env.action_space)

    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # update critic
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()
        self.critic_optimizer.step()

        # update actor
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))
class Agents:
    def __init__(self, params):

        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']
        num_agents = params['num_of_agents']

        nn_params = params['nn_params']
        nn_params['nn_actor']['l1'][0] = state_size
        nn_params['nn_actor']['l3'][1] = action_size
        nn_params['nn_critic']['l1'][0] = (state_size + action_size) *  num_agents

        self.__actors_local = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)]
        self.__actors_target = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)]
        self.__critic_local = Critic(nn_params['nn_critic']).to(device)
        self.__critic_target = Critic(nn_params['nn_critic']).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__num_agents = num_agents
        self.__memory = ReplayBuffer(buf_params)
        self.__t = 0

        self.gamma = params['gamma']
        self.learning_rate_actor = params['learning_rate_actor']
        self.learning_rate_critic = params['learning_rate_critic']
        self.tau = params['tau']

        self.__optimisers_actor = [optim.Adam(self.__actors_local[0].parameters(), self.learning_rate_actor),
                                   optim.Adam(self.__actors_local[1].parameters(), self.learning_rate_actor)]
        self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic)
        self.__uo_process = UOProcess(shape=(self.__num_agents, self.__action_size))
        # other parameters
        self.agent_loss = 0.0

    # Set methods
    def set_learning_rate(self, lr_actor, lr_critic):
        self.learning_rate_actor = lr_actor
        self.learning_rate_critic = lr_critic
        for n in range(self.__num_agents):
            for param_group in self.__optimisers_actor[n].param_groups:
                param_group['lr'] = lr_actor
        for param_group in self.__optimiser_critic.param_groups:
            param_group['lr'] = lr_critic

    # Get methods
    def get_actor(self):
        return self.__actors_local

    def get_critic(self):
        return self.__critic_local

    # Other methods
    def step(self, state, action, reward, next_state, done):
        # add experience to memory
        self.__memory.add(state, action, reward, next_state, done)

        if self.__memory.is_ready():
            self.__update()

    def choose_action(self, states, mode='train'):
        if mode == 'train':
            # state should be transformed to a tensor
            states = torch.from_numpy(np.array(states)).float().to(device)
            actions = np.zeros((self.__num_agents, self.__action_size))
            for i, actor in enumerate(self.__actors_local):
                state = states[i, :]
                actor.eval()
                with torch.no_grad():
                    action = actor(state)
                actor.train()
                actions[i, :] = action.cpu().numpy()
            actions += np.array(self.__uo_process.sample())
            return np.clip(actions, -1, 1)
        elif mode == 'test':
            # state should be transformed to a tensor
            states = torch.from_numpy(np.array(states)).float().to(device)
            actions = np.zeros((self.__num_agents, self.__action_size))
            for i, actor in enumerate(self.__actors_local):
                state = states[i, :]
                actor.eval()
                with torch.no_grad():
                    action = actor(state)
                actions[i, :] = action.cpu().numpy()
            actions += np.array(self.__uo_process.sample())
            return np.clip(actions, -1, 1)
        else:
            print("Invalid mode value")

    def reset(self, sigma):
        self.__uo_process.reset(sigma)

    def __update(self):

        for i in range(self.__num_agents):

            # update critic
            # ----------------------------------------------------------
            #
            states, actions, rewards, next_states, dones = self.__memory.sample()

            states_i = states[:, i, :]
            actions_i = actions[:, i, :]
            rewards_i = rewards[:, i]
            next_states_i = next_states[:, i, :]
            dones_i = dones[:, i]

            loss_fn = nn.MSELoss()
            self.__optimiser_critic.zero_grad()

            # form target
            next_states_actions = torch.cat((next_states[:, 0, :], next_states[:, 1, :],
                                             self.__actors_target[0].forward(next_states[:, 0, :]),
                                             self.__actors_target[1].forward(next_states[:, 1, :])), dim=1)
            Q_target_next = self.__critic_target.forward(next_states_actions).detach()
            targets = (rewards_i + self.gamma * Q_target_next[:, i] * (1 - dones_i))

            # form output
            states_actions = torch.cat((states[:, 0, :], states[:, 1, :],
                                        actions[:, 0, :], actions[:, 1, :]), dim=1)
            outputs = self.__critic_local.forward(states_actions)
            mean_loss_critic = loss_fn(outputs[:, i], targets)  # minus added since it's gradient ascent
            mean_loss_critic.backward()
            self.__optimiser_critic.step()

            # update actor
            # ----------------------------------------------------------
            self.__optimisers_actor[i].zero_grad()
            predicted_actions = copy.copy(actions)
            predicted_actions[:, i, :] = self.__actors_local[i](states_i)
            mean_loss_actor = - self.__critic_local.forward(torch.cat((states[:, 0, :], states[:, 1, :],
                                                                       predicted_actions[:, 0, :],
                                                                       predicted_actions[:, 1, :]), dim=1))[:, i].mean()
            mean_loss_actor.backward()
            self.__optimisers_actor[i].step()   # update actor

            self.__soft_update(self.__critic_local, self.__critic_target, self.tau)
            self.__soft_update(self.__actors_local[i], self.__actors_target[i], self.tau)

    @staticmethod
    def __soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Example #10
0
class RaCT():
    def __init__(self, M, eh1, eh2, dh2, ci, lr_ac=0.001, lr_cr=0.001):
        ## Network initializations
        # Actor
        self.actor = VAE(
            M, eh1, eh2, dh2
        )  # Number of inputs, units in encoder_hidden_layer1, encoder_hidden_layer2,
        #decoder_hidden_layer1
        # Critic
        self.critic = Critic(ci)  # Length of feature vector
        # Optimizers
        self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_ac)
        self.optim_critic = torch.optim.Adam(self.critic.parameters(),
                                             lr=lr_cr)
        self.mse = torch.nn.MSELoss()

    def pretrain_actor(self,
                       X,
                       batch_size,
                       beta_max,
                       epochs,
                       epochs_annealing,
                       val_set,
                       masked=True):
        '''
        Pretraining of actor using MLE cost = NLL + Beta*KL
        
        Minimize NLL: Maximize the probability of interactions in the reconstruction which are 1 in the input.
        KL: Regulatory, makes sure the distribution of z is not very different than prior.
        
        X: Interaction Matrix, training dataset
        beta_max : Max Beta that will be reached after annealing
        epochs : Total number of epochs
        epochs_annealing: Number of epochs for annealing. Since beta_max is set, controls how quick beta grows.
        val_set : Validation set for validation
        
        masked: Controls the training task. If True, only a partial history is given to actor and only unobserved 
        interactions are considered in NLL. Proposed method is not clear in the paper.
        
        '''
        beta = 0
        beta_increase = beta_max / epochs_annealing  # Linear Growth of Beta
        for epoch in range(epochs):
            self.optim_actor.zero_grad()
            ## Sample a batch
            batch_ind = np.random.choice(X.shape[0], batch_size)
            xbatch = X[batch_ind, :]
            xbatch = torch.tensor(
                xbatch.toarray(),
                dtype=torch.float32)  # Scipy Sparse to Tensor Dense

            ## UNMASKED
            if not masked:
                xlog, KL = self.actor.forward(xbatch)
                nll = -torch.mean(xlog * xbatch, dim=1)
                elbo_beta = torch.mean(nll + beta * KL)

            ## MASKED
            else:
                # Sample masks
                mask, xbatch_masked = self.mask(xbatch)
                xbatch_reverse_masked = xbatch * (1 - mask)
                xlog, KL = self.actor.forward(xbatch_masked)
                nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1)
                elbo_beta = torch.mean(nll + beta * KL)

            print('NLLL : ', torch.mean(nll.detach()))
            print('Elbo : ', elbo_beta.detach())
            elbo_beta.backward()
            self.optim_actor.step()  # Update the actor
            if epoch < epochs_annealing:
                beta = beta + beta_increase
            if epoch % 20 == 0:
                self.evaluate(val_set)

    def pretrain_critic(self, X, batch_size, epochs):
        '''
        Pretraining of critic using MSE between score predictions of Critic network and NDCG@100.
        Critic tries to learn giving similar results with NDCG.
        No unmasked option here, since NDCG only accounts for unobserver interactions.
        '''
        for epoch in range(epochs):
            self.optim_actor.zero_grad()
            self.optim_critic.zero_grad()
            # Sample a batch
            batch_ind = np.random.choice(X.shape[0], batch_size)
            xbatch_spr = X[batch_ind, :]
            xbatch = torch.tensor(xbatch_spr.toarray(), dtype=torch.float32)
            # Prepare masks
            mask, xbatch_masked = self.mask(xbatch)
            xbatch_reverse_masked = xbatch * (1 - mask)
            # Find score prediction of critic given masked input
            xlog, KL = self.actor.forward(xbatch_masked)
            nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1)
            score_pred = self.critic.forward(xbatch, nll, mask)

            ## I will try the one from implementation. 1st-arg=prediction, 2nd-arg = reverse-masked-input
            # 4th-arg = masked_input
            ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(),
                                          xbatch_reverse_masked, 100,
                                          xbatch_masked)
            ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32)
            print('NDCG mean :', torch.mean(ndcg))
            mse_loss = self.mse(
                score_pred,
                ndcg)  ## Minimize the difference between Critic and NDCG
            print('MSE : ', mse_loss.detach())
            mse_loss.backward()
            self.optim_critic.step()

    def alternative_training(self,
                             X,
                             batch_size,
                             beta,
                             epochs,
                             recalculate_actor=False):
        '''
        Train both of them together. Do the following epochs times.
        
        1. Train Actor to maximize the score of predictions.Use Critic as a both
        differentiable and accurate metric.(At least this is what we hope to get.)
        2. Train Critic using MSE cost with NDCG. We need this to make sure that we can predict the score of 
        distributions produced by the new Actor.
        
        Note that in the tests, this stage is observed to be too unstable. Unlucky seeds can cause collapse of 
        the whole training.
        
        TODO: Work on the unstability.
        
        recalculate_actor : Experimental parameter for Critic Phase. If True, reconstruct the graph of actor network for 
        the training of Critic. If false, use the results from Actor phase as constants.
        
        '''
        for epoch in range(epochs):
            # Sample a batch. Will use the same batch for both phases.
            batch_ind = np.random.choice(X.shape[0], batch_size)
            xbatch_spr = X[batch_ind, :]
            xbatch = torch.tensor(xbatch_spr.toarray(), dtype=torch.float32)
            # Mask it
            mask, xbatch_masked = self.mask(xbatch)
            xbatch_reverse_masked = xbatch * (1 - mask)
            ### Actor Phase
            self.optim_actor.zero_grad()
            self.optim_critic.zero_grad()
            xlog, KL = self.actor.forward(xbatch_masked)
            nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1)
            actor_loss = -self.critic.forward(
                xbatch, nll, mask).mean()  # Use -critic_score as the loss.
            #So maximize the critic score
            actor_loss.backward()
            self.optim_actor.step()
            print('Critic ', epoch, ' , ', actor_loss.detach())
            print('NLLL : ', torch.mean(nll.detach()))

            ### Critic Phase
            self.optim_actor.zero_grad()
            self.optim_critic.zero_grad()

            if recalculate_actor:
                xlog, KL = self.actor.forward(xbatch)
                nll = -torch.mean(xlog * xbatch, dim=1)
            else:
                nll.detach_()
            score_pred = self.critic.forward(xbatch, nll, mask)
            ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(),
                                          xbatch_reverse_masked, 100,
                                          xbatch_masked)
            ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32)
            print('NDCG mean :', torch.mean(ndcg))
            mse_loss = self.mse(score_pred, ndcg)
            mse_loss.backward()
            #             print('MSE Loss : ',mse_loss.detach())
            self.optim_critic.step()

    def mask(self, X, p=0.5):
        '''
        Generates a random(Bernoulli) matrix(mask) of same shape with X. p is the probability of each element being 1. 
        Note that elements in the matrix sampled from independent distributions.  
        '''
        mask = torch.distributions.bernoulli.Bernoulli(p).sample(
            sample_shape=X.shape)
        X_masked = X * mask
        return mask, X_masked

    def evaluate(self, val_set):
        with torch.no_grad():
            ## Convert from Scipy sparse to Torch Tensor.
            xbatch = torch.tensor(val_set.toarray(), dtype=torch.float32)
            mask, xbatch_masked = self.mask(xbatch)
            xbatch_reverse_masked = xbatch * (
                1 - mask)  # Reverse Mask, 1 if not observed
            xlog, KL = self.actor.forward(
                xbatch_masked
            )  # Accepts a 'partial'(masked) interaction history.
            nll = -torch.mean(
                xlog * xbatch_reverse_masked,
                dim=1)  # We only care about guessing unobserved interactionsl
            score_pred = self.critic.forward(xbatch, nll,
                                             mask)  # Note that first argument
            # should be the original(unmasked) matrix.
            # Calculate NDCG@100.
            ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(),
                                          xbatch_reverse_masked, 100,
                                          xbatch_masked)
            ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32)
            print('NDCG mean :', torch.mean(ndcg))