Esempio n. 1
0
    def __init__(self, env, initial_act=30, gamma=0.98, tau=0.01, actor_lr=1e-4, critic_lr=1e-3, reward_scale=1., buffer_size=100, writer=None):

        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.initial_act = initial_act
        self.reward_scale = reward_scale
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.buffer = PrioritizedReplayBuffer(buffer_size)
        self.actor = Actor(self.obs_dim, self.action_dim)
        self.target_actor = Actor(self.obs_dim, self.action_dim)
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.critic = Crtic(self.obs_dim, self.action_dim)
        self.target_critic = Crtic(self.obs_dim, self.action_dim)
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.actor = self.actor.to(self.device)
        self.critic = self.critic.to(self.device)
        self.target_actor = self.target_actor.to(self.device)
        self.target_critic = self.target_critic.to(self.device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

        self.criterion = nn.MSELoss(reduction='none')

        self.writer = writer
Esempio n. 2
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Esempio n. 3
0
    def __init__(self, state_size, action_size, seed, index=0, num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int):   Dimension of each state
            action_size (int):  Dimension of each action
            seed (int):         Random seed
            index (int):        Index assigned to the agent
            num_agents (int):   Number of agents in the environment
        """

        self.state_size = state_size  # State size
        self.action_size = action_size  # Action size
        self.seed = torch.manual_seed(seed)  # Random seed
        self.index = index  # Index of this agent, not used at the moment
        self.tau = TAU  # Parameter for soft weight update
        self.num_updates = N_UPDATES  # Number of updates to perform when updating
        self.num_agents = num_agents  # Number of agents in the environment
        self.tstep = 0  # Simulation step (modulo (%) UPDATE_EVERY)
        self.gamma = GAMMA  # Gamma for the reward discount
        self.alpha = ALPHA  # PER: toggle prioritization (0..1)

        # Set up actor and critic networks
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OUNoise((1, action_size), seed)

        # Replay buffer
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, seed, self.alpha)
Esempio n. 4
0
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.memory_size = 20000
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 5000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.batch_size = 64
        self.train_start = 1000

        # create prioritized replay memory using SumTree
        self.memory = PrioritizedReplayBuffer(self.memory_size)

        # create main model and target model
        self.model = DQN(state_size, action_size)
        self.model.apply(self.weights_init)
        self.target_model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model = torch.load('save_model/cartpole_dqn')
    def __init__(self,
                 env,
                 model,
                 target_model,
                 config,
                 name_agent="prioritized-dqn"):
        self.name_agent = name_agent

        self.dim_space = env.observation_space.shape[0]
        self.nb_actions = env.action_space.n

        self.epsilon = config.epsilon_start
        self.epsilon_final = config.epsilon_final
        self.epsilon_start = config.epsilon_start
        self.epsilon_decay = config.epsilon_decay

        self.gamma = config.gamma
        self.update_nb_iter = config.update_nb_iter

        # changing the buffer (taking a priotirized buffer
        # insted of a uniform probability buffer)
        self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size,
                                                     config.w,
                                                     config.beta_final,
                                                     config.beta_start,
                                                     config.beta_decay)
        self.environment = env
        self.batch_size = config.batch_size

        self.model = model
        self.target_model = target_model
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=config.learning_rate)

        self.loss_data = []
        self.rewards = []
Esempio n. 6
0
 def _create_buffer(self, buffer_type, action_size, buffer_size, batch_size,
                    alpha, beta, seed, device):
     if buffer_type == 'prioritized':
         self._update_buffer_priorities = True
         return PrioritizedReplayBuffer(action_size,
                                        buffer_size,
                                        batch_size,
                                        seed,
                                        alpha=alpha,
                                        beta=beta,
                                        device=device)
     elif buffer_type == 'sample':
         return ReplayBuffer(action_size,
                             buffer_size,
                             batch_size,
                             seed,
                             device=device)
     else:
         raise Exception(
             'Unknown buffer type - must be one of prioritized or sample')
Esempio n. 7
0
class DQNAgent():
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.memory_size = 20000
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 5000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.batch_size = 64
        self.train_start = 1000

        # create prioritized replay memory using SumTree
        self.memory = PrioritizedReplayBuffer(self.memory_size)

        # create main model and target model
        self.model = DQN(state_size, action_size)
        self.model.apply(self.weights_init)
        self.target_model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model = torch.load('save_model/cartpole_dqn')

    # weight xavier initialize
    def weights_init(self, m):
        classname = m.__class__.__name__
        if classname.find('Linear') != -1:
            torch.nn.init.xavier_uniform(m.weight)

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = torch.from_numpy(state)
            state = Variable(state).float().cpu()
            q_value = self.model(state)
            _, action = torch.max(q_value, 1)
            return int(action)

    # save sample (error,<s,a,r,s'>) to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        target = self.model(Variable(torch.FloatTensor(state))).data
        old_val = target[0][action]
        target_val = self.target_model(Variable(
            torch.FloatTensor(next_state))).data
        if done:
            target[0][action] = reward
        else:
            target[0][
                action] = reward + self.discount_factor * torch.max(target_val)

        error = abs(old_val - target[0][action])

        self.memory.add(error, (state, action, reward, next_state, done))

    # pick samples from prioritized replay memory (with batch_size)
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)
        mini_batch = np.array(mini_batch).transpose()

        states = np.vstack(mini_batch[0])
        actions = list(mini_batch[1])
        rewards = list(mini_batch[2])
        next_states = np.vstack(mini_batch[3])
        dones = mini_batch[4]

        # bool to binary
        dones = dones.astype(int)

        # Q function of current state
        states = torch.Tensor(states)
        states = Variable(states).float()
        pred = self.model(states)

        # one-hot encoding
        a = torch.LongTensor(actions).view(-1, 1)

        one_hot_action = torch.FloatTensor(self.batch_size,
                                           self.action_size).zero_()
        one_hot_action.scatter_(1, a, 1)

        pred = torch.sum(pred.mul(Variable(one_hot_action)), dim=1)

        # Q function of next state
        next_states = torch.Tensor(next_states)
        next_states = Variable(next_states).float()
        next_pred = self.target_model(next_states).data

        rewards = torch.FloatTensor(rewards)
        dones = torch.FloatTensor(dones)

        # Q Learning: get maximum Q value at s' from target model
        target = rewards + (1 -
                            dones) * self.discount_factor * next_pred.max(1)[0]
        target = Variable(target)

        errors = torch.abs(pred - target).data.numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        self.optimizer.zero_grad()

        # MSE Loss function
        loss = (torch.FloatTensor(is_weights) *
                F.mse_loss(pred, target)).mean()
        loss.backward()

        # and train
        self.optimizer.step()
Esempio n. 8
0
class DDPG_Agent:
    def __init__(self, state_size, action_size, seed, index=0, num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int):   Dimension of each state
            action_size (int):  Dimension of each action
            seed (int):         Random seed
            index (int):        Index assigned to the agent
            num_agents (int):   Number of agents in the environment
        """

        self.state_size = state_size  # State size
        self.action_size = action_size  # Action size
        self.seed = torch.manual_seed(seed)  # Random seed
        self.index = index  # Index of this agent, not used at the moment
        self.tau = TAU  # Parameter for soft weight update
        self.num_updates = N_UPDATES  # Number of updates to perform when updating
        self.num_agents = num_agents  # Number of agents in the environment
        self.tstep = 0  # Simulation step (modulo (%) UPDATE_EVERY)
        self.gamma = GAMMA  # Gamma for the reward discount
        self.alpha = ALPHA  # PER: toggle prioritization (0..1)

        # Set up actor and critic networks
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OUNoise((1, action_size), seed)

        # Replay buffer
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, seed, self.alpha)

    # act and act_targets similar to exercises and MADDPG Lab
    def act(self, states, noise=1.0):
        """Returns actions for given state as per current policy.
    
        Params
        ======
            state [n_agents, state_size]: current state
            noise (float):    control whether or not noise is added
        """
        # Uncomment if state is numpy array instead of tensor
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((1, self.action_size))

        # Put model into evaluation mode
        self.actor_local.eval()

        # Get actions for current state, transformed from probabilities
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()

        # Put actor back into training mode
        self.actor_local.train()

        # Ornstein-Uhlenbeck noise addition
        actions += noise * self.noise.sample()

        #  Transform probability into valid action ranges
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, dones, beta):
        """Save experience in replay memory, use random samples from buffer to learn.
        
        PARAMS
        ======
            states:     [n_agents, state_size]  current state
            actions:    [n_agents, action_size] taken action
            rewards:    [n_agents]              earned reward
            next_states:[n_agents, state_size]  next state
            dones:      [n_agents]              Whether episode has finished
            beta:       [0..1]                  PER: toggles correction for importance weights (0 - no corrections, 1 - full correction)
        """
        # ------------------------------------------------------------------
        # Save experience in replay memory - slightly more effort due to Prioritization
        # We need to calculate priorities for the experience tuple.
        # This is in our case (Q_expected - Q_target)**2
        # -----------------------------------------------------------------
        # Set all networks to evaluation mode
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()

        state = torch.from_numpy(states).float().to(device)
        next_state = torch.from_numpy(next_states).float().to(device)
        action = torch.from_numpy(actions).float().to(device)
        #reward = torch.from_numpy(rewards).float().to(device)
        #done = torch.from_numpy(dones).float().to(device)

        with torch.no_grad():
            next_actions = self.actor_target(state)
            own_action = action[:, self.index *
                                self.action_size:(self.index + 1) *
                                self.action_size]
            if self.index:
                # Agent 1
                next_actions_agent = torch.cat((own_action, next_actions),
                                               dim=1)
            else:
                # Agent 0: flipped order
                next_actions_agent = torch.cat((next_actions, own_action),
                                               dim=1)

            # Predicted Q value from Critic target network
            Q_targets_next = self.critic_target(next_state,
                                                next_actions_agent).float()
            #print(f"Type Q_t_n: {type(Q_targets_next)}")
            #print(f"Type gamma: {type(self.gamma)}")
            #print(f"Type dones: {type(dones)}")
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            Q_expected = self.critic_local(state, action)

        # Use error between Q_expected and Q_targets as priority in buffer
        error = (Q_expected - Q_targets)**2
        self.memory.add(state, action, rewards, next_state, dones, error)

        # Set all networks back to training mode
        self.actor_target.train()
        self.critic_target.train()
        self.critic_local.train()

        # ------------------------------------------------------------------
        # Usual learning procedure
        # -----------------------------------------------------------------
        # Learn every UPDATE_EVERY time steps
        self.tstep = (self.tstep + 1) % UPDATE_EVERY

        # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn
        if self.tstep == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(self.num_updates):
                experiences = self.memory.sample(beta)
                self.learn(experiences)

    def reset(self):
        """Reset the noise parameter of the agent."""
        self.noise.reset()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples. 
        Update according to 
            Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        
        According to the lessons: 
            actor_target  (state)           gives   action
            critic_target (state, action)   gives   Q-value

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of 
                    states          states visited
                    actions         actions taken by all agents
                    rewards         rewards received
                    next states     all next states
                    dones           whether or not a final state is reached 
                    weights         weights of the experiences
                    indices         indices of the experiences            
        """

        # Load experiences from sample
        states, actions, rewards, next_states, dones, weights_cur, indices = experiences

        # ------------------- update critic ------------------- #

        # Get next actions via actor network
        next_actions = self.actor_target(next_states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1
            next_actions_agent = torch.cat((own_actions, next_actions), dim=1)
        else:
            # Agent 0: flipped order
            next_actions_agent = torch.cat((next_actions, own_actions), dim=1)

        # Predicted Q value from Critic target network
        Q_targets_next = self.critic_target(next_states, next_actions_agent)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        Q_expected = self.critic_local(states, actions)

        # Update priorities in ReplayBuffer
        loss = (Q_expected - Q_targets).pow(2).reshape(
            weights_cur.shape) * weights_cur
        self.memory.update(indices, loss.data.cpu().numpy())

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradients
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING)
        self.critic_optimizer.step()

        # ------------------- update actor ------------------- #
        actions_expected = self.actor_local(states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1:
            actions_expected_agent = torch.cat((own_actions, actions_expected),
                                               dim=1)
        else:
            # Agent 0: flipped order
            actions_expected_agent = torch.cat((actions_expected, own_actions),
                                               dim=1)

        # Compute actor loss based on expectation from actions_expected
        actor_loss = -self.critic_local(states, actions_expected_agent).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.target_soft_update(self.critic_local, self.critic_target)
        self.target_soft_update(self.actor_local, self.actor_target)

    def target_soft_update(self, local_model, target_model):
        """Soft update model parameters for actor and critic of all MADDPG agents.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'actor_hidden_layers': [
                each.out_features for each in self.actor_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'actor_state_dict':
            self.actor_local.state_dict(),
            'critic_hidden_layers': [
                each.out_features for each in self.critic_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'critic_state_dict':
            self.critic_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's actor and critic networks.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_actor_hidden_layers = [
            each.out_features for each in self.actor_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}"
            )
            return None
        my_critic_hidden_layers = [
            each.out_features for each in self.critic_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}"
            )
            return None
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
class Prioritized(DQN):
    def __init__(self,
                 env,
                 model,
                 target_model,
                 config,
                 name_agent="prioritized-dqn"):
        self.name_agent = name_agent

        self.dim_space = env.observation_space.shape[0]
        self.nb_actions = env.action_space.n

        self.epsilon = config.epsilon_start
        self.epsilon_final = config.epsilon_final
        self.epsilon_start = config.epsilon_start
        self.epsilon_decay = config.epsilon_decay

        self.gamma = config.gamma
        self.update_nb_iter = config.update_nb_iter

        # changing the buffer (taking a priotirized buffer
        # insted of a uniform probability buffer)
        self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size,
                                                     config.w,
                                                     config.beta_final,
                                                     config.beta_start,
                                                     config.beta_decay)
        self.environment = env
        self.batch_size = config.batch_size

        self.model = model
        self.target_model = target_model
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=config.learning_rate)

        self.loss_data = []
        self.rewards = []

    def loss(self):
        """ 
            the loss is equal to:
                    Rt+1+γt+1qθ(St+1,argmax qθ(St+1,a′))−qθ(St,At))^2
        """
        states, actions, rewards, next_states, finish, indices, weight = self.replay_buffer.sample(
        )
        actions = actions.long()

        # qθ(St,At)
        q0 = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # argmax qθ_barre(St+1,a′)
        max_next_q0 = self.model(next_states).max(1)[0] * (1 - finish)

        Rt_gamma_max = (rewards + self.gamma * max_next_q0)

        loss = (q0 - Rt_gamma_max).pow(2) * weight

        # update the priority of the buffer
        self.replay_buffer.add_p(indices, loss.detach().numpy())

        loss = loss.sum()

        return loss
Esempio n. 10
0
import MahjongPy
from naiveAI import AgentNaive, NMnaive
import tensorflow as tf
import numpy as np
from copy import deepcopy
from buffer import PrioritizedReplayBuffer

sess = tf.InteractiveSession()

if __name__ == '__main__':

    nn = NMnaive(sess)
    env = EnvMahjong()

    # before the train start, create 4 agents.
    memory = PrioritizedReplayBuffer(state_dim=34 * 4, action_dim=34)
    agent = AgentNaive(nn, memory)
    n_games = 2

    for n in range(n_games):
        done = 0
        this_state = env.reset()
        step = 0
        while not done and step < 10000:
            next_aval_states = env.get_aval_actions()
            action, policy = agent.select(next_aval_states)
            next_state, score, done, info = env.step(action)
            agent.remember(this_state, action, next_state, score, done,
                           next_aval_states, policy)
            agent.learn()
            this_state = deepcopy(next_state)
Esempio n. 11
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=9999e-4,
                 double_dqn=False,
                 dueling_network=False,
                 prioritized_replay=False):
        """ Initialize an Agent instance.
        
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            lr_decay (float): Multiplicative factor of learning rate decay
            double_dqn (bool): Toogle for using the Double-DQN method
            dueling_network (bool): Toogle for using the Dueling Network (DN) method
            prioritized_replay (bool): Toogle for using the Prioritized Replay method
        """

        # Set the parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.dueling_network = dueling_network
        self.prioritized_replay = prioritized_replay

        # Q-Network hidden layers.
        hidden_layers = [128, 32]

        # Use the Dueling Network (DN) method.
        if self.dueling_network:

            # DN requires a hidden state value.
            hidden_state_value = [64, 32]

            self.qnetwork_local = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target.eval()

        else:  # Use the Deep Q-Network (DQN) method.

            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           hidden_layers).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            hidden_layers).to(device)
            self.qnetwork_target.eval()

        # Optimize using Adam.
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=LEARNING_RATE)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Use the Prioritized Replay memory buffer if enabled.
        if self.prioritized_replay:

            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)

        else:  # Use the Replay memory buffer instead.
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize the time step (until the THRESHOLD is reached).
        self.t_step = 0
Esempio n. 12
0
class RDPG:
    def __init__(self, env, initial_act=30, gamma=0.98, tau=0.01, actor_lr=1e-4, critic_lr=1e-3, reward_scale=1., buffer_size=100, writer=None):

        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.initial_act = initial_act
        self.reward_scale = reward_scale
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.buffer = PrioritizedReplayBuffer(buffer_size)
        self.actor = Actor(self.obs_dim, self.action_dim)
        self.target_actor = Actor(self.obs_dim, self.action_dim)
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.critic = Crtic(self.obs_dim, self.action_dim)
        self.target_critic = Crtic(self.obs_dim, self.action_dim)
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.actor = self.actor.to(self.device)
        self.critic = self.critic.to(self.device)
        self.target_actor = self.target_actor.to(self.device)
        self.target_critic = self.target_critic.to(self.device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

        self.criterion = nn.MSELoss(reduction='none')

        self.writer = writer

    def store_episode(self, episode):
        self.buffer.add(episode)

    def get_action(self, obs, action, hidden_in, epoch, train=False):
        history = torch.cat([torch.FloatTensor(obs), torch.FloatTensor(action)]).to(torch.float).reshape(1, 1, self.obs_dim+self.action_dim).to(self.device)
        action, hidden_out = self.actor(history, hidden_in)
        if not train:
            return action[0, 0].detach().cpu().numpy(), hidden_out
        action = action[0, 0].detach().cpu().numpy() + np.random.normal(0, 0.1)
        return np.clip(action, -1, 1), hidden_out

    def soft_update(self, target_net, net):
        for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1 - self.tau) * target_param.data
            )

    def update(self, epoch, batch_size=10, beta=0.4):
        if len(self.buffer) < batch_size:
            return 
        batch, indices, weights = self.buffer.replay(batch_size=batch_size, beta=beta)
        indices = indices.to(self.device)
        weights = weights.to(self.device)

        obs_batch, action_batch, reward_batch, done_batch = [], [], [], []
        for episode in batch:
            obs_batch.append(episode[0])
            action_batch.append(episode[1])
            reward_batch.append(episode[2])
            done_batch.append(episode[3])

        obs_tensor = torch.cat(obs_batch).reshape(batch_size, *obs_batch[0].shape[1:]).to(self.device)  # Shape(batch_size, episode_length+1, 3)
        next_obs_tensor = obs_tensor[:, 1: :]  # Shape(batch_size, episode_length, 3)
        obs_tensor = obs_tensor[:, :-1, :]  # Shape(batch_size, episode_length, 3)
        action_tensor = torch.FloatTensor(action_batch).to(self.device)  # Shape(batch_size, episode_length, 1)
        next_action_tensor = action_tensor[:, 1:, :]
        action_tensor = action_tensor[:, :-1, :]
        reward_tensor = torch.FloatTensor(reward_batch).unsqueeze(dim=-1).to(self.device)  # Shape(batch_size, episode_length, 1)
        done_tensor = torch.FloatTensor(done_batch).unsqueeze(dim=-1).to(self.device)  # Shape(batch_size, episode_length, 1)

        hidden = (torch.randn(1, batch_size, 64).to(self.device),
                  torch.randn(1, batch_size, 64).to(self.device))  # Shape(1, batch_size, hidden_size)

        with torch.no_grad():
            target_action, _ = self.target_actor(torch.cat([next_obs_tensor, next_action_tensor], dim=2), hidden)  # Shape(batch_size, episode_length, 1)
            target_q, _ = self.target_critic(torch.cat([next_obs_tensor, target_action], dim=2), hidden)  # Shape(batch_size, episode_length, 1)
            y = reward_tensor * self.reward_scale + done_tensor * self.gamma * target_q  # Shape(batch_size, episode_length, 1)

        q_values, _ = self.critic(torch.cat([obs_tensor, action_tensor], dim=2), hidden)
        critic_loss = (weights * self.criterion(q_values, y).mean(1).squeeze()).mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        action, _ = self.actor(torch.cat([obs_tensor, action_tensor], dim=2), hidden)
        actor_loss = -(weights * self.critic(torch.cat([obs_tensor, action], dim=2), hidden)[0].mean(1).squeeze()).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.target_critic, self.critic)
        self.soft_update(self.target_actor, self.actor)

        # Priority update which replayed
        self.buffer.update_priority(indices.cpu(), (y.mean(1).squeeze() - q_values.mean(1).squeeze()).abs().detach().cpu().numpy())

        if self.writer:
            self.writer.add_scalar("Train/ActorLoss", actor_loss.item(), epoch)
            self.writer.add_scalar("Train/CriticLoss", critic_loss.item(), epoch)