Example #1
0
    def __init__(self,
                 action_size=2,
                 seed=0,
                 load_file=None,
                 n_agents=2,
                 buffer_size=int(3e4),
                 batch_size=128,
                 gamma=0.99,
                 update_every=2,
                 noise_start=1.0,
                 noise_decay=1.0,
                 evaluation_only=False):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.evaluation_only = evaluation_only
        # create two agents, each with their own actor and critic
        models = [model.LowDim2x(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(0, models[0], load_file=None),
            DDPG(1, models[1], load_file=None)
        ]
        # create shared replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)
        if load_file:
            for i, save_agent in enumerate(self.agents):
                actor_file = torch.load(load_file + '.' + str(i) +
                                        '.actor.pth',
                                        map_location='cpu')
                critic_file = torch.load(load_file + '.' + str(i) +
                                         '.critic.pth',
                                         map_location='cpu')
                save_agent.actor_local.load_state_dict(actor_file)
                save_agent.actor_target.load_state_dict(actor_file)
                save_agent.critic_local.load_state_dict(critic_file)
                save_agent.critic_target.load_state_dict(critic_file)
            print('Loaded: {}.actor.pth'.format(load_file))
            print('Loaded: {}.critic.pth'.format(load_file))
Example #2
0
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double_dqn=True,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 action_map=None):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            use_double_dqn (bool): wheter to use double DQN algorithm
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            action_map (dict): how to map action indexes from model output to gym environment
        """
        random.seed(seed)

        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double_dqn = use_double_dqn
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.loss_list = []  # track loss across steps
        self.entropy_list = []  # track entropy across steps

        # Q-Network
        self.qnetwork_local = model.local
        self.qnetwork_target = model.target

        # DEBUG weight initialization
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #self.qnetwork_local.fc_s.weight.data[0] = torch.tensor([0.0, 0.0, 0.0, 0.0])
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #input('->')

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=.00025, momentum=0.95)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            self.qnetwork_local.load_state_dict(torch.load(load_file + '.pth'))
            self.qnetwork_target.load_state_dict(torch.load(load_file +
                                                            '.pth'))
            #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))

        self.action_map = action_map
Example #3
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double_dqn=True,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 action_map=None):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            use_double_dqn (bool): wheter to use double DQN algorithm
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            action_map (dict): how to map action indexes from model output to gym environment
        """
        random.seed(seed)

        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double_dqn = use_double_dqn
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.loss_list = []  # track loss across steps
        self.entropy_list = []  # track entropy across steps

        # Q-Network
        self.qnetwork_local = model.local
        self.qnetwork_target = model.target

        # DEBUG weight initialization
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #self.qnetwork_local.fc_s.weight.data[0] = torch.tensor([0.0, 0.0, 0.0, 0.0])
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #input('->')

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=.00025, momentum=0.95)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            self.qnetwork_local.load_state_dict(torch.load(load_file + '.pth'))
            self.qnetwork_target.load_state_dict(torch.load(load_file +
                                                            '.pth'))
            #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))

        self.action_map = action_map

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        if self.use_prioritized_experience_replay:
            priority = 100.0  # set initial priority to max value
            self.memory.add(state, action, reward, next_state, done, priority)
        else:
            self.memory.add(state, action, reward, next_state, done)

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                # if prioritized experience replay is enabled
                if self.use_prioritized_experience_replay:
                    self.memory.sort()
                    indexes, experiences = self.memory.sample(self.alpha)
                    self.learn(experiences, self.gamma, indexes)
                    self.alpha = self.alpha_decay * self.alpha
                else:
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        if len(
                state.shape
        ) == 1:  # reshape 1-D states into 2-D (as expected by the model)
            state = np.expand_dims(state, axis=0)
        state = torch.from_numpy(state).float().to(device)
        # calculate action values
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, indexes=None):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_prioritized_experience_replay:
            states, actions, rewards, next_states, dones, priorities = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        # DEBUG replay memory
        #print('learning:')
        #show_frames(states)
        #show_frames(next_states)

        # Select double DQN or regular DQN
        if self.use_double_dqn:
            # get greedy actions (for next states) from local model
            q_local_argmax = self.qnetwork_local(next_states).detach().argmax(
                dim=1).unsqueeze(1)
            # get predicted q values (for next states) from target model indexed by q_local_argmax
            q_targets_next = self.qnetwork_target(next_states).gather(
                1, q_local_argmax).detach()
        else:
            # get max predicted q values (for next states) from target model
            q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # get q values from local model
        q_local = self.qnetwork_local(states)
        # get q values for chosen action
        predictions = q_local.gather(1, actions)
        # calculate td targets
        targets = rewards + (gamma * q_targets_next * (1 - dones))

        # calculate new priorities
        if self.use_prioritized_experience_replay:
            with torch.no_grad():
                new_priorities = torch.abs(targets - predictions).to(device)
                self.memory.batch_update(indexes,
                                         (states, actions, rewards,
                                          next_states, dones, new_priorities))

        # calculate loss using mean squared error: (targets - predictions).pow(2).mean()
        loss = F.mse_loss(predictions, targets)
        # minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        # clip gradients
        #for param in self.qnetwork_local.parameters():
        #    param.grad.data.clamp_(-10, 10)
        self.optimizer.step()

        # update stats
        with torch.no_grad():
            self.loss_list.append(loss.item())
            # calculate sparse softmax cross entropy
            self.entropy_list.append(
                F.cross_entropy(q_local, actions.squeeze(1)))

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #4
0
class Agent():
    """Meta agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self,
                 action_size=2,
                 seed=0,
                 load_file=None,
                 n_agents=2,
                 buffer_size=int(3e4),
                 batch_size=128,
                 gamma=0.99,
                 update_every=2,
                 noise_start=1.0,
                 noise_decay=1.0,
                 evaluation_only=False):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.evaluation_only = evaluation_only
        # create two agents, each with their own actor and critic
        models = [model.LowDim2x(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(0, models[0], load_file=None),
            DDPG(1, models[1], load_file=None)
        ]
        # create shared replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)
        if load_file:
            for i, save_agent in enumerate(self.agents):
                actor_file = torch.load(load_file + '.' + str(i) +
                                        '.actor.pth',
                                        map_location='cpu')
                critic_file = torch.load(load_file + '.' + str(i) +
                                         '.critic.pth',
                                         map_location='cpu')
                save_agent.actor_local.load_state_dict(actor_file)
                save_agent.actor_target.load_state_dict(actor_file)
                save_agent.critic_local.load_state_dict(critic_file)
                save_agent.critic_target.load_state_dict(critic_file)
            print('Loaded: {}.actor.pth'.format(load_file))
            print('Loaded: {}.critic.pth'.format(load_file))

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        all_next_states = all_next_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)
        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0 and self.evaluation_only == False:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                # each agent does it's own sampling from the replay buffer
                experiences = [
                    self.memory.sample() for _ in range(self.n_agents)
                ]
                self.learn(experiences, self.gamma)

    def act(self, all_states, add_noise=True):
        # pass each agent's state from the environment and calculate it's action
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state,
                               noise_weight=self.noise_weight,
                               add_noise=True)
            self.noise_weight *= self.noise_decay
            all_actions.append(action)
        return np.array(all_actions).reshape(
            1, -1)  # reshape 2x2 into 1x4 dim vector

    def learn(self, experiences, gamma):
        # each agent uses it's own actor to calculate next_actions
        all_next_actions = []
        for i, agent in enumerate(self.agents):
            _, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)
        # each agent uses it's own actor to calculate actions
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, _, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
        # each agent learns from it's experience sample
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)
Example #5
0
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 n_agents=1,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0001,
                 clip_gradients=False,
                 theta=0.15,
                 sigma=0.2,
                 update_every=1,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 evaluation_only=False):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of agents to train simultaneously
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            weight_decay (float): L2 weight decay
            clip_gradients (bool): whether to clip gradients on both actor and critic
            theta (float): OU noise parameter
            sigma (float): OU noise parameter
            update_every (int): how often to update the network
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """
        random.seed(seed)

        self.action_size = action_size
        self.n_agents = n_agents
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        self.clip_gradients = clip_gradients
        self.evaluation_only = evaluation_only

        self.loss_list = []  # track loss across steps

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # DEBUG weight initialization
        #print(self.actor_local.fcs1.weight.data[0])
        #print(self.actor_target.fcs1.weight.data[0])
        #print(self.critic_local.fcs1.weight.data[0])
        #print(self.critic_target.fcs1.weight.data[0])
        #input('->')

        # Noise process
        self.noise = OUNoise((n_agents, action_size),
                             seed,
                             theta=theta,
                             sigma=sigma)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            if device.type == 'cpu':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            elif device.type == 'cuda:0':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))
Example #6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 n_agents=1,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0001,
                 clip_gradients=False,
                 theta=0.15,
                 sigma=0.2,
                 update_every=1,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 evaluation_only=False):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of agents to train simultaneously
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            weight_decay (float): L2 weight decay
            clip_gradients (bool): whether to clip gradients on both actor and critic
            theta (float): OU noise parameter
            sigma (float): OU noise parameter
            update_every (int): how often to update the network
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """
        random.seed(seed)

        self.action_size = action_size
        self.n_agents = n_agents
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        self.clip_gradients = clip_gradients
        self.evaluation_only = evaluation_only

        self.loss_list = []  # track loss across steps

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # DEBUG weight initialization
        #print(self.actor_local.fcs1.weight.data[0])
        #print(self.actor_target.fcs1.weight.data[0])
        #print(self.critic_local.fcs1.weight.data[0])
        #print(self.critic_target.fcs1.weight.data[0])
        #input('->')

        # Noise process
        self.noise = OUNoise((n_agents, action_size),
                             seed,
                             theta=theta,
                             sigma=sigma)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            if device.type == 'cpu':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            elif device.type == 'cuda:0':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        if self.use_prioritized_experience_replay:
            priority = 100.0  # set initial priority to max value
            if self.n_agents == 1:
                self.memory.add(state, action, reward, next_state, done,
                                priority)
            else:
                for i in range(self.n_agents):
                    self.memory.add(state[i, :], action[i, :], reward[i],
                                    next_state[i, :], done[i], priority[i, :])
        else:
            if self.n_agents == 1:
                self.memory.add(state, action, reward, next_state, done)
            else:
                for i in range(self.n_agents):
                    self.memory.add(state[i, :], action[i, :], reward[i],
                                    next_state[i, :], done[i])

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0 and self.evaluation_only == False:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                # if prioritized experience replay is enabled
                if self.use_prioritized_experience_replay:
                    self.memory.sort()
                    indexes, experiences = self.memory.sample(self.alpha)
                    self.learn(experiences, self.gamma, indexes)
                    self.alpha = self.alpha_decay * self.alpha
                else:
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        if len(
                state.shape
        ) == 1:  # reshape 1-D states into 2-D (as expected by the model)
            state = np.expand_dims(state, axis=0)
        state = torch.from_numpy(state).float().to(device)
        # calculate action values
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        #print('pre: {}'.format(action))  # DEBUG
        if add_noise:
            action += self.noise.sample()
        #print('pst: {}'.format(action))  # DEBUG
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, indexes=None):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_prioritized_experience_replay:
            states, actions, rewards, next_states, dones, priorities = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        # DEBUG replay memory
        #print('learning:')
        #show_frames(states)
        #show_frames(next_states)

        # ---------------------------- update critic ---------------------------- #
        # get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states (y_i)
        q_expected = self.critic_local(states, actions)
        # compute critic loss
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))
        critic_loss = F.mse_loss(q_expected, q_targets)
        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # DEBUG gradients
        #for m in self.critic_local.parameters():
        #    print(m.grad)
        if self.clip_gradients:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                           10,
                                           norm_type=2)
            #for param in self.qnetwork_local.parameters():
            #    param.grad.data.clamp_(-10, 10)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # DEBUG gradients
        #for m in self.actor_local.parameters():
        #    print(m.grad)
        if self.clip_gradients:
            torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(),
                                           10,
                                           norm_type=2)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # ---------------- update prioritized experience replay ---------------- #
        if self.use_prioritized_experience_replay:
            with torch.no_grad():
                new_priorities = torch.abs(q_targets - q_expected).to(device)
                self.memory.batch_update(indexes,
                                         (states, actions, rewards,
                                          next_states, dones, new_priorities))

        # ---------------------------- update stats ---------------------------- #
        with torch.no_grad():
            self.loss_list.append(critic_loss.item())

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)