Ejemplo n.º 1
0
class MADDPGAgent:
    """
    Defines a Multi-Agent Deep Deterministic Policy Gradient (MADDPG) agent
    """
    def __init__(self,
                 num_agents=2,
                 obs_size=24,
                 act_size=2,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3,
                 weight_decay_actor=1e-5,
                 weight_decay_critic=1e-4,
                 clip_grad=1.0):
        super(MADDPGAgent, self).__init__()

        # Write parameters
        self.num_agents = num_agents
        self.gamma = gamma
        self.tau = tau
        self.clip_grad = clip_grad

        # Create all the networks
        self.actor = ActorNetwork(obs_size, act_size).to(device)
        self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device)
        self.target_actor = ActorNetwork(obs_size, act_size).to(device)
        self.target_critic = CriticNetwork(num_agents, obs_size,
                                           act_size).to(device)

        # Copy initial network parameters to target networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Initialize training optimizers and OU noise
        self.noise = OUNoise(act_size, scale=1.0)
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=weight_decay_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

    def act(self, obs, noise=0.0):
        """ Act using the online actor network """
        obs = obs.to(device)
        action = self.actor(obs) + (noise * self.noise.noise()).to(device)
        action = torch.clamp(action, -1, 1)
        return action

    def target_act(self, obs, noise=0.0):
        """ Act using the target actor network (used for training) """
        obs = obs.to(device)
        action = self.target_actor(obs) + (noise *
                                           self.noise.noise()).to(device)
        action = torch.clamp(action, -1, 1)
        return action

    def update_targets(self):
        """
        Perform soft update of target network parameters based on latest actor/critic parameters
        """
        soft_update(self.target_critic, self.critic, self.tau)
        soft_update(self.target_actor, self.actor, self.tau)

    def train(self, samples):
        """
        Perform a training step for critic and actor networks with soft update
        """

        # Unpack data from replay buffer and convert to tensors
        obs = torch.tensor([exp[0] for exp in samples],
                           dtype=torch.float,
                           device=device)
        act = torch.tensor([exp[1] for exp in samples],
                           dtype=torch.float,
                           device=device)
        reward = torch.tensor([exp[2] for exp in samples],
                              dtype=torch.float,
                              device=device)
        next_obs = torch.tensor([exp[3] for exp in samples],
                                dtype=torch.float,
                                device=device)
        done = torch.tensor([exp[4] for exp in samples],
                            dtype=torch.float,
                            device=device)
        obs_full = torch.tensor([exp[5] for exp in samples],
                                dtype=torch.float,
                                device=device)
        next_obs_full = torch.tensor([exp[6] for exp in samples],
                                     dtype=torch.float,
                                     device=device)
        act_full = torch.tensor([exp[7] for exp in samples],
                                dtype=torch.float,
                                device=device)

        # Critic update
        self.critic_optimizer.zero_grad()
        target_critic_obs = [next_obs_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        target_critic_obs = torch.cat(target_critic_obs, dim=1)
        target_act = [self.target_act(next_obs_full[:,i,:].squeeze()) \
                        for i in range(self.num_agents)]
        target_act = torch.cat(target_act, dim=1)
        with torch.no_grad():
            q_next = self.target_critic(target_critic_obs, target_act)
        q_target = reward + self.gamma * q_next * (1 - done)

        critic_obs = [obs_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        critic_obs = torch.cat(critic_obs, dim=1)
        critic_act = [act_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        critic_act = torch.cat(critic_act, dim=1)
        q = self.critic(critic_obs, critic_act)

        critic_loss = torch.nn.functional.mse_loss(q, q_target.detach())
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                       self.clip_grad)
        self.critic_optimizer.step()

        # Actor update using policy gradient
        self.actor_optimizer.zero_grad()
        actor_act = [self.act(obs_full[:,i,:].squeeze()) \
                     for i in range(self.num_agents)]
        actor_act = torch.cat(actor_act, dim=1)
        actor_loss = -self.critic(critic_obs, actor_act).mean()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad)
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.update_targets()
Ejemplo n.º 2
0
class DdpgAgent:
    """
    A Deep Deterministic Policy Gradient Agent.
    Interacts with and learns from the environment.
    """
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """
        Initialize an Agent object.
        
        Params
        ======
            num_agents (int): number of agents observed at the same time. multiple agents are handled within the class.
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        if random_seed is not None:
            random.seed(random_seed)
            np.random.seed(random_seed)

        self.t_step = 0  # A counter that increases each time the "step" function is executed
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = ActorNetwork(state_size,
                                        action_size,
                                        USE_BATCH_NORM,
                                        random_seed,
                                        fc1_units=FC1_UNITS,
                                        fc2_units=FC2_UNITS,
                                        fc3_units=FC3_UNITS).to(device)
        self.actor_target = ActorNetwork(state_size,
                                         action_size,
                                         USE_BATCH_NORM,
                                         random_seed,
                                         fc1_units=FC1_UNITS,
                                         fc2_units=FC2_UNITS,
                                         fc3_units=FC3_UNITS).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_ACTOR)
        # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR,
        #                                      weight_decay=WEIGHT_DECAY_ACTOR)  # Also solves it, but Adam quicker

        # Critic Network (w/ Target Network)
        self.critic_local = CriticNetwork(state_size,
                                          action_size,
                                          USE_BATCH_NORM,
                                          random_seed,
                                          fc1_units=FC1_UNITS,
                                          fc2_units=FC2_UNITS,
                                          fc3_units=FC3_UNITS).to(device)
        self.critic_target = CriticNetwork(state_size,
                                           action_size,
                                           USE_BATCH_NORM,
                                           random_seed,
                                           fc1_units=FC1_UNITS,
                                           fc2_units=FC2_UNITS,
                                           fc3_units=FC3_UNITS).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_CRITIC)
        # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC,
        #                                       weight_decay=WEIGHT_DECAY_CRITIC)  # Also solves it, but Adam quicker

        # Make sure target is initiated with the same weight as the local network
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.soft_update(self.critic_local, self.critic_target, 1)

        # Setting default modes for the networks
        # Target networks do not need to train, so always eval()
        # Local networks, in training mode, unless altered in code - eg when acting.
        self.actor_local.train()
        self.actor_target.eval()
        self.critic_local.train()
        self.critic_target.eval()

        # Action Noise process (encouraging exploration during training)
        # Could consider parameter noise in future as a potentially better alternative / addition
        if ACTION_NOISE_METHOD == 'initial':
            self.noise = InitialOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                theta=NOISE_THETA,
                sigma=NOISE_SIGMA)
        elif ACTION_NOISE_METHOD == 'adjusted':
            self.noise = AdjustedOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                sigma=NOISE_SIGMA,
                theta=NOISE_THETA,
                dt=NOISE_DT,
                sigma_delta=NOISE_SIGMA_DELTA,
            )
        else:
            raise ValueError('Unknown action noise method: ' +
                             ACTION_NOISE_METHOD)

        # Replay memory
        self.memory = ReplayBuffer(
            buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            sampling_method=REPLAY_BUFFER_SAMPLING_METHOD,
            random_seed=random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1

        # Save experience / reward
        self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory, every UPDATE_EVERY steps
        if self.t_step % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_action_noise=False):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval(
        )  # train state is set right before actual training
        with torch.no_grad(
        ):  # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower..
            return np.clip(
                self.actor_local(states).cpu().data.numpy() +
                (self.noise.sample() if add_action_noise else 0), -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): reward discount factor
        """

        states, actions, rewards, next_states, dones = experiences
        self.actor_local.train(
        )  # critic_local is always in train state, but actor_local goes into eval with acting

        # Critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if CLIP_GRADIENT_CRITIC:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if CLIP_GRADIENT_ACTOR:
            torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # Soft-Update of Target Networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update target model parameters from local model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent:
    def __init__(self,
                 n_actions,
                 n_states,
                 obs_shape,
                 gamma=0.99,
                 lr=0.0003,
                 gae_lambda=0.95,
                 entropy_coeff=0.0005,
                 ppo_clip=0.2,
                 mini_batch_size=64,
                 n_epochs=10,
                 clip_value_loss=True,
                 normalize_observation=False,
                 stop_normalize_obs_after_timesteps=50000,
                 fc1=64,
                 fc2=64,
                 environment='None',
                 run=0):

        self.entropy_coeff = entropy_coeff
        self.clip_value_loss = clip_value_loss
        self.gamma = gamma
        self.ppo_clip = ppo_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        self.normalize_observation = normalize_observation
        self.stop_obs_timesteps = stop_normalize_obs_after_timesteps
        self.timestep = 0

        self.actor = ActorNetwork(n_states=n_states,
                                  n_actions=n_actions,
                                  lr=lr,
                                  fc1_dims=fc1,
                                  fc2_dims=fc2,
                                  chkpt_dir=environment,
                                  run=run)
        self.critic = CriticNetwork(n_states=n_states,
                                    lr=lr,
                                    fc1_dims=fc1,
                                    fc2_dims=fc2,
                                    chkpt_dir=environment,
                                    run=run)
        self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda)
        self.running_stats = RunningStats(shape_states=obs_shape,
                                          chkpt_dir=environment,
                                          run=run)
        # self.optimizer = optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=lr, eps=1e-5)

    def remember(self, state, action, log_probs, value, reward, done):
        self.memory.store_memory(state, action, log_probs, value, reward, done)

    def remember_adv(self, advantage_list):
        self.memory.store_advantage(advantage_list)

    def save_networks(self):
        print('--saving networks--')
        self.actor.save_actor()
        self.critic.save_critic()
        if self.normalize_observation:
            self.running_stats.save_stats()

    def load_networks(self):
        print('--loading networks--')
        self.actor.load_actor()
        self.critic.load_critic()
        if self.normalize_observation:
            self.running_stats.load_stats()

    def normalize_obs(self, obs):
        mean, std = self.running_stats()
        obs_norm = (obs - mean) / (std + 1e-6)
        return obs_norm

    def choose_action(self, observation):
        if self.normalize_observation:
            if self.timestep < self.stop_obs_timesteps:
                self.running_stats.online_update(observation)
            elif self.timestep == self.stop_obs_timesteps:
                print('No online update for obs Normalization anymore')
            observation = self.normalize_obs(
                observation)  #Normalize Observations

        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist, _ = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        log_probs = dist.log_prob(action)
        log_probs = T.sum(log_probs, dim=1,
                          keepdim=True).squeeze().detach().cpu().numpy()

        value = T.squeeze(value).item()

        # action = T.squeeze(action).detach().numpy()
        if action.shape[0] == 1 and action.shape[1] == 1:
            action = action.detach().cpu().numpy()[0].reshape(1, )
        else:
            action = T.squeeze(action).detach().cpu().numpy()
        self.timestep += 1

        return action, log_probs, value

    def choose_deterministic_action(self, observation):
        if self.normalize_observation:
            observation = self.normalize_obs(
                observation)  #Normalize Observations

        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        _, mean = self.actor(state)
        action = T.squeeze(mean).detach().cpu().numpy()  #.reshape(1, )
        return action

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr, \
            reward_arr, dones_arr, advantage_arr, batches = \
                self.memory.generate_batches()

            if self.normalize_observation:
                #print(state_arr[0:5,:])
                state_arr = self.normalize_obs(state_arr)
                #print(state_arr[0:5,:])

            for batch in batches:
                states = T.tensor(state_arr[batch],
                                  dtype=T.float).to(self.actor.device)
                old_log_probs = T.tensor(old_prob_arr[batch]).to(
                    self.actor.device).detach()
                actions = T.tensor(action_arr[batch]).to(
                    self.actor.device).detach()
                critic_value_old = T.tensor(vals_arr[batch]).to(
                    self.actor.device).detach()
                advantage = T.tensor(advantage_arr[batch]).to(
                    self.actor.device)
                #returns = T.tensor(reward_arr[batch]).to(self.actor.device)
                #advantage = returns - critic_value_old

                # Advantage Normalization per Mini-Batch
                advantage = (advantage - advantage.mean()) / (advantage.std() +
                                                              1e-8)
                advantage = advantage.detach()

                ## Actor-Loss
                dist, _ = self.actor(states)
                critic_value_new = self.critic(states)
                critic_value_new = T.squeeze(critic_value_new)

                new_log_probs = dist.log_prob(actions)
                new_log_probs = T.sum(new_log_probs, dim=1,
                                      keepdim=True).squeeze()

                prob_ratio = (new_log_probs - old_log_probs).exp()

                weighted_probs = advantage * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1 - self.ppo_clip,
                                                 1 + self.ppo_clip) * advantage
                ppo_surr_loss = -T.min(weighted_probs,
                                       weighted_clipped_probs).mean()
                entropy_loss = -self.entropy_coeff * dist.entropy().mean()
                actor_loss = ppo_surr_loss + entropy_loss

                ## Critic-Loss
                returns = advantage + critic_value_old
                # Clipping Value Loss
                if self.clip_value_loss:
                    v_loss_unclipped = ((critic_value_new - returns)**2)
                    v_clipped = critic_value_old + T.clamp(
                        critic_value_new - critic_value_old, -self.ppo_clip,
                        self.ppo_clip)
                    v_loss_clipped = (v_clipped - returns)**2
                    v_loss_max = T.max(v_loss_unclipped, v_loss_clipped)
                    critic_loss = 0.5 * v_loss_max.mean()
                else:
                    critic_loss = 0.5 * (
                        (critic_value_new - returns)**2).mean()

                ## Backprop Actor
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(parameters=self.actor.parameters(),
                                         max_norm=0.5,
                                         norm_type=2)
                self.actor.optimizer.step()

                ## Backprop Critic
                self.critic.optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(parameters=self.critic.parameters(),
                                         max_norm=0.5,
                                         norm_type=2)
                self.critic.optimizer.step()

                # loss = critic_loss + actor_loss
                # self.optimizer.zero_grad()
                # loss.backward()
                # nn.utils.clip_grad_norm_(parameters=list(self.actor.parameters()) + list(self.critic.parameters()),
                #                        max_norm=0.8,
                #                        norm_type=2)
                # self.optimizer.step()

        self.memory.clear_memory(
        )  # Clear Memory to save new samples for next iteration
Ejemplo n.º 4
0
def main(args):
    args = parse_arguments()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    env = gym.make(args.env_name)
    os.environ['OMP_NUM_THREADS'] = '1'
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed_all(args.seed)
    torch.set_num_threads(1)
    writer = SummaryWriter(log_dir=args.save_dir)
    actor = ActorNetwork(env.observation_space.shape[0], env.action_space.n)
    critic = CriticNetwork(env.observation_space.shape[0])
    if args.continue_training:
        try:
            actorState = torch.load(args.load_dir,
                                    map_location=lambda storage, loc: storage)
            actor.load_state_dict(actorState)
        except:
            assert False, "Unable to find a model to load"
    if args.cuda:
        actor.cuda()
        critic.cuda()
    actor_optimizer = optim.Adam(actor.parameters(), lr=args.lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=args.lr)
    N = args.nsteps
    eps = 1.0
    obsarr = []
    rewardarr = []
    actionlossarr = []
    actionarr = []
    valuearr = []
    ep_len = 0
    for ep in range(args.num_episodes):
        done = False
        obs = env.reset()

        while not done:
            ep_len += 1
            obs_var = Variable(torch.from_numpy(obs).float(), volatile=True)
            action = actor.get_action(obs_var)
            value = critic(obs_var)
            action = action.data[0]
            next_obs, reward, done, _ = env.step(action)
            if args.render:
                env.render()
            obsarr.append(obs)
            actionarr.append(action)
            rewardarr.append(reward)
            valuearr.append(value)
            obs = next_obs

        T = len(obsarr)
        G = [0] * T

        batch_obs = Variable(torch.from_numpy(np.stack(obsarr)).float())
        batch_act = Variable(torch.from_numpy(np.array(actionarr)))
        logprobvar = actor.evaluate_actions(batch_obs, batch_act)
        valvar = critic(batch_obs)
        logprobvar = logprobvar.squeeze(1)
        valvar = valvar.squeeze(1)

        for t in reversed(range(T)):
            V = 0
            if t + N < T:
                V = valvar[t + N].data[0]
            G[t] = pow(args.gamma, N) * V
            u = min(N, T - t)
            for k in range(u):
                G[t] += pow(args.gamma, k) * rewardarr[t + k]

        Gtensor = Variable(torch.FloatTensor(G))
        adv = 0.01 * Gtensor - valvar.detach()
        action_loss = -(adv * logprobvar).mean()
        value_loss = (0.01 * Gtensor - valvar).pow(2).mean()
        actionlossarr.append(action_loss)

        critic_optimizer.zero_grad()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm(critic.parameters(), 3)
        critic_optimizer.step()

        if ep % args.update_freq == 0:
            actor_optimizer.zero_grad()
            l = torch.cat(actionlossarr).mean()
            l.backward()
            torch.nn.utils.clip_grad_norm(actor.parameters(), 3)
            actor_optimizer.step()
            r = np.array(rewardarr).sum() / args.update_freq
            print("Episode: {} | Reward: {:.3f}| Length: {}".format(
                ep, r, ep_len / args.update_freq))
            obsarr = []
            rewardarr = []
            actionlossarr = []
            actionarr = []
            ep_len = 0

        if ep % 500 == 0:
            torch.save(actor.state_dict(),
                       args.save_dir + '/' + args.env_name + '.pt')
            rm, rs, em = test(env, actor, False)
            writer.add_scalar('test/reward_mean', rm, ep)
            writer.add_scalar('test/reward_std', rs, ep)
            writer.add_scalar('test/ep_len_mean', em, ep)
            writer.export_scalars_to_json(args.save_dir + '/' + args.env_name +
                                          '_scalars.json')

        writer.add_scalar('train/reward', r, ep)
class Agent:
    """ This class represents the reinforcement learning agent """
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 gamma: float = 0.99,
                 lr_actor: float = 0.001,
                 lr_critic: float = 0.003,
                 weight_decay: float = 0.0001,
                 tau: float = 0.001,
                 buffer_size: int = 100000,
                 batch_size: int = 64):
        """
        :param state_size: how many states does the agent get as input (input size of neural networks)
        :param action_size: from how many actions can the agent choose
        :param gamma: discount factor
        :param lr_actor: learning rate of the actor network
        :param lr_critic: learning rate of the critic network
        :param weight_decay:
        :param tau: soft update parameter
        :param buffer_size: size of replay buffer
        :param batch_size: size of learning batch (mini-batch)
        """
        self.tau = tau
        self.gamma = gamma

        self.batch_size = batch_size

        self.actor_local = ActorNetwork(state_size, action_size).to(device)
        self.actor_target = ActorNetwork(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        print(self.actor_local)

        self.critic_local = CriticNetwork(state_size, action_size).to(device)
        self.critic_target = CriticNetwork(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)
        print(self.critic_local)

        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size)
        # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process
        self.noise = OUNoise(action_size)

    def step(self, experience: tuple):
        """
        :param experience: tuple consisting of (state, action, reward, next_state, done)
        :return:
        """
        self.memory.add(*experience)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state, add_noise: bool = True):
        """ Actor uses the policy to act given a state """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local.forward(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        # Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        # the actor_target returns the next action, this next action is then used (with the state) to estimate
        # the Q-value with the critic_target network

        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target.forward(next_states)
        q_expected = self.critic_local.forward(states, actions)
        q_targets_next = self.critic_target.forward(next_states, actions_next)

        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))

        # minimize the loss
        critic_loss = F.mse_loss(q_expected, q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        # endregion Update Critic

        # region Update actor
        # Compute actor loss
        actions_predictions = self.actor_local.forward(states)
        actor_loss = -self.critic_local.forward(states,
                                                actions_predictions).mean()
        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion Update actor

        # region update target network
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)
        # endregion update target network

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def hard_update(self, local_model, target_model):
        """Copy the weights and biases from the local to the target network"""
        for target_param, param in zip(target_model.parameters(),
                                       local_model.parameters()):
            target_param.data.copy_(param.data)

    def reset(self):
        self.noise.reset()