def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.stacked_state_size = state_size * n_agents
        self.stacked_action_size = action_size * n_agents

        # Actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Critic networks
        self.critic_local = CriticNetwork(self.stacked_state_size,
                                          self.stacked_action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(self.stacked_state_size,
                                           self.stacked_action_size,
                                           seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # OUNoise
        self.exploration_noise = OUNoise(action_size, seed)
Beispiel #2
0
    def __init__(self, state_size, action_size, agent_id):

        self.state_size  = state_size
        self.action_size = action_size
        self.seed        = args['seed']
        self.device      = args['device']
        #self.args        = args

        # Q-Network
        self.actor_network    = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_target     = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_optimizer  = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR'])
        
        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        #if not agent_id:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #else:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        
        # Replay memory
        self.memory      = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed)
        
        # Noise process
        self.noise       = OUNoise(action_size, self.seed)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step      = 0
        
        self.mCriticLoss = 0
        
        self.actorLoss   = 0
Beispiel #3
0
    def __init__(self, state_size, obs_size, action_size, num_agents):
        super(DDPGAgent, self).__init__()

        #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.actor = ActorNetwork(obs_size, action_size).to(device)
        self.critic = CriticNetwork(state_size,
                                    action_size * num_agents).to(device)
        self.target_actor = ActorNetwork(obs_size, action_size).to(device)
        self.target_critic = CriticNetwork(state_size,
                                           action_size * num_agents).to(device)

        #self.noise = OUNoise(out_actor, scale=1.0 )
        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)
Beispiel #4
0
    def __init__(self, state_size, action_size, random_seed, agent_size=1):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_size = agent_size

        self.local_actor = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.target_actor = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        self.local_critic = CriticNetwork(state_size, action_size,
                                          random_seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size,
                                           random_seed).to(device)

        self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR)
        self.opt_critic = optim.Adam(self.local_critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Beispiel #5
0
class Actor():
    def __init__(self, state_size, action_size, random_seed, learning_rate,
                 noise, device):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.learning_rate = learning_rate

        self.actor_local = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        hard_update(self.actor_target, self.actor_local)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.noise = noise
        self.device = device

    def act(self, state, noise_factor, add_noise):
        """ Returns actions for given state as per given policy"""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += (noise_factor * self.noise.sample())
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()
Beispiel #6
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)

        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size,
                                           seed).to(device)

        # optimizers for local actor and critic
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR,
                                           weight_decay=0.0)

        # MSE loss for updating the critic
        self.critic_loss_function = nn.MSELoss()

        # ensure that the local and target networks are initialized with the same random weights
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic_local.parameters()):
            target_param.data.copy_(param.data)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
def create_model(context):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    context.sess = tf.InteractiveSession(config=config)

    context.actor = ActorNetwork(
        context.sess, [len(context.assets), context.n,
                       len(context.features)],
        len(context.assets) + 1, context.actor_learning_rate, context.tau,
        context.minibatch_size)

    context.critic = CriticNetwork(
        context.sess, [len(context.assets), context.n,
                       len(context.features)],
        len(context.assets) + 1, context.critic_learning_rate, context.tau,
        context.gamma, context.actor.get_num_trainable_vars())

    # Inicializar las variables de Tensorflow
    context.sess.run(tf.global_variables_initializer())

    context.saver = tf.train.Saver()

    # Inicializar los pesos de las redes objetivo
    context.actor.update_target_network()
    context.critic.update_target_network()
Beispiel #8
0
class Learner:
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)

    def learning(self):
        torch.manual_seed(self.opt.seed)

        while True:
            # batch-trace
            states, actions, rewards = self.q_batch.get(block=True)

            onehot_actions = torch.FloatTensor(
                index2onehot(actions, self.n_act)).to(self.device)

            # update actor network
            self.actor_optimizer.zero_grad()
            action_log_probs = self.actor(states)
            action_log_probs = torch.sum(action_log_probs * onehot_actions, 1)
            values = self.critic(states)
            advantages = rewards - values.detach()
            pg_loss = -torch.sum(action_log_probs * advantages)
            actor_loss = pg_loss
            actor_loss.backward()
            self.actor_optimizer.step()

            # update critic network
            self.critic_optimizer.zero_grad()
            target_values = rewards
            critic_loss = nn.MSELoss()(values, target_values)
            critic_loss.backward()
            self.critic_optimizer.step()
Beispiel #9
0
    def __init__(self, state_size, action_size, random_seed, learning_rate,
                 noise, device):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.learning_rate = learning_rate

        self.actor_local = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        hard_update(self.actor_target, self.actor_local)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.noise = noise
        self.device = device
Beispiel #10
0
def build_actor_critic(sess, env):
    w_init = tflearn.initializations.xavier_initializer()

    with tf.variable_scope("model", reuse=None, initializer=w_init):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=True)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=True)

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    return actor, critic
Beispiel #11
0
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)
    def __init__(self, state_size, action_size, memory, seed=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        
        if seed is not None:
            self.seed = seed

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size, seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, seed).to(device)
        
        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size, seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, seed).to(device)
        
        # optimizers for local actor and critic 
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0)
        
        # MSE loss for updating the critic
        # self.critic_loss_function = nn.MSELoss()
        self.critic_loss_function = nn.SmoothL1Loss()

        # copy the local networks weights to the target network 
        self.copy_weights_from_local_to_target()
        
        # Replay memory
        self.memory = memory
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
Beispiel #13
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 hidden_in_actor=512,
                 hidden_out_actor=256,
                 lr_actor=1e-4,
                 hidden_in_critic=512,
                 hidden_out_critic=256,
                 lr_critic=3e-4,
                 weight_decay_critic=0,
                 seed=1,
                 device='cpu'):
        super(DDPGAgent, self).__init__()

        self.device = device

        # Actor
        self.actor = ActorNetwork(state_size, hidden_in_actor,
                                  hidden_out_actor, action_size,
                                  seed).to(device)
        self.target_actor = ActorNetwork(state_size, hidden_in_actor,
                                         hidden_out_actor, action_size,
                                         seed).to(device)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)

        # Target
        self.critic = CriticNetwork(state_size, action_size, num_agents,
                                    hidden_in_critic, hidden_out_critic,
                                    seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size, num_agents,
                                           hidden_in_critic, hidden_out_critic,
                                           seed).to(device)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

        # Noise
        self.noise = OUNoise(action_size, seed, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
Beispiel #14
0
def build_actor_critic(sess, env):
    with tf.variable_scope("model", reuse=None):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=False)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=False)

    sess.run(tf.global_variables_initializer())

    return actor, critic
Beispiel #15
0
class DDPGAgent:
    #def __init__(self, in_actor=14, hidden_in_actor=16, hidden_out_actor=8, out_actor=2,
    #in_critic=20, hidden_in_critic=32, hidden_out_critic=16,
    #lr_actor=1.0e-2, lr_critic=1.0e-2):
    def __init__(self, state_size, obs_size, action_size, num_agents):
        super(DDPGAgent, self).__init__()

        #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.actor = ActorNetwork(obs_size, action_size).to(device)
        self.critic = CriticNetwork(state_size,
                                    action_size * num_agents).to(device)
        self.target_actor = ActorNetwork(obs_size, action_size).to(device)
        self.target_critic = CriticNetwork(state_size,
                                           action_size * num_agents).to(device)

        #self.noise = OUNoise(out_actor, scale=1.0 )
        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

    def act(self, obs, noise=0.0):
        if type(obs) == np.ndarray:
            obs = torch.from_numpy(obs).float().to(device)
        #self.actor.eval()
        action = self.actor(obs)
        action += noise * self.noise.noise()
        #self.actor.train()
        #return action.cpu().data.numpy()
        return action

    def target_act(self, obs, noise=0.0):
        if type(obs) == np.ndarray:
            obs = torch.from_numpy(obs).float().to(device)
        #obs = obs.to(device)
        #self.target_actor.eval()
        #action = self.target_actor(obs) + noise*self.noise.noise()
        action = self.target_actor(obs)
        action += noise * self.noise.noise()
        #self.target_actor.train()
        #return action.cpu().data.numpy()
        return action
Beispiel #16
0
    def __init__(self, opt, actor_id, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed + actor_id)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.n_episodes = 0
        self.n_steps = 0
        self.gamma = opt.gamma

        # epsilon
        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        # モデル
        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
Beispiel #17
0
def build_actor_critic(sess, env, env_eval):
    w_init = tflearn.initializations.xavier_initializer()

    with tf.variable_scope("model", reuse=None, initializer=w_init):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=True)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=True)

        # if config.noise_std:
        #     actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.input_cardin * env.state_cardin),
        #                                                sigma=config.noise_std,
        #                                                sigma_dec=config.noise_dec)
        # else:
        #     actor_noise = None

    with tf.variable_scope("model", reuse=True):
        with tf.name_scope("actor"):
            actor_eval = ActorNetwork(sess,
                                      env_eval,
                                      config,
                                      is_training=False)

        with tf.name_scope("critic"):
            critic_eval = CriticNetwork(sess,
                                        env_eval,
                                        config,
                                        is_training=False)

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    return actor, critic, actor_eval, critic_eval  #, actor_noise
    def __init__(self,
        device,
        key,
        state_size, action_size, random_seed,
        memory, noise,
        lr, weight_decay,
        checkpoint_folder = './Saved_Model/'):

        self.DEVICE = device

        self.KEY = key

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.LR = lr
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR)

        self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth'
        if os.path.isfile(self.checkpoint_full_name):
            self.local.load_state_dict(torch.load(self.checkpoint_full_name))
            self.target.load_state_dict(torch.load(self.checkpoint_full_name))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise
Beispiel #19
0
    def __init__(self, opt, actor_id,  q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner

        self.env = gym.make(self.opt.env)
        self.env_state = self.env.reset()
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        ## パラメータ
        self.batch_size = opt.batch_size
        self.roll_out_n_steps = opt.roll_out_n_steps
        self.gamma = opt.gamma

        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        self.n_episodes = 0

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)  # ActorNetwork
        self.critic = CriticNetwork(self.n_state).to(self.device)  # CriticNetwork
Beispiel #20
0
    def __init__(self, path_to_weights, weights_id):

        print("Init ANN")

        self.init = False

        # aviod TF from allocation all GPU mem
        # https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        #sess = tf.Session()
        K.set_session(sess)

        self.path_to_weights = path_to_weights
        self.next_weights_id = weights_id

        #check if path is valid
        if not os.path.isdir(self.path_to_weights):
            print("Folder with weights does not exist!")
            print(self.path_to_weights)
            self.init = False
        else:
            self.init = True

        if self.init:
            #create actor network
            self.actor = ActorNetwork.ActorNetwork(sess, 5, 2)
            #load first weights
            print("Loading first weights for ANN from " + str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
            try:
                self.actor.model.load_weights(
                    str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
                self.next_weights_id = self.next_weights_id + 1
            except:
                print("Cannot find the weight (.h5) file")
                print(str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
                self.init = False
Beispiel #21
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 hidden_in_actor=512,
                 hidden_out_actor=256,
                 lr_actor=1e-4,
                 hidden_in_critic=512,
                 hidden_out_critic=256,
                 lr_critic=3e-4,
                 weight_decay_critic=0,
                 seed=1,
                 device='cpu'):
        super(DDPGAgent, self).__init__()

        self.device = device

        # Actor
        self.actor = ActorNetwork(state_size, hidden_in_actor,
                                  hidden_out_actor, action_size,
                                  seed).to(device)
        self.target_actor = ActorNetwork(state_size, hidden_in_actor,
                                         hidden_out_actor, action_size,
                                         seed).to(device)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)

        # Target
        self.critic = CriticNetwork(state_size, action_size, num_agents,
                                    hidden_in_critic, hidden_out_critic,
                                    seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size, num_agents,
                                           hidden_in_critic, hidden_out_critic,
                                           seed).to(device)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

        # Noise
        self.noise = OUNoise(action_size, seed, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

    def reset(self):
        self.noise.reset()

    def act(self, obs, noise_factor=0.0):

        if torch.is_tensor(obs):
            states = obs
        else:
            states = torch.from_numpy(obs).float().to(self.device)

        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states).cpu().data.numpy()
        self.actor.train()
        actions += noise_factor * self.noise.sample()
        return np.clip(actions, -1, 1)

    def target_act(self, obs):

        if torch.is_tensor(obs):
            states = obs
        else:
            states = torch.from_numpy(obs).float().to(self.device)

        self.target_actor.eval()
        with torch.no_grad():
            actions = self.target_actor(states).cpu().data.numpy()
        self.target_actor.train()
        return np.clip(actions, -1, 1)
Beispiel #22
0
def main(args):

    with tf.Session() as sess:
        env = gym.make(args['env'])
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        # Ensure action bound is symmetric
        # assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

        if args['train']:
            if not os.path.exists(args['save_dir']):
                os.makedirs(args['save_dir'])
            with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f:
                json.dump(args, f, indent=2)
            train(sess, env, args, actor, critic, actor_noise)
        else:
            # ddpg = []
            # indexes = [e for e in range(400) if e % 10 == 9]
            # indexes = [0] + indexes
            indexes = [399]
            num_test_tasks = 100
            buckets = 1
            successes = []
            directory = args['to_pickle']
            for index in indexes:
                # times = []
                task_success = []
                saver = tf.train.Saver()
                saver.restore(
                    sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt".
                    format(directory, index))
                for _ in range(buckets):
                    tasks = env.unwrapped.sample_tasks(num_test_tasks)
                    # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)]
                    success = 0
                    for task in tasks:
                        s = env.reset_task(task)
                        step = 0
                        d = False
                        while not d:
                            # env.render()
                            action = actor.predict_target(
                                np.reshape(s, (1, actor.s_dim)))[0]
                            step += 1
                            s, r, d, _ = env.step(action)
                        if r == 1:
                            success += 1
                        # times.append(step)
                    env.close()
                    task_success.append(success / num_test_tasks)
                successes.append(task_success)
                # ddpg.append(times)
            # out = [successes, ddpg]
            env.close()
            if not os.path.exists('./pkls'):
                os.makedirs('./pkls')
            with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f:
                pickle.dump(successes, f)
Beispiel #23
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #24
0
class Learner(object):
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)

    def learning(self):
        torch.manual_seed(self.opt.seed)
        coef_hat = torch.FloatTensor([self.opt.coef_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step)
        rho_hat = torch.FloatTensor([self.opt.rho_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step)
        while True:
            # batch-trace
            states, actions, rewards, dones, action_log_probs = self.q_batch.get(block=True)

            logit_log_probs = self.actor(states)
            V = self.critic(states).view(self.opt.batch_size, self.opt.n_step) * (1 - dones)

            action_probs = torch.exp(action_log_probs)
            logit_probs = torch.exp(logit_log_probs)

            is_rate = torch.prod(logit_probs / (action_probs + 1e-6), dim=-1).detach()
            coef = torch.min(coef_hat, is_rate) * (1 - dones)
            rho = torch.min(rho_hat, is_rate) * (1 - dones)

            # V-trace
            v_trace = torch.zeros((self.opt.batch_size, self.opt.n_step)).to(self.device)
            target_V = V.detach()
            for rev_step in reversed(range(states.size(1) - 1)):
                v_trace[:, rev_step] = target_V[:, rev_step] \
                                       + rho[:, rev_step] * (rewards[:, rev_step] + self.opt.gamma*target_V[:, rev_step+1] - target_V[:, rev_step]) \
                                       + self.opt.gamma * coef[:, rev_step] * (v_trace[:, rev_step+1] - target_V[:, rev_step+1])

            # actor loss
            onehot_actions = torch.FloatTensor(
                idx2onehot(actions.cpu().numpy(), self.opt.batch_size, self.n_act)).to(self.device)
            logit_log_probs = torch.sum(logit_log_probs * onehot_actions, dim=-1)
            advantages = rewards + self.opt.gamma * v_trace - V
            pg_loss = -torch.sum(logit_log_probs * advantages.detach())
            actor_loss = pg_loss

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # critic
            critic_loss = torch.mean((v_trace.detach() - V)**2)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
Beispiel #25
0
class Actor(object):
    def __init__(self, opt, actor_id, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed + actor_id)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.n_episodes = 0
        self.n_steps = 0
        self.gamma = opt.gamma

        # epsilon
        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        # モデル
        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)

    def performing(self):
        torch.manual_seed(self.opt.seed)

        while True:
            self.load_model()
            self.train_episode()
            if self.n_episodes % 100 == 0:
                rewards = self.evaluation(self.env)
                rewards_mu = np.array(
                    [np.sum(np.array(l_i), 0) for l_i in rewards]).mean()
                print("Episode %d, Average Reward %.2f" %
                      (self.n_episodes, rewards_mu))

    def _softmax_action(self, state):
        state = torch.FloatTensor([state]).to(self.device)
        softmax_action = torch.exp(self.actor(state))  # expをかけて,行動確率とする
        softmax_action = softmax_action.cpu().detach().numpy()
        return softmax_action

    def exploration_action(self, state):
        softmax_action = self._softmax_action(state)

        if np.random.rand() > self.eps_greedy:
            return np.argmax(softmax_action)
        else:
            return np.random.choice(self.n_act)

    def train_episode(self):
        done = False
        state = self.env.reset()
        self.env_state = state
        self.next_done = done

        while not done:
            self.n_steps += 1
            states = np.zeros((self.opt.n_step, self.n_state))
            actions = np.zeros(self.opt.n_step)
            rewards = np.zeros(self.opt.n_step)
            log_probs = np.zeros((self.opt.n_step, self.n_act))
            dones = np.ones(self.opt.n_step)
            for i in range(self.opt.n_step):
                states[i] = self.env_state
                dones[i] = self.next_done
                log_prob = self.actor(
                    torch.FloatTensor([state]).to(
                        self.device)).detach().cpu().numpy()[0]
                action = self.exploration_action(state)
                next_state, reward, done, info = self.env.step(action)

                reward = 0
                if done:
                    if self.n_steps > 190:
                        reward = 1
                    else:
                        reward = -1

                log_probs[i] = log_prob
                actions[i] = action
                rewards[i] = reward
                self.env_state = next_state
                self.next_done = done
                if done:
                    self.env_state = self.env.reset()
                    break

            # n_step回終了
            if done:
                self.n_steps = 0
                self.n_episodes += 1
                self.episode_done = True
            else:
                self.episode_done = False

            self.q_trace.put((states, actions, rewards, dones, log_probs),
                             block=True)

    # choose an action based on state for execution
    def action(self, state):
        softmax_action = self._softmax_action(state)
        action = np.argmax(softmax_action)
        return action

    def value(self, state):  # Qを出力
        state_var = torch.FloatTensor([state]).to(self.device)
        q_var = self.critic(state_var)  # 行動価値を出value
        q = q_var.cpu().detach().numpy()
        return q

    def _discount_reward(self, rewards, final_value):
        discounted_r = np.zeros_like(rewards)
        R = final_value  # Q(s_t, a_t)
        for t in reversed(range(0, len(rewards))):
            R = rewards[t] + self.gamma * R
            discounted_r[t] = R
        return discounted_r

    def evaluation(self, env_eval):
        rewards = []
        for i in range(10):
            rewards_i = []

            state = env_eval.reset()
            action = self.action(state)
            state, reward, done, _ = env_eval.step(action)
            rewards_i.append(reward)

            while not done:
                action = self.action(state)
                state, reward, done, _ = env_eval.step(action)
                rewards_i.append(reward)
            rewards.append(rewards_i)

        return rewards

    def load_model(self):
        try:
            self.actor.load_state_dict(self.learner.actor.state_dict())
            self.critic.load_state_dict(self.learner.critic.state_dict())
        except:
            print('load error')
Beispiel #26
0
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.autograd import Variable

import gym
import numpy as np
import matplotlib.pyplot as plt

from model import ActorNetwork, CriticNetwork

actor = ActorNetwork(4, 2)
critic = CriticNetwork(4)
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=8e-4)
env = gym.make('CartPole-v0')
GAMMA = 0.99
N_EPISODES = 20000
LOG_STEPS = 100
SAVE_STEPS = 100


def select_action(S):
    '''
    select action based on currentr state
    args:
        S: current state
    returns:
        action to take, log probability of the chosen action
    '''
Beispiel #27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, agent_size=1):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_size = agent_size

        self.local_actor = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.target_actor = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        self.local_critic = CriticNetwork(state_size, action_size,
                                          random_seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size,
                                           random_seed).to(device)

        self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR)
        self.opt_critic = optim.Adam(self.local_critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def save_experience(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience
        self.memory.add(state, action, reward, next_state, done)

    def multi_step(self, t):
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if t % 20 == 0:
                for i in range(0, 10):
                    self.learn(self.memory.sample(), GAMMA)
            else:
                pass

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.local_actor.eval()
        with torch.no_grad():
            action = self.local_actor(state).cpu().data.numpy()
        self.local_actor.train()
        if add_noise:
            for a in range(0, self.agent_size):
                action[a] += self.noise.sample()
        return np.clip(action, -1, 1)  # all actions between -1 and 1

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Target and Local Critics-Actors are used to sove the moving targets problem.
        TargetActor generates the next action, and TargetCritic generates the corresponding Q-value.
        This function updates policy and value parameters using given batch of experience tuples.

        Q_targets = r + gamma * critic_t(next_state, actor_t(next_state))

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.target_actor(next_states)
        Q_targets_next = self.target_critic(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.local_critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.opt_critic.zero_grad()
        critic_loss.backward()
        #use gradient clipping when training the critic network
        torch.nn.utils.clip_grad_norm_(self.local_critic.parameters(), 1)
        self.opt_critic.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.local_actor(states)
        actor_loss = -self.local_critic(states, actions_pred).mean()
        # Minimize the loss
        self.opt_actor.zero_grad()
        torch.nn.utils.clip_grad_norm_(self.local_actor.parameters(), 1)
        actor_loss.backward()
        self.opt_actor.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.local_critic, self.target_critic, TAU)
        self.soft_update(self.local_actor, self.target_actor, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        this function manages the update of local and target models syncing
        theta_target = tau*theta_local + (1 - tau)*theta_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, memory, seed=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        
        if seed is not None:
            self.seed = seed

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size, seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, seed).to(device)
        
        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size, seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, seed).to(device)
        
        # optimizers for local actor and critic 
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0)
        
        # MSE loss for updating the critic
        # self.critic_loss_function = nn.MSELoss()
        self.critic_loss_function = nn.SmoothL1Loss()

        # copy the local networks weights to the target network 
        self.copy_weights_from_local_to_target()
        
        # Replay memory
        self.memory = memory
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for _ in range(LEARN_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                self.soft_update_all()


    def copy_weights_from_local_to_target(self):
        # ensure that the local and target networks are initialized with the same random weights
        # or copy you saved weights after loading into local
        for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_param.data.copy_(param.data)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy. 
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        # get predicted actions for current state from actor network
        self.actor_local.eval()
        with torch.no_grad():
            action_values = self.actor_local(state)
        self.actor_local.train()

        # take the predicted actions and add noise, used as exploration in a continuous environment
        action_values = action_values.cpu().data.numpy()
        
        if add_noise == True:
            action_values += self.noise.sample()
        
        return action_values
        
    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        # unpack the experiences tuple 
        states, actions, rewards, next_states, dones = experiences
        
        # computer the loss for the actor network per the DDPG algorithm
        actor_local_predicted_actions = self.actor_local(states)
        policy_loss = -self.critic_local(states, actor_local_predicted_actions).mean()
        
        # compute the loss for the critic network per the DDPG algorithm
        predicted_Q_vals = self.critic_local(states, actions)
        predicted_actions = self.actor_target(next_states)
        Q_next = self.critic_target(next_states, predicted_actions)
        Q_targets = rewards + (gamma * Q_next * (1 - dones))
        
        critic_loss = self.critic_loss_function(predicted_Q_vals, Q_targets)
        
        # update the networks
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        
    
    def soft_update_all(self):
        # and soft update the target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter   # use percent tau local_param.data and rest target_param.data
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.stacked_state_size = state_size * n_agents
        self.stacked_action_size = action_size * n_agents

        # Actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Critic networks
        self.critic_local = CriticNetwork(self.stacked_state_size,
                                          self.stacked_action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(self.stacked_state_size,
                                           self.stacked_action_size,
                                           seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # OUNoise
        self.exploration_noise = OUNoise(action_size, seed)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # Add exploration noise
        action += self.exploration_noise.sample()

        return np.clip(action, -1, 1)

    def update(self, states, current_agent_states, actions,
               current_agent_actions, target_next_actions, rewards,
               current_agent_rewards, next_states, dones, current_agent_dones,
               action_preds):
        flatten_states = torch.reshape(states, shape=(BATCH_SIZE, -1))
        flatten_next_states = torch.reshape(next_states,
                                            shape=(BATCH_SIZE, -1))
        flatten_actions = torch.reshape(actions, shape=(BATCH_SIZE, -1))

        y = current_agent_rewards + GAMMA * self.critic_target(
            flatten_next_states,
            target_next_actions) * (1 - current_agent_dones)

        # Critic loss
        critic_loss = F.mse_loss(
            y, self.critic_local(flatten_states, flatten_actions))

        # Critic backprop
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        actor_loss = -self.critic_local(flatten_states, action_preds).mean()

        # Actor backprop
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft updates
        self.update_target_network()

    def update_target_network(self):
        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

        for target_param, local_param in zip(self.critic_target.parameters(),
                                             self.critic_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
class Actor:

    def __init__(self,
        device,
        key,
        state_size, action_size, random_seed,
        memory, noise,
        lr, weight_decay,
        checkpoint_folder = './Saved_Model/'):

        self.DEVICE = device

        self.KEY = key

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.LR = lr
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR)

        self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth'
        if os.path.isfile(self.checkpoint_full_name):
            self.local.load_state_dict(torch.load(self.checkpoint_full_name))
            self.target.load_state_dict(torch.load(self.checkpoint_full_name))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.DEVICE)

        self.local.eval()
        with torch.no_grad():
            action = self.local(state).cpu().data.numpy()
        self.local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def reset(self):
        self.noise.reset()

    def checkpoint(self):
        torch.save(self.local.state_dict(), self.checkpoint_full_name)