Ejemplo n.º 1
0
Archivo: agent.py Proyecto: Kavka1/RL
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.action_boundary = env_params['action_boundary']

        self.lr_a = args.lr_a
        self.lr_c = args.lr_c
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_eps = args.noise_eps
        self.batch_size = args.batch_size

        self.device = torch.device(args.device)

        self.actor = DeterministicPolicy(self.o_dim,
                                         self.a_dim).to(self.device)
        self.actor_tar = DeterministicPolicy(self.o_dim,
                                             self.a_dim).to(self.device)
        self.critic = QFunction(self.o_dim, self.a_dim).to(self.device)
        self.critic_tar = QFunction(self.o_dim, self.a_dim).to(self.device)

        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=self.lr_a)
        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c)

        self.hard_update()
Ejemplo n.º 2
0
Archivo: agent.py Proyecto: Kavka1/RL
    def __init__(self, s_dim, a_dim, action_space, args):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.action_space = action_space
        self.lr_pi = args.lr_pi
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.batch_size = args.batch_size
        self.policy_update_interval = args.policy_update_interval
        self.device = torch.device(args.device)
        self.policy_loss_log = torch.tensor(0.).to(self.device)

        self.policy = DeterministicPolicy(self.s_dim,
                                          self.a_dim,
                                          self.device,
                                          action_space=self.action_space).to(
                                              self.device)
        self.policy_target = DeterministicPolicy(
            self.s_dim,
            self.a_dim,
            self.device,
            action_space=self.action_space).to(self.device)
        self.Q1 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.hard_update_target()

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)
Ejemplo n.º 3
0
    def __init__(self, input_shape, action_n, gamma=0.99, N=50000):
        self.shape = input_shape
        self.batch_size = input_shape[0]
        self.N = N

        Q = QFunction(input_shape, action_n, scope="Q")
        target_Q = QFunction(input_shape, action_n, scope="target_Q")

        # Forward Q
        self.s = tf.placeholder(shape=[None] + input_shape[1:],
                                dtype=tf.float32)
        self.a = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.int32)
        self.probs = Q(self.s, s_bias=False)

        # add offset
        first = tf.expand_dims(tf.range(self.batch_size), axis=1)
        indices = tf.concat(values=[first, self.a], axis=1)
        # gather corresiponding q_vals
        self.q_val = tf.expand_dims(tf.gather_nd(self.probs, indices), axis=1)

        # TD target
        self.done = tf.placeholder(shape=[self.batch_size, 1],
                                   dtype=tf.float32)
        self.r = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32)
        self.s_ = tf.placeholder(shape=input_shape, dtype=tf.float32)

        # D-DQN
        a_max = tf.expand_dims(tf.argmax(Q(self.s_, reuse=True), axis=1),
                               axis=1)
        a_max = tf.to_int32(a_max)
        target_q_val = tf.expand_dims(tf.gather_nd(
            target_Q(self.s_), tf.concat(values=[first, a_max], axis=1)),
                                      axis=1)
        self.y = self.r + gamma * (1.0 - self.done) * target_q_val
        # Error Clipping
        self.loss = tf.reduce_mean(Hurber_loss(self.q_val, self.y))

        # Update Q
        opt = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        grads_and_vars = opt.compute_gradients(self.loss)
        grads_and_vars = [[grad, var] for grad, var in grads_and_vars \
                          if grad is not None and (var.name.startswith("Q") or var.name.startswith("shared"))]
        self.train_op = opt.apply_gradients(grads_and_vars)

        # Update target Q
        self.target_train_op = copy_params(Q, target_Q)

        # replay buffer
        self.D = []
Ejemplo n.º 4
0
    def __init__(self, action_n):
        # create Q networks
        Q = QFunction(action_n, scope="Q")
        target_Q = QFunction(action_n, scope="target_Q")

        # define placeholders
        self.s = tf.placeholder(
            shape=[None, cfg.height, cfg.width, cfg.state_length],
            dtype=tf.float32)
        self.a = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.int32)
        self.r = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.float32)
        self.done = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.float32)
        self.next_s = tf.placeholder(
            shape=[cfg.batch_size, cfg.height, cfg.width, cfg.state_length],
            dtype=tf.float32)

        # predict Q values
        self.probs = Q(self.s)

        # add offset
        first = tf.expand_dims(tf.range(cfg.batch_size), axis=1)

        # choose Q value
        q_val = tf.expand_dims(tf.gather_nd(self.probs,
                                            tf.concat([first, self.a],
                                                      axis=1)),
                               axis=1)

        # create teacher
        a_max = tf.expand_dims(tf.argmax(Q(self.next_s, reuse=True),
                                         axis=1,
                                         output_type=tf.int32),
                               axis=1)
        target_q_val = tf.expand_dims(tf.gather_nd(
            target_Q(self.next_s), tf.concat([first, a_max], axis=1)),
                                      axis=1)
        y = self.r + cfg.gamma * (1.0 - self.done) * target_q_val

        # calculate loss
        self.loss = huber_loss(y, q_val)

        # update Q
        opt = tf.train.AdamOptimizer(cfg.eta)
        self.train_op = opt.minimize(self.loss)

        # update target Q
        self.target_train_op = copy_params(Q, target_Q)
Ejemplo n.º 5
0
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_bound = env_params['action_max']

        self.lr = args.lr
        self.l2_coefficient = args.l2_coefficient
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.device = torch.device(args.device)
        self.tau = args.tau
        self.noise_eps = args.noise_eps

        self.policy = Policy(o_dim=self.o_dim,
                             a_dim=self.a_dim,
                             g_dim=self.g_dim).to(self.device)
        self.policy_target = Policy(o_dim=self.o_dim,
                                    a_dim=self.a_dim,
                                    g_dim=self.g_dim).to(self.device)
        self.Q = QFunction(o_dim=self.o_dim,
                           a_dim=self.a_dim,
                           g_dim=self.g_dim).to(self.device)
        self.Q_target = QFunction(o_dim=self.o_dim,
                                  a_dim=self.a_dim,
                                  g_dim=self.g_dim).to(self.device)
        sync_networks(self.policy)
        sync_networks(self.Q)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.normalizer_o = Normalizer(size=self.o_dim,
                                       eps=1e-2,
                                       clip_range=1.)
        self.normalizer_g = Normalizer(size=self.g_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.hard_update()
Ejemplo n.º 6
0
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_boundary = env_params['action_boundary']
        self.max_episode_steps = env_params['max_episode_steps']

        self.evaluate_episodes = args.evaluate_episodes
        self.lr_pi = args.lr_pi_TD3
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.action_var = args.action_var
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.K_updates = args.K_updates_TD3
        self.policy_update_interval = args.policy_update_interval
        self.batch_size = args.batch_size
        self.device = torch.device(args.device)
        self.load_model_remark = args.load_model_remark

        self.total_trained_goal_num = 0
        self.total_episode_num = 0
        self.total_update_num = 0
        self.policy_loss_log = 0.
        self.q1_loss_log = 0.
        self.q2_loss_log = 0.

        self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim)

        self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)

        self.hard_update()
Ejemplo n.º 7
0
Archivo: agent.py Proyecto: Kavka1/RL
class DDPGAgent():
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.action_boundary = env_params['action_boundary']

        self.lr_a = args.lr_a
        self.lr_c = args.lr_c
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_eps = args.noise_eps
        self.batch_size = args.batch_size

        self.device = torch.device(args.device)

        self.actor = DeterministicPolicy(self.o_dim,
                                         self.a_dim).to(self.device)
        self.actor_tar = DeterministicPolicy(self.o_dim,
                                             self.a_dim).to(self.device)
        self.critic = QFunction(self.o_dim, self.a_dim).to(self.device)
        self.critic_tar = QFunction(self.o_dim, self.a_dim).to(self.device)

        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=self.lr_a)
        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c)

        self.hard_update()

    def hard_update(self):
        self.actor_tar.load_state_dict(self.actor.state_dict())
        self.critic_tar.load_state_dict(self.critic.state_dict())

    def soft_update(self):
        for params, params_tar in zip(self.actor.parameters(),
                                      self.actor_tar.parameters()):
            params_tar.data.copy_(self.tau * params.data +
                                  (1 - self.tau) * params_tar.data)
        for params, params_tar in zip(self.critic.parameters(),
                                      self.critic_tar.parameters()):
            params_tar.data.copy_(self.tau * params.data +
                                  (1 - self.tau) * params_tar.data)

    def choose_action(self, obs, is_evaluete=False):
        obs = torch.from_numpy(obs).float().to(self.device)
        with torch.no_grad():
            action = self.actor(obs)
        if not is_evaluete:
            action += torch.normal(torch.tensor(0.),
                                   torch.tensor(self.noise_eps))
        action = torch.clamp(action, -self.action_boundary,
                             self.action_boundary).cpu().detach().numpy()
        return action

    def rollout(self, env, memory, is_evaluate=False):
        total_reward = 0.
        obs = env.reset()
        done = False
        while not done:
            a = self.choose_action(obs, is_evaluate)
            obs_, r, done, info = env.step(a)

            memory.store(obs, a, r, obs_, done)

            total_reward += r
            obs = obs_
        return total_reward

    def update(self, memory):
        obs, a, r, obs_, done = memory.sample_batch(self.batch_size)

        obs = torch.from_numpy(obs).float().to(self.device)
        a = torch.from_numpy(a).float().to(self.device)
        r = torch.from_numpy(r).float().to(self.device)
        obs_ = torch.from_numpy(obs_).float().to(self.device)
        done = torch.from_numpy(done).float().to(self.device)

        with torch.no_grad():
            next_action_tar = self.actor_tar(obs_)
            next_q_tar = self.critic_tar(obs_, next_action_tar)
            critic_target = r + (1 - done) * self.gamma * next_q_tar
        critic_eval = self.critic(obs, a)
        loss_critic = F.mse_loss(critic_eval, critic_target.detach())
        self.optimizer_c.zero_grad()
        loss_critic.backward()
        self.optimizer_c.step()

        loss_actor = -self.critic(obs, self.actor(obs)).mean()
        self.optimizer_a.zero_grad()
        loss_actor.backward()
        self.optimizer_a.step()

        self.soft_update()

    def save_model(self, remark):
        if not os.path.exists('pretrained_model/'):
            os.mkdir('pretrained_model/')
        path = 'pretrained_model/{}.pt'.format(remark)
        print('Saving model to {}'.format(path))
        torch.save(self.actor.state_dict(), path)

    def load_model(self, remark):
        path = 'pretrained_model/{}.pt'.format(remark)
        print('Loading model from {}'.format(path))
        model = torch.load(path)
        self.actor.load_state_dict(model)
Ejemplo n.º 8
0
Archivo: agent.py Proyecto: Kavka1/RL
class TD3Agent():
    def __init__(self, s_dim, a_dim, action_space, args):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.action_space = action_space
        self.lr_pi = args.lr_pi
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.batch_size = args.batch_size
        self.policy_update_interval = args.policy_update_interval
        self.device = torch.device(args.device)
        self.policy_loss_log = torch.tensor(0.).to(self.device)

        self.policy = DeterministicPolicy(self.s_dim,
                                          self.a_dim,
                                          self.device,
                                          action_space=self.action_space).to(
                                              self.device)
        self.policy_target = DeterministicPolicy(
            self.s_dim,
            self.a_dim,
            self.device,
            action_space=self.action_space).to(self.device)
        self.Q1 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.hard_update_target()

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)

    def hard_update_target(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q1_target.load_state_dict(self.Q1.state_dict())
        self.Q2_target.load_state_dict(self.Q2.state_dict())

    def soft_update_target(self):
        for param, param_target in zip(self.policy.parameters(),
                                       self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q1.parameters(),
                                       self.Q1_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q2.parameters(),
                                       self.Q2_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))

    def choose_action(self, s):
        s = torch.from_numpy(s).to(self.device).float()
        return self.policy.sample(s).cpu().detach().numpy()

    def learn(self, memory, total_step):
        s, a, r, s_, done = memory.sample_batch(self.batch_size)
        s = torch.from_numpy(s).to(self.device)
        a = torch.from_numpy(a).to(self.device)
        r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
        s_ = torch.from_numpy(s_).to(self.device)
        done = torch.from_numpy(done).to(self.device).unsqueeze(dim=1)

        noise = (torch.randn_like(a) * self.noise_std).clamp(
            -self.noise_clip, self.noise_clip)
        a_target_next = self.policy_target.sample(s_) + noise
        q1_next = self.Q1_target(s_, a_target_next)
        q2_next = self.Q2_target(s_, a_target_next)
        q_next_min = torch.min(q1_next, q2_next)
        q_loss_target = r + (1 - done) * self.gamma * q_next_min

        #update q1
        q1_loss_pred = self.Q1(s, a)
        q1_loss = F.mse_loss(q1_loss_pred, q_loss_target.detach()).mean()
        self.optimizer_q1.zero_grad()
        q1_loss.backward()
        self.optimizer_q1.step()

        #update q2
        q2_loss_pred = self.Q2(s, a)
        q2_loss = F.mse_loss(q2_loss_pred, q_loss_target.detach()).mean()
        self.optimizer_q2.zero_grad()
        q2_loss.backward()
        self.optimizer_q2.step()

        #delay upodate policy
        if total_step % self.policy_update_interval == 0:
            policy_loss = -self.Q1(s, self.policy.sample(s)).mean()
            self.optimizer_pi.zero_grad()
            policy_loss.backward()
            self.optimizer_pi.step()
            self.soft_update_target()

            self.policy_loss_log = policy_loss

        return q1_loss.item(), q2_loss.item(), self.policy_loss_log.item()

    def save_model(self,
                   env_name,
                   remarks='',
                   pi_path=None,
                   q1_path=None,
                   q2_path=None):
        if not os.path.exists('pretrained_models/'):
            os.mkdir('pretrained_models/')

        if pi_path == None:
            pi_path = 'pretrained_models/policy_{}_{}'.format(
                env_name, remarks)
        if q1_path == None:
            q1_path = 'pretrained_models/q1_{}_{}'.format(env_name, remarks)
        if q2_path == None:
            q2_path = 'pretrained_models/q2_{}_{}'.format(env_name, remarks)
        print('Saving model to {} , {} and {}'.format(pi_path, q1_path,
                                                      q2_path))
        torch.save(self.policy.state_dict(), pi_path)
        torch.save(self.Q1.state_dict(), q1_path)
        torch.save(self.Q2.state_dict(), q2_path)

    def load_model(self, pi_path, q1_path, q2_path):
        print('Loading models from {} , {} and {}'.format(
            pi_path, q1_path, q2_path))
        self.policy.load_state_dict(torch.load(pi_path))
        self.Q1.load_state_dict(torch.load(q1_path))
        self.Q2.load_state_dict(torch.load(q2_path))
Ejemplo n.º 9
0
class TD3Agent():
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_boundary = env_params['action_boundary']
        self.max_episode_steps = env_params['max_episode_steps']

        self.evaluate_episodes = args.evaluate_episodes
        self.lr_pi = args.lr_pi_TD3
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.action_var = args.action_var
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.K_updates = args.K_updates_TD3
        self.policy_update_interval = args.policy_update_interval
        self.batch_size = args.batch_size
        self.device = torch.device(args.device)
        self.load_model_remark = args.load_model_remark

        self.total_trained_goal_num = 0
        self.total_episode_num = 0
        self.total_update_num = 0
        self.policy_loss_log = 0.
        self.q1_loss_log = 0.
        self.q2_loss_log = 0.

        self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim)

        self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)

        self.hard_update()

    def hard_update(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q1_target.load_state_dict(self.Q1.state_dict())
        self.Q2_target.load_state_dict(self.Q2.state_dict())

    def soft_update(self):
        for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau))
        for param, param_target in zip(self.Q1.parameters(), self.Q1_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau))
        for param, param_target in zip(self.Q2.parameters(), self.Q2_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau))

    def select_action(self, observation, goal):
        observation = torch.from_numpy(observation).float().to(self.device)
        goal = torch.from_numpy(goal).float().to(self.device)
        input_tensor = torch.cat([observation, goal], dim=0)
        with torch.no_grad():
            dist = self.policy(input_tensor)
            action = dist.sample()

            action = action.cpu().detach().numpy()
            action = np.clip(action, -self.action_boundary, self.action_boundary)

        return action

    def train_and_evaluate(self, goals, env, logger = None):
        returns = np.zeros(shape=[len(goals)], dtype=np.float32)
        
        for i_goal, goal in enumerate(goals):
            success_count = 0
            cumulative_r = 0. #for log
            used_steps = 0
            cumulative_loss_pi, cumulative_loss_q1, cumulative_loss_q2 = 0., 0., 0.
            print('--{} goal: ({:.4f}, {:.4f}):-----------------------'.format(self.total_trained_goal_num, goal[0], goal[1]))

            for i_episode in range(self.evaluate_episodes):
                success_flag = 0
                
                _ = env.reset()
                obs = env.set_goal(goal)
                for i_step in range(self.max_episode_steps):
                    a = self.select_action(obs['observation'], obs['desired_goal'])
                    obs_, reward, done, info = env.step(a)
                    
                    self.memory.store(obs['observation'], a, reward, obs_['observation'], obs['desired_goal'])
                    
                    cumulative_r += reward
                    used_steps += 1

                    if success_flag == 0 and info['is_success'] == 1:
                        success_flag = 1
                        break

                    obs = obs_

                if len(self.memory) > self.batch_size:
                    loss_q1, loss_q2, loss_pi = self.update() #need change
                    cumulative_loss_pi += loss_pi
                    cumulative_loss_q1 += loss_q1
                    cumulative_loss_q2 += loss_q2

                success_count += success_flag

            average_success = success_count / self.evaluate_episodes
            returns[i_goal] = average_success
            
            self.total_trained_goal_num += 1
            self.total_episode_num += self.evaluate_episodes
            if logger is not None:
                logger.add_scalar('Indicator/reward_per_step', cumulative_r/used_steps, self.total_trained_goal_num)
                logger.add_scalar('Indicator/goal_success_rate', average_success, self.total_trained_goal_num)
                logger.add_scalar('loss/loss_pi', cumulative_loss_pi/self.evaluate_episodes, self.total_update_num)
                logger.add_scalar('loss/loss_q1', cumulative_loss_q1/self.evaluate_episodes, self.total_update_num)
                logger.add_scalar('loss/loss_q2', cumulative_loss_q2/self.evaluate_episodes, self.total_update_num)
            
            print('\t success_rate: {:.2f}'.format(average_success))
            print('\t average_episode_return: {:.4f}'.format(cumulative_r/self.evaluate_episodes))

        return returns

    def evaluate_goal(self, goals, env):
        returns = np.zeros(shape=[len(goals)], dtype=np.float32)
        for i_goal, goal in enumerate(goals):
            success_count = 0
            for i_episode in range(self.evaluate_episodes):
                _ = env.reset()
                obs = env.set_goal(goal)
                for i_step in range(self.max_episode_steps):
                    a = self.select_action(obs['observation'], obs['desired_goal'])
                    obs_, reward, done, info = env.step(a)
                    if info['is_success'] == 1:
                        success_count += 1
                        break
                    obs = obs_
            average_success = success_count / self.evaluate_episodes
            print('{} goal: {} {}   return: {}'.format(i_goal, goal[0], goal[1], average_success))
            returns[i_goal] = average_success
        return returns

    def update(self):
        for i in range(self.K_updates):
            o, a, r, o_, g = self.memory.sample(self.batch_size)
            o = torch.from_numpy(o).to(self.device)
            a = torch.from_numpy(a).to(self.device)
            r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
            o_ = torch.from_numpy(o_).to(self.device)
            g = torch.from_numpy(g).to(self.device)

            o_g_input = torch.cat([o, g], dim=1)
            next_o_g_input = torch.cat([o_, g], dim=1)
            o_g_a_input = torch.cat([o, g, a], dim=1)

            noise = (torch.randn_like(a) * self.noise_std).clamp(-self.noise_clip, self.noise_clip)
            a_target_next = self.policy_target(next_o_g_input).sample() + noise

            next_o_a_target_g_input = torch.cat([o_, g, a_target_next], dim=1)
            q1_next = self.Q1_target(next_o_a_target_g_input)
            q2_next = self.Q2_target(next_o_a_target_g_input)
            q_next_min = torch.min(q1_next, q2_next)
            q_loss_tar = r + self.gamma * q_next_min

            q1_loss_pred = self.Q1(o_g_a_input)
            q1_loss = F.mse_loss(q1_loss_pred, q_loss_tar.detach())
            self.optimizer_q1.zero_grad()
            q1_loss.backward()
            self.optimizer_q1.step()

            q2_loss_pred = self.Q2(o_g_a_input)
            q2_loss = F.mse_loss(q2_loss_pred, q_loss_tar.detach())
            self.optimizer_q2.zero_grad()
            q2_loss.backward()
            self.optimizer_q2.step()

            self.total_update_num += 1
            self.q1_loss_log = q1_loss.cpu().detach().numpy()
            self.q2_loss_log = q2_loss.cpu().detach().numpy()

            if self.total_update_num % self.policy_update_interval == 0:
                actions = self.policy(o_g_input).sample()
                policy_loss = - self.Q1(torch.cat([o_g_input, actions], dim=1)).mean()
                self.optimizer_pi.zero_grad()
                policy_loss.backward()
                self.optimizer_pi.step()

                self.policy_loss_log = policy_loss.cpu().detach().numpy()

                self.soft_update()

        return self.q1_loss_log, self.q2_loss_log, self.policy_loss_log

    def save_model(self, remark):
        if not os.path.exists('pretrained_models_TD3/'):
            os.mkdir('pretrained_models_TD3/')
        path = 'pretrained_models_TD3/{}.pt'.format(remark)
        print('Saving model to {}'.format(path))
        torch.save(self.policy.state_dict(), path)

    def load_model(self):
        print('Loading models with remark {}'.format(self.load_model_remark))
        policy_model = torch.load('pretrained_models_TD3/{}.pt'.format(self.load_model_remark), map_location=lambda storage, loc: storage)
        self.policy.load_state_dict(policy_model)
Ejemplo n.º 10
0
    def __init__(self, args, env_params):
        self.s_dim = env_params['o_dim'] + env_params['g_dim']
        self.a_dim = env_params['a_dim']
        self.f_dim = args.f_dim
        self.action_bound = env_params['action_max']
        self.max_timestep = env_params['max_timestep']
        self.max_episode = args.max_episode
        self.evaluate_episode = args.evaluate_episode
        self.evaluate_interval = args.evaluate_interval
        self.log_interval = args.log_interval
        self.save_model_interval = args.save_model_interval
        self.save_model_start = args.save_model_start

        self.lr = args.lr
        self.lr_model = args.lr_model
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.eta = args.eta
        self.noise_eps = args.noise_eps
        self.device = torch.device(args.device)

        self.normalizer_s = Normalizer(size=self.s_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.memory = Memory(size=args.memory_size,
                             s_dim=self.s_dim,
                             a_dim=self.a_dim)

        self.policy = Policy(s_dim=self.s_dim,
                             a_dim=self.a_dim).to(self.device)
        self.policy_target = Policy(s_dim=self.s_dim,
                                    a_dim=self.a_dim).to(self.device)
        self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device)
        self.Q_target = QFunction(s_dim=self.s_dim,
                                  a_dim=self.a_dim).to(self.device)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.encoder = StateEncoder(s_dim=self.s_dim,
                                    f_dim=self.f_dim).to(self.device)
        self.EnvForward = ForwardModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)
        self.EnvInverse = InverseModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)

        self.optimizer_forward = optim.Adam(
            [{
                'params': self.EnvForward.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)
        self.optimizer_inverse = optim.Adam(
            [{
                'params': self.EnvInverse.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)

        self.hard_update()

        self.update_num = 0
Ejemplo n.º 11
0
class DDPG_Agent():
    def __init__(self, args, env_params):
        self.s_dim = env_params['o_dim'] + env_params['g_dim']
        self.a_dim = env_params['a_dim']
        self.f_dim = args.f_dim
        self.action_bound = env_params['action_max']
        self.max_timestep = env_params['max_timestep']
        self.max_episode = args.max_episode
        self.evaluate_episode = args.evaluate_episode
        self.evaluate_interval = args.evaluate_interval
        self.log_interval = args.log_interval
        self.save_model_interval = args.save_model_interval
        self.save_model_start = args.save_model_start

        self.lr = args.lr
        self.lr_model = args.lr_model
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.eta = args.eta
        self.noise_eps = args.noise_eps
        self.device = torch.device(args.device)

        self.normalizer_s = Normalizer(size=self.s_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.memory = Memory(size=args.memory_size,
                             s_dim=self.s_dim,
                             a_dim=self.a_dim)

        self.policy = Policy(s_dim=self.s_dim,
                             a_dim=self.a_dim).to(self.device)
        self.policy_target = Policy(s_dim=self.s_dim,
                                    a_dim=self.a_dim).to(self.device)
        self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device)
        self.Q_target = QFunction(s_dim=self.s_dim,
                                  a_dim=self.a_dim).to(self.device)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.encoder = StateEncoder(s_dim=self.s_dim,
                                    f_dim=self.f_dim).to(self.device)
        self.EnvForward = ForwardModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)
        self.EnvInverse = InverseModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)

        self.optimizer_forward = optim.Adam(
            [{
                'params': self.EnvForward.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)
        self.optimizer_inverse = optim.Adam(
            [{
                'params': self.EnvInverse.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)

        self.hard_update()

        self.update_num = 0

    def select_action(self, state, train_mode=True):
        s = self.normalize_input(state)
        s = torch.tensor(state, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            action = self.policy(s).cpu().numpy()

        if train_mode:
            action += np.random.randn(
                self.a_dim
            ) * self.noise_eps * self.action_bound  #Gaussian Noise
        else:
            pass

        action = np.clip(action,
                         a_min=-self.action_bound,
                         a_max=self.action_bound)
        return action

    def get_intrisic_reward(self, s, a, s_):
        s, a, s_ = torch.from_numpy(s).to(
            self.device).float(), torch.from_numpy(a).to(
                self.device).float(), torch.from_numpy(s_).to(
                    self.device).float()
        with torch.no_grad():
            feature = self.encoder(s)
            next_feature_pred = self.EnvForward(feature, a)
            next_feature = self.encoder(s_)
        r_i = self.eta * torch.norm(next_feature_pred - next_feature)
        r_i = torch.clamp(r_i, min=-0.1, max=0.1)
        return r_i.cpu().detach().numpy()

    def train(self, env, logger=None):
        total_step = 0
        loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0.
        for i_episode in range(self.max_episode):
            obs = env.reset()
            s = get_state(obs)

            cumulative_r = 0.
            for i_step in range(self.max_timestep):
                a = self.select_action(s)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)

                r_i = self.get_intrisic_reward(s, a, s_)
                r = r_e + r_i

                self.memory.store(s, a, r, s_)
                s = s_

                if len(self.memory) > self.batch_size:
                    loss_pi, loss_q, loss_forward, loss_inverse = self.learn()
                cumulative_r += r_e
                total_step += 1

            print(
                'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} '
                .format(i_episode, total_step, cumulative_r,
                        info['is_success']))
            if logger is not None and i_episode % self.log_interval == 0:
                logger.add_scalar('Indicator/cumulative reward', cumulative_r,
                                  i_episode)
                logger.add_scalar('Loss/pi_loss', loss_pi, i_episode)
                logger.add_scalar('Loss/q_loss', loss_q, i_episode)
                logger.add_scalar('Loss/forward_loss', loss_forward, i_episode)
                logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode)
            if i_episode % self.evaluate_interval == 0:
                success_rate = self.evaluate(env)
                if logger is not None:
                    logger.add_scalar('Indicator/success rate', success_rate,
                                      i_episode)

            if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0:
                self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode))

    def evaluate(self, env, render=False):
        success_count = 0
        for i_episode in range(self.evaluate_episode):
            obs = env.reset()
            s = get_state(obs)
            for i_step in range(self.max_timestep):
                if render:
                    env.render()
                a = self.select_action(s, train_mode=False)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)
                s = s_
            success_count += info['is_success']

        return success_count / self.evaluate_episode

    def learn(self):
        s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size)
        self.normalizer_s.update(s)

        s, s_ = self.normalize_input(s, s_)
        s = torch.from_numpy(s).to(self.device)
        a = torch.from_numpy(a).to(self.device)
        r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
        s_ = torch.from_numpy(s_).to(self.device)

        #update policy and Q
        with torch.no_grad():
            a_next_tar = self.policy_target(s_)
            Q_next_tar = self.Q_target(s_, a_next_tar)
            loss_q_tar = r + self.gamma * Q_next_tar
        loss_q_pred = self.Q(s, a)
        loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach())
        self.optimizer_q.zero_grad()
        loss_q.backward()
        self.optimizer_q.step()

        loss_p = -self.Q(s, self.policy(s)).mean()
        self.optimizer_p.zero_grad()
        loss_p.backward()
        self.optimizer_p.step()

        self.soft_update()

        #update env model and encoder
        feature = self.encoder(s)
        next_feature = self.encoder(s_)
        a_pred = self.EnvInverse(feature, next_feature)
        loss_inverse = F.mse_loss(a_pred, a)

        next_feature_pred = self.EnvForward(feature, a)
        with torch.no_grad():
            next_feature_tar = self.encoder(s_)
        loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach())

        self.optimizer_forward.zero_grad()
        self.optimizer_inverse.zero_grad()
        loss_forward.backward(retain_graph=True)
        loss_inverse.backward()
        self.optimizer_forward.step()
        self.optimizer_inverse.step()

        self.update_num += 1
        return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy(
        ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach(
        ).numpy()

    def update_normalizer(self, states):
        states = np.array(states, dtype=np.float32)
        self.normalizer_s.update(states)

    def hard_update(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q_target.load_state_dict(self.Q.state_dict())

    def soft_update(self):
        for param, param_target in zip(self.policy.parameters(),
                                       self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q.parameters(),
                                       self.Q_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))

    def normalize_input(self, s, s_=None):
        s = self.normalizer_s.normalize(s)
        if s_ is not None:
            s_ = self.normalizer_s.normalize(s_)
            return s, s_
        else:
            return s

    def save_model(self, remarks):
        if not os.path.exists('pretrained_models_DDPG/'):
            os.mkdir('pretrained_models_DDPG/')
        path = 'pretrained_models_DDPG/{}.pt'.format(remarks)
        print('Saving model to {}'.format(path))
        torch.save([
            self.normalizer_s.mean, self.normalizer_s.std,
            self.policy.state_dict()
        ], path)

    def load_model(self, remark):
        print('Loading models with remark {}'.format(remark))
        self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load(
            'pretrained_models_DDPG/{}.pt'.format(remark),
            map_location=lambda storage, loc: storage)
        self.policy.load_state_dict(policy_model)
Ejemplo n.º 12
0
    def __init__(self,
                 input_shape,
                 action_n,
                 N,
                 alpha=0.5,
                 beta=0.5,
                 beta_decay=50000,
                 gamma=0.99):
        self.shape = input_shape
        self.batch_size = input_shape[0]

        # Prioritized Replay Memory
        self.pr = PrioritizedReplayBuf(N=N,
                                       alpha=alpha,
                                       beta=beta,
                                       beta_decay=beta_decay,
                                       batch_size=self.batch_size)
        # Importance Sampling weights
        self.is_w = tf.placeholder(shape=[self.batch_size, 1],
                                   dtype=tf.float32)

        Q = QFunction(input_shape, action_n, scope="Q")
        target_Q = QFunction(input_shape, action_n, scope="target_Q")

        # Forward Q
        self.s = tf.placeholder(shape=[None] + input_shape[1:],
                                dtype=tf.float32)
        self.a = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.int32)
        self.probs = Q(self.s, s_bias=False)

        # add offset
        first = tf.expand_dims(tf.range(self.batch_size), axis=1)
        indices = tf.concat(values=[first, self.a], concat_dim=1)
        # gather corresiponding q_vals
        self.q_val = tf.expand_dims(tf.gather_nd(self.probs, indices), axis=1)

        # TD target
        self.done = tf.placeholder(shape=[self.batch_size, 1],
                                   dtype=tf.float32)
        self.r = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32)
        self.s_ = tf.placeholder(shape=input_shape, dtype=tf.float32)

        # D-DQN
        a_max = tf.expand_dims(tf.argmax(Q(self.s_, reuse=True), axis=1),
                               axis=1)
        a_max = tf.to_int32(a_max)
        target_q_val = tf.expand_dims(tf.gather_nd(
            target_Q(self.s_), tf.concat(values=[first, a_max], concat_dim=1)),
                                      axis=1)
        self.y = self.r + gamma * (1.0 - self.done) * target_q_val
        # Error Clipping
        # TD-error
        self.delta = Hurber_loss(self.q_val, self.y)
        # Importance sampling
        max_is = tf.reduce_max(self.is_w)
        self.loss = tf.reduce_mean((self.is_w / max_is) * self.delta)

        # Update Q
        # reducing step-size by a factor of four
        opt = tf.train.RMSPropOptimizer(0.00025 / 4, 0.99, 0.0, 1e-6)
        grads_and_vars = opt.compute_gradients(self.loss)
        grads_and_vars = [[grad, var] for grad, var in grads_and_vars \
                          if grad is not None and (var.name.startswith("Q") or var.name.startswith("shared"))]
        self.train_op = opt.apply_gradients(grads_and_vars)

        # Update target Q
        self.target_train_op = copy_params(Q, target_Q)
Ejemplo n.º 13
0
class DDPG_Her_Agent():
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_bound = env_params['action_max']

        self.lr = args.lr
        self.l2_coefficient = args.l2_coefficient
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.device = torch.device(args.device)
        self.tau = args.tau
        self.noise_eps = args.noise_eps

        self.policy = Policy(o_dim=self.o_dim,
                             a_dim=self.a_dim,
                             g_dim=self.g_dim).to(self.device)
        self.policy_target = Policy(o_dim=self.o_dim,
                                    a_dim=self.a_dim,
                                    g_dim=self.g_dim).to(self.device)
        self.Q = QFunction(o_dim=self.o_dim,
                           a_dim=self.a_dim,
                           g_dim=self.g_dim).to(self.device)
        self.Q_target = QFunction(o_dim=self.o_dim,
                                  a_dim=self.a_dim,
                                  g_dim=self.g_dim).to(self.device)
        sync_networks(self.policy)
        sync_networks(self.Q)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.normalizer_o = Normalizer(size=self.o_dim,
                                       eps=1e-2,
                                       clip_range=1.)
        self.normalizer_g = Normalizer(size=self.g_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.hard_update()

    def hard_update(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q_target.load_state_dict(self.Q.state_dict())

    def soft_update(self):
        for param, param_target in zip(self.policy.parameters(),
                                       self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q.parameters(),
                                       self.Q_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))

    def normalize_input(self, o, g, o_=None):
        o = self.normalizer_o.normalize(o)
        g = self.normalizer_g.normalize(g)
        if o_ is not None:
            o_ = self.normalizer_o.normalize(o_)
            return o, g, o_
        else:
            return o, g

    def select_action(self, observation, goal, train_mode=True):
        observation, goal = self.normalize_input(observation, goal)
        observation, goal = torch.tensor(observation, dtype=torch.float32).to(
            self.device), torch.tensor(goal,
                                       dtype=torch.float32).to(self.device)
        o_g = torch.cat([observation, goal], dim=0)
        with torch.no_grad():
            action = self.policy(o_g).cpu().numpy()

        if train_mode:
            action += np.random.randn(
                self.a_dim
            ) * self.noise_eps * self.action_bound  #Gaussian Noise
        else:
            pass
        action = np.clip(action,
                         a_min=-self.action_bound,
                         a_max=self.action_bound)
        return action

    def learn(self, memory):
        o, a, r, o_, g = memory.sample_batch(batch_size=self.batch_size)
        o, g, o_ = self.normalize_input(o, g, o_)
        o = torch.from_numpy(o).to(self.device)
        a = torch.from_numpy(a).to(self.device)
        r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
        o_ = torch.from_numpy(o_).to(self.device)
        g = torch.from_numpy(g).to(self.device)

        #update Q
        a_next_target = self.policy_target(torch.cat([o_, g], dim=1))
        q_tar = r + self.gamma * self.Q_target(
            torch.cat([o_, a_next_target, g], dim=1))
        q_tar = torch.clamp(q_tar, -1 / (1 - self.gamma), 0)
        q_pred = self.Q(torch.cat([o, a, g], dim=1))
        loss_q = F.mse_loss(q_pred, q_tar.detach())
        self.optimizer_q.zero_grad()
        loss_q.backward()
        sync_grads(self.Q)
        self.optimizer_q.step()

        #update policy
        a_eval = self.policy(torch.cat([o, g], dim=1))
        loss_p = -self.Q(torch.cat(
            [o, a_eval, g], dim=1)).mean() + self.l2_coefficient * (
                a_eval / self.action_bound).pow(2).mean()  #actions
        self.optimizer_p.zero_grad()
        loss_p.backward()
        sync_grads(self.policy)
        self.optimizer_p.step()

        return loss_q.cpu().item(), loss_p.cpu().item(), q_pred.mean().cpu(
        ).item()

    def update_normalizer(self, observations, goals):
        observations, goals = np.array(
            observations, dtype=np.float32), np.array(goals, dtype=np.float32)
        self.normalizer_o.update(observations)
        self.normalizer_g.update(goals)

    def save_model(self, remarks):
        if not os.path.exists('pretrained_models_DDPG/'):
            os.mkdir('pretrained_models_DDPG/')
        path = 'pretrained_models_DDPG/{}.pt'.format(remarks)
        print('Saving model to {}'.format(path))
        torch.save([
            self.normalizer_o.mean, self.normalizer_o.std,
            self.normalizer_g.mean, self.normalizer_g.std,
            self.policy.state_dict()
        ], path)

    def load_model(self, remarks):
        print('Loading models with remark {}'.format(remarks))
        self.normalizer_o.mean, self.normalizer_o.std, self.normalizer_g.mean, self.normalizer_g.std, policy_model = torch.load(
            'pretrained_models_DDPG/{}.pt'.format(remarks),
            map_location=lambda storage, loc: storage)
        self.policy.load_state_dict(policy_model)