def train(args):
    logger.warning(
        'This script is an example to showcase the MetaModule and '
        'data-loading features of Torchmeta, and as such has been '
        'very lightly tested. For a better tested implementation of '
        'Model-Agnostic Meta-Learning (MAML) using Torchmeta with '
        'more features (including multi-step adaptation and '
        'different datasets), please check `https://github.com/'
        'tristandeleu/pytorch-maml`.')

    inputs = [
        "new_models/210429/dataset/BedBathingBaxterHuman-v0217_0-v1-human-coop-robot-coop_10k"
    ]
    env_name = "BedBathingBaxterHuman-v0217_0-v1"
    env = gym.make('assistive_gym:' + env_name)

    dataset = behaviour(inputs, shots=400, test_shots=1)
    dataloader = BatchMetaDataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers)

    model = PolicyNetwork(env.observation_space_human.shape[0],
                          env.action_space_human.shape[0])
    for key, v in model.features.named_parameters():
        v.data = torch.nn.init.zeros_(v)
    model.to(device=args.device)
    model.train()
    meta_optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    pbar = tqdm(total=args.num_batches)
    batch_idx = 0
    while batch_idx < args.num_batches:
        for batch in dataloader:
            model.zero_grad()

            train_inputs, train_targets = batch['train']
            train_inputs = train_inputs.to(device=args.device).float()
            train_targets = train_targets.to(device=args.device).float()

            loss = torch.tensor(0., device=args.device)
            for task_idx, (train_input, train_target) in enumerate(
                    zip(train_inputs, train_targets)):
                train_output = model(train_input)
                loss += get_loss(train_output, train_target)

            model.zero_grad()
            loss.div_(len(dataloader))
            loss.backward()
            meta_optimizer.step()

            pbar.update(1)
            pbar.set_postfix(loss='{0:.4f}'.format(loss.item()))
            batch_idx += 1
Beispiel #2
0
class Agent():
    def __init__(self, state_size, action_size, num_agents):
        state_dim = state_size
        #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state.
        action_dim = action_size

        self.num_agents = num_agents

        max_size = 100000  ###
        self.replay = Replay(max_size)

        hidden_dim = 128

        self.critic_net = ValueNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_critic_net = ValueNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        self.actor_net = PolicyNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_actor_net = PolicyNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        for target_param, param in zip(self.target_critic_net.parameters(),
                                       self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_actor_net.parameters(),
                                       self.actor_net.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic_net.parameters(),
                                           lr=CRITIC_LEARNING_RATE)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

    def get_action(self, state):
        return self.actor_net.get_action(state)[0]

    def add_replay(self, state, action, reward, next_state, done):
        for i in range(self.num_agents):
            self.replay.add(state[i], action[i], reward[i], next_state[i],
                            done[i])

    def learning_step(self):

        #Check if relay buffer contains enough samples for 1 batch
        if (self.replay.cursize < BATCH_SIZE):
            return

        #Get Samples
        state, action, reward, next_state, done = self.replay.get(BATCH_SIZE)

        #calculate loss
        actor_loss = self.critic_net(state, self.actor_net(state))
        actor_loss = -actor_loss.mean()

        next_action = self.target_actor_net(next_state)
        target_value = self.target_critic_net(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * DISCOUNT_RATE * target_value

        value = self.critic_net(state, action)
        critic_loss = F.mse_loss(value, expected_value.detach())

        #backprop
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        #soft update
        self.soft_update(self.critic_net, self.target_critic_net, TAU)
        self.soft_update(self.actor_net, self.target_actor_net, TAU)

    def save(self, name):
        torch.save(self.critic_net.state_dict(), name + "_critic")
        torch.save(self.actor_net.state_dict(), name + "_actor")

    def load(self, name):
        self.critic_net.load_state_dict(torch.load(name + "_critic"))
        self.critic_net.eval()
        self.actor_net.load_state_dict(torch.load(name + "_actor"))
        self.actor_net.eval()

        for target_param, param in zip(self.target_critic_net.parameters(),
                                       self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_actor_net.parameters(),
                                       self.actor_net.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #3
0
class DDPG:
    def __init__(self, cfg):
        self.device = cfg.device
        self.gamma = cfg.gamma
        self.batch_size = cfg.batch_size

        self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                      cfg.hidden_dim).to(self.device)
        self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                        cfg.hidden_dim).to(self.device)
        self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                             cfg.hidden_dim).to(self.device)
        self.target_value_net.load_state_dict(self.value_net.state_dict())
        self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                               cfg.hidden_dim).to(self.device)
        self.target_policy_net.load_state_dict(self.policy_net.state_dict())
        self.soft_tau = cfg.soft_tau

        self.value_lr = cfg.value_lr
        self.policy_lr = cfg.policy_lr
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # mean squared error
        self.value_criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size)

    def update(self, cfg):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            cfg.batch_size)
        # print(np.shape(state), np.shape(action), np.shape(reward), np.shape(next_state), np.shape(done))
        # (128, 3) (128, 1) (128,) (128, 3) (128,)
        state = torch.FloatTensor(state).to(cfg.device)
        action = torch.FloatTensor(action).to(cfg.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)
        next_state = torch.FloatTensor(next_state).to(cfg.device)
        done = torch.FloatTensor(done).unsqueeze(1).to(cfg.device)

        self.value_net(state, self.policy_net(state))

        # Actor Loss
        policy_loss = self.value_net(state, self.policy_net(state))
        policy_loss = -policy_loss.mean()

        next_action = self.target_policy_net(next_state)
        target_value = self.target_value_net(next_state, next_action.detach())
        TD_target = reward + (1.0 - done) * self.gamma * target_value
        TD_target = torch.clamp(TD_target, -np.inf, np.inf)

        value = self.value_net(state, action)
        # Critic Loss
        value_loss = self.value_criterion(value, TD_target.detach())

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        # Update target network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)

        for target_param, param in zip(self.target_policy_net.parameters(),
                                       self.policy_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)
Beispiel #4
0
class A2CAgent():
    """
    init function
    input: env, which is the CartPole-v0
           gamma, 0.99 in this case
           lr, learning rate is 1e-4

    define: env = env, which is the CartPole-v0
            obs_dim: 4 obervations
                    Observation:
                    Type: Box(4)
                    Num	Observation                 Min         Max
                    0	Cart Position             -4.8            4.8
                    1	Cart Velocity             -Inf            Inf
                    2	Pole Angle                 -24 deg        24 deg
                    3	Pole Velocity At Tip      -Inf            Inf

            action_dim: 2 actions
                        Actions:
                        Type: Discrete(2)
                        Num	Action
                        0	Push cart to the left
                        1	Push cart to the right

            value_network: two layer network with input 4 (observation dim) and output 1 (reward?)
            policy_network: two layer network with input 4 (observation dim) and output 2 (action dim)

            value and policy optimizer using default Adam and learning rate
    """
    def __init__(self, env, gamma, lr):

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    """
    input state to get the next action
    
    using policy network to get the next state by using softmax
    """

    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits = self.policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    """
    form trajectory get all of the information, and calculated the discounted_rewards
    use value network to train the states with new values and compute the loss between the value and target value by using MSE
    same logic for policy network
    
    FloatTensor = FLOAT TYPE ARRAY
   
   t
tensor([[1, 2, 3],
        [4, 5, 6]])
t.view(-1,1)
tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]]) 
     
    """

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0] for sars in trajectory])
        actions = torch.LongTensor([sars[1]
                                    for sars in trajectory]).view(-1, 1)
        rewards = torch.FloatTensor([sars[2] for sars in trajectory])
        next_states = torch.FloatTensor([sars[3] for sars in trajectory])
        dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1)

        # compute value target

        ## Two for loop to calculate the discounted reward for each one
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \
                                        * rewards[j:]) for j in
                              range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    """
        zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).

        loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
        
        opt.step() causes the optimizer to take a step based on the gradients of the parameters.
        """

    def update(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
class PPOAgent():
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers,
                 opt_epoch=OPT_EPOCH,
                 use_gae=True,
                 batch_size=BATCH_SIZE):
        """Initialize an PPO Agent object.

           Arguments
           ---------
           state_size (int): Dimension of state
           action_size (int): Total number of possible actions
           seed (int): Random seed
           hidden_layers (list): List of integers, each element represents for the size of a hidden layer
           opt_epoch (int): Nummber of updates using the collected trajectories
           use_gae (logic): Indicator of using GAE, True or False
        """
        self.state_size = state_size
        self.action_size = action_size
        self.state_normalizer = MeanStdNormalizer()
        self.opt_epoch = opt_epoch
        self.use_gae = use_gae
        self.batch_size = batch_size
        self.seed = random.seed(seed)

        # networks
        if use_gae:
            # self.network = ActorCriticNetwork(state_size, action_size, seed, hidden_layers).to(device)
            self.network = ActorCriticNetwork(state_size, action_size, seed,
                                              hidden_layers)
        else:
            # self.network = PolicyNetwork(state_size, action_size, seed, hidden_layers).to(device)
            self.network = PolicyNetwork(state_size, action_size, seed,
                                         hidden_layers)
        self.optimizer = optim.Adam(self.network.parameters(), lr=LR)

    # collect trajectories for a parallelized parallelEnv object
    def collect_trajectories(self,
                             envs,
                             brain_name,
                             tmax=ROLLOUT_LEN,
                             nrand=5,
                             n_agents=N,
                             discount=GAMMA,
                             lamda=LAMDA):
        """Collect trajectories.

           Arguments
           ---------
           envs: Environment
           brain_name: brain name of given environment
           tmax: Maximum length of collected trajectories
           nrand: Random steps performed before collecting trajectories
           n_agents: Number of parallel agents in the environment
        """

        # number of parallel instances
        n = n_agents

        #initialize returning lists and start the game!
        state_list = []
        reward_list = []
        log_prob_list = []
        action_list = []
        done_list = []
        prediction_list = []

        # reset environment
        env_info = envs.reset(train_mode=True)[brain_name]
        '''
        # perform nrand random steps
        for _ in range(nrand):
            actions = np.random.randn(n, self.action_size)
            actions = np.clip(actions, -1, 1)
            env_info = envs.step(actions)[brain_name]
        '''

        states = env_info.vector_observations
        # states = self.state_normalizer(states)

        for _ in range(tmax):
            # probs will only be used as the pi_old
            # no gradient propagation is needed
            # so we move it to the cpu
            #states_input = torch.tensor(states, dtype=torch.float, device=device)
            states_input = torch.tensor(states, dtype=torch.float)
            predictions = self.network(states_input)
            actions = to_np(predictions['a'])
            actions = np.clip(actions, -1, 1)
            env_info = envs.step(actions)[brain_name]
            next_states = env_info.vector_observations
            # next_states = self.state_normalizer(next_states)
            rewards = env_info.rewards
            dones = env_info.local_done

            # store the result
            state_list.append(states)
            reward_list.append(rewards)
            log_prob_list.append(to_np(predictions['log_pi_a']))
            action_list.append(actions)
            done_list.append(dones)
            prediction_list.append(predictions)

            states = next_states.copy()

            # stop if any of the trajectories is done
            # we want all the lists to be retangular
            if np.stack(dones).any():
                break

        # store one more step's prediction
        #states_input = torch.tensor(states, dtype=torch.float, device=device)
        states_input = torch.tensor(states, dtype=torch.float)
        predictions = self.network(states_input)
        prediction_list.append(predictions)

        # # return pi_theta, states, actions, rewards, probability
        #  return np.stack(log_prob_list), np.stack(state_list), np.stack(action_list), \
        #      np.stack(reward_list), np.stack(done_list), np.stack(prediction_list)

        # calculate accumulated discounted rewards and advantage values
        log_old_probs = np.stack(log_prob_list)
        states = np.stack(state_list)
        actions = np.stack(action_list)
        rewards = np.stack(reward_list)
        dones = np.stack(done_list)
        predictions = np.stack(prediction_list)

        # calculate accumulated discounted rewards and advantage functions
        if not self.use_gae:
            discount_seq = discount**np.arange(len(rewards))
            rewards_discounted = np.asarray(rewards) * discount_seq[:,
                                                                    np.newaxis]

            rewards_future = rewards_discounted[::-1].cumsum(axis=0)[::-1]
            advantages = rewards_future.copy()
        else:
            T = log_old_probs.shape[0]
            rewards_future = np.zeros_like(log_old_probs)
            advantages = np.zeros_like(log_old_probs)
            tmp_adv = np.zeros(log_old_probs.shape[1])

            for i in reversed(range(T)):
                td_error = rewards[i, :] + discount * (1 - dones[i, :]) * to_np(predictions[i+1]['v']) - \
                    to_np(predictions[i]['v'])
                tmp_adv = tmp_adv * lamda * discount * (1 -
                                                        dones[i, :]) + td_error
                advantages[i] = tmp_adv.copy()
                rewards_future[i] = tmp_adv + to_np(predictions[i]['v'])

        mean = np.mean(advantages)
        std = np.std(advantages) + 1.0e-10

        adv_normalized = (advantages - mean) / std

        # return
        return log_old_probs, states, actions, rewards, rewards_future, adv_normalized

    # clipped surrogate function
    # similar as -policy_loss for REINFORCE, but for PPO
    def clipped_surrogate(self,
                          log_old_probs,
                          states,
                          actions,
                          rewards_future,
                          adv_normalized,
                          epsilon=EPSILON,
                          beta=BETA):
        """Clipped surrogate function.

           Arguments
           ---------
           log_old_probs: Log probability of old policy, array with dim batch_size * 1
           states: States, array with dim batch_size * state_size
           actions: Actions, array with dim batch_size * action_size
           rewards_future: Accumulated discounted rewards, array with dim batch_size * 1
           adv_normalized: Advantage values, array with dim batch_size * 1
        """

        # convert everything into pytorch tensors and move to gpu if available
        # state_count = (states.shape[0], states.shape[1])
        '''
        log_old_probs = torch.tensor(log_old_probs.copy(), dtype=torch.float, device=device)
        adv = torch.tensor(adv_normalized.copy(), dtype=torch.float, device=device)
        rewards_future = torch.tensor(rewards_future.copy(), dtype=torch.float, device=device)
        states = torch.tensor(states.copy(), dtype=torch.float, device=device)
        actions = torch.tensor(actions.copy(), dtype=torch.float, device=device)
        '''
        log_old_probs = torch.tensor(log_old_probs.copy(), dtype=torch.float)
        adv = torch.tensor(adv_normalized.copy(), dtype=torch.float)
        rewards_future = torch.tensor(rewards_future.copy(), dtype=torch.float)
        states = torch.tensor(states.copy(), dtype=torch.float)
        actions = torch.tensor(actions.copy(), dtype=torch.float)

        # convert states to policy (or probability)
        # states_input = states.view(-1, self.state_size)
        # actions_input = actions.view(-1, self.action_size)
        new_predictions = self.network(states, actions)
        # log_new_probs = new_predictions['log_pi_a'].view(state_count)
        log_new_probs = new_predictions['log_pi_a'].view(-1, 1)

        # ratio for clipping
        ratio = (log_new_probs - log_old_probs).exp()

        # clipped function
        clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
        clipped_surrogate = torch.min(ratio * adv, clip * adv)

        # include entropy as a regularization term
        # entropy = new_predictions['entropy'].view(state_count)
        entropy = new_predictions['entropy'].view(-1, 1)

        # policy/actor loss
        policy_loss = -clipped_surrogate.mean() - beta * entropy.mean()

        # value/cirtic loss, if use GAE
        if self.use_gae:
            # value_loss = (rewards_future - new_predictions['v'].view(state_count)).pow(2).mean()
            value_loss = 1.0 * (rewards_future - new_predictions['v'].view(
                -1, 1)).pow(2).mean()
            loss = policy_loss + value_loss
        else:
            loss = policy_loss

        # this returns an average of all the loss entries
        return loss

    def step(self, envs, brain_name, grad_clip=10):
        # first, collect trajectories
        log_old_probs, states, actions, rewards, rewards_future, adv_normalized = \
            self.collect_trajectories(envs, brain_name)

        # reshape the data
        log_old_probs_flat = log_old_probs.reshape(-1, 1)
        states_flat = states.reshape(-1, self.state_size)
        actions_flat = actions.reshape(-1, self.action_size)
        rewards_future_flat = rewards_future.reshape(-1, 1)
        adv_normalized_flat = adv_normalized.reshape(-1, 1)

        # update parameters using collected trajectories
        for _ in range(self.opt_epoch):
            # random sample from the collected trajectories by mini-batches
            sampler = random_sample(np.arange(states_flat.shape[0]),
                                    self.batch_size)

            # then updates parameters using the sampled mini-batch
            for batch_indices in sampler:
                self.network.train()
                L = self.clipped_surrogate(log_old_probs_flat[batch_indices],
                                           states_flat[batch_indices],
                                           actions_flat[batch_indices],
                                           rewards_future_flat[batch_indices],
                                           adv_normalized_flat[batch_indices])
                self.optimizer.zero_grad()
                L.backward()
                torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                               grad_clip)
                self.optimizer.step()
                # del L

        return rewards
Beispiel #6
0
class SAC(object):
    def __init__(self, config, env):
        self.device = config.device

        self.gamma = config.gamma  # 折扣因子

        self.tau = config.tau

        # 学习率
        self.value_lr = config.value_lr
        self.soft_q_lr = config.soft_q_lr
        self.policy_lr = config.policy_lr

        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.shape[0]  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        # 初始化V网络
        self.value_net = ValueNetwork(self.num_states, 256).to(self.device)
        # 初始化V目标网络
        self.target_value_net = ValueNetwork(self.num_states,
                                             256).to(self.device)

        # V目标网络和V网络初始时参数一致
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        # 初始化Q网络
        self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions,
                                       256).to(self.device)

        # 初始化策略网络
        self.policy_net = PolicyNetwork(self.num_states, self.num_actions,
                                        256).to(self.device)

        # 训练的优化器
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(),
                                           lr=self.soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # 均方损失函数
        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]

    # 获取动作的log_prob
    def get_action_log_prob(self, s, epsilon=1e-6):
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon)  # reparameterization

        return action, log_prob, z, mean, log_std

    # 从经验池中选取样本
    def get_batch(self):
        transitions, _, _ = self.memory.sample(self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _

    # 学习
    def learn(self):
        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        expected_q_value = self.soft_q_net(batch_state, batch_action)  # q(s,a)
        expected_value = self.value_net(batch_state)  # v(s)
        new_action, log_prob, z, mean, log_std = self.get_action_log_prob(
            batch_state)  # a~, logpi(a~|s), dist, 均值,标准差

        target_value = self.target_value_net(batch_next_state)  # vtar(s')
        next_q_value = batch_reward + (
            1 -
            batch_done) * self.gamma * target_value  # r + gamma*(1-d)*vtar(s')
        q_value_loss = self.soft_q_criterion(expected_q_value,
                                             next_q_value.detach()).mean()

        expected_new_q_value = self.soft_q_net(batch_state,
                                               new_action)  # q(s,a~)
        next_value = expected_new_q_value - log_prob
        value_loss = self.value_criterion(expected_value,
                                          next_value.detach()).mean()

        log_prob_target = expected_new_q_value - expected_value  # q(s,a) - v(s)
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        self.soft_q_optimizer.zero_grad()
        q_value_loss.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        torch.save(self.soft_q_net, 'sac1_q.pkl')
        torch.save(self.value_net, 'sac1_v.pkl')
        torch.save(self.policy_net, 'sac1_policy.pkl')

    # 加载模型
    def load(self):
        self.soft_q_net = torch.load('sac1_q.pkl')
        self.value_net = torch.load('sac1_v.pkl')
        self.policy_net = torch.load('sac1_policy.pkl')
Beispiel #7
0
class SAC:
    def __init__(self, env_name, n_states, n_actions, memory_size, batch_size,
                 gamma, alpha, lr, action_bounds, reward_scale):
        self.env_name = env_name
        self.n_states = n_states
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.action_bounds = action_bounds
        self.reward_scale = reward_scale
        self.memory = Memory(memory_size=self.memory_size)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.policy_network = PolicyNetwork(
            n_states=self.n_states,
            n_actions=self.n_actions,
            action_bounds=self.action_bounds).to(self.device)
        self.q_value_network1 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.q_value_network2 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.value_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network.load_state_dict(
            self.value_network.state_dict())
        self.value_target_network.eval()

        self.value_loss = torch.nn.MSELoss()
        self.q_value_loss = torch.nn.MSELoss()

        self.value_opt = Adam(self.value_network.parameters(), lr=self.lr)
        self.q_value1_opt = Adam(self.q_value_network1.parameters(),
                                 lr=self.lr)
        self.q_value2_opt = Adam(self.q_value_network2.parameters(),
                                 lr=self.lr)
        self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)

    def store(self, state, reward, done, action, next_state):
        state = from_numpy(state).float().to("cpu")
        reward = torch.Tensor([reward]).to("cpu")
        done = torch.Tensor([done]).to("cpu")
        action = torch.Tensor([action]).to("cpu")
        next_state = from_numpy(next_state).float().to("cpu")
        self.memory.add(state, reward, done, action, next_state)

    def unpack(self, batch):
        batch = Transition(*zip(*batch))

        states = torch.cat(batch.state).view(self.batch_size,
                                             self.n_states).to(self.device)
        rewards = torch.cat(batch.reward).view(self.batch_size,
                                               1).to(self.device)
        dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device)
        actions = torch.cat(batch.action).view(-1,
                                               self.n_actions).to(self.device)
        next_states = torch.cat(batch.next_state).view(
            self.batch_size, self.n_states).to(self.device)

        return states, rewards, dones, actions, next_states

    def train(self):
        if len(self.memory) < self.batch_size:
            return 0, 0, 0
        else:
            batch = self.memory.sample(self.batch_size)
            states, rewards, dones, actions, next_states = self.unpack(batch)

            # Calculating the value target
            reparam_actions, log_probs = self.policy_network.sample_or_likelihood(
                states)
            q1 = self.q_value_network1(states, reparam_actions)
            q2 = self.q_value_network2(states, reparam_actions)
            q = torch.min(q1, q2)
            target_value = q.detach() - self.alpha * log_probs.detach()

            value = self.value_network(states)
            value_loss = self.value_loss(value, target_value)

            # Calculating the Q-Value target
            with torch.no_grad():
                target_q = self.reward_scale * rewards + \
                           self.gamma * self.value_target_network(next_states) * (1 - dones)
            q1 = self.q_value_network1(states, actions)
            q2 = self.q_value_network2(states, actions)
            q1_loss = self.q_value_loss(q1, target_q)
            q2_loss = self.q_value_loss(q2, target_q)

            policy_loss = (self.alpha * log_probs - q).mean()

            self.policy_opt.zero_grad()
            policy_loss.backward()
            self.policy_opt.step()

            self.value_opt.zero_grad()
            value_loss.backward()
            self.value_opt.step()

            self.q_value1_opt.zero_grad()
            q1_loss.backward()
            self.q_value1_opt.step()

            self.q_value2_opt.zero_grad()
            q2_loss.backward()
            self.q_value2_opt.step()

            self.soft_update_target_network(self.value_network,
                                            self.value_target_network)

            return value_loss.item(), 0.5 * (
                q1_loss + q2_loss).item(), policy_loss.item()

    def choose_action(self, states):
        states = np.expand_dims(states, axis=0)
        states = from_numpy(states).float().to(self.device)
        action, _ = self.policy_network.sample_or_likelihood(states)
        return action.detach().cpu().numpy()[0]

    @staticmethod
    def soft_update_target_network(local_network, target_network, tau=0.005):
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.policy_network.state_dict(),
                   self.env_name + "_weights.pth")

    def load_weights(self):
        self.policy_network.load_state_dict(
            torch.load(self.env_name + "_weights.pth"))

    def set_to_eval_mode(self):
        self.policy_network.eval()
Beispiel #8
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    # pi: state -> acton
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        '''if action < 0.5:
            return 0
        else:
            return 1'''
        scaled_action = []
        for idx, a in enumerate(action):
            action_range = self.action_range[idx]
            a = (action_range[1] - action_range[0]) / 2.0 + (
                action_range[1] + action_range[0]) / 2.0
            scaled_action.append(a)
        return scaled_action

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        #TODO: Question: why using 2 Q-networks?
        # To reduce bias in training.

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy net and target value nets
        # TODO: Question: what does this part do?
        # The original paper mentioned 2 methods for approximating the value function
        # 1. the EMA of policy weights to update the Q network
        # 2. periodical update of the policy network, which is used in this code
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1