Beispiel #1
0
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)


def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20))
    else:
        policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3))
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = 5
optim_batch_size = 64

"""create agent"""
agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads)


def update_params(batch, i_iter):
    states = torch.from_numpy(np.stack(batch.state))
    actions = torch.from_numpy(np.stack(batch.action))
    rewards = torch.from_numpy(np.stack(batch.reward))
    masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
Beispiel #3
0
    else:
        policy_mgr = DiscretePolicy(state_dim, 4)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)

optim_epochs = 10
optim_batch_size = 50
"""create agent"""
agent = Agent(env,
              policy_mgr,
              policy_wrk,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)
Beispiel #4
0
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.WGAN:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
    elif args.GEOMGAN:
        # new kernel
        #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim)
        noise_dim = 64
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=noise_dim,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              slope=0.1,
                              dropout=False,
                              dprob=0.2)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate / 2)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_decay)

    if args.WGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()
                    # return -discrim_net(state_action).sum().item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.GEOMGAN:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            #dataSize = states.size()[0]
            # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device)
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.GEOMGAN:
                # tbd, no discriminaotr learning
                pass
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.GEOMGAN:
                optimizer_kernel.zero_grad()

            if args.WGAN:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake,
                                         args.sigma_list)
                #tbd
                #rewards = K[0]+K[1]-2*K[2]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.GEOMGAN:
                # larger, better, but slower
                noise_num = 100
                mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num,
                                         noise_dim, kernel_net, cuda)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach()
                errD = mmd2_D  #+ args.lambda_rg * one_side_errD
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))
            if args.GEOMGAN:
                optimizer_kernel.step()
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.GEOMGAN:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
Beispiel #5
0
class BC(object):
    """
    A vanilla Behavior Cloning model.
    """
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args

        self.is_dict_action = is_dict_action
        self.is_atari = is_atari

        self.state_dim = state_dim

        self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
            Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.actor_loss = nn.CrossEntropyLoss(
        ) if self.is_dict_action else nn.MSELoss()

    def set_expert(self, expert_traj, num_trajs):
        """
        Set the expert trajectories.
        :param expert_traj:
        :param num_traj
        :return:
        """
        # self.expert_traj_pool = expert_traj
        # self.expert_traj = np.vstack(expert_traj[:num_trajs])

    def set_expert2(self, expert_traj):
        """
        Set the expert trajectories.
        :param expert_traj:
        :param num_traj
        :return:
        """
        self.expert_traj = expert_traj

    def train(self):
        """
        :param num_traj:
        :return:
        """
        if self.expert_traj is not None:
            expert_traj = self.expert_traj
        expert_state_actions = torch.DoubleTensor(expert_traj)

        expert_states = expert_state_actions[:, :self.state_dim].to(
            self.device)
        expert_actions = expert_state_actions[:,
                                              self.state_dim:].to(self.device)

        predicted_actions = self.actor(expert_states)[0]

        self.actor_optimizer.zero_grad()
        loss = self.actor_loss(predicted_actions, expert_actions)
        loss.backward()
        self.actor_optimizer.step()

        return loss.to('cpu').detach().numpy()

    def train2(self):
        """
        :return:
        """
        if self.expert_traj is not None:
            expert_traj = self.expert_traj
        # expert_state_actions = torch.DoubleTensor(expert_traj)
        #
        # expert_states = expert_state_actions[:,:self.state_dim].to(self.device)
        # expert_actions = expert_state_actions[:,self.state_dim:].to(self.device)
        expert_states = torch.DoubleTensor(self.expert_traj.state)
        expert_actions = torch.DoubleTensor(self.expert_traj.action)
        predicted_actions = self.actor(expert_states)[0]

        self.actor_optimizer.zero_grad()
        loss = self.actor_loss(predicted_actions, expert_actions)
        loss.backward()
        self.actor_optimizer.step()

        return loss.to('cpu').detach().numpy()
Beispiel #6
0
                                              hidden_size=(64, 64),
                                              log_std=args.log_std)

        else:
            policy_net = Policy(state_dim,
                                env.action_space.shape[0],
                                log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state, = pickle.load(
        open(args.model_path, "rb"))

policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)

params = list(policy_net.parameters()) + list(value_net.parameters())
unique_optimizer = torch.optim.Adam(params, lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = args.optim_epochs
optim_batch_size = args.optim_batch_size
"""create agent"""
agent = Agent(env,
              policy_net,
              device,
              running_state=running_state,
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.AL:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='leakyrelu',
                                        slope=0.1,
                                        dropout=True,
                                        dprob=0.2)
    elif args.VAKLIL:
        noise_dim = 64
        mid_dim = 32
        discrim_net = VAEDiscriminator(state_dim + action_dim,
                                       num_outputs=noise_dim,
                                       sigmoid_out=False,
                                       sn=True,
                                       test=False,
                                       w_init=False,
                                       hidden_size_enc=(),
                                       hidden_size_dec=(),
                                       encode_size=mid_dim,
                                       activation='relu',
                                       dropout=False)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              dropout=False)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_kernel_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_kernel_decay)

    if args.AL:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.VAKLIL:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.VAKLIL:
                g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake,
                                                     mean_mode=False)
                e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real,
                                                     mean_mode=False)
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.VAKLIL:
                optimizer_kernel.zero_grad()

            if args.AL:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.VAKLIL:
                noise_num = 20000
                mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2(
                    e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda,
                    args.sigma_list)
                mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                errD = (mmd2_D_net + mmd2_D_rbf) / 2
                # 1e-8: small number for numerical stability
                i_c = 0.2
                bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat(
                    (e_mu, g_mu), dim=0)**2) + (torch.cat(
                        (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat(
                            (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1,
                                                              dim=1))) - i_c
                discrim_loss = -errD + (args.beta * bottleneck_loss) + (
                    args.lambda_h * penalty)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))

            discrim_loss.backward()
            optimizer_discrim.step()
            if args.VAKLIL:
                optimizer_kernel.step()

        if args.VAKLIL:
            with torch.no_grad():
                noise_num = 20000
                g_o_enc, _, _ = discrim_net(dis_input_fake)
                e_o_enc, _, _ = discrim_net(dis_input_real)
                _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num,
                                                   noise_dim, kernel_net, cuda,
                                                   args.sigma_list)
                _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                K = [sum(x) / 2 for x in zip(K_net, K_rbf)]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards  #.detach()
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.VAKLIL:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
Beispiel #8
0
class PPO(object):
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args
        if is_atari:
            self.actor = CNNPolicy(state_dim, action_dim).to(self.device)
            self.critic = CNNCritic(state_dim).to(self.device)
        else:
            self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
                Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)
            self.critic = Value(state_dim).to(self.device)

        # initialize optimizer for actor and critic
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.learning_rate)

        # optimization epoch number and batch size for PPO
        self.optim_epochs = 10
        self.optim_batch_size = 64

    def train(self, batch):
        """
        Train the policy using the given batch.
        :param batch:
        :return:
        """

        states = torch.DoubleTensor(np.stack(batch.state)).to(self.device)
        actions = torch.DoubleTensor(np.stack(batch.action)).to(self.device)
        rewards = torch.DoubleTensor(np.stack(batch.reward)).to(self.device)
        masks = torch.DoubleTensor(np.stack(batch.mask)).to(self.device)

        with torch.no_grad():
            values = self.critic(states)
            fixed_log_probs = self.actor.get_log_prob(states, actions)

        # get advantage estimation from the trajectories
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  self.config.gamma,
                                                  self.config.tau, self.device)

        # compute minibatch size
        optim_iter_num = int(math.ceil(states.shape[0] /
                                       self.optim_batch_size))

        # PPO updates
        for _ in range(self.optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = torch.LongTensor(perm).to(self.device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * self.optim_batch_size,
                    min((i + 1) * self.optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]
                self.ppo_step(states_b, actions_b, returns_b, advantages_b,
                              fixed_log_probs_b)

    def ppo_step(self, states, actions, returns, advantages, fixed_log_probs):
        """
        A PPO policy gradient update step.
        :param states:
        :param actions:
        :param returns:
        :param advantages:
        :param fixed_log_probs:
        :return:
        """
        # update critic, for now assume one epoch
        values_pred = self.critic(states)
        value_loss = (values_pred - returns).pow(2).mean()
        # weight decay
        for param in self.critic.parameters():
            value_loss += param.pow(2).sum() * self.config.l2_reg
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()

        # update actor
        log_probs = self.actor.get_log_prob(states, actions)
        ratio = torch.exp(log_probs - fixed_log_probs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.config.clip_epsilon,
                            1.0 + self.config.clip_epsilon) * advantages
        policy_surr = -torch.min(surr1, surr2).mean()
        self.actor.zero_grad()
        policy_surr.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40)
        self.actor_optimizer.step()