Esempio n. 1
0
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)

optim_epochs = 10
optim_batch_size = 50
"""create agent"""
agent = Agent(env,
              policy_mgr,
              policy_wrk,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)


def update_params(batch_mgr, batch_wrk):
Esempio n. 2
0
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)


def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
Esempio n. 3
0
class DDPG:
    def __init__(
        self,
        env=None,
        render=False,
        num_process=1,
        memory_size=1000000,
        lr_p=1e-3,
        lr_v=1e-3,
        gamma=0.99,
        polyak=0.995,
        explore_size=10000,
        batch_size=100,
        min_update_step=1000,
        update_step=50,
        action_noise=0.1,
        seed=1,
    ):
        self.env = env
        self.render = render
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.explore_size = explore_size
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.action_noise = action_noise
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions,
                                self.action_high).to(device)
        self.policy_net_target = Actor(self.num_states, self.num_actions,
                                       self.action_high).to(device)

        self.value_net = Value(self.num_states + self.num_actions).to(device)
        self.value_net_target = Value(self.num_states +
                                      self.num_actions).to(device)

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state, noise_scale):
        """select action"""
        self.policy_net.eval()
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = self.policy_net(state)
        self.policy_net.train()
        action = action.cpu().numpy()[0]
        # add noise
        noise = noise_scale * np.random.randn(self.num_actions)
        action += noise
        action = np.clip(action, -self.action_high, self.action_high)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        self.policy_net.eval()
        self.value_net.eval()

        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action = self.choose_action(state, 0)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""
        self.policy_net.train()
        self.value_net.train()

        state = self.env.reset()
        episode_reward = 0

        while True:
            if self.render:
                self.env.render()

            action = self.choose_action(state, self.action_noise)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for _ in range(self.update_step):
                    batch = self.memory.sample(
                        self.batch_size)  # random sample batch
                    self.update(batch)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")

        # record reward information
        writer.add_scalar("ddpg/reward", episode_reward, i_iter)

    def update(self, batch):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by DDPG
        ddpg_step(self.policy_net, self.policy_net_target, self.value_net,
                  self.value_net_target, self.optimizer_p, self.optimizer_v,
                  batch_state, batch_action, batch_reward, batch_next_state,
                  batch_mask, self.gamma, self.polyak)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.value_net = torch.load(model_path,
                                                     map_location=device)

    def save(self, save_path):
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        """save model"""
        torch.save((self.policy_net, self.value_net),
                   f"{save_path}/WebEye_ddpg.pt")
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20))
    else:
        policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3))
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = 5
optim_batch_size = 64

"""create agent"""
agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads)


def update_params(batch, i_iter):
    states = torch.from_numpy(np.stack(batch.state))
    actions = torch.from_numpy(np.stack(batch.action))
    rewards = torch.from_numpy(np.stack(batch.reward))
    masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
    if use_gpu:
Esempio n. 5
0
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.WGAN:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
    elif args.GEOMGAN:
        # new kernel
        #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim)
        noise_dim = 64
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=noise_dim,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              slope=0.1,
                              dropout=False,
                              dprob=0.2)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate / 2)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_decay)

    if args.WGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()
                    # return -discrim_net(state_action).sum().item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.GEOMGAN:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            #dataSize = states.size()[0]
            # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device)
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.GEOMGAN:
                # tbd, no discriminaotr learning
                pass
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.GEOMGAN:
                optimizer_kernel.zero_grad()

            if args.WGAN:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake,
                                         args.sigma_list)
                #tbd
                #rewards = K[0]+K[1]-2*K[2]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.GEOMGAN:
                # larger, better, but slower
                noise_num = 100
                mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num,
                                         noise_dim, kernel_net, cuda)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach()
                errD = mmd2_D  #+ args.lambda_rg * one_side_errD
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))
            if args.GEOMGAN:
                optimizer_kernel.step()
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.GEOMGAN:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
Esempio n. 6
0
def update_params(batch, i_iter, opt):
    """update discriminator"""
    reirl_weights.write(
        reirl(expert_traj[:, :-action_dim], np.stack(batch.state), opt))
    value_net = Value(state_dim)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    if i_iter > 0:
        j_max = 3  #if i_iter < 20 else 15
        for j in range(j_max):  #3):
            batch, log = ppo_agent.collect_samples(3000)
            print('{}\tT_sample {}\texpert_R_avg {}\tR_avg {}'.format(
                j, log['sample_time'], log['avg_c_reward'], log['avg_reward']))
            states = torch.from_numpy(np.stack(
                batch.state)).to(dtype).to(device)
            player_actions = torch.from_numpy(np.stack(
                batch.player_action)).to(dtype).to(device)
            opponent_actions = torch.from_numpy(np.stack(
                batch.opponent_action)).to(dtype).to(device)
            rewards = torch.from_numpy(np.stack(
                batch.reward)).to(dtype).to(device)
            masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
            with torch.no_grad():
                values = value_net(states)
                fixed_log_probs = policy_net.get_log_prob(
                    states, player_actions)
                opponent_fixed_log_probs = opponent_net.get_log_prob(
                    states, opponent_actions)
            """get advantage estimation from the trajectories"""
            advantages, returns = estimate_advantages(rewards, masks, values,
                                                      args.gamma, args.tau,
                                                      device)
            """perform mini-batch PPO update"""
            optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
            for _ in range(optim_epochs):
                perm = np.arange(states.shape[0])
                np.random.shuffle(perm)
                perm = LongTensor(perm).to(device)

                states, player_actions, opponent_actions, returns, advantages, fixed_log_probs, opponent_fixed_log_probs = \
                    states[perm].clone(), player_actions[perm].clone(), \
                    opponent_actions[perm].clone(), returns[perm].clone(), \
                    advantages[perm].clone(), \
                    fixed_log_probs[perm].clone(), opponent_fixed_log_probs[
                        perm].clone()

                for i in range(optim_iter_num):
                    ind = slice(
                        i * optim_batch_size,
                        min((i + 1) * optim_batch_size, states.shape[0]))
                    states_b, player_actions_b, opponent_actions_b, advantages_b, returns_b, fixed_log_probs_b, opponent_fixed_log_probs_b = \
                        states[ind], player_actions[ind], opponent_actions[ind], \
                        advantages[ind], returns[ind], fixed_log_probs[ind], \
                        opponent_fixed_log_probs[ind]

                    # Update the player
                    ppo_step(policy_net,
                             value_net,
                             optimizer_policy,
                             optimizer_value,
                             1,
                             states_b,
                             player_actions_b,
                             returns_b,
                             advantages_b,
                             fixed_log_probs_b,
                             args.clip_epsilon,
                             args.l2_reg,
                             max_grad=max_grad)
                    # Update the opponent
                    ppo_step(opponent_net,
                             value_net,
                             optimizer_opponent,
                             optimizer_value,
                             1,
                             states_b,
                             opponent_actions_b,
                             returns_b,
                             advantages_b,
                             opponent_fixed_log_probs_b,
                             args.clip_epsilon,
                             args.l2_reg,
                             opponent=True,
                             max_grad=max_grad)
Esempio n. 7
0
        else:
            policy_net = Policy(state_dim,
                                env.action_space.shape[0],
                                log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state, = pickle.load(
        open(args.model_path, "rb"))

policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)

params = list(policy_net.parameters()) + list(value_net.parameters())
unique_optimizer = torch.optim.Adam(params, lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = args.optim_epochs
optim_batch_size = args.optim_batch_size
"""create agent"""
agent = Agent(env,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)
Esempio n. 8
0
class SAC_Alpha:
    def __init__(self,
                 env,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_a=3e-4,
                 lr_q=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 target_update_delay=1,
                 seed=1,
                 ):
        self.env = env
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_a = lr_a
        self.lr_q = lr_q
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.target_update_delay = target_update_delay
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0]

        self.target_entropy = - np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device)

        self.q_net_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_2 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_()

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, _ = self.policy_net.get_action_log_prob(state)
        action = action.cpu().numpy()[0]
        return action, None

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action, _ = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""

        state = self.env.reset()
        episode_reward = 0

        while True:

            if self.render:
                self.env.render()

            action, _ = self.choose_action(state)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for k in range(1, self.update_step + 1):
                    batch = self.memory.sample(self.batch_size)  # random sample batch
                    self.update(batch, k)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")
        # record reward information
        writer.add_scalar("sac_alpha/reward", episode_reward, i_iter)

    def update(self, batch, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by SAC Alpha
        sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1,
                       self.q_net_target_2,
                       self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state,
                       batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak,
                       self.target_entropy,
                       k_iter % self.target_update_delay == 0)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.q_net_1, self.q_net_2, self.alpha = torch.load(model_path, map_location=device)

    def save(self, save_path):
        """save model"""
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        torch.save((self.policy_net, self.q_net_1, self.q_net_2, self.alpha), f"{save_path}/WebEye_sac_alpha.pt")
Esempio n. 9
0
def train_v_upper_envelope(states,
                           actions,
                           returns,
                           state_dim,
                           device,
                           seed,
                           upper_learning_rate=3e-3,
                           weight_decay=0.02,
                           max_step_num=int(1e6),
                           consecutive_steps=4,
                           k=10000):

    states = torch.from_numpy(np.array(states))
    actions = torch.from_numpy(np.array(actions))
    returns = torch.from_numpy(np.array(returns))  # returns is actually Gts

    use_gpu = True if device == "cuda:0" else False

    # Init upper_envelope net (*use relu as activation function
    upper_envelope = Value(state_dim, activation='relu')
    upper_envelope_retrain = Value(state_dim, activation='relu')

    optimizer_upper = torch.optim.Adam(upper_envelope.parameters(),
                                       lr=upper_learning_rate,
                                       weight_decay=weight_decay)
    optimizer_upper_retrain = torch.optim.Adam(
        upper_envelope_retrain.parameters(),
        lr=upper_learning_rate,
        weight_decay=weight_decay)

    if use_gpu:
        upper_envelope = upper_envelope.cuda()
        upper_envelope_retrain = upper_envelope_retrain.cuda()

    # =========================== #
    # Split data into training and testing #
    # But make sure the highest Ri is in the training set

    # pick out the highest data point
    highestR, indice = torch.max(returns, 0)
    highestR = highestR.view(-1, 1)
    highestS = states[indice]
    highestA = actions[indice]
    print("HighestR:", highestR)

    statesW = torch.cat((states[:indice], states[indice + 1:]))
    actionsW = torch.cat((actions[:indice], actions[indice + 1:]))
    returnsW = torch.cat((returns[:indice], returns[indice + 1:]))

    # shuffle the data
    perm = np.arange(statesW.shape[0])
    np.random.shuffle(perm)
    perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)
    statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm]

    # divide data into train/test
    divide = int(states.shape[0] * 0.8)
    train_states, train_actions, train_returns = statesW[:
                                                         divide], actionsW[:
                                                                           divide], returnsW[:
                                                                                             divide]
    test_states, test_actions, test_returns = statesW[divide:], actionsW[
        divide:], returnsW[divide:]

    # add the highest data into training
    print(train_states.size(), highestS.size())
    print(train_actions.size(), highestA.size())
    print(train_returns.size(), highestR.size())
    train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0)))
    train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0)))
    train_returns = torch.cat(
        (train_returns.squeeze(), highestR.squeeze().unsqueeze(0)))

    # train upper envelope
    # env_dummy = env_factory(0)
    # state_dim = env_dummy.observation_space.shape[0]
    # upper_envelope = Value(state_dim)
    # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20)

    epoch_n = 100
    batch_size = 64
    optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size))

    num_increase = 0
    previous_loss = math.inf

    calculate_vali = 2
    best_parameters = upper_envelope.state_dict()
    running_traning_steps = 0
    best_training_steps = running_traning_steps

    # Upper Envelope Training starts
    upper_envelope.train()

    while num_increase < consecutive_steps:
        # update theta for n steps, n = calculate_vali
        # train calculate_vali steps
        for i in range(calculate_vali):
            train_loss = 0
            perm = np.arange(train_states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

            train_states, train_actions, train_returns = train_states[
                perm], train_actions[perm], train_returns[perm]

            for i in range(optim_iter_num):
                ind = slice(i * batch_size,
                            min((i + 1) * batch_size, states.shape[0]))
                states_b, returns_b = train_states[ind], train_returns[ind]
                states_b = Variable(states_b.float())
                returns_b = Variable(returns_b.float())
                Vsi = upper_envelope(states_b)
                # loss = loss_fn(Vsi, returns_b)
                loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
                train_loss += loss.detach()
                upper_envelope.zero_grad()
                loss.backward()
                optimizer_upper.step()

        # early stopping

        running_traning_steps += calculate_vali

        # calculate validation error
        test_iter = int(math.ceil(test_states.shape[0] / batch_size))
        validation_loss = 0
        for n in range(test_iter):
            ind = slice(n * batch_size,
                        min((n + 1) * batch_size, states.shape[0]))
            states_t, returns_t = test_states[ind], test_returns[ind]
            states_t = Variable(states_t.float())
            returns_t = Variable(returns_t.float())
            Vsi = upper_envelope(states_t)
            loss = L2PenaltyLoss(Vsi, returns_t, k_val=k)
            validation_loss += loss

        if validation_loss < previous_loss:
            best_training_steps = running_traning_steps
            previous_loss = validation_loss
            best_parameters = upper_envelope.state_dict()
            num_increase = 0
        else:
            num_increase += 1

    print("best_training_steps:", best_training_steps)
    upper_envelope.load_state_dict(best_parameters)

    # retrain on the whole set
    upper_envelope_retrain.train()

    optim_iter_num = int(math.ceil(states.shape[0] / batch_size))
    for i in range(best_training_steps):
        train_loss = 0
        perm = np.arange(states.shape[0])
        np.random.shuffle(perm)
        perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

        states, actions, returns = states[perm], actions[perm], returns[perm]

        for i in range(optim_iter_num):
            ind = slice(i * batch_size,
                        min((i + 1) * batch_size, states.shape[0]))
            states_b, returns_b = states[ind], returns[ind]
            states_b = Variable(states_b.float())
            returns_b = Variable(returns_b.float())
            Vsi = upper_envelope_retrain(states_b)
            #loss = loss_fn(Vsi, returns_b)
            loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
            train_loss += loss.detach()
            upper_envelope_retrain.zero_grad()
            loss.backward()
            optimizer_upper_retrain.step()

    upper_envelope.load_state_dict(upper_envelope_retrain.state_dict())
    print("Policy training is complete.")

    return upper_envelope
Esempio n. 10
0
class MAGAIL:
    def __init__(self, config, log_dir, exp_name):
        self.config = config
        self.exp_name = exp_name
        self.writer = SummaryWriter(log_dir=f"{log_dir}/{self.exp_name}")
        """seeding"""
        seed = self.config["general"]["seed"]
        torch.manual_seed(seed)
        np.random.seed(seed)

        self._load_expert_data()
        self._init_model()

    def _init_model(self):
        self.V = Value(num_states=self.config["value"]["num_states"],
                       num_hiddens=self.config["value"]["num_hiddens"],
                       drop_rate=self.config["value"]["drop_rate"],
                       activation=self.config["value"]["activation"])
        self.P = JointPolicy(
            initial_state=self.expert_dataset.state.to(device),
            config=self.config["jointpolicy"])
        self.D = Discriminator(
            num_states=self.config["discriminator"]["num_states"],
            num_actions=self.config["discriminator"]["num_actions"],
            num_hiddens=self.config["discriminator"]["num_hiddens"],
            drop_rate=self.config["discriminator"]["drop_rate"],
            use_noise=self.config["discriminator"]["use_noise"],
            noise_std=self.config["discriminator"]["noise_std"],
            activation=self.config["discriminator"]["activation"])

        print("Model Structure")
        print(self.P)
        print(self.V)
        print(self.D)
        print()

        self.optimizer_policy = optim.Adam(
            self.P.parameters(),
            lr=self.config["jointpolicy"]["learning_rate"])
        self.optimizer_value = optim.Adam(
            self.V.parameters(), lr=self.config["value"]["learning_rate"])
        self.optimizer_discriminator = optim.Adam(
            self.D.parameters(),
            lr=self.config["discriminator"]["learning_rate"])
        self.scheduler_discriminator = optim.lr_scheduler.StepLR(
            self.optimizer_discriminator, step_size=2000, gamma=0.95)

        self.discriminator_func = nn.BCELoss()

        to_device(self.V, self.P, self.D, self.D, self.discriminator_func)

    def _load_expert_data(self):
        num_expert_states = self.config["general"]["num_states"]
        num_expert_actions = self.config["general"]["num_actions"]
        expert_batch_size = self.config["general"]["expert_batch_size"]

        self.expert_dataset = ExpertDataSet(
            data_set_path=self.config["general"]["expert_data_path"],
            num_states=num_expert_states,
            num_actions=num_expert_actions)
        self.expert_data_loader = DataLoader(
            dataset=self.expert_dataset,
            batch_size=expert_batch_size,
            shuffle=True,
            num_workers=multiprocessing.cpu_count() // 2)

    def train(self, epoch):
        self.P.train()
        self.D.train()
        self.V.train()

        # collect generated batch
        gen_batch = self.P.collect_samples(
            self.config["ppo"]["sample_batch_size"])
        # batch: ('state', 'action', 'next_state', 'log_prob', 'mask')
        gen_batch_state = trans_shape_func(
            torch.stack(gen_batch.state
                        ))  # [trajectory length * parallel size, state size]
        gen_batch_action = trans_shape_func(
            torch.stack(gen_batch.action
                        ))  # [trajectory length * parallel size, action size]
        gen_batch_next_state = trans_shape_func(
            torch.stack(gen_batch.next_state)
        )  # [trajectory length * parallel size, state size]
        gen_batch_old_log_prob = trans_shape_func(
            torch.stack(
                gen_batch.log_prob))  # [trajectory length * parallel size, 1]
        gen_batch_mask = trans_shape_func(torch.stack(
            gen_batch.mask))  # [trajectory length * parallel size, 1]

        # grad_collect_func = lambda d: torch.cat([grad.view(-1) for grad in torch.autograd.grad(d, self.D.parameters(), retain_graph=True)]).unsqueeze(0)
        ####################################################
        # update discriminator
        ####################################################
        for expert_batch_state, expert_batch_action in self.expert_data_loader:
            gen_r = self.D(gen_batch_state, gen_batch_action)
            expert_r = self.D(expert_batch_state.to(device),
                              expert_batch_action.to(device))

            # label smoothing for discriminator
            expert_labels = torch.ones_like(expert_r)
            gen_labels = torch.zeros_like(gen_r)

            if self.config["discriminator"]["use_label_smoothing"]:
                smoothing_rate = self.config["discriminator"][
                    "label_smooth_rate"]
                expert_labels *= (1 - smoothing_rate)
                gen_labels += torch.ones_like(gen_r) * smoothing_rate

            e_loss = self.discriminator_func(expert_r, expert_labels)
            g_loss = self.discriminator_func(gen_r, gen_labels)
            d_loss = e_loss + g_loss

            # """ WGAN with Gradient Penalty"""
            # d_loss = gen_r.mean() - expert_r.mean()
            # differences_batch_state = gen_batch_state[:expert_batch_state.size(0)] - expert_batch_state
            # differences_batch_action = gen_batch_action[:expert_batch_action.size(0)] - expert_batch_action
            # alpha = torch.rand(expert_batch_state.size(0), 1)
            # interpolates_batch_state = gen_batch_state[:expert_batch_state.size(0)] + (alpha * differences_batch_state)
            # interpolates_batch_action = gen_batch_action[:expert_batch_action.size(0)] + (alpha * differences_batch_action)
            # gradients = torch.cat([x for x in map(grad_collect_func, self.D(interpolates_batch_state, interpolates_batch_action))])
            # slopes = torch.norm(gradients, p=2, dim=-1)
            # gradient_penalty = torch.mean((slopes - 1.) ** 2)
            # d_loss += 10 * gradient_penalty

            self.optimizer_discriminator.zero_grad()
            d_loss.backward()
            self.optimizer_discriminator.step()

            self.scheduler_discriminator.step()

        self.writer.add_scalar('train/loss/d_loss', d_loss.item(), epoch)
        self.writer.add_scalar("train/loss/e_loss", e_loss.item(), epoch)
        self.writer.add_scalar("train/loss/g_loss", g_loss.item(), epoch)
        self.writer.add_scalar('train/reward/expert_r',
                               expert_r.mean().item(), epoch)
        self.writer.add_scalar('train/reward/gen_r',
                               gen_r.mean().item(), epoch)

        with torch.no_grad():
            gen_batch_value = self.V(gen_batch_state)
            gen_batch_reward = self.D(gen_batch_state, gen_batch_action)

        gen_batch_advantage, gen_batch_return = estimate_advantages(
            gen_batch_reward, gen_batch_mask, gen_batch_value,
            self.config["gae"]["gamma"], self.config["gae"]["tau"],
            self.config["jointpolicy"]["trajectory_length"])

        ####################################################
        # update policy by ppo [mini_batch]
        ####################################################
        ppo_optim_epochs = self.config["ppo"]["ppo_optim_epochs"]
        ppo_mini_batch_size = self.config["ppo"]["ppo_mini_batch_size"]
        gen_batch_size = gen_batch_state.shape[0]
        optim_iter_num = int(math.ceil(gen_batch_size / ppo_mini_batch_size))

        for _ in range(ppo_optim_epochs):
            perm = torch.randperm(gen_batch_size)

            for i in range(optim_iter_num):
                ind = perm[slice(
                    i * ppo_mini_batch_size,
                    min((i + 1) * ppo_mini_batch_size, gen_batch_size))]
                mini_batch_state, mini_batch_action, mini_batch_next_state, mini_batch_advantage, mini_batch_return, \
                mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], gen_batch_next_state[ind], \
                                          gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ind]

                v_loss, p_loss = ppo_step(
                    self.P,
                    self.V,
                    self.optimizer_policy,
                    self.optimizer_value,
                    states=mini_batch_state,
                    actions=mini_batch_action,
                    next_states=mini_batch_next_state,
                    returns=mini_batch_return,
                    old_log_probs=mini_batch_old_log_prob,
                    advantages=mini_batch_advantage,
                    ppo_clip_ratio=self.config["ppo"]["clip_ratio"],
                    value_l2_reg=self.config["value"]["l2_reg"])

                self.writer.add_scalar('train/loss/p_loss', p_loss, epoch)
                self.writer.add_scalar('train/loss/v_loss', v_loss, epoch)

        print(f" Training episode:{epoch} ".center(80, "#"))
        print('gen_r:', gen_r.mean().item())
        print('expert_r:', expert_r.mean().item())
        print('d_loss', d_loss.item())

    def eval(self, epoch):
        self.P.eval()
        self.D.eval()
        self.V.eval()

        gen_batch = self.P.collect_samples(
            self.config["ppo"]["sample_batch_size"])
        gen_batch_state = torch.stack(gen_batch.state)
        gen_batch_action = torch.stack(gen_batch.action)

        gen_r = self.D(gen_batch_state, gen_batch_action)
        for expert_batch_state, expert_batch_action in self.expert_data_loader:
            expert_r = self.D(expert_batch_state.to(device),
                              expert_batch_action.to(device))

            print(f" Evaluating episode:{epoch} ".center(80, "-"))
            print('validate_gen_r:', gen_r.mean().item())
            print('validate_expert_r:', expert_r.mean().item())

        self.writer.add_scalar("validate/reward/gen_r",
                               gen_r.mean().item(), epoch)
        self.writer.add_scalar("validate/reward/expert_r",
                               expert_r.mean().item(), epoch)

    def save_model(self, save_path):
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # dump model from pkl file
        # torch.save((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt")
        torch.save(self.D, f"{save_path}/{self.exp_name}_Discriminator.pt")
        torch.save(self.P, f"{save_path}/{self.exp_name}_JointPolicy.pt")
        torch.save(self.V, f"{save_path}/{self.exp_name}_Value.pt")

    def load_model(self, model_path):
        # load entire model
        # self.D, self.P, self.V = torch.load((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt")
        self.D = torch.load(f"{model_path}_Discriminator.pt",
                            map_location=device)
        self.P = torch.load(f"{model_path}_JointPolicy.pt",
                            map_location=device)
        self.V = torch.load(f"{model_path}_Value.pt", map_location=device)
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.AL:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='leakyrelu',
                                        slope=0.1,
                                        dropout=True,
                                        dprob=0.2)
    elif args.VAKLIL:
        noise_dim = 64
        mid_dim = 32
        discrim_net = VAEDiscriminator(state_dim + action_dim,
                                       num_outputs=noise_dim,
                                       sigmoid_out=False,
                                       sn=True,
                                       test=False,
                                       w_init=False,
                                       hidden_size_enc=(),
                                       hidden_size_dec=(),
                                       encode_size=mid_dim,
                                       activation='relu',
                                       dropout=False)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              dropout=False)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_kernel_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_kernel_decay)

    if args.AL:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.VAKLIL:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.VAKLIL:
                g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake,
                                                     mean_mode=False)
                e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real,
                                                     mean_mode=False)
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.VAKLIL:
                optimizer_kernel.zero_grad()

            if args.AL:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.VAKLIL:
                noise_num = 20000
                mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2(
                    e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda,
                    args.sigma_list)
                mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                errD = (mmd2_D_net + mmd2_D_rbf) / 2
                # 1e-8: small number for numerical stability
                i_c = 0.2
                bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat(
                    (e_mu, g_mu), dim=0)**2) + (torch.cat(
                        (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat(
                            (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1,
                                                              dim=1))) - i_c
                discrim_loss = -errD + (args.beta * bottleneck_loss) + (
                    args.lambda_h * penalty)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))

            discrim_loss.backward()
            optimizer_discrim.step()
            if args.VAKLIL:
                optimizer_kernel.step()

        if args.VAKLIL:
            with torch.no_grad():
                noise_num = 20000
                g_o_enc, _, _ = discrim_net(dis_input_fake)
                e_o_enc, _, _ = discrim_net(dis_input_real)
                _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num,
                                                   noise_dim, kernel_net, cuda,
                                                   args.sigma_list)
                _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                K = [sum(x) / 2 for x in zip(K_net, K_rbf)]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards  #.detach()
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.VAKLIL:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
Esempio n. 12
0
def learn_model(args):

    print("RL result will be saved at %s" % args.rl_filename)
    print("RL model will be saved at %s" % args.rl_model_filename)
    if use_gpu:
        print("Using CUDA.")

    torch.manual_seed(args.rl_seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.rl_seed)
        torch.backends.cudnn.deterministic = True
    np.random.seed(args.rl_seed)
    random.seed(args.rl_seed)

    env = gym.make(args.env_name)
    env.seed(args.rl_seed)

    env_test = gym.make(args.env_name)
    env_test.seed(args.rl_seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    a_bound = np.asscalar(env.action_space.high[0])
    a_low = np.asscalar(env.action_space.low[0])
    assert a_bound == -a_low

    ## Binary flag for manually cliping actions for step function after adding Gaussian noise.
    clip = (args.env_name == "LunarLanderContinuous-v2"
            or args.env_name == "BipedalWalker-v2")

    print(env.observation_space)
    print(env.action_space)
    """define actor and critic"""
    policy_net = Policy(state_dim,
                        action_dim,
                        log_std=args.log_std,
                        a_bound=a_bound,
                        hidden_size=args.hidden_size,
                        activation=args.activation).to(device)
    value_net = Value(state_dim,
                      hidden_size=args.hidden_size,
                      activation=args.activation).to(device)

    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate_v)
    decayed_lambda_td = args.lambda_td

    def update_params_c(batch, i_iter):
        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, args.gamma, args.tau)

        if args.lamret:
            returns = lambda_returns
        else:
            returns = mc_returns
        """perform critic update"""
        #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg)  # full batch GD
        gae_step_epoch(value_net, optimizer_value, states, returns,
                       args.l2_reg)  # Stochastic GD

    """ Function to update the parameters of value and policy networks"""

    def update_params_p(batch, i_iter):

        nonlocal decayed_lambda_td

        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        next_states = torch.from_numpy(np.stack(
            batch.next_state)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories, this is done after gae_step update"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, gamma=args.gamma, tau=args.tau)

        if args.method_name == "TRPO-RET-MC":
            returns = mc_returns.detach(
            )  # detach() does not matter since we back prop policy network only.
        elif args.method_name == "TRPO-RET-GAE":
            returns = lambda_returns.detach(
            )  # detach() does not matter actually.
        else:
            returns = 0  # returns is not used for TRPO and TRPO-TD.

        # standardize or not ?
        if args.mgae:
            advantages = (advantages - advantages.mean()
                          ) / advantages.std()  # this will be m-std version
        else:
            advantages = advantages / advantages.std(
            )  # this will be std version

        trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \
            max_kl=args.max_kl, damping=args.damping, \
            lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd)
        """ decay the td_reg parameter after update """
        decayed_lambda_td = decayed_lambda_td * args.decay_td

    """create agent"""
    agent = Agent(env, policy_net, render=False)
    agent_test = Agent(env_test,
                       policy_net,
                       mean_action=True,
                       render=args.render)
    """ The actual learning loop"""
    for i_iter in range(args.rl_max_iter_num):
        """ Save the learned policy model """
        if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \
            or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0:

            policy_net = policy_net.to(device_cpu)
            value_net = value_net.to(device_cpu)

            pickle.dump((policy_net, value_net),
                        open(args.rl_model_filename + ("_I%d.p" % (i_iter)),
                             'wb'))

            policy_net = policy_net.to(device)
            value_net = value_net.to(device)
        """ Test the policy before update """
        if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num:
            _, log_test = agent_test.collect_samples_test(max_num_episodes=20,
                                                          render=args.render,
                                                          clip=clip)
        """generate multiple trajectories that reach the minimum batch_size"""
        t0 = time.time()
        batch, log = agent.collect_samples_train(
            args.min_batch_size, render=False,
            clip=clip)  # this is on-policy samples
        t1 = time.time()
        """ update parameters """
        t0_d = time.time()
        update_params_c(batch, i_iter)  #critic update
        update_params_p(batch, i_iter)  #actor update
        t1_d = time.time()
        """ Print out result to stdout and save it to a text file for later usage"""
        if i_iter % args.log_interval == 0:

            result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" %
                                   (i_iter, t1 - t0, t1_d - t0_d))
            result_text += " | [R] " + t_format(
                "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2)
            result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            print(result_text)

            with open(args.rl_filename, 'a') as f:
                print(result_text, file=f)
Esempio n. 13
0
class PPO(object):
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args
        if is_atari:
            self.actor = CNNPolicy(state_dim, action_dim).to(self.device)
            self.critic = CNNCritic(state_dim).to(self.device)
        else:
            self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
                Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)
            self.critic = Value(state_dim).to(self.device)

        # initialize optimizer for actor and critic
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.learning_rate)

        # optimization epoch number and batch size for PPO
        self.optim_epochs = 10
        self.optim_batch_size = 64

    def train(self, batch):
        """
        Train the policy using the given batch.
        :param batch:
        :return:
        """

        states = torch.DoubleTensor(np.stack(batch.state)).to(self.device)
        actions = torch.DoubleTensor(np.stack(batch.action)).to(self.device)
        rewards = torch.DoubleTensor(np.stack(batch.reward)).to(self.device)
        masks = torch.DoubleTensor(np.stack(batch.mask)).to(self.device)

        with torch.no_grad():
            values = self.critic(states)
            fixed_log_probs = self.actor.get_log_prob(states, actions)

        # get advantage estimation from the trajectories
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  self.config.gamma,
                                                  self.config.tau, self.device)

        # compute minibatch size
        optim_iter_num = int(math.ceil(states.shape[0] /
                                       self.optim_batch_size))

        # PPO updates
        for _ in range(self.optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = torch.LongTensor(perm).to(self.device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * self.optim_batch_size,
                    min((i + 1) * self.optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]
                self.ppo_step(states_b, actions_b, returns_b, advantages_b,
                              fixed_log_probs_b)

    def ppo_step(self, states, actions, returns, advantages, fixed_log_probs):
        """
        A PPO policy gradient update step.
        :param states:
        :param actions:
        :param returns:
        :param advantages:
        :param fixed_log_probs:
        :return:
        """
        # update critic, for now assume one epoch
        values_pred = self.critic(states)
        value_loss = (values_pred - returns).pow(2).mean()
        # weight decay
        for param in self.critic.parameters():
            value_loss += param.pow(2).sum() * self.config.l2_reg
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()

        # update actor
        log_probs = self.actor.get_log_prob(states, actions)
        ratio = torch.exp(log_probs - fixed_log_probs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.config.clip_epsilon,
                            1.0 + self.config.clip_epsilon) * advantages
        policy_surr = -torch.min(surr1, surr2).mean()
        self.actor.zero_grad()
        policy_surr.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40)
        self.actor_optimizer.step()