def train_trpo():

    R_avg = []
    R_max = []
    R_min = []

    for i_iter in range(args.max_iter_num):
        """generate multiple trajectories that reach the minimum batch_size"""
        batch, log = agent.collect_samples(args.min_batch_size)

        t0 = time.time()

        sampled_states = torch.from_numpy(np.stack(
            batch.state)).to(dtype).to(device)
        sampled_actions = torch.from_numpy(np.stack(
            batch.action)).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
        terminals = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)

        with torch.no_grad():
            values = value_net(sampled_states)
        advantages, returns = estimate_advantages(rewards, terminals, values,
                                                  args.gamma, args.tau, device)
        trpo_step(policy_net, value_net, sampled_states, sampled_actions,
                  returns, advantages, args.max_kl, args.damping, args.l2_reg)

        t1 = time.time()

        if i_iter % args.log_interval == 0:
            print(
                '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'
                .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'],
                        log['max_reward'], log['avg_reward']))

        if args.save_model_interval > 0 and (
                i_iter + 1) % args.save_model_interval == 0:
            to_device(torch.device('cpu'), policy_net, value_net)
            pickle.dump(
                (policy_net, value_net, running_state),
                open(
                    os.path.join(
                        assets_dir(),
                        'learned_models/expert-policies/{}_trpo.p'.format(
                            args.env_name)), 'wb'))
            to_device(device, policy_net, value_net)
        torch.cuda.empty_cache()
Beispiel #2
0
def airl_step(batch, i_iter):
    to_device(device, policy_net, value_net, discriminator)

    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
        values = value_net(states)
    X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent
    Y = torch.from_numpy(expert_traj).to(dtype).to(device)
    rewards = []
    rs  = discriminator(X).detach().clone()
    for r in rs:
        #Reward: log D - log 1-D reward
        rewards.append(math.log(r.item()) -  math.log(1 - r.item()))
    rewards  = torch.tensor(rewards)
    rewards =  torch.clamp(rewards, max=10,  min=-10)
    survival_bonus = 11 # Adding survival bonus improved performance on environment where you need to stay alive
    rewards = rewards + survival_bonus
    print(rewards.mean())
    advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device)
    sampled_episodes = []
    epis = []
    for pair in range(len(masks)): # Split to episodes (for tracking eval only)
        epis.append(X[pair].cpu().numpy())
        if masks[pair] == 0:
            sampled_episodes.append(epis)
            epis = []
    batch_size = args.batch
    for ep in range(1):
        permutation = torch.randperm(X.size()[0])
        for i in range(0,X.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x = X[indices, ::]
            learner_samples_disc = discriminator(batch_x)
            expert_samples_disc = discriminator(Y)
            optimizer_discrim.zero_grad()
            discrim_loss = discrim_criterion(learner_samples_disc, zeros((batch_x.shape[0], 1), device=device)) + \
                discrim_criterion(expert_samples_disc, ones((expert_traj.shape[0], 1), device=device))
            discrim_loss.backward()
            optimizer_discrim.step()
    trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) # Update policy
    return len(sampled_episodes)
def gail_step(batch, i_iter):
    to_device(device, policy_net, value_net, discriminator)
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    with torch.no_grad():
        values = value_net(states)
    X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent
    Y = torch.from_numpy(expert_traj).to(dtype).to(device)
    rewards = []
    rs  = discriminator(X).detach().clone()
    for r in rs:
        rewards.append(-math.log(r.item())) #gail reward
    rewards  = torch.tensor(rewards)
    advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device)
    sampled_episodes = []
    epis = []
    for pair in range(len(masks)): # Split to episodes for evaluation only
        epis.append(X[pair].cpu().numpy())
        if masks[pair] == 0:
            sampled_episodes.append(epis)
            epis = []
    batch_size = args.batch
    for ep in range(1):
        permutation = torch.randperm(X.size()[0])
        for i in range(0,X.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x = X[indices, ::]
            learner_samples_disc = discriminator(batch_x)
            expert_samples_disc = discriminator(Y)
            optimizer_discrim.zero_grad()
            discrim_loss = discrim_criterion(learner_samples_disc, ones((batch_x.shape[0], 1), device=device)) + \
                discrim_criterion(expert_samples_disc, zeros((expert_traj.shape[0], 1), device=device))
            discrim_loss.backward()
            optimizer_discrim.step()
    trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) # Update policy (TRPO from: https://github.com/Khrylx/PyTorch-RL)
    return len(sampled_episodes)
Beispiel #4
0
def sil_step(batch, i_iter):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    next_states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(
        device)  # Use for state-next_state distribution matching
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    to_device(device, policy_net, value_net, critic_net)
    X = torch.cat([states, actions],
                  1).to(dtype).to(device)  # Concatenate s,a pairs of agent
    with torch.no_grad():
        values = value_net(states)
    for _ in range(1):
        sampled_episodes = []
        epis = []
        for pair in range(
                len(masks)
        ):  # Split to episodes to do random matching (Corresponds to Step 3 of Algorithm 1)
            epis.append(X[pair].cpu().numpy())
            if masks[pair] == 0:
                sampled_episodes.append(epis)
                epis = []
        total_wasserstein = 0  # Keeps track of all Wassersteins for one episode
        rewards = []  # Logs rewards to update TRPO
        min_wasserstein = 10e10  # Used for logging at command line
        max_wasserstein = 0  # Used for logging at command line
        best_trajectory = None  # Used for logging at command line
        worst_trajectory = None  # Used for logging at command line
        index = 0  # Used for logging at command line
        best_idx = 0  # Used for logging at command line
        worst_idx = 0  # Used for logging at command line
        per_trajectory_dis = []  # Used for logging at command line
        cost_loss = []
        num_of_samples = len(sampled_episodes) - 1
        threshold = num_of_samples - 3
        episodic_eval_sinkhorn = []
        for trajectory in sampled_episodes:
            X = torch.tensor(trajectory).to(dtype).to(
                device)  # Convert trajectory to tensor.
            sample_traj_index = random.randint(0, (args.dataset_size - 1))
            Y = torch.from_numpy(expert_traj[sample_traj_index]).to(dtype).to(
                device
            )  # Randomly match (Corresponds to Step 3 of Algorithm 1)
            cost_matrix = cosine_critic(
                X, Y, critic_net
            )  # Get cost matrix for samples using critic network.
            transport_plan = optimal_transport_plan(
                X, Y, cost_matrix,
                method='sinkhorn_gpu')  # Getting optimal coupling
            per_sample_costs = torch.diag(
                torch.mm(transport_plan, cost_matrix.T)
            )  # Get diagonals W = MC^T, where M is the optimal transport map and C the cost matrix
            distance = torch.sum(
                per_sample_costs
            )  # Calculate Wasserstein by summing diagonals, i.e., W=Trace[MC^T]
            wasserstein_distance = -(
                distance
            )  # Assign -wasserstein in order to GD to maximise if using adversary for training.

            per_trajectory_dis.append(distance.detach().cpu().numpy(
            ))  # Keep track of all Wasserstein distances in one sample.

            #=========FOR EVALUATION ONLY=============#
            if args.log_actual_sinkhorn:
                evaluation_cost_matrix = cosine_distance(X, Y)
                evaluation_transport_plan = optimal_transport_plan(
                    X, Y, evaluation_cost_matrix, method='sinkhorn_gpu')
                eval_wasserstein_distance = torch.sum(
                    torch.diag(
                        torch.mm(evaluation_transport_plan,
                                 evaluation_cost_matrix.T)))
                episodic_eval_sinkhorn.append(eval_wasserstein_distance.item())
            #=========================================#

            if distance < min_wasserstein and index != (
                    len(sampled_episodes)
            ):  # Keep track of best trajectory based on Wasserstein distance
                min_wasserstein = distance
                best_trajectory = X
                best_idx = index
            if distance > max_wasserstein and index != (
                    len(sampled_episodes)
            ):  # Keep track of worst trajectory based on Wasserstein distance
                max_wasserstein = distance
                worst_trajectory = X
                worst_idx = index
            index += 1
            counter = 0
            survival_bonus = 4 / X.shape[0]
            for per_sample_cost in per_sample_costs:
                with torch.no_grad():
                    temp_r = -2 * per_sample_cost + survival_bonus
                    temp_r.unsqueeze_(0)
                    temp_r = running_reward(temp_r.cpu())
                    rewards.append(temp_r)
                    counter += 1
            total_wasserstein += distance
            torch.cuda.empty_cache()
    total_wasserstein = -total_wasserstein / num_of_samples
    optimizer_ot.zero_grad()
    total_wasserstein.backward(
    )  #Only backpropagates through the critic network.
    optimizer_ot.step()
    #    args.critic_lr*=0.992 # Perhaps decreasing lr may be useful in stabilizing training process
    #    for param_group in optimizer_ot.param_groups:
    #        param_group['lr'] = args.critic_lr
    with torch.no_grad():
        rewards = torch.tensor(rewards)
    advantages, returns = estimate_advantages(
        rewards, masks, values, args.gamma, args.tau,
        device)  # Get Advantages for TRPO
    torch.cuda.empty_cache()
    trpo_step(
        policy_net, value_net, states, actions, returns, advantages,
        args.max_kl, args.damping, args.l2_reg
    )  # Update policy (TRPO from: https://github.com/Khrylx/PyTorch-RL)
    return (total_wasserstein**2)**(1 / 2), episodic_eval_sinkhorn, len(
        sampled_episodes
    ), min_wasserstein, best_trajectory, best_idx, max_wasserstein, worst_trajectory, worst_idx, per_trajectory_dis
def sil_step(batch, i_iter):
    # Get s,a,r of agent interaction with the environment. This is what agent.collect_samples returns in the main method.
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    next_states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
    to_device(device, policy_net, value_net, critic_net)
    X = torch.cat([states, actions],
                  1).to(dtype).to(device)  # Concatenate s,a pairs of agent
    with torch.no_grad():
        values = value_net(states)
    for _ in range(1):
        sampled_episodes = []
        epis = []
        for pair in range(len(masks)):  # Split to episodes
            epis.append(X[pair].cpu().numpy())
            if masks[pair] == 0:
                sampled_episodes.append(epis)
                epis = []
        total_wasserstein = 0  # Keeps track of all Wassersteins for one episode
        rewards = []  # Logs rewards to update TRPO
        min_wasserstein = 10e10  # Used for logging at command line
        max_wasserstein = 0  # Used for logging at command line
        best_trajectory = None  # Used for logging at command line
        worst_trajectory = None  # Used for logging at command line
        index = 0  # Used for logging at command line
        best_idx = 0  # Used for logging at command line
        worst_idx = 0  # Used for logging at command line
        per_trajectory_dis = []  # Used for logging at command line
        cost_loss = []
        num_of_samples = len(sampled_episodes) - 1
        threshold = num_of_samples - 3
        episodic_eval_sinkhorn = []
        for trajectory in sampled_episodes:
            X = torch.tensor(trajectory).to(dtype).to(
                device)  # Convert trajectory to tensor.
            sample_traj_index = random.randint(0, (args.dataset_size - 1))
            Y = torch.from_numpy(expert_traj[sample_traj_index]).to(dtype).to(
                device
            )  # Comment this out  if you do not want to use expert trajectories, but use the below to use direct expert feedback.
            cost_matrix = cosine_distance(
                X, Y
            )  # Get cost matrix for samples using fixed cosine transport cost.
            transport_plan = optimal_transport_plan(
                X, Y, cost_matrix,
                method='sinkhorn_gpu')  # Getting optimal coupling
            per_sample_costs = torch.diag(
                torch.mm(transport_plan, cost_matrix.T)
            )  # Get diagonals W = MC^T, where M is the optimal transport map and C the cost matrix
            distance = torch.sum(
                per_sample_costs
            )  # Calculate Wasserstein by summing diagonals, i.e., W=Trace[MC^T]
            wasserstein_distance = -(
                distance
            )  # Assign -wasserstein in order to gradient descent to maximise if using adversary for training.
            per_trajectory_dis.append(distance.detach().cpu().numpy(
            ))  # Keep track of all Wasserstein distances in one sample.
            episodic_eval_sinkhorn.append(distance.item())
            if distance < min_wasserstein and index != (
                    len(sampled_episodes)
            ):  # Keep track of best trajectory based on Wasserstein distance
                min_wasserstein = distance
                best_trajectory = X
                best_idx = index
            if distance > max_wasserstein and index != (
                    len(sampled_episodes)
            ):  # Keep track of worst trajectory based on Wasserstein distance
                max_wasserstein = distance
                worst_trajectory = X
                worst_idx = index
            index += 1
            counter = 0
            survival_bonus = 4 / X.shape[0]
            for per_sample_cost in per_sample_costs:  # Add rewards
                with torch.no_grad():
                    temp_r = -2 * per_sample_cost + survival_bonus
                    temp_r.unsqueeze_(0)
                    temp_r = running_reward(temp_r.cpu())
                    rewards.append(temp_r)
                    counter += 1
            total_wasserstein += distance
            torch.cuda.empty_cache()
    with torch.no_grad():
        rewards = torch.tensor(rewards)
    advantages, returns = estimate_advantages(
        rewards, masks, values, args.gamma, args.tau,
        device)  # Get Advantages for TRPO
    torch.cuda.empty_cache()
    trpo_step(policy_net, value_net, states, actions, returns, advantages,
              args.max_kl, args.damping, args.l2_reg)  # Update policy
    return (total_wasserstein**2)**(1 / 2), episodic_eval_sinkhorn, len(
        sampled_episodes
    ), min_wasserstein, best_trajectory, best_idx, max_wasserstein, worst_trajectory, worst_idx, per_trajectory_dis