Esempio n. 1
0
    print("Starting Ensemble!")
    imitator.train_ensemble()
    print("Finished Ensemble!")
    # create agent
    agent = Agent(env, imitator.policy.actor, args.device,
                  running_state=running_state, render=args.render, num_threads=args.num_threads)

    log_list = {"bc_loss": [],
                "uncertainty_cost":[],
                "avg_reward": [],
                "std_reward": []}

    total_timesteps = 0

    for i_iter in range(args.max_iter_num):
        batch, log = agent.collect_samples(args.min_batch_size)
        # train DRIL
        t0 = time.time()
        loss = imitator.train(batch)
        t1 = time.time()

        imitator.policy.actor.to('cpu')
        episode_rewards = evaluate_model(env, imitator.policy.actor, running_state=running_state, verbose=False)[
            'episodes_rewards']
        imitator.policy.actor.to(args.device)

        if i_iter % args.log_interval == 0:
            print('{}\tT_update: {:.4f}\t training loss: {:.2f}\t uncertainty cost: {:.4f}'
                  '\t R_avg: {:.2f}\t R_std: {:.2f}'.format(
                i_iter, t1 - t0, loss['bc_loss'], loss['uncertainty_cost'],
                episode_rewards.mean(), episode_rewards.std()))
Esempio n. 2
0
    # args.expert_traj_path = "assets/expert_traj/{}_{}_0.p".format(args.env_name, args.expert_model)
    # expert_traj, running_state, _ = pickle.load(open(args.expert_traj_path, "rb"))
    # running_state.fix = True

    args.expert_path = "assets/expert_models/{}_ppo_0.p".format(args.env_name)

    # load expert and state normalization
    expert, _, running_state, _ = pickle.load(open(args.expert_path, "rb"))
    running_state.fix = True
    expert_agent = Agent(env,
                         expert,
                         args.device,
                         running_state=running_state,
                         render=args.render,
                         num_threads=args.num_threads)
    expert_traj, expert_log = expert_agent.collect_samples(
        args.min_batch_size * args.num_trajs, return_memory=True)

    print(expert_log['avg_reward'])

    # train discriminator
    # regressor = BC(args, state_dim, action_dim, is_disc_action)
    # regressor.set_expert2(expert_traj.sample())
    # t0 = time.time()
    #
    # for _ in range(10):
    #     regressor.train2()
    # t1 = time.time()

    # print("Finished training BC regressor, took {}".format(t1-t0))
    # running_state.fix = False
Esempio n. 3
0
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=exp_args["config"]["lr"])
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=exp_args["config"]["lr"])
""" Create Agent """

agent = Agent(env,
              policy_net,
              device,
              running_state=running_state,
              render=exp_args["config"]["render"],
              num_threads=exp_args["config"]["num-threads"],
              horizon=exp_args["config"]["horizon"])

agent.collect_samples(2048)


def update_params(batch):

    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
    actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
    rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
    masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)

    with torch.no_grad():
        values = value_net(states)

    advantages, returns = estimate_advantages(rewards, masks, values,
                                              exp_args["config"]["gamma"],
                                              exp_args["config"]["tau"],
Esempio n. 4
0
# running_reward = ZFilter((1,), demean=False, clip=10)
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# env.seed(args.seed)


try:
    policy_net, value_net, discrim_net, running_state = pickle.load(open(args.file, "rb"))
    print('1')
except:
    policy_net, value_net,  running_state = pickle.load(open(args.file, "rb"))
    print('2')

print('type running state = ',type(running_state))

print
policy_net.to(device)
value_net.to(device)


"""create agent"""
agent = Agent(env, policy_net, device, running_state=running_state, num_threads=1, mean_action=True)


batch, log = agent.collect_samples(args.max_timesteps)

print('R_min {0:.2f}\tR_max {1:.2f}\tR_avg {2:.2f}\tNum_episodes {3:.2f}'.format(log['min_reward'], log['max_reward'], log['avg_reward'], log['num_episodes']))


env.shutdown()