Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        'Behaviour cloning using pre-trained expert rollouts.')
    parser.add_argument('--rollout_file', type=str,
                        default='expert_data/Humanoid-v2.pkl')
    parser.add_argument('--envname', type=str, default='Humanoid-v2')
    parser.add_argument('--max_timesteps', type=int, default=1000)
    parser.add_argument('--training_epochs', type=int, default=2000)
    parser.add_argument('--save_model', type=str, default='./DAgger_Humanoid_lstm-v2.pth')
    parser.add_argument('--render', type=bool, default=True)

    args = parser.parse_args()

    # load expert rollout and model
    rollout = load_rollout(args.rollout_file)
    train = torch.tensor(rollout['observations'], dtype=torch.double)
    target = torch.tensor(rollout['actions'], dtype=torch.double)

    policy_net = load_policy.load_policy(args.expert_policy_file)

    train = train.to(dev)
    target = target.to(dev)

    db.printTensor(train)
    db.printTensor(target)

    # make the environment
    env = gym.make(args.envname)

    # build model
    model = Model(input_dim=env.observation_space.shape[0],
                  output_dim=env.action_space.shape[0])
    model.double()
    model.to(torch.device(dev))
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    # train
    for i in range(args.dagger_epochs):
        for epoch in range(args.training_epochs):
            data_generator = recurrent_generator(train, target, batch_size=20)
            t_start = time.time()
            for train_sample, target_sample in data_generator:
                out = model(train_sample)
                loss = criterion(out, target_sample)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            if epoch % 10 == 0:
                db.printInfo('Epoch: {} Loss: {} Time: {}'.format(epoch, loss, time.time()-t_start))

        obs = env.reset()
        done = False

    save(epoch, model, optimizer, loss, args.save_model)
Esempio n. 2
0
def rtg_solution(re_n, gamma, reward_to_go=True):
    db.printTensor(re_n)
    # YOUR_CODE_HERE
    if reward_to_go:
        q_n = [
            scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[::-1]
            for re in re_n
        ]
    else:
        q_n = [
            np.full_like(
                re,
                scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[-1])
            for re in re_n
        ]
    db.printInfo(q_n)
    q_n = np.concatenate(q_n).astype(np.float32)
    db.printInfo(q_n)
    return q_n
Esempio n. 3
0
def rtg(re_n, gamma, reward_to_go=True):
    db.printTensor(re_n)
    q_n = []
    if reward_to_go:
        for traj in re_n:
            q_path = []
            for reward in traj[::-1]:
                try:
                    q_path.append(q_path[-1] * gamma + reward)
                except IndexError:
                    q_path.append(reward)
            q_n.append(q_path[::-1])
    else:
        # for traj in re_n:
        #     q_path = 0
        #     for t, reward in enumerate(traj[::-1]):
        #         q_path = q_path*gamma + reward
        #         db.printInfo(q_path)
        #         # db.printInfo(q_path)
        #         # db.printInfo(t)
        #         # db.printInfo(gamma**t)

        #         # input()
        #     # do this to have the same return for each time step
        #     q_n.append([q_path for _ in range(len(traj))])
        for traj in re_n:
            for t, reward in enumerate(traj):
                try:
                    q_path = q_path + reward * gamma**t
                except:
                    q_path = reward
            # do this to have the same return for each time step
            q_n.append([q_path for _ in range(len(traj))])
    db.printInfo(q_n)
    q_n = np.concatenate(q_n).astype(np.float32)

    db.printInfo(q_n)
    return q_n
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        'Behaviour cloning using pre-trained expert rollouts.')
    parser.add_argument('--rollout_file',
                        type=str,
                        default='expert_data/Ant-v2.pkl')
    parser.add_argument('--envname', type=str, default='Ant-v2')
    parser.add_argument('--max_timesteps', type=int, default=1000)
    parser.add_argument('--training_epochs', type=int, default=2000)
    parser.add_argument('--save_model', type=str, default='./BC_Ant-v2.pth')
    parser.add_argument('--render', type=bool, default=True)
    parser.add_argument('--recurrent', type=bool, default=False)
    parser.add_argument('--hidden_size', type=int, default=512)

    args = parser.parse_args()

    # load expert rollout
    rollout = load_rollout(args.rollout_file)
    train = torch.tensor(rollout['observations'], dtype=torch.double)
    target = torch.tensor(rollout['actions'], dtype=torch.double)

    train = train.to(dev)
    target = target.to(dev)

    db.printTensor(train)
    db.printTensor(target)

    # make the environment
    env = gym.make(args.envname)

    # build model
    model = Model(input_dim=env.observation_space.shape[0],
                  output_dim=env.action_space.shape[0])
    model.double()
    model.to(torch.device(dev))
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(args.training_epochs):
        data_generator = feed_forward_generator(train, target, batch_size=20)
        for train_sample, target_sample in data_generator:
            db.printTensor(train_sample)
            input()
            out = model(train_sample)
            loss = criterion(out, target_sample)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % 10 == 0:
            db.printInfo('Epoch: {} Loss: {:.4f}'.format(epoch, loss))
    save(epoch, model, optimizer, loss, args.save_model)