def main(): parser = argparse.ArgumentParser( 'Behaviour cloning using pre-trained expert rollouts.') parser.add_argument('--rollout_file', type=str, default='expert_data/Humanoid-v2.pkl') parser.add_argument('--envname', type=str, default='Humanoid-v2') parser.add_argument('--max_timesteps', type=int, default=1000) parser.add_argument('--training_epochs', type=int, default=2000) parser.add_argument('--save_model', type=str, default='./DAgger_Humanoid_lstm-v2.pth') parser.add_argument('--render', type=bool, default=True) args = parser.parse_args() # load expert rollout and model rollout = load_rollout(args.rollout_file) train = torch.tensor(rollout['observations'], dtype=torch.double) target = torch.tensor(rollout['actions'], dtype=torch.double) policy_net = load_policy.load_policy(args.expert_policy_file) train = train.to(dev) target = target.to(dev) db.printTensor(train) db.printTensor(target) # make the environment env = gym.make(args.envname) # build model model = Model(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0]) model.double() model.to(torch.device(dev)) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.MSELoss() # train for i in range(args.dagger_epochs): for epoch in range(args.training_epochs): data_generator = recurrent_generator(train, target, batch_size=20) t_start = time.time() for train_sample, target_sample in data_generator: out = model(train_sample) loss = criterion(out, target_sample) optimizer.zero_grad() loss.backward() optimizer.step() if epoch % 10 == 0: db.printInfo('Epoch: {} Loss: {} Time: {}'.format(epoch, loss, time.time()-t_start)) obs = env.reset() done = False save(epoch, model, optimizer, loss, args.save_model)
def rtg_solution(re_n, gamma, reward_to_go=True): db.printTensor(re_n) # YOUR_CODE_HERE if reward_to_go: q_n = [ scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[::-1] for re in re_n ] else: q_n = [ np.full_like( re, scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[-1]) for re in re_n ] db.printInfo(q_n) q_n = np.concatenate(q_n).astype(np.float32) db.printInfo(q_n) return q_n
def rtg(re_n, gamma, reward_to_go=True): db.printTensor(re_n) q_n = [] if reward_to_go: for traj in re_n: q_path = [] for reward in traj[::-1]: try: q_path.append(q_path[-1] * gamma + reward) except IndexError: q_path.append(reward) q_n.append(q_path[::-1]) else: # for traj in re_n: # q_path = 0 # for t, reward in enumerate(traj[::-1]): # q_path = q_path*gamma + reward # db.printInfo(q_path) # # db.printInfo(q_path) # # db.printInfo(t) # # db.printInfo(gamma**t) # # input() # # do this to have the same return for each time step # q_n.append([q_path for _ in range(len(traj))]) for traj in re_n: for t, reward in enumerate(traj): try: q_path = q_path + reward * gamma**t except: q_path = reward # do this to have the same return for each time step q_n.append([q_path for _ in range(len(traj))]) db.printInfo(q_n) q_n = np.concatenate(q_n).astype(np.float32) db.printInfo(q_n) return q_n
def main(): parser = argparse.ArgumentParser( 'Behaviour cloning using pre-trained expert rollouts.') parser.add_argument('--rollout_file', type=str, default='expert_data/Ant-v2.pkl') parser.add_argument('--envname', type=str, default='Ant-v2') parser.add_argument('--max_timesteps', type=int, default=1000) parser.add_argument('--training_epochs', type=int, default=2000) parser.add_argument('--save_model', type=str, default='./BC_Ant-v2.pth') parser.add_argument('--render', type=bool, default=True) parser.add_argument('--recurrent', type=bool, default=False) parser.add_argument('--hidden_size', type=int, default=512) args = parser.parse_args() # load expert rollout rollout = load_rollout(args.rollout_file) train = torch.tensor(rollout['observations'], dtype=torch.double) target = torch.tensor(rollout['actions'], dtype=torch.double) train = train.to(dev) target = target.to(dev) db.printTensor(train) db.printTensor(target) # make the environment env = gym.make(args.envname) # build model model = Model(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0]) model.double() model.to(torch.device(dev)) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.MSELoss() for epoch in range(args.training_epochs): data_generator = feed_forward_generator(train, target, batch_size=20) for train_sample, target_sample in data_generator: db.printTensor(train_sample) input() out = model(train_sample) loss = criterion(out, target_sample) optimizer.zero_grad() loss.backward() optimizer.step() if epoch % 10 == 0: db.printInfo('Epoch: {} Loss: {:.4f}'.format(epoch, loss)) save(epoch, model, optimizer, loss, args.save_model)