def main(): parser = argparse.ArgumentParser( 'Behaviour cloning using pre-trained expert rollouts.') parser.add_argument('--rollout_file', type=str, default='expert_data/Humanoid-v2.pkl') parser.add_argument('--envname', type=str, default='Humanoid-v2') parser.add_argument('--max_timesteps', type=int, default=1000) parser.add_argument('--training_epochs', type=int, default=2000) parser.add_argument('--save_model', type=str, default='./DAgger_Humanoid_lstm-v2.pth') parser.add_argument('--render', type=bool, default=True) args = parser.parse_args() # load expert rollout and model rollout = load_rollout(args.rollout_file) train = torch.tensor(rollout['observations'], dtype=torch.double) target = torch.tensor(rollout['actions'], dtype=torch.double) policy_net = load_policy.load_policy(args.expert_policy_file) train = train.to(dev) target = target.to(dev) db.printTensor(train) db.printTensor(target) # make the environment env = gym.make(args.envname) # build model model = Model(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0]) model.double() model.to(torch.device(dev)) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.MSELoss() # train for i in range(args.dagger_epochs): for epoch in range(args.training_epochs): data_generator = recurrent_generator(train, target, batch_size=20) t_start = time.time() for train_sample, target_sample in data_generator: out = model(train_sample) loss = criterion(out, target_sample) optimizer.zero_grad() loss.backward() optimizer.step() if epoch % 10 == 0: db.printInfo('Epoch: {} Loss: {} Time: {}'.format(epoch, loss, time.time()-t_start)) obs = env.reset() done = False save(epoch, model, optimizer, loss, args.save_model)
def testSampleAction(self): ob_dim, ac_dim, n_layers, hidden_size = 5, 3, 20, 10 batch = 1 neural_network_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': False, 'size': hidden_size } network = PolicyNet(neural_network_args) inputs, outputs = [batch, ob_dim], [batch, ac_dim] # dist = torch.distributions.Categorical(action_probs) obs = torch.randn(inputs) if neural_network_args['discrete']: out = network(obs) action_probs = torch.nn.functional.softmax(out, dim=-1) dist = torch.distributions.Categorical(action_probs) sampled_action = dist.sample() dist.log_prob(sampled_action) dist1 = torch.distributions.Categorical(probs=action_probs) dist2 = torch.distributions.Categorical(logits=out) db.printInfo(dist1) db.printInfo(dist2) else: ts_mean, ts_logstd = network(obs) ts_mean = torch.randn(2000, 6) ts_logstd = torch.randn(6) dist = torch.distributions.Normal(loc=ts_mean, scale=ts_logstd) # YOUR_CODE_HERE ts_logstd_na = [] for _ in range(list(ts_mean.shape)[0]): ts_logstd_na.append(ts_logstd) ts_logstd_na = torch.stack(ts_logstd_na) # db.printInfo(ts_logstd_na) # db.printInfo(ts_logstd_na.exp()) sampled_action = torch.normal(mean=ts_mean, std=ts_logstd_na.exp()) # sampled_action = torch.distributions.Normal(mean, logstd.exp()).sample() # sampled_action = torch.normal(mean, logstd.exp()) # sampled_action = torch.normal(mean=torch.tensor([0,0]), # std=1) db.printInfo(sampled_action) db.printInfo(obs) db.printInfo()
def main(): parser = argparse.ArgumentParser( 'Behaviour cloning using pre-trained expert rollouts.') parser.add_argument('--rollout_file', type=str, default='expert_data/Ant-v2.pkl') parser.add_argument('--envname', type=str, default='Ant-v2') parser.add_argument('--max_timesteps', type=int, default=1000) parser.add_argument('--training_epochs', type=int, default=2000) parser.add_argument('--save_model', type=str, default='./BC_Ant-v2.pth') parser.add_argument('--render', type=bool, default=True) parser.add_argument('--recurrent', type=bool, default=False) parser.add_argument('--hidden_size', type=int, default=512) args = parser.parse_args() # load expert rollout rollout = load_rollout(args.rollout_file) train = torch.tensor(rollout['observations'], dtype=torch.double) target = torch.tensor(rollout['actions'], dtype=torch.double) train = train.to(dev) target = target.to(dev) db.printTensor(train) db.printTensor(target) # make the environment env = gym.make(args.envname) # build model model = Model(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0]) model.double() model.to(torch.device(dev)) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.MSELoss() for epoch in range(args.training_epochs): data_generator = feed_forward_generator(train, target, batch_size=20) for train_sample, target_sample in data_generator: db.printTensor(train_sample) input() out = model(train_sample) loss = criterion(out, target_sample) optimizer.zero_grad() loss.backward() optimizer.step() if epoch % 10 == 0: db.printInfo('Epoch: {} Loss: {:.4f}'.format(epoch, loss)) save(epoch, model, optimizer, loss, args.save_model)
def save(epoch, model, optimizer, loss, path, overwrite=False): if overwrite: rev = 1 while os.path.exists(path): path = path[:-4] path = path+'_'+str(rev)+'.pth' rev += 1 torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, path) db.printInfo('Model saved to {}'.format(path))
def rtg_solution(re_n, gamma, reward_to_go=True): db.printTensor(re_n) # YOUR_CODE_HERE if reward_to_go: q_n = [ scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[::-1] for re in re_n ] else: q_n = [ np.full_like( re, scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[-1]) for re in re_n ] db.printInfo(q_n) q_n = np.concatenate(q_n).astype(np.float32) db.printInfo(q_n) return q_n
def rtg(re_n, gamma, reward_to_go=True): db.printTensor(re_n) q_n = [] if reward_to_go: for traj in re_n: q_path = [] for reward in traj[::-1]: try: q_path.append(q_path[-1] * gamma + reward) except IndexError: q_path.append(reward) q_n.append(q_path[::-1]) else: # for traj in re_n: # q_path = 0 # for t, reward in enumerate(traj[::-1]): # q_path = q_path*gamma + reward # db.printInfo(q_path) # # db.printInfo(q_path) # # db.printInfo(t) # # db.printInfo(gamma**t) # # input() # # do this to have the same return for each time step # q_n.append([q_path for _ in range(len(traj))]) for traj in re_n: for t, reward in enumerate(traj): try: q_path = q_path + reward * gamma**t except: q_path = reward # do this to have the same return for each time step q_n.append([q_path for _ in range(len(traj))]) db.printInfo(q_n) q_n = np.concatenate(q_n).astype(np.float32) db.printInfo(q_n) return q_n
def main(): parser = argparse.ArgumentParser( 'Behaviour cloning using pre-trained expert rollouts.') parser.add_argument('--save_file', type=str, default='./BC_Ant-v2.pth') parser.add_argument('--envname', type=str, default='Ant-v2') parser.add_argument('--iter', type=int, default=20) parser.add_argument('--max_timesteps', type=int, default=1000) parser.add_argument('--render', type=bool, default=False) args = parser.parse_args() env = gym.make(args.envname) # build model model = Model(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0]) model.double() model.load_state_dict(load_model(args.save_file)) returns = [] for i in range(args.iter): obs = env.reset() done = False totalr = 0. steps = 0 while not done: with torch.no_grad(): action = model( torch.tensor(obs, dtype=torch.double).unsqueeze(0)) action = action.numpy() obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps >= args.max_timesteps: break db.printInfo("Iter {} {}/{} Reward: {:.2f}".format( i, steps, args.max_timesteps, totalr)) returns.append(totalr) db.printInfo("Mean return {}".format(np.mean(returns))) db.printInfo("Std of return {}".format(np.std(returns)))
import torch import torch.nn as nn import torch.optim as optim import load_policy import print_custom as db if torch.cuda.is_available(): dev = "cuda:0" else: dev = "cpu" # dev = 'cpu' db.printInfo(dev) class Model(nn.Module): def __init__(self, input_dim, output_dim, hidden_size=64): super(Model, self).__init__() self.hidden_size = hidden_size self.fc1 = nn.Linear(input_dim, 200) self.fc2 = nn.Linear(200, 100) self.fc3 = nn.Linear(100, input_dim) self.lstm = nn.LSTM(input_dim, hidden_size, batch_first=True) self.fc4 = nn.Linear(hidden_size, output_dim) def forward(self, inputs): hidden = (torch.zeros(1, len(inputs), self.hidden_size, dtype=torch.double).to(dev), torch.zeros(1, len(inputs), self.hidden_size, dtype=torch.double).to(dev))
def mlp(input_size, output_size, n_layers, hidden_size, activation=nn.Tanh): layers = [] layers.append(nn.Linear(input_size, hidden_size)) layers.append(activation()) for _ in range(n_layers): layers.append(nn.Linear(hidden_size, hidden_size)) layers.append(activation()) layers.append(nn.Linear(hidden_size, output_size)) print(layers) if __name__ == "__main__": # unittest.main() re_n = np.arange(1, 21).reshape(2, 10) re_n = np.ones(5).reshape(1, 5) re_n = np.arange(1, 5).reshape(1, 4) db.printInfo(re_n) # re_n = np.ones(10).reshape(1,10) q_n = rtg(re_n, 0.5, False) q_n_sol = rtg_solution(re_n, 0.5, False) # q_n_sol = rtq_v2(re_n,0.5, False) # db.printInfo('Equal {}'.format(False not in (q_n == q_n_sol))) # mlp_sol(5,3,2,64) # mlp(5,3,2,64) # module = build_mlp(5,3,3,64) # print(module)
def run(self): print("Agent {} started, Process ID {}".format(self.name, os.getpid())) actions = [] rewards = [] states = [] logprobs = [] is_terminal = [] timestep = 0 # lists to collect agent experience # variables for logging running_reward = 0 for i_episodes in range(1, self.max_episode + 2): state = self.env.reset() if i_episodes == self.max_episode + 1: db.printInfo("Max episodes reached") msg = MsgMaxReached(self.proc_id, True) self.pipe.send(msg) break for i in range(self.max_timestep): timestep += 1 states.append(state) with torch.no_grad(): action, logprob = self.memory.agent_policy.act( state, False) state, reward, done, _ = self.env.step(action) actions.append(action) logprobs.append(logprob) rewards.append(reward) is_terminal.append(done) running_reward += reward if timestep % self.update_timestep == 0: stateT, actionT, logprobT, disReturn = \ self.experience_to_tensor( states, actions, rewards, logprobs, is_terminal) self.add_experience_to_pool(stateT, actionT, logprobT, disReturn) msg = MsgUpdateRequest(int(self.proc_id), True) self.pipe.send(msg) msg = self.pipe.recv() if msg == "RENDER": self.render = True timestep = 0 actions = [] rewards = [] states = [] logprobs = [] is_terminal = [] if done: break if self.render: time.sleep(0.005) self.env.render() if i_episodes % self.log_interval == 0: running_reward = running_reward / self.log_interval # db.printInfo("sending reward msg") msg = MsgRewardInfo(self.proc_id, i_episodes, running_reward) self.pipe.send(msg) running_reward = 0