class I_MLP(nn.Module): def __init__(self): super(I_MLP, self).__init__() self.name = "models/inf_model.pt" self.n_steps = 10 self.fc_1 = nn.Linear(4, 4) self.fc_2 = nn.Linear(4, 2) # model_dict = torch.load("models/h_model.pt", map_location='cpu') # self.load_state_dict(model_dict) self.rc_1 = nn.Linear(4, 8) self.rc_2 = nn.Linear(8, 8) self.rc_3 = nn.Linear(8, 2) # curr human self.human = MLP() model_dict = torch.load("models/h_model.pt", map_location='cpu') self.human.load_state_dict(model_dict) self.human.eval def prediction(self, x): h1 = self.fc_1(x) return torch.tanh(self.fc_2(h1)) def policy(self, x): h1 = torch.tanh(self.rc_1(x)) h2 = torch.tanh(self.rc_2(h1)) return self.rc_3(h2) def rollout(self, s_star, s_0): error = 0.0 s = torch.FloatTensor(s_0) for t in range(self.n_steps): x = torch.cat((s, s_star), 0) ah_hat = self.human.prediction(x).detach() ah = self.prediction(x) context = torch.cat((s, ah), 0) ar = self.policy(context) s = s + 0.1 * ar error += torch.norm(s - s_star)**2 error += torch.tanh(ah[0]) + torch.tanh(ah[1]) error += torch.norm(ah) error += 1.0 * torch.norm(ah - ah_hat) return error def loss(self): Q = 0.0 g1 = torch.FloatTensor([1.0, 0.0]) g2 = torch.FloatTensor([0.0, 1.0]) for round in range(10): s_0 = np.random.random(2) for s_star in [g1, g2]: error = self.rollout(s_star, s_0) Q += error return Q
def main(): env = gym.make("LanderCustom-v0") qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('expert_bc.pt')) human.eval() softmax = torch.nn.Softmax(dim=0) episodes = 30 scores = [] Q_threshold = 1e-2 for episode in range(episodes): if episode < 10: force_x = 0.0 elif episode < 20: force_x = +500.0 else: force_x = -500.0 env.start_state(force_x, 0.0) state = env.reset() score = 0 while True: with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = softmax(human(state).data).numpy() action_star = np.argmax(Q_values) action = np.random.choice(np.arange(4), p=action_pred_dist) loss = Q_values[action_star] - Q_values[action] # if loss > Q_threshold: # action = action_star # env.render() state, reward, done, _ = env.step(action) score += reward if done: print("episode: ", episode, "score: ", score) break scores.append(score) env.close() print("The average score is: ", np.mean(np.array(scores)))
class I_MLP_MLP(nn.Module): def __init__(self): super(I_MLP_MLP, self).__init__() # curr human self.bc_human = MLP() model_dict = torch.load(self.bc_human.name, map_location='cpu') self.bc_human.load_state_dict(model_dict) self.bc_human.eval self.name = "models/inf_robot.pt" self.fc_1 = nn.Linear(4, 4) self.fc_2 = nn.Linear(4, 2) self.rc_1 = nn.Linear(4, 4) self.rc_2 = nn.Linear(4, 2) def human(self, x): h1 = torch.tanh(self.fc_1(x)) return self.fc_2(h1) def robot(self, x): h1 = torch.tanh(self.rc_1(x)) return self.rc_2(h1) def loss(self, n_samples): loss = 0.0 regulate = 1.0 for iteration in range(n_samples): state = torch.rand(2) target = torch.rand(2) error = target - state ah_star = torch.FloatTensor([-error[1], error[0]]) ah_hat = self.bc_human.human(torch.cat((state, target), 0)).detach() ah_hat *= torch.norm(ah_star) / torch.norm(ah_hat) ah = self.human(torch.cat((state, target), 0)) ar = self.robot(torch.cat((state, ah), 0)) state += 1.0 * ar loss += torch.norm(target - state)**2 loss += (1.0 - regulate) * torch.norm(ah_star - ah)**2 loss += regulate * torch.norm(ah_hat - ah)**2 loss += abs(torch.norm(ah) - torch.norm(ar)) return loss
def main(): env = gym.make("LanderCustom-v0") fx_init = float(sys.argv[1]) Q_threshold = float(sys.argv[2]) savename = 'test1.pkl' joystick = Joystick() qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('mlp_model.pt')) human.eval() episodes = 10 scores = [] data = [] env.start_state(fx_init, 0.0) for episode in range(episodes): state = env.reset() env.render() score = 0 while True: action, start, stop = joystick.input() if start: break while True: action, start, stop = joystick.input() data.append(list(state) + [action]) with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = human(state).data.numpy() action_star = np.argmax(Q_values) action_pred = np.argmax(action_pred_dist) # action = action_pred loss = Q_values[action_star] - Q_values[action] if loss > Q_threshold: action = action_star env.render() state, reward, done, _ = env.step(action) score += reward if done or stop: print(episode, score) # pickle.dump(data, open(savename, "wb" )) break time.sleep(0.025) scores.append(score) env.close() print(scores)