def main(): env = gym.make("LanderCustom-v0") qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('expert_bc.pt')) human.eval() softmax = torch.nn.Softmax(dim=0) episodes = 30 scores = [] Q_threshold = 1e-2 for episode in range(episodes): if episode < 10: force_x = 0.0 elif episode < 20: force_x = +500.0 else: force_x = -500.0 env.start_state(force_x, 0.0) state = env.reset() score = 0 while True: with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = softmax(human(state).data).numpy() action_star = np.argmax(Q_values) action = np.random.choice(np.arange(4), p=action_pred_dist) loss = Q_values[action_star] - Q_values[action] # if loss > Q_threshold: # action = action_star # env.render() state, reward, done, _ = env.step(action) score += reward if done: print("episode: ", episode, "score: ", score) break scores.append(score) env.close() print("The average score is: ", np.mean(np.array(scores)))
def __init__(self): super(R_MLP, self).__init__() self.name = "models/r_model.pt" self.n_steps = 10 self.human = MLP() model_dict = torch.load("models/h_model.pt", map_location='cpu') self.human.load_state_dict(model_dict) self.human.eval self.rc_1 = nn.Linear(4, 8) self.rc_2 = nn.Linear(8, 8) self.rc_3 = nn.Linear(8, 2)
def __init__(self): super(I_MLP_MLP, self).__init__() # curr human self.bc_human = MLP() model_dict = torch.load(self.bc_human.name, map_location='cpu') self.bc_human.load_state_dict(model_dict) self.bc_human.eval self.name = "models/inf_robot.pt" self.fc_1 = nn.Linear(4, 4) self.fc_2 = nn.Linear(4, 2) self.rc_1 = nn.Linear(4, 4) self.rc_2 = nn.Linear(4, 2)
def main(): env = gym.make("LanderCustom-v0") fx_init = float(sys.argv[1]) Q_threshold = float(sys.argv[2]) savename = 'test1.pkl' joystick = Joystick() qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('mlp_model.pt')) human.eval() episodes = 10 scores = [] data = [] env.start_state(fx_init, 0.0) for episode in range(episodes): state = env.reset() env.render() score = 0 while True: action, start, stop = joystick.input() if start: break while True: action, start, stop = joystick.input() data.append(list(state) + [action]) with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = human(state).data.numpy() action_star = np.argmax(Q_values) action_pred = np.argmax(action_pred_dist) # action = action_pred loss = Q_values[action_star] - Q_values[action] if loss > Q_threshold: action = action_star env.render() state, reward, done, _ = env.step(action) score += reward if done or stop: print(episode, score) # pickle.dump(data, open(savename, "wb" )) break time.sleep(0.025) scores.append(score) env.close() print(scores)