def get_optimal_tilt(episodes): env = gym.make("CartpoleTheta-v0") qnetwork = QNetwork(state_size=4, action_size=2, seed=0) qnetwork.load_state_dict(torch.load("models/cartpole_tilt.pth")) qnetwork.eval() softmax = torch.nn.Softmax(dim=1) dataset = [] for episode in range(episodes): state = env.reset(theta="tilt") xi = [] for t in range(500): with torch.no_grad(): state_t = torch.from_numpy(state).float().unsqueeze(0) action_values = qnetwork(state_t) action_values = softmax(action_values).cpu().data.numpy()[0] action = np.argmax(action_values) xi.append([action] + list(state)) # env.render() # can always toggle visualization next_state, _, done, _ = env.step(action) state = next_state if done: dataset.append(xi) break env.close() return dataset
def get_human(episodes, t_delay=8, type="regular"): env = gym.make('LunarLanderTheta-v0') qnetwork = QNetwork(state_size=8, action_size=4, seed=1) qnetwork.load_state_dict(torch.load("models/dqn_center.pth")) qnetwork.eval() softmax = torch.nn.Softmax(dim=1) dataset = [] if type == "regular": stoptime_lb = 200 noise_threshold = 0.0 elif type == "noise": stoptime_lb = 200 noise_threshold = 0.2 elif type == "counterfactual": stoptime_lb = 0 noise_threshold = 0.2 for episode in range(episodes): stoptime = np.random.randint(stoptime_lb, 201) state = env.reset(theta="center") xi = [] action = 0 episode_reward = 0 for t in range(1000): if t < stoptime and t % t_delay == 0: with torch.no_grad(): state_t = torch.from_numpy(state).float().unsqueeze(0) action_values = qnetwork(state_t) action_values = softmax( action_values).cpu().data.numpy()[0] action = np.argmax(action_values) elif t > stoptime: action = 0 if np.random.random() < noise_threshold and t < 200: action = np.random.randint(0, 4) # env.render() # can always toggle visualization next_state, _, done, info = env.step(action) awake = info["awake"] reward = info["reward"] xi.append([t] + [action] + [awake] + [state]) state = next_state episode_reward += reward if done: print("\rReward: {:.2f}\tLanded: {}\tReward: {}"\ .format(episode_reward, awake, "center"), end="") dataset.append(xi) break env.close() return dataset
def gen_traj(episodes, t_delay=8, theta=None): # load environment env = gym.make('LunarLanderTheta-v0') # load our trained q-network path = "models/dqn_" + theta + ".pth" qnetwork = QNetwork(state_size=8, action_size=4, seed=1) qnetwork.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) qnetwork.eval() softmax = torch.nn.Softmax(dim=1) dataset = [] for episode in range(episodes): state = env.reset(theta=theta) xi = [] episode_reward = 0 for t in range(1000): if t%t_delay == 0: with torch.no_grad(): state_t = torch.from_numpy(state).float().unsqueeze(0) action_values = qnetwork(state_t) action_values = softmax(action_values).cpu().data.numpy()[0] action = np.argmax(action_values) # env.render() # can always toggle visualization next_state, _, done, info = env.step(action) awake = info["awake"] reward = info["reward"] xi.append([t] + [action] + [awake] + [state]) state = next_state episode_reward += reward if done: print("\rReward: {:.2f}\tLanded: {}\tReward: {}"\ .format(episode_reward, awake, theta), end="") dataset.append(xi) break env.close() return dataset
def get_human(episodes, t_delay=8, type="regular"): env = gym.make("CartpoleTheta-v0") qnetwork = QNetwork(state_size=4, action_size=2, seed=0) qnetwork.load_state_dict(torch.load("models/cartpole_up.pth")) qnetwork.eval() softmax = torch.nn.Softmax(dim=1) dataset = [] if type == "regular": stoptime_lb = 200 noise_threshold = 0.0 elif type == "noise": stoptime_lb = 200 noise_threshold = 0.05 elif type == "counterfactual": stoptime_lb = 0 noise_threshold = 0.0 for episode in range(episodes): stoptime = np.random.randint(stoptime_lb, 201) state = env.reset(theta="up") xi = [] action = 0 for t in range(500): if t < stoptime and t % t_delay == 0: with torch.no_grad(): state_t = torch.from_numpy(state).float().unsqueeze(0) action_values = qnetwork(state_t) action_values = softmax( action_values).cpu().data.numpy()[0] action = np.argmax(action_values) if np.random.random() < noise_threshold: action = np.random.randint(0, 2) xi.append([action] + list(state)) # img = env.render(mode="rgb_array") # can always toggle visualization next_state, _, done, _ = env.step(action) state = next_state if done: dataset.append(xi) break env.close() return dataset