max_episode_steps=10) nb_steps = 4000 agent = VPG(env, MLP_Multihead, gamma=1, verbose=False, learning_rate=1e-3, regularize=REGULARIZE, lam=LAMBDA) print(agent.seed) agent.learn(timesteps=nb_steps) obs, _ = env.reset() scores = [] for rp in RANDOMIZATION_SPACE: first_action = 0 env = GridworldEnv(randomized_params=[rp], randomize=True, regularize=False, randomization_space=[rp], goal_reward=GOAL_REWARD, lava_reward=LAVA_REWARD, step_reward=STEP_REWARD, out_of_grid=OUT_OF_GRID_REWARD, max_episode_steps=10) obs, _ = env.reset() score = 0
ct = 0 for i in range(4): for j in range(4): if state == (i + 1, j + 1): print(True) obs = ct ct += 1 return obs for i in range(NUM_EPISODES): state = env.reset() #state = convert(env.agentPos) steps = 0 while True: if np.random.sample() < EPSILON: action = np.random.choice(env.action_space.n) else: action = np.argmax(Q[state]) next_state, reward, done, _ = env.step(action) #next_state = convert(env.agentPos) #if done and reward > 0: # next_state = 15
def her_experiment(): batch_size = 256 discount_factor = 0.8 learn_rate = 1e-3 num_hidden = 128 num_episodes = 2 epochs = 200 training_steps = 10 memory_size = 100000 # her = False # seeds = [42, 30, 2,19,99] # This is not randomly chosen seeds = [42, 30, 2, 19, 99] shape = [30, 30] targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x] env = GridworldEnv(shape=shape, targets=targets(*shape)) # functions for grid world def sample_goal(): return np.random.choice(env.targets, 1) extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1) def calc_reward(state, action, goal): if state == goal: return 0.0 else: return -1.0 # # maze # def sample_goal(): # return env.maze.end_pos # extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1) # def calc_reward(state, action, goal): # if state == goal: # return 0.0 # else: # return -1.0 means = [] x_epochs = [] l_stds = [] h_stds = [] for her in [True, False]: episode_durations_all = [] for seed in seeds: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) env.seed(seed) print(env.reset()) memory = ReplayMemory(memory_size) if her: # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n) model = QNetwork(2 * env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=True) else: model = QNetwork(env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=False) episode_durations_all.append( loop_environments.smooth(episode_durations, 10)) mean = np.mean(episode_durations_all, axis=0) means.append(mean) std = np.std(episode_durations_all, ddof=1, axis=0) l_stds.append(mean - std) h_stds.append(mean + std) x_epochs.append(list(range(len(mean)))) # print(len(mean),mean,std) line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration", ["HindsightReplay", "RandomReplay"], "Episode duration per epoch", ["orange", "blue"]) name = "her_" + str(shape) file_name = os.path.join("./results", name) with open(file_name + ".pkl", "wb") as f: pickle.dump((x_epochs, means, l_stds, h_stds), f)
if __name__ == '__main__': size_x = 4 size_y = 4 env = GridworldEnv(size_x, size_y) env.make_start(0, 0) env.make_goal(0, 3) env.make_goal(3, 0) agent = GridworldAgent(size_x, size_y) total_episodes = 1000 for i in range(total_episodes): obs = env.reset() agent.reset() agent.append_trajectory(t_step=0, prev_action=None, observation=obs, reward=None, done=None) prev_action = agent.pick_action(obs) while True: # --- time step rolls here --- #print('---- time step {0} ----'.format(env.t_step))