コード例 #1
0
                       max_episode_steps=10)

    nb_steps = 4000

    agent = VPG(env,
                MLP_Multihead,
                gamma=1,
                verbose=False,
                learning_rate=1e-3,
                regularize=REGULARIZE,
                lam=LAMBDA)
    print(agent.seed)

    agent.learn(timesteps=nb_steps)

    obs, _ = env.reset()
    scores = []
    for rp in RANDOMIZATION_SPACE:
        first_action = 0
        env = GridworldEnv(randomized_params=[rp],
                           randomize=True,
                           regularize=False,
                           randomization_space=[rp],
                           goal_reward=GOAL_REWARD,
                           lava_reward=LAVA_REWARD,
                           step_reward=STEP_REWARD,
                           out_of_grid=OUT_OF_GRID_REWARD,
                           max_episode_steps=10)

        obs, _ = env.reset()
        score = 0
コード例 #2
0
    ct = 0
    for i in range(4):
        for j in range(4):

            if state == (i + 1, j + 1):
                print(True)
                obs = ct
            ct += 1

    return obs


for i in range(NUM_EPISODES):

    state = env.reset()
    #state = convert(env.agentPos)
    steps = 0

    while True:

        if np.random.sample() < EPSILON:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(Q[state])

        next_state, reward, done, _ = env.step(action)

        #next_state = convert(env.agentPos)
        #if done and reward > 0:
        #    next_state = 15
コード例 #3
0
ファイル: her_experiment.py プロジェクト: m0re4u/rl-project
def her_experiment():
    batch_size = 256
    discount_factor = 0.8
    learn_rate = 1e-3
    num_hidden = 128
    num_episodes = 2
    epochs = 200
    training_steps = 10
    memory_size = 100000
    # her = False
    # seeds = [42, 30, 2,19,99]  # This is not randomly chosen
    seeds = [42, 30, 2, 19, 99]
    shape = [30, 30]
    targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x]
    env = GridworldEnv(shape=shape, targets=targets(*shape))

    # functions for grid world
    def sample_goal():
        return np.random.choice(env.targets, 1)

    extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1)

    def calc_reward(state, action, goal):
        if state == goal:
            return 0.0
        else:
            return -1.0
        # # maze
        #     def sample_goal():
        #         return env.maze.end_pos
        #     extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1)
        #     def calc_reward(state, action, goal):
        #         if state == goal:
        #             return 0.0
        #         else:
        #             return -1.0

    means = []
    x_epochs = []
    l_stds = []
    h_stds = []
    for her in [True, False]:
        episode_durations_all = []
        for seed in seeds:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            env.seed(seed)
            print(env.reset())
            memory = ReplayMemory(memory_size)
            if her:
                # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n)
                model = QNetwork(2 * env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=True)
            else:
                model = QNetwork(env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=False)

            episode_durations_all.append(
                loop_environments.smooth(episode_durations, 10))
        mean = np.mean(episode_durations_all, axis=0)
        means.append(mean)
        std = np.std(episode_durations_all, ddof=1, axis=0)
        l_stds.append(mean - std)
        h_stds.append(mean + std)
        x_epochs.append(list(range(len(mean))))
        # print(len(mean),mean,std)
    line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration",
                  ["HindsightReplay", "RandomReplay"],
                  "Episode duration per epoch", ["orange", "blue"])
    name = "her_" + str(shape)
    file_name = os.path.join("./results", name)

    with open(file_name + ".pkl", "wb") as f:
        pickle.dump((x_epochs, means, l_stds, h_stds), f)
コード例 #4
0
if __name__ == '__main__':

    size_x = 4
    size_y = 4

    env = GridworldEnv(size_x, size_y)
    env.make_start(0, 0)
    env.make_goal(0, 3)
    env.make_goal(3, 0)

    agent = GridworldAgent(size_x, size_y)

    total_episodes = 1000
    for i in range(total_episodes):

        obs = env.reset()
        agent.reset()

        agent.append_trajectory(t_step=0,
                                prev_action=None,
                                observation=obs,
                                reward=None,
                                done=None)

        prev_action = agent.pick_action(obs)

        while True:

            #      --- time step rolls here ---
            #print('----  time step {0}  ----'.format(env.t_step))