def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise,
        learning_rate_actor, learning_rate_actor_param, epsilon_final,
        zero_index_gradients, initialise_params, scale_actions, clip_grad,
        split, indexed, layers, multipass, weighted, average, random_weighted,
        render_freq, save_freq, save_dir, save_frames, visualise,
        action_input_layer, title):

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)
    assert not (save_frames and visualise)
    if visualise:
        assert render_freq > 0
    if save_frames:
        assert render_freq > 0
        vidir = os.path.join(save_dir, "frames")
        os.makedirs(vidir, exist_ok=True)

    env = gym.make('Platform-v0')
    initial_params_ = [3., 10., 400.]
    if scale_actions:
        for a in range(env.action_space.spaces[0].n):
            initial_params_[a] = 2. * (
                initial_params_[a] - env.action_space.spaces[1].spaces[a].low
            ) / (env.action_space.spaces[1].spaces[a].high -
                 env.action_space.spaces[1].spaces[a].low) - 1.

    env = ScaledStateWrapper(env)  # 状态空间转换为-1~1
    env = PlatformFlattenedActionWrapper(env)  # 扁平化动作空间
    if scale_actions:  # 转换动作空间为-1~1
        env = ScaledParameterisedActionWrapper(env)

    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    env.seed(seed)
    np.random.seed(seed)

    print(env.observation_space)

    from agents.pdqn import PDQNAgent
    from agents.pdqn_split import SplitPDQNAgent
    from agents.pdqn_multipass import MultiPassPDQNAgent
    assert not (split and multipass)
    agent_class = PDQNAgent
    if split:
        agent_class = SplitPDQNAgent
    elif multipass:
        agent_class = MultiPassPDQNAgent
    agent = agent_class(env.observation_space.spaces[0],
                        env.action_space,
                        batch_size=batch_size,
                        learning_rate_actor=learning_rate_actor,
                        learning_rate_actor_param=learning_rate_actor_param,
                        epsilon_steps=epsilon_steps,
                        gamma=gamma,
                        tau_actor=tau_actor,
                        tau_actor_param=tau_actor_param,
                        clip_grad=clip_grad,
                        indexed=indexed,
                        weighted=weighted,
                        average=average,
                        random_weighted=random_weighted,
                        initial_memory_threshold=initial_memory_threshold,
                        use_ornstein_noise=use_ornstein_noise,
                        replay_memory_size=replay_memory_size,
                        epsilon_final=epsilon_final,
                        inverting_gradients=inverting_gradients,
                        actor_kwargs={
                            'hidden_layers': layers,
                            'action_input_layer': action_input_layer,
                        },
                        actor_param_kwargs={
                            'hidden_layers': layers,
                            'squashing_function': False,
                            'output_layer_init_std': 0.0001,
                        },
                        zero_index_gradients=zero_index_gradients,
                        seed=seed)

    if initialise_params:
        initial_weights = np.zeros((env.action_space.spaces[0].n,
                                    env.observation_space.spaces[0].shape[0]))
        initial_bias = np.zeros(env.action_space.spaces[0].n)
        for a in range(env.action_space.spaces[0].n):
            initial_bias[a] = initial_params_[a]
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 250
    total_reward = 0.
    returns = []
    start_time = time.time()
    video_index = 0
    # agent.epsilon_final = 0.
    # agent.epsilon = 0.
    # agent.noise = None

    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)
        if visualise and i % render_freq == 0:
            env.render()

        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):

            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            agent.step(state, (act, all_action_parameters), reward, next_state,
                       (next_act, next_all_action_parameters), terminal, steps)
            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state

            episode_reward += reward
            if visualise and i % render_freq == 0:
                env.render()

            if terminal:
                break
        agent.end_episode()

        if save_frames and i % render_freq == 0:
            video_index = env.unwrapped.save_render_states(
                vidir, title, video_index)

        returns.append(episode_reward)
        total_reward += episode_reward
        if i % 100 == 0:
            print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format(
                str(i), total_reward / (i + 1),
                np.array(returns[-100:]).mean()))
    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))
    env.close()
    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    print("Ave. return =", sum(returns) / len(returns))
    print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)

    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)
Ejemplo n.º 2
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        epsilon_steps, epsilon_final, tau_actor, tau_actor_param,
        tau_actor_param_critic, use_ornstein_noise, learning_rate_actor,
        learning_rate_actor_param, learning_rate_actor_param_critic,
        reward_scale, clip_grad, title, scale_actions, zero_index_gradients,
        split, layers, multipass, indexed, weighted, average, random_weighted,
        render_freq, action_input_layer, initialise_params, save_freq,
        save_dir, save_frames, visualise):

    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)
    assert not (save_frames and visualise)
    if visualise:
        assert render_freq > 0
    if save_frames:
        assert render_freq > 0
        vidir = os.path.join(save_dir, "frames")
        os.makedirs(vidir, exist_ok=True)

    if scale_actions:
        kickto_weights = np.array(
            [[-0.375, 0.5, 0, 0.0625, 0],
             [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        shoot_goal_left_weights = np.array([0.857346647646219686, 0])
        shoot_goal_right_weights = np.array([-0.857346647646219686, 0])
    else:
        xfear = 50.0 / PITCH_LENGTH
        yfear = 50.0 / PITCH_WIDTH
        caution = 5.0 / PITCH_WIDTH
        kickto_weights = np.array([[2.5, 1, 0, xfear, 0],
                                   [0, 0, 1 - caution, 0, yfear]])
        shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0])
        shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0])

    initial_weights = np.zeros((4, 20))  #np.zeros((4, 17))
    initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:]
    initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:]
    initial_weights[2, 16] = shoot_goal_left_weights[1]
    initial_weights[3, 16] = shoot_goal_right_weights[1]

    initial_bias = np.zeros((4, ))
    initial_bias[0] = kickto_weights[0, 0]
    initial_bias[1] = kickto_weights[1, 0]
    initial_bias[2] = shoot_goal_left_weights[0]
    initial_bias[3] = shoot_goal_right_weights[0]

    if not scale_actions:
        # rescale initial action-parameters for a scaled state space
        for a in range(env.action_space.spaces[0].n):
            mid = (env.observation_space.spaces[0].high +
                   env.observation_space.spaces[0].low) / 2.
            initial_bias[a] += np.sum(initial_weights[a] * mid)
            initial_weights[
                a] = initial_weights[a] * env.observation_space.spaces[
                    0].high - initial_weights[a] * mid

    env = GoalFlattenedActionWrapper(env)
    if scale_actions:
        env = ScaledParameterisedActionWrapper(env)
    env = ScaledStateWrapper(env)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    env.seed(seed)
    np.random.seed(seed)

    assert not (split and multipass)
    agent_class = HHQNAgent

    agent = agent_class(
        observation_space=env.observation_space.spaces[0],
        action_space=env.action_space,
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,  # 0.0001
        learning_rate_actor_param=learning_rate_actor_param,  # 0.001
        learning_rate_actor_param_critic=learning_rate_actor_param_critic,
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,
        clip_grad=clip_grad,
        indexed=indexed,
        average=average,
        random_weighted=random_weighted,
        tau_actor=tau_actor,
        weighted=weighted,
        tau_actor_param=tau_actor_param,
        tau_actor_param_critic=tau_actor_param_critic,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        actor_kwargs={
            'hidden_layers': layers,
            'output_layer_init_std': 1e-5,
            'action_input_layer': action_input_layer,
        },
        actor_param_kwargs={
            'hidden_layers': layers,
            'output_layer_init_std': 1e-5,
            'squashing_function': False
        },
        zero_index_gradients=zero_index_gradients,
        seed=seed)

    if initialise_params:
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 150
    total_reward = 0.
    returns = []
    start_time = time.time()
    video_index = 0
    Reward = []
    possibility = []
    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))

        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)
        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        if visualise and i % render_freq == 0:
            env.render()

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            r = reward * reward_scale
            agent.step(state, (act, all_action_parameters), r, next_state,
                       (next_act, next_all_action_parameters), terminal, steps)
            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward

            if visualise and i % render_freq == 0:
                env.render()

            if terminal:
                break
        agent.end_episode()

        if save_frames:
            video_index = env.unwrapped.save_render_states(
                vidir, title, video_index)

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                (np.array(returns) == 50.).sum() / len(returns)))
        Reward.append(total_reward / (i + 1))
        possibility.append((np.array(returns) == 50.).sum() / len(returns))

    plot_reward(Reward)
    plot_p(possibility)
    end_time = time.time()
    print("Training took %.2f seconds" % (end_time - start_time))
    env.close()

    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =",
              sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)
Ejemplo n.º 3
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise,
        learning_rate_actor, learning_rate_actor_param, epsilon_final,
        zero_index_gradients, initialise_params, scale_actions, clip_grad,
        split, indexed, layers, multipass, weighted, average, random_weighted,
        render_freq, save_freq, save_dir, save_frames, visualise,
        action_input_layer, title, window):
    pic_name = filename_generator("./results/imgs/", "capacity60-5-10", seed,
                                  title)
    print(pic_name)
    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)
    assert not (save_frames and visualise)
    if visualise:
        assert render_freq > 0
    if save_frames:
        assert render_freq > 0
        vidir = os.path.join(save_dir, "frames")
        os.makedirs(vidir, exist_ok=True)

    env = gym.make('Cloud-v0')
    # initial_params_ = [0.0, 0.0, 0.0]
    initial_params_ = [0.5, 0.5, 0.5]
    # initial_params_ = [1.0, 1.0, 1.0]
    if scale_actions:
        for a in range(env.action_space.spaces[0].n):
            initial_params_[a] = 2. * (
                initial_params_[a] - env.action_space.spaces[1].spaces[a].low
            ) / (env.action_space.spaces[1].spaces[a].high -
                 env.action_space.spaces[1].spaces[a].low) - 1.

    env = ScaledStateWrapper(env)  # 状态空间转换为-1~1
    env = PlatformFlattenedActionWrapper(env)  # 扁平化动作空间
    if scale_actions:  # 转换动作空间为-1~1
        env = ScaledParameterisedActionWrapper(env)

    # dir = os.path.join(save_dir,title)
    # env = Monitor(env, directory=os.path.join(dir,str(seed)), video_callable=False, write_upon_reset=False, force=True)
    # env.seed(seed)
    np.random.seed(seed)

    print(env.observation_space)

    from agents.pdqn import PDQNAgent
    from agents.pdqn_split import SplitPDQNAgent
    from agents.pdqn_multipass import MultiPassPDQNAgent
    assert not (split and multipass)
    agent_class = PDQNAgent
    if split:
        agent_class = SplitPDQNAgent
    elif multipass:
        agent_class = MultiPassPDQNAgent
    agent = agent_class(env.observation_space.spaces[0],
                        env.action_space,
                        batch_size=batch_size,
                        learning_rate_actor=learning_rate_actor,
                        learning_rate_actor_param=learning_rate_actor_param,
                        epsilon_steps=epsilon_steps,
                        gamma=gamma,
                        tau_actor=tau_actor,
                        tau_actor_param=tau_actor_param,
                        clip_grad=clip_grad,
                        indexed=indexed,
                        weighted=weighted,
                        average=average,
                        random_weighted=random_weighted,
                        initial_memory_threshold=initial_memory_threshold,
                        use_ornstein_noise=use_ornstein_noise,
                        replay_memory_size=replay_memory_size,
                        epsilon_final=epsilon_final,
                        inverting_gradients=inverting_gradients,
                        actor_kwargs={
                            'hidden_layers': layers,
                            'action_input_layer': action_input_layer,
                        },
                        actor_param_kwargs={
                            'hidden_layers': layers,
                            'squashing_function': False,
                            'output_layer_init_std': 0.0001,
                        },
                        zero_index_gradients=zero_index_gradients,
                        seed=seed,
                        spot_bound=-0.4167)  # <=8

    if initialise_params:
        initial_weights = np.zeros((env.action_space.spaces[0].n,
                                    env.observation_space.spaces[0].shape[0]))
        initial_bias = np.zeros(env.action_space.spaces[0].n)
        for a in range(env.action_space.spaces[0].n):
            initial_bias[a] = initial_params_[a]
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 5000
    total_reward = 0.
    returns = []
    start_time = time.time()
    video_index = 0
    # agent.epsilon_final = 0.
    # agent.epsilon = 0.
    # agent.noise = None
    best = -float("inf")

    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))
        state = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):

            ret = env.step(
                action
            )  # execute action in environment, and observe next state
            next_state, reward, terminal, _ = ret  # obtain result
            next_state = np.array(next_state, dtype=np.float32,
                                  copy=False)  # convert to nparray

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)  # choose action according to next state
            next_action = pad_action(
                next_act, next_act_param)  # package action and param
            agent.step(
                state,
                (act, all_action_parameters),
                reward,
                next_state,  # add sample and learn
                (next_act, next_all_action_parameters),
                terminal,
                time_steps=1)
            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters  # transfer state and action
            action = next_action
            state = next_state

            episode_reward += reward  # calculate the episode reward

            if terminal:
                break
        agent.end_episode()

        returns.append(episode_reward)
        total_reward += episode_reward

        if episode_reward > best:
            best = episode_reward
            with open('results/res.txt', "w") as f:
                f.write(str(best * 500.0))

        print('Episode{0:5s} R:{1:.4f} Avg:{2:.4f} r10:{3:.4f}'.format(
            str(i), episode_reward, total_reward / (i + 1),
            np.array(returns[-window:]).mean()))

        if visualise and i % window == 0 and i is not 0:
            plot_window_reward(returns, filename=pic_name, window=window)
            # plot_reward(returns, filename=pic_name)
    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))
    print(best * 500.0)
    # env.close()
    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    print("Ave. return =", sum(returns) / len(returns))
    print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)
Ejemplo n.º 4
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        scale_actions, epsilon_steps, epsilon_final, tau_actor, tau_critic,
        use_ornstein_noise, learning_rate_actor, learning_rate_critic,
        reward_scale, clip_grad, initialise_params, layers, save_dir, title):
    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)

    if scale_actions:
        kickto_weights = np.array(
            [[-0.375, 0.5, 0, 0.0625, 0],
             [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        shoot_goal_left_weights = np.array([0.857346647646219686, 0])
        shoot_goal_right_weights = np.array([-0.857346647646219686, 0])
    else:
        xfear = 50.0 / PITCH_LENGTH
        yfear = 50.0 / PITCH_WIDTH
        caution = 5.0 / PITCH_WIDTH
        kickto_weights = np.array([[2.5, 1, 0, xfear, 0],
                                   [0, 0, 1 - caution, 0, yfear]])
        shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0])
        shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0])

    initial_weights = np.zeros((4, 17))
    initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:]
    initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:]
    initial_weights[2, 16] = shoot_goal_left_weights[1]
    initial_weights[3, 16] = shoot_goal_right_weights[1]

    initial_bias = np.zeros((4, ))
    initial_bias[0] = kickto_weights[0, 0]
    initial_bias[1] = kickto_weights[1, 0]
    initial_bias[2] = shoot_goal_left_weights[0]
    initial_bias[3] = shoot_goal_right_weights[0]

    env = GoalFlattenedActionWrapper(env)
    if scale_actions:
        env = ScaledParameterisedActionWrapper(env)
    env = ScaledStateWrapper(env)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    print(env.action_space)
    print(env.observation_space)
    env.seed(seed)
    np.random.seed(seed)
    agent = PADDPGAgent(
        observation_space=env.observation_space.spaces[0],
        action_space=env.action_space,
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,
        learning_rate_critic=learning_rate_critic,
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,
        clip_grad=clip_grad,
        tau_actor=tau_actor,
        tau_critic=tau_critic,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        n_step_returns=False,
        adam_betas=(0.9, 0.999),
        critic_kwargs={
            'hidden_layers': layers,
            'init_type': "kaiming"
        },
        actor_kwargs={
            'hidden_layers': layers,
            'init_type': "kaiming",  # 'init_std': 1e-5,  # 0.0001,
            'squashing_function': False
        },
        seed=seed)

    if initialise_params:
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 150
    total_reward = 0.
    returns = []
    start_time = time.time()

    log_f = open("log_paddpg_GoalEnv.txt", "w+")

    for i in range(episodes):
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_actions, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)

            r = reward * reward_scale
            agent.step(state,
                       (act, act_param, all_actions, all_action_parameters),
                       r,
                       next_state, (next_act, next_act_param, next_all_actions,
                                    next_all_action_parameters),
                       terminal,
                       optimise=True)

            act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward

            if terminal:
                break
        agent.end_episode()

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                (np.array(returns) == 50.).sum() / len(returns)))

            # from left to right: episode number, episode reward, averaged total reward for all past episodes,
            # returns for nearest 100 episodes and success rates
            log_f.write('{},{},{},{},{}\n'.format(
                i, episode_reward, total_reward / (i + 1),
                np.array(returns[-100:]).mean(),
                (np.array(returns) == 50.).sum() / len(returns)))

            log_f.flush()

    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))
    env.close()
    print(agent)

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =",
              sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)
Ejemplo n.º 5
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        save_dir, epsilon_steps, epsilon_final, tau_actor, tau_critic,
        use_ornstein_noise, learning_rate_actor, learning_rate_critic,
        clip_grad, layers, initialise_params, title):
    env = gym.make('Platform-v0')
    env = ScaledStateWrapper(env)

    initial_params_ = [3., 10., 400.]
    for a in range(env.action_space.spaces[0].n):
        initial_params_[a] = 2. * (
            initial_params_[a] - env.action_space.spaces[1].spaces[a].low) / (
                env.action_space.spaces[1].spaces[a].high -
                env.action_space.spaces[1].spaces[a].low) - 1.

    env = PlatformFlattenedActionWrapper(env)
    env = ScaledParameterisedActionWrapper(env)

    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    env.seed(seed)
    np.random.seed(seed)

    agent = PADDPGAgent(observation_space=env.observation_space.spaces[0],
                        action_space=env.action_space,
                        batch_size=batch_size,
                        learning_rate_actor=learning_rate_actor,
                        learning_rate_critic=learning_rate_critic,
                        epsilon_steps=epsilon_steps,
                        epsilon_final=epsilon_final,
                        gamma=gamma,
                        clip_grad=clip_grad,
                        tau_actor=tau_actor,
                        tau_critic=tau_critic,
                        initial_memory_threshold=initial_memory_threshold,
                        use_ornstein_noise=use_ornstein_noise,
                        replay_memory_size=replay_memory_size,
                        inverting_gradients=inverting_gradients,
                        adam_betas=(0.9, 0.999),
                        critic_kwargs={
                            'hidden_layers': layers,
                            'init_type': "kaiming"
                        },
                        actor_kwargs={
                            'hidden_layers': layers,
                            'init_type': "kaiming",
                            'init_std': 0.0001,
                            'squashing_function': False
                        },
                        seed=seed)
    print(agent)
    if initialise_params:
        initial_weights = np.zeros((env.action_space.spaces[0].n,
                                    env.observation_space.spaces[0].shape[0]))
        initial_bias = np.zeros(env.action_space.spaces[0].n)
        for a in range(env.action_space.spaces[0].n):
            initial_bias[a] = initial_params_[a]
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)

    max_steps = 250
    total_reward = 0.
    returns = []
    start_time = time.time()
    for i in range(episodes):
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_actions, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            agent.step(state,
                       (act, act_param, all_actions, all_action_parameters),
                       reward, next_state,
                       (next_act, next_act_param, next_all_actions,
                        next_all_action_parameters), terminal, steps)
            act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters
            action = next_action
            state = next_state  # .copy()

            episode_reward += reward

            if terminal:
                break
        agent.end_episode()

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f}'.format(str(i + 1),
                                            total_reward / (i + 1)))
    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))

    env.close()

    returns = env.get_episode_rewards()
    print("Ave. return =", sum(returns) / len(returns))
    print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)