Beispiel #1
0
        with open('q_vals.pickle', 'rb') as handle:
            self.Q_expert = pickle.load(handle)
        print(self.Q_expert)


if __name__ == '__main__':
    # DiscretisedEnv
    env = DiscretisedEnv(gym.make('CartPole-v0'))

    # hyperparameters
    n_episodes = 1000
    goal_duration = 150
    decay_steps = 5000
    all_rewards = list()
    durations = collections.deque(maxlen=100)
    Epsilon = AnnealingSchedule(start=1.0, end=0.01, decay_steps=decay_steps)
    Alpha = AnnealingSchedule(start=1.0, end=0.01, decay_steps=decay_steps)
    agent = Q_Agent(env)
    agent.load_Q()

    global_timestep = tf.train.get_or_create_global_step()
    for episode in range(n_episodes):
        current_state = env.reset()

        done = False
        duration = 0

        # one episode of q learning
        while not done:
            # env.render()
            duration += 1
Beispiel #2
0
                        default=100,
                        type=int,
                        help="game env type")
    args = parser.parse_args()

    if args.mode == "CartPole":
        env = MyWrapper(gym.make("CartPole-v0"))
    elif args.mode == "Atari":
        env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))

    params = Parameters(algo="DQfD", mode=args.mode)
    params.num_episodes = args.num_episodes
    replay_buffer = PrioritizedReplayBuffer(
        params.memory_size, alpha=params.prioritized_replay_alpha)
    Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start,
                             end=params.prioritized_replay_beta_end,
                             decay_steps=params.decay_steps)
    agent = DQfD(args.mode, Model, Model, env.action_space.n, params,
                 logdirs.model_DQN)
    if params.policy_fn == "Eps":
        Epsilon = AnnealingSchedule(start=params.epsilon_start,
                                    end=params.epsilon_end,
                                    decay_steps=params.decay_steps)
        policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon)
    elif params.policy_fn == "Boltzmann":
        policy = BoltzmannQPolicy_eager()

    reward_buffer = deque(maxlen=params.reward_buffer_ep)
    summary_writer = tf.contrib.summary.create_file_writer(logdirs.log_DQfD)

    expert = DQN(args.mode, Model_CartPole_DQN, Model_CartPole_DQN,
parser.add_argument("--google_colab",
                    default=False,
                    type=bool,
                    help="if you are executing this on GoogleColab")
params = parser.parse_args()
params.goal = 195
params.test_episodes = 10
params.prioritized_replay_alpha = 0.6
params.prioritized_replay_beta_start = 0.4
params.prioritized_replay_beta_end = 1.0
params.prioritized_replay_noise = 1e-6

replay_buffer = PrioritizedReplayBuffer(params.memory_size,
                                        alpha=params.prioritized_replay_alpha)
Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start,
                         end=params.prioritized_replay_beta_end,
                         decay_steps=params.decay_steps)
Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon)

Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
anneal_lr = AnnealingSchedule(start=0.0025,
                              end=0.00025,
                              decay_steps=params.decay_steps,
                              decay_type="linear")
optimizer = tf.train.RMSPropOptimizer(anneal_lr.get_value(), 0.99, 0.0, 1e-6)
Beispiel #4
0
    if params.google_colab:
        # mount your drive on google colab
        from google.colab import drive

        drive.mount("/content/gdrive")
        params.log_dir = "/content/gdrive/My Drive/logs/logs/DQN/{}".format(
            params.env_name)
        params.model_dir = "/content/gdrive/My Drive/logs/models/DQN/{}".format(
            params.env_name)
        os.makedirs(params.log_dir)
        os.makedirs(params.model_dir)
        assert os.path.isdir(
            params.log_dir
        ), "Faild to create a directory on your My Drive, pls check it"
        assert os.path.isdir(
            params.model_dir
        ), "Faild to create a directory on your My Drive, pls check it"
        agent = DQN(params, env.action_space.n)
    else:
        agent = DQN(params, env.action_space.n)

Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
policy = EpsilonGreedyPolicy(Epsilon_fn=Epsilon)
replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)
train_DQN(agent, env, policy, replay_buffer, reward_buffer, params,
          summary_writer)
Beispiel #5
0
        # apply processed gradients to the network
        self.optimizer.apply_gradients(zip(grads,
                                           self.model.trainable_weights))

        return loss, batch_loss


if __name__ == '__main__':
    env = gym.make('MountainCarContinuous-v0')

    # hyperparameters
    all_rewards = list()
    params = Parameters(algo="DQN", mode="CartPole")
    Epsilon = AnnealingSchedule(start=params.epsilon_start,
                                end=params.epsilon_end,
                                decay_steps=params.decay_steps)
    Alpha = AnnealingSchedule(start=params.epsilon_start,
                              end=params.epsilon_end,
                              decay_steps=params.decay_steps)
    agent = Continuous_Q_Agent(env, params)
    global_step = 0

    for episode in range(params.num_episodes):
        state = env.reset()
        episode_loss = 0
        episode_reward = 0

        for t in itertools.count():
            # env.render()
Beispiel #6
0
from tf_rl.common.utils import AnnealingSchedule
from tf_rl.common.params import Parameters
from tf_rl.common.policy import EpsilonGreedyPolicy

params = Parameters("CartPole")
Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
policy = EpsilonGreedyPolicy(Epsilon_fn=Epsilon)
num_episodes = 80

for ep in range(num_episodes):
    print(Epsilon.get_value(ep))
    policy.index_episode = ep
    print(policy.current_epsilon())