Ejemplo n.º 1
0
def main():
    state_size = 17
    action_size = 4
    buffer_size = 1024
    batch_size = 32
    num_steps = 4096
    num_samples = 1024
    num_repeat = 10

    gym_memory = GymReplayBuffer(buffer_size)
    memory = ReplayBuffer(state_size, action_size, buffer_size, batch_size, 0)

    # Make some convenient aliases.
    n = num_steps
    ns = state_size
    na = action_size

    # Generate random experiences ...
    states = np.zeros((n, ns), dtype=np.float32)
    actions = np.random.randint(0, na, n)
    rewards = np.random.uniform(0, 1, n)
    next_states = np.zeros((n, ns), dtype=np.float32)
    dones = np.random.randint(2, size=n, dtype=np.bool)

    ts=[]
    ts.append(time.time())

    print('Memory')
    for _ in range(num_repeat):
        for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones):
            memory.add(s0, a, r, s1, d)
    ts.append(time.time())
    for _ in range(num_repeat):
        for _ in range(num_samples):
            sample = memory.sample()
    ts.append(time.time())

    print('Gym-Memory')
    for _ in range(num_repeat):
        for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones):
            gym_memory.add(s0, a, r, s1, d)
    ts.append(time.time())
    for _ in range(num_repeat):
        for _ in range(num_samples):
            sample = gym_memory.sample(batch_size)
    ts.append(time.time())

    print('Result')
    print(np.diff(ts))
Ejemplo n.º 2
0
class DeepqLearner:
    def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None):
        self.env = env
        self.q_func = q_func
        self.config = config
        self.callback = callback

        # Create all the functions necessary to train the model
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=config["gpu_memory_fraction"])
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        sess.__enter__()

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph

        def make_obs_ph(name):
            return ObservationInput(env.observation_space, name=name)

        act, self.train, self.update_target, self.debug = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10,
            param_noise=config["param_noise"])

        act_params = {
            # 'make_obs_ph': make_obs_ph,
            # 'q_func': q_func,
            'num_actions': env.action_space.n,
        }

        self.act = ActWrapper(act, act_params)

        # Create the replay buffer
        self.config = config
        self.replay_buffer = None
        self.beta_schedule = None
        self.make_replay_buffer()

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.t = 0
        self.episode_rewards = [0.0]
        self.num_episodes = 1
        self.saved_mean_reward = None
        self.saved_episode_num = None
        self.episode_frames = 0
        self.model_file = None
        self.start_time = 0
        self.episode_start_time = 0

    def make_replay_buffer(self):
        if self.config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.config["buffer_size"],
                alpha=self.config["prioritized_replay_alpha"])
            if self.config["prioritized_replay_beta_iters"] is None:
                self.config["prioritized_replay_beta_iters"] = self.config[
                    "max_timesteps"]
            self.beta_schedule = LinearSchedule(
                self.config["prioritized_replay_beta_iters"],
                initial_p=self.config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.config["buffer_size"])
            self.beta_schedule = None

    def run(self):
        reset = True
        obs = self.env.reset()
        self.start_time = time.time()
        self.episode_start_time = time.time()

        with tempfile.TemporaryDirectory() as td:
            td = self.config["checkpoint_path"] or td

            self.model_file = os.path.join(td, "model")
            if tf.train.latest_checkpoint(td) is not None:
                load_state(self.model_file)
                logger.log('Loaded model from {}'.format(self.model_file))

            for self.t in range(self.config["max_timesteps"]):
                if self.callback is not None:
                    if self.callback(locals(), globals()):
                        break

                # Determine next action to take, then take that action and observe results
                action = self._action(obs, reset)
                env_action = action
                new_obs, rew, done, _ = self.env.step(env_action)
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                # Increment typical values
                reset = False
                self.episode_frames += 1
                self.episode_rewards[-1] += rew

                # See if done with episode
                if done:
                    obs = self._reset()
                    reset = True

                # Do training and deepq updating as needed
                if self.t > self.config["learning_starts"]:
                    if self.t % self.config["train_freq"] == 0:
                        self._train()
                    if self.t % self.config["target_network_update_freq"] == 0:
                        self.update_target()

    def _action(self, obs, reset):
        # Take action and update exploration to the newest value
        kwargs = {}
        if not self.config["param_noise"]:
            update_eps = self.exploration.value(self.t)
            # update_param_noise_threshold = 0.
        else:
            update_eps = 0.
            # Compute the threshold such that the KL divergence between perturbed and non-perturbed
            # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
            # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
            # for detailed explanation.
            update_param_noise_threshold = -np.log(
                1. - self.exploration.value(self.t) +
                self.exploration.value(self.t) /
                float(self.env.action_space.n))
            kwargs['reset'] = reset
            kwargs[
                'update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True
        return self.act(np.array(obs)[None], update_eps=update_eps,
                        **kwargs)[0]

    def _train(self):
        try:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if self.config["prioritized_replay"]:
                experience = self.replay_buffer.sample(
                    self.config["batch_size"],
                    beta=self.beta_schedule.value(self.t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                    self.config["batch_size"])
                weights, batch_idxes = np.ones_like(rewards), None

            # Determine errors
            td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones,
                                   weights)
            if self.config["prioritized_replay"]:
                new_priorities = np.abs(
                    td_errors) + self.config["prioritized_replay_eps"]
                self.replay_buffer.update_priorities(batch_idxes,
                                                     new_priorities)
        except Exception as e:
            self.make_replay_buffer()
            print(e)

    def _reset(self):
        self.attempt_print()
        self.attempt_checkpoint()
        self.episode_rewards.append(0.0)
        self.num_episodes += 1
        self.episode_frames = 0
        self.episode_start_time = time.time()

        return self.env.reset()

    def calc_mean_100ep_reward(self):
        if self.num_episodes <= 1:
            return None
        return round(np.mean(self.episode_rewards[-101:-1]), 1)

    def attempt_print(self):
        p_freq = self.config["print_freq"]
        if p_freq is not None and self.num_episodes % p_freq == 0:
            logger.record_tabular("% time spent exploring",
                                  int(100 * self.exploration.value(self.t)))
            logger.record_tabular("reward - current", self.episode_rewards[-1])
            logger.record_tabular("reward - mean",
                                  self.calc_mean_100ep_reward())
            logger.record_tabular("reward - saved", self.saved_mean_reward)
            logger.record_tabular("episode # - current", self.num_episodes)
            logger.record_tabular("episode # - saved", self.saved_episode_num)
            logger.record_tabular("steps - total", self.t)
            logger.record_tabular("steps - episode", self.episode_frames)
            logger.record_tabular(
                "time - ep duration",
                str(time.time() - self.episode_start_time) + "s")
            logger.record_tabular("time - remaining",
                                  self.estimate_time_remaining())
            logger.dump_tabular()

    def estimate_time_remaining(self):
        duration = time.time() - self.start_time
        if duration <= 0:
            return "Unknown"

        time_remaining = self.t / duration * (self.config["max_timesteps"] -
                                              self.t) / 60.0
        suffix = ""

        # Format based on time
        if time_remaining < MINUTE:
            suffix = " seconds"
        elif time_remaining < HOUR:
            suffix = " minutes"
            time_remaining = time_remaining / MINUTE
        elif time_remaining < DAY:
            suffix = " hours"
            time_remaining = time_remaining / HOUR
        else:
            suffix = " days"
            time_remaining = time_remaining / DAY

        # Round remaining time and return
        time_remaining = round(time_remaining * 100.0) / 100.0
        return str(time_remaining) + suffix

    def attempt_checkpoint(self):
        # Determine if we're going to checkpoint
        c_freq = self.config["checkpoint_freq"]
        if c_freq is not None \
                and self.num_episodes > 100 \
                and self.t > self.config["learning_starts"] \
                and self.num_episodes % c_freq == 0:

            # Determine if reward is growing
            mean_100ep_reward = self.calc_mean_100ep_reward()
            if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward:
                if self.config["print_freq"] is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".
                        format(self.saved_mean_reward, mean_100ep_reward))
                    self.saved_mean_reward = mean_100ep_reward
                    self.saved_episode_num = self.num_episodes
                    save_state(self.model_file)

    def save(self, save_path):
        print("Saving model to " + save_path)
        self.act.save(save_path)
Ejemplo n.º 3
0
def main(_):
    print("Used flags:", FLAGS)
    config = configparser.ConfigParser()
    config.read(FLAGS.config_file)
    timer = time.time()

    ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",")
    worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",")
    job = FLAGS.job_name
    task = FLAGS.task_index
    learning_rate = config.getfloat(FLAGS.config, 'learning_rate')
    batch_size = config.getint(FLAGS.config, 'batch_size')
    memory_size = config.getint(FLAGS.config, 'memory_size')
    target_update = config.getint(FLAGS.config, 'target_update')
    seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed')
    max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds')
    epochs = config.getint(FLAGS.config, 'start_epoch')
    end_epoch = config.getint(FLAGS.config, 'end_epoch')
    epoch_decay = config.getint(FLAGS.config, 'epoch_decay')
    # epoch_decay_rate = (epochs - end_epoch) / epoch_decay
    epoch = LinearSchedule(epoch_decay, end_epoch, epochs)
    backup = config.getint(FLAGS.config, 'backup')  # unused in async
    sync = config.getboolean(FLAGS.config, 'sync')
    gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio')
    sync_workers = len(worker_hosts)-backup
    mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute')
    animate = 0
    draw = 0

    print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n"
          "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n"
          "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}"
          .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update,
                  seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync))

    cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
    chief = True if job == 'worker' and task == 0 else False
    print("/job:", job, "/task:", task, " - Chief: ", chief, sep='')

    # Create server
    server = tf.train.Server(cluster, job_name=job, task_index=task)

    run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\
        format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts),
               epochs, batch_size, memory_size, target_update, learning_rate, backup, seed)
    run_code += "-sync" if sync else "-async"

    # Set a unique random seed for each client
    seed = ((seed * 10) + task)
    random.seed(seed)

    if not mute:
        print("Run code:", run_code)

    # Start parameter servers
    if job == 'ps':
        server.join()

    # Start training
    with U.make_session(num_cpu=4, target=server.target) as sess:
        # Create the environment
        env = gym.make(env_name)
        env.seed(seed)
        tf.set_random_seed(seed)

        # Create all the functions necessary to train the model
        act, train, global_opt,  update_target, update_weights, sync_opt, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
            # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
            chief=chief,
            server=server,
            workers=sync_workers
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(memory_size)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        if not chief:
            if not mute:
                print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts)))
            time.sleep(4)

        # Initialize the parameters and copy them to the target network.
        U.initialize(chief=chief)

        if chief:
            sess.run(debug['run_code'].assign(run_code))
            if not mute:
                print("Set global run code to:", run_code)

        if not mute:
            print("initialized variables, sleeping for 1 sec")
        time.sleep(2)

        if not chief:
            while not sess.run(tf.is_variable_initialized(debug['run_code'])):
                if not mute:
                    print("Global run code not yet initialized")
                time.sleep(2)
            run_code = str(sess.run(debug['run_code']).decode())
            if run_code == '':
                if not mute:
                    print("Run code empty. Trying to fetch again...")
                time.sleep(5)
            if not mute:
                print("Read global run code:", run_code)

        run_code += "(w" + str(task) + ")"
        print("Final run_code:", run_code)

        t_global_old = update_weights()[0][0]
        update_target()
        exp_gen = 1000  # For how many timesteps sould we only generate experience (not train)
        t_start = exp_gen
        comm_rounds = 0
        comm_rounds_global = 0
        dt = 0
        write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"])

        episode_rewards = [0.0]
        cr_reward = 0
        obs = env.reset()
        for t in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            cr_reward += rew

            # Animate every <animate> episodes
            if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0:
                if done:
                    print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1])
                env.render()

            if done:
                if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0:
                    env.render()
                avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1)
                write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global])

                obs = env.reset()
                episode_rewards.append(0)

            [converged] = sync_opt['check_converged']()
            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged
            if is_solved or comm_rounds >= max_comm_rounds:
                sync_opt['set_converged']([True])
                if not mute:
                    print("Converged was set to", sync_opt['check_converged']()[0])
                write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute)
                print("Converged after:  ", len(episode_rewards), "episodes")
                print("Agent total steps:", t)
                print("Global steps:     ", debug['t_global']()[0])
                sec = round(time.time() - timer)
                print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s")
                return
            else:
                if t >= exp_gen:
                # if t >= batch_size:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                    if t - t_start >= np.round(epoch.value(comm_rounds)):  

                        cr_old = comm_rounds_global

                        # Apply gradients to weights in PS
                        if sync:
                            # Tell the ps we are done and want to submit score
                            [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']()

                            if comm_rounds_global == comm_rounds:
                                if worker_count <= sync_workers:
                                    # If allowed to submit score, do it
                                    [comm_rounds_global] = sync_opt['submit_score']([cr_reward])

                                    if chief: 
                                        [submits] = sync_opt['set_submit']([0])
                                        while worker_count != sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_wc")
                                                break
                                            worker_count = sync_opt['check_wc']()[0]

                                    while sync_opt['check_submit']()[0] == -1:
                                        if sync_opt['check_converged']()[0]:
                                            if not mute:
                                                print("Other worker converged! Finishing in check_submit")
                                            break
                                      
                                        pass

                                    if sync_opt['check_converged']()[0]:
                                        if not mute:
                                            print("Other worker converged! Continuing before submit")
                                        continue

                                    # Now all eligible workers have sent their score and gradient round has started
                                    # Submit gradient
                                    # TODO 4th argument overrides everything else unles it is set to -1 in the code
                                    [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old],
                                                                              [cr_reward], [1/len(worker_hosts)], [True])

                                    submits = sync_opt['inc_submit']()
                                    if chief:
                                        while not sync_opt['check_submit']()[0] == sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_submit (chief)")
                                                break
                                          
                                            pass
                                        # print("Round", comm_rounds, "finished")
                                        [w] = sync_opt['reset_wc']()[0]
                                        # print("Worker count reset to:", w)
                                        sync_opt['reset_score']()
                                        submits = sync_opt['set_submit']([-1])
                                        # print("Submit round finished. Submits set to:", submits[0])
                                        [r] = sync_opt['inc_comm_round']()[0]
                                        # print("New round started:", r)

                                    # Normal workers wait until GCR > CR
                                    if not chief:
                                        while sync_opt['check_round']()[0] <= comm_rounds:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_round")
                                                break
                                            # print("Worker submitted, waiting for next round:", comm_rounds + 1)
                                            # time.sleep(0.1)
                                            pass

                                else: #elif worker_count > sync_workers:
                                    # If not allowed to submit score, wait for next round to start
                                    if not mute:
                                        print("Worker finished too late but before new round started (", comm_rounds_global, ")")
                                        print("WC(", worker_count, ") > N(", sync_workers, ")", sep="")
                                    target = np.floor(comm_rounds_global + 1)  # +1 if x.0, +0.5 if x.5
                                    while not sync_opt['check_round']()[0] >= target:
                                        pass

                            elif comm_rounds_global > comm_rounds:
                                # This means the worker is behind. Do nothing and start next round
                                if not mute:
                                    print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global)
                                # TODO How to handle round count when skipping rounds?
                                comm_rounds = comm_rounds_global - 1

                            elif comm_rounds_global < comm_rounds:
                                print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global)
                                time.sleep(5)

                        else:
                            sync_opt['inc_comm_round']()
                            [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False])

                        # Update the local weights with the new global weights from PS
                        t_global_old = update_weights()[0][0]

                        comm_rounds += 1
                        # print("Round finished. Increasing local comm_round to:", comm_rounds)
                        cr_reward = 0
                        # TODO RE-ENABLE comm-rounds LOGGING
                        # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True)

                        t_start = t
                if t % target_update == 0:
                    update_target()

            if not mute and done and len(episode_rewards) % 10 == 0:
                last_rewards = episode_rewards[-101:-1]
                logger.record_tabular("steps", t)
                logger.record_tabular("global steps", debug['t_global']()[0])
                logger.record_tabular("communication rounds", comm_rounds)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.record_tabular("last gradient factor", np.round(factor, 4))
                logger.dump_tabular()
                rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards]
                streak = 0
                for i in reversed(rew_ill):
                    if i == "●":
                        streak += 1
                    else:
                        break
                #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) +
                      " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) +
                      " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) +
                      " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) +
                      " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) +
                      " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
Ejemplo n.º 4
0
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs
                env.render()
            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if is_solved:
                # Show off the result
                print("Total Number of Episodes: ", len(episode_rewards))
                print("t final value: ", t)
                break
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("episode reward", episode_rewards[-2])
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()
Ejemplo n.º 5
0
     IsPlot = True
 else:
     IsPlot = False
 
 if (sample_time % train_freq == 0):
     states = np.vstack(states)
     actions_idx = np.vstack(actions_idx)
     actions = np.array(actions)
     
     rewards_tmp = rewards.copy()
     last_value = expected_sarsa(model,last_state,K,C,action_low,action_high,False,random_choose,num=100)
     rewards_tmp.append(last_value)
     Q_target = discount_with_dones(rewards_tmp, dones+[last_done], gamma)
     Q_target = np.float32(np.vstack(Q_target))[:-1]
     
     R_buffer_sample = replay_buffer.sample(np.min([minibatch,timestep]))
     next_states_sampled = np.squeeze(R_buffer_sample[3], axis=1)
     dones_sampled = R_buffer_sample[4]
     reward_sampled = R_buffer_sample[2]
     
     last_v = [expected_sarsa(model,np.reshape(state_tmp,(1,-1)),K,C,action_low,action_high,True,random_choose,num=100) for state_tmp in next_states_sampled]
     last_v = np.vstack(last_v)
     Q_target_hist = reward_sampled + last_v * (1-dones_sampled) * gamma
     
     states_sampled1 = np.squeeze(R_buffer_sample[0], axis=1)
     states_sampled2 = states
     states_sampled = np.concatenate((states_sampled1,states_sampled2), axis = 0)
     actions_sampled1 = R_buffer_sample[1]
     actions_sampled2 = actions
     actions_sampled = np.concatenate((actions_sampled1, actions_sampled2), axis = 0)
     target = np.reshape(np.concatenate((Q_target_hist, Q_target), axis = 0), (-1))
Ejemplo n.º 6
0
def train_policy(arglist):
    with U.single_threaded_session():
        # Create the environment
        if arglist.use_dense_rewards:
            print("Will use env MineRLNavigateDense-v0")
            env = gym.make("MineRLNavigateDense-v0")
            env_name = "MineRLNavigateDense-v0"
        else:
            print("Will use env MineRLNavigate-v0")
            env = gym.make('MineRLNavigate-v0')
            env_name = "MineRLNavigate-v0"

        if arglist.force_forward:
            env = MineCraftWrapperSimplified(env)
        else:
            env = MineCraftWrapper(env)

        if not arglist.use_demonstrations:
            # Use stack of last 4 frames as obs
            env = FrameStack(env, 4)

        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space,
                                                      name=name),
            q_func=build_q_func('conv_only', dueling=True),
            num_actions=env.action_space.n,
            gamma=0.9,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )

        # Create the replay buffer(s) (TODO: Use prioritized replay buffer)
        if arglist.use_demonstrations:
            replay_buffer = ReplayBuffer(int(arglist.replay_buffer_len / 2))
            demo_buffer = load_demo_buffer(env_name,
                                           int(arglist.replay_buffer_len / 2))
        else:
            replay_buffer = ReplayBuffer(arglist.replay_buffer_len)

        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(
            schedule_timesteps=arglist.num_exploration_steps *
            arglist.num_episodes * arglist.max_episode_steps,
            initial_p=1.0,
            final_p=arglist.final_epsilon)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        episode_rewards = [0.0]
        n_episodes = 0
        n_steps = 0
        obs = env.reset()
        log_path = "./learning_curves/minerl_" + str(date.today()) + "_" + str(
            time.time()) + ".dat"
        log_file = open(log_path, "a")
        for episode in range(arglist.num_episodes):
            print("Episode: ", str(episode))
            done = False
            episode_steps = 0
            while not done:

                # Take action and update exploration to the newest value
                action = act(obs[None],
                             update_eps=exploration.value(n_steps))[0]
                new_obs, rew, done, _ = env.step(action)
                n_steps += 1
                episode_steps += 1

                # Break episode
                if episode_steps > arglist.max_episode_steps:
                    done = True

                # Store transition in the replay buffer.
                replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                # Store rewards
                episode_rewards[-1] += rew
                if done:
                    obs = env.reset()
                    episode_rewards.append(0)
                    n_episodes += 1

                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if (n_steps > arglist.learning_starts_at_steps) and (n_steps %
                                                                     4 == 0):
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))

                if arglist.use_demonstrations:
                    if (n_steps < arglist.learning_starts_at_steps) and (
                            n_steps % 4 == 0):
                        obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample(
                            32)
                        train(obses_t, actions, rewards, obses_tp1, dones,
                              np.ones_like(rewards))
                    if (n_steps > arglist.learning_starts_at_steps) and (
                            n_steps % 4 == 0):
                        obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample(
                            32)
                        train(obses_t, actions, rewards, obses_tp1, dones,
                              np.ones_like(rewards))

                # Update target network periodically.
                if n_steps % arglist.target_net_update_freq == 0:
                    update_target()

                # Log data for analysis
                if done and len(episode_rewards) % 10 == 0:
                    logger.record_tabular("steps", n_steps)
                    logger.record_tabular("episodes", len(episode_rewards))
                    logger.record_tabular(
                        "mean episode reward",
                        round(np.mean(episode_rewards[-101:-1]), 1))
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * exploration.value(n_steps)))
                    logger.dump_tabular()

                #TODO: Save checkpoints
                if n_steps % arglist.checkpoint_rate == 0:
                    checkpoint_path = "./checkpoints/minerl_" + str(
                        episode) + "_" + str(date.today()) + "_" + str(
                            time.time()) + ".pkl"
                    save_variables(checkpoint_path)
                    print("%s,%s,%s,%s" %
                          (n_steps, episode,
                           round(np.mean(episode_rewards[-101:-1]),
                                 1), int(100 * exploration.value(n_steps))),
                          file=log_file)
        log_file.close()
Ejemplo n.º 7
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          callback=None):

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer

    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}

            update_eps = exploration.value(t)
            
            
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
 #               obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)


            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)

    return act
Ejemplo n.º 8
0
class DeepQ(object):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    def __init__(self,
                 env,
                 q_func,
                 lr=5e-4,
                 max_timesteps=100000,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 train_freq=1,
                 batch_size=32,
                 print_freq=100,
                 checkpoint_freq=10000,
                 learning_starts=1000,
                 gamma=1.0,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 callback=None,
                 max_episodes=100):

        self.env = env
        self.q_func = q_func
        self.lr = lr
        self.max_timesteps = max_timesteps
        self.buffer_size = buffer_size
        self.exploration_fraction = exploration_fraction
        self.exploration_final_eps = exploration_final_eps
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.checkpoint_freq = checkpoint_freq
        self.learning_starts = learning_starts
        self.gamma = gamma
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.prioritized_replay_eps = prioritized_replay_eps
        self.param_noise = param_noise
        self.callback = callback
        self.max_episodes = max_episodes
        # Create all the functions necessary to train the model

        self.sess = tf.Session()
        self.sess.__enter__()

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph
        self.observation_space_shape = env.observation_space.shape

    def make_obs_ph(self, name):
        return U.BatchInput(self.observation_space_shape, name=name)

    def make_build_train(self):
        # Build act and train networks
        self.act, self.train, self.update_target, self.debug = deepq.build_train(
            make_obs_ph=self.make_obs_ph,
            q_func=self.q_func,
            num_actions=self.env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.lr),
            gamma=self.gamma,
            grad_norm_clipping=10,
            param_noise=self.param_noise)

        self.act_params = {
            'make_obs_ph': self.make_obs_ph,
            'q_func': self.q_func,
            'num_actions': self.env.action_space.n,
        }

        self.act = ActWrapper(self.act, self.act_params)

        return 'make_build_train() complete'

    def initialize(self):
        # Create the replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                self.prioritized_replay_beta_iters = self.max_timesteps
            self.beta_schedule = LinearSchedule(
                self.prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        # self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * self.max_timesteps),
        #                                   initial_p=1.0,
        #                                   final_p=self.exploration_final_eps)

        self.exploration = ConstantSchedule(self.exploration_final_eps)
        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        return 'initialize() complete'

    def transfer_pretrain(self,
                          transferred_instances,
                          epochs,
                          tr_batch_size,
                          keep_in_replay_buffer=True):
        """
        This is a custom function from University of Toronto group to first pretrain
        the deepq train network with transferred instances. These instances must be
        zip([s],[a],[r],[s']) tuples mapped over to the same state and action spaces as the target
        task environment.

        No output - just updates parameters of train and target networks.
        """
        # TODO - function that trains self.act and self.train using mapped instances
        done = False
        # pack all instances into replay buffer
        for obs, action, rew, new_obs in transferred_instances:
            self.replay_buffer.add(obs, action, rew, new_obs, float(done))

        for epoch in range(epochs):
            obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                tr_batch_size)
            weights, batch_idxes = np.ones_like(rewards), None
            td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones,
                                   weights)

        self.update_target()

        if keep_in_replay_buffer is not True:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        return 'transfer_pretrain() complete'

    def task_train(self):
        self.episode_rewards = [0.0]
        self.episode_steps = [0.0]
        self.saved_mean_reward = None
        obs = self.env.reset()
        reset = True
        with tempfile.TemporaryDirectory() as td:
            model_saved = False
            model_file = os.path.join(td, "model")
            for t in range(self.max_timesteps):
                if self.callback is not None:
                    if self.callback(locals(), globals()):
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(t)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - self.exploration.value(t) +
                        self.exploration.value(t) /
                        float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                action = self.act(np.array(obs)[None],
                                  update_eps=update_eps,
                                  **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                self.episode_rewards[-1] += rew
                self.episode_steps[-1] += 1
                if done:
                    obs = self.env.reset()
                    self.episode_rewards.append(0.0)
                    self.episode_steps.append(0.0)
                    reset = True

                if t > self.learning_starts and t % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size, beta=self.beta_schedule.value(t))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    td_errors = self.train(obses_t, actions, rewards,
                                           obses_tp1, dones, weights)
                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if t > self.learning_starts and t % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target()

                mean_100ep_reward = round(
                    np.mean(self.episode_rewards[-101:-1]), 1)
                num_episodes = len(self.episode_rewards)
                if done and self.print_freq is not None and len(
                        self.episode_rewards) % self.print_freq == 0:
                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(t)))
                    logger.dump_tabular()

                if (self.checkpoint_freq is not None
                        and t > self.learning_starts and num_episodes > 100
                        and t % self.checkpoint_freq == 0):
                    if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward:
                        if self.print_freq is not None:
                            logger.log(
                                "Saving model due to mean reward increase: {} -> {}"
                                .format(self.saved_mean_reward,
                                        mean_100ep_reward))
                        U.save_state(model_file)
                        model_saved = True
                        self.saved_mean_reward = mean_100ep_reward

                if num_episodes >= self.max_episodes:
                    break

            if model_saved:
                if self.print_freq is not None:
                    logger.log("Restored model with mean reward: {}".format(
                        self.saved_mean_reward))
                U.load_state(model_file)
        return self.act, self.episode_rewards, self.episode_steps

    def get_q_values(self, obs):
        '''
        Input:
            obs should be a numpy array with shape (?,state_space)
        Output:
            returns Q values for each possible action with shape (?,action_space)
        '''
        return self.debug['q_values'](obs)
Ejemplo n.º 9
0
    def learn(self):
        act = self.act
        train = self.train
        update_target = self.update_target

        env = self.env
        with self.session.as_default():
            replay_buffer = ReplayBuffer(self._replay_buffer_size)
            exploration = LinearSchedule(
                schedule_timesteps=self._exploration_schedule_steps,
                initial_p=self._exploration_initial_prob,
                final_p=self._exploration_final_prob)

            tf_util.initialize()
            update_target()

            episode_rewards = [0.0]
            episode_errors = []
            episode_rw_errors = []
            episode_error_diffs = []
            observation = env.reset()
            cnt = itertools.count()
            for t in itertools.count():
                # print("iter: ", t)
                # Take action and update exploration to the newest value
                action = act(observation[None],
                             update_eps=exploration.value(t))[0]
                new_observation, reward, done, _ = env.step(action)
                # Store transition in the replay buffer.
                replay_buffer.add(observation, action, reward, new_observation,
                                  float(done))
                observation = new_observation

                episode_rewards[-1] += reward

                if done:
                    episode_errors.append(env.error)
                    episode_rewards.append(0)
                    if self._random_walk_sampling_args is not None:
                        sampling_args = self._random_walk_sampling_args
                        sampling_args.update({"graph": env.graph})
                        rw_error = random_walk_error(sampling_args)
                        episode_rw_errors.append(rw_error)
                        episode_error_diffs.append(rw_error - env.error)

                    if len(episode_rewards) % 10 == 0:
                        nmse = env.get_current_nmse()
                        logger.record_tabular("steps", t)
                        logger.record_tabular("episodes", len(episode_rewards))
                        logger.record_tabular(
                            "mean episode reward",
                            round(np.mean(episode_rewards[-101:-1]), 3))
                        logger.record_tabular(
                            "mean episode error",
                            round(np.mean(episode_errors[-101:-1]), 3))
                        logger.record_tabular("nmse", nmse)
                        logger.record_tabular(
                            "sampling set", [int(v) for v in env.sampling_set])
                        logger.record_tabular("% time spent exploring",
                                              int(100 * exploration.value(t)))
                        if self._random_walk_sampling_args is not None:
                            logger.record_tabular(
                                "mean random walk error",
                                round(np.mean(episode_rw_errors[-101:-1]), 3))
                            logger.record_tabular(
                                "mean error diff",
                                round(np.mean(episode_error_diffs[-101:-1]),
                                      3))
                        logger.dump_tabular()

                    observation = env.reset()

                # Minimize the Bellman equation error on replay buffer sample batch
                if t > 1000:
                    (observations_t, actions, rewards, observations_tp1,
                     dones) = replay_buffer.sample(32)
                    train(observations_t, actions, rewards, observations_tp1,
                          dones, np.ones_like(rewards))
                if t % 1000 == 0:
                    # Update target network periodically.
                    update_target()
Ejemplo n.º 10
0
def learn(
        env,
        max_timesteps=50000000,
        # Human level control hyperparameters
        batch_size=32,
        buffer_size=1000000,
        agent_history_length=4,
        target_network_update_freq=10000,
        discount_factor=0.99,
        # "action_repeat=4" handled by gym environment(equivalent to frame skip)
        train_freq=4,  # agent "update frequency" in human level control paper
        initial_exploration_rate=1,
        final_exploration_rate=0.1,
        final_exploration_frame=1000000,
        replay_start_size=50000,
        print_freq=10,
        checkpoint_freq=100,
        episode_render_freq=None,
        log_dir='./tensorboard',
        start_from_checkpoint=False):

    writer = tf.summary.FileWriter(log_dir + '/' + env.spec.id)

    # Linear decay as used in the deepmind paper
    epsilon = lambda t: max(
        initial_exploration_rate -
        (t / final_exploration_frame), final_exploration_rate)

    preprocess = _preprocess if len(
        env.observation_space.shape) == 3 else lambda x: x

    replay_buffer = ReplayBuffer(buffer_size)
    num_actions = env.action_space.n

    # Here, we'll use a simple feed forward nn for representing
    # Q(s) -> [r_1, r_2, ..., r_n] where r_k is the reward for taking action
    # `k` in state `s`
    if start_from_checkpoint:
        model = load_model('tmp_model',
                           custom_objects={'huber_loss': huber_loss})
    else:
        model = q_nn(env.observation_space, num_actions, agent_history_length)
    target_model = clone_model(model)

    # Keep some state about the current episode
    num_episodes = 0
    episode_total_reward = 0
    episode_timesteps = 0
    episode_rewards = [0.0]

    last_checkpoint_mean_reward = -inf
    mean_100ep_reward = -inf

    # Start off with a fresh environment
    ob = preprocess(env.reset())
    obs = [ob for i in range(agent_history_length)]

    # Play breakout for max_timesteps
    for t in range(max_timesteps):
        # With probability epsilon, take a random action
        if (random.uniform(0, 1) < epsilon(t)):
            action = env.action_space.sample()
        else:
            observations = np.array([obs])
            actions = np.reshape(np.ones(num_actions), [1, -1])
            q_values = model.predict_on_batch([observations, actions])
            action = np.argmax(q_values, axis=1)[0]

        # Collect observations and store them for replay
        new_ob, reward, is_done, info = env.step(action)
        is_done = info['ale.lives'] != 5
        new_obs = list(obs)
        new_obs.pop(0)
        new_obs.append(preprocess(new_ob))

        replay_buffer.add(obs, action, reward, new_obs, is_done)
        obs = new_obs

        # Update logging info
        episode_total_reward += reward
        episode_timesteps += 1

        if t > replay_start_size and t % train_freq == 0:
            fit_batch(model, target_model, num_actions, discount_factor,
                      replay_buffer.sample(batch_size), writer,
                      t // train_freq)

        if t > replay_start_size and t % target_network_update_freq == 0:
            # Must checkpoint model and clear sess to avoid OOM https://github.com/keras-team/keras/issues/5345
            model.save('tmp_model')
            K.clear_session()
            target_model = load_model(
                'tmp_model', custom_objects={'huber_loss': huber_loss})
            model = load_model('tmp_model',
                               custom_objects={'huber_loss': huber_loss})
            print('Setting model to target model')

        if is_done:
            ob = preprocess(env.reset())
            obs = np.array([ob for i in range(agent_history_length)])
            episode_timesteps = 0
            num_episodes += 1
            episode_rewards.append(episode_total_reward)
            episode_total_reward = 0
            if len(episode_rewards) > 100:
                episode_rewards.pop(0)
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)

        if is_done and num_episodes % print_freq == 0:
            print("timesteps", t)
            print("episodes run", num_episodes)
            print("last episode reward", episode_rewards[-1])
            print("mean_100ep_reward", mean_100ep_reward)
            print("% time spent exploring", int(100 * epsilon(t)))

        if t % checkpoint_freq == 0 and mean_100ep_reward > last_checkpoint_mean_reward:
            print("Saving model due to mean reward increase: ",
                  last_checkpoint_mean_reward, " -> ", mean_100ep_reward)
            model.save('models/' + env.spec.id + '_deepq.h5py')
            last_checkpoint_mean_reward = mean_100ep_reward

        if episode_render_freq is not None and num_episodes % episode_render_freq == 0:
            env.render()
Ejemplo n.º 11
0
    def learn(self):

        with U.make_session(8):
            # Create the environment
            env = gym.make(self._args.env)
            # Create all the functions necessary to train the model
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: ObservationInput(
                    env.observation_space, name=name),
                q_func=self.model,
                num_actions=env.action_space.n,
                optimizer=tf.train.AdamOptimizer(
                    learning_rate=self._args.learning_rate),
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(self._args.replay_buffer_size)
            # Create the schedule for exploration starting from 1 till min_exploration_rate.
            exploration = LinearSchedule(
                schedule_timesteps=self._args.exploration_duration,
                initial_p=1.0,
                final_p=self._args.min_exploration_rate)

            # Initialize the parameters and copy them to the target network.
            U.initialize()
            update_target()

            episode_rewards = [0.0]
            obs = env.reset()
            for t in itertools.count():
                # Take action and update exploration to the newest value
                action = act(obs[None], update_eps=exploration.value(t))[0]
                new_obs, rew, done, _ = env.step(action)
                # Store transition in the replay buffer.
                replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                episode_rewards[-1] += rew
                if done:
                    obs = env.reset()
                    episode_rewards.append(0)

                mean_episode_reward = np.mean(episode_rewards[-101:-1])
                # Show learned agent:
                if mean_episode_reward >= self._render_reward_threshold:
                    env.render()

                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

                if done and len(episode_rewards) % 10 == 0:
                    self._reward_buffer_mutex.acquire()
                    self._reward_buffer.append(mean_episode_reward)

                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", len(episode_rewards))
                    logger.record_tabular("mean episode reward",
                                          round(mean_episode_reward, 1))
                    logger.record_tabular("% time spent exploring",
                                          int(100 * exploration.value(t)))
                    logger.dump_tabular()

                    self._reward_buffer_changed = True
                    self._reward_buffer_mutex.release()
Ejemplo n.º 12
0
def main():
    print('main')
    stats_file = pathlib.Path('stats.csv')
    if stats_file.exists():
        stats_file.unlink()

    broker = dqn.env.Broker('http://localhost:5000')
    env = dqn.env.HaliteEnv(broker)

    with U.make_session(num_cpu=4):
        observation_shape = env.observation_space.shape

        def make_obs_ph(name):
            import dqn.tf_util as U
            return U.BatchInput(observation_shape, name=name)

        # Create all the functions necessary to train the model
        act, train, update_target, debug = dqn.graph.build_train(
            make_obs_ph=make_obs_ph,
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )

        act = dqn.play.ActWrapper(
            act, {
                'make_obs_ph': make_obs_ph,
                'q_func': model,
                'num_actions': env.action_space.n,
            })

        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=30000,
                                     initial_p=1.0,
                                     final_p=0.03)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        learning_starts = 1000
        target_network_update_freq = 500
        checkpoint_freq = 20

        episode_rewards = [0.0]
        wins = [False]
        saved_mean_reward = None
        obs = env.reset()
        for t in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, info = env.step(action)

            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)
                wins.append(info['win'])

            win_rate = round(np.mean(wins[-100:]), 4)
            is_solved = t > 100 and win_rate >= 99
            if is_solved:
                print('solved')
                break
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > learning_starts:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    actions = np.argmax(actions, axis=1)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if t > learning_starts and t % target_network_update_freq == 0:
                    update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 4)
            num_episodes = len(episode_rewards)
            exploration_rate = int(100 * exploration.value(t))

            if done:
                info = {
                    'date': str(dt.datetime.now()),
                    'episode': len(episode_rewards),
                    **info,
                    'win_rate': win_rate,
                    'mean_100ep_reward': mean_100ep_reward,
                    'exploration_rate': exploration_rate,
                }
                print('episode', info)
                if not stats_file.exists():
                    with stats_file.open('w') as fp:
                        fp.write(','.join(info.keys()) + '\n')
                with stats_file.open('a') as fp:
                    fp.write(','.join(map(str, info.values())) + '\n')

            if done and num_episodes % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", mean_100ep_reward)
                logger.record_tabular("mean win rate", win_rate)
                logger.record_tabular("% time spent exploring",
                                      exploration_rate)
                logger.dump_tabular()

            if done and (t > learning_starts and num_episodes > 100
                         and num_episodes % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".
                        format(saved_mean_reward, mean_100ep_reward))
                    act.save('dqn_model.pkl')
                    saved_mean_reward = mean_100ep_reward

    act.save('dqn_model.pkl')
    env.close()
def startTraining():
    # Create the environment
    print('START ENV', RC.GB_CLIENT_ID(), RC.gbRobotHandle())
    env = RobotOperationEnvironment(RC.GB_CLIENT_ID(), RC.GB_CSERVER_ROBOT_ID,
                                    RC.gbRobotHandle())
    #print('ACTION_SPACE', env.action_space.shape)
    # Create all the functions necessary to train the model
    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: BatchInput(env.observation_space.shape,
                                            name=name),
        q_func=model,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
    )
    # Create the replay buffer
    replay_buffer = ReplayBuffer(50000)
    # Create the schedule for exploration starting from 1 (every action is random) down to
    # 0.02 (98% of actions are selected according to values predicted by the model).
    exploration = LinearSchedule(schedule_timesteps=10000,
                                 initial_p=1.0,
                                 final_p=0.02)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    obs = env.reset()
    print("Manipulator DEEPQ Training Experiment Start.")
    for t in itertools.count():
        print('Episode ', len(episode_rewards), 'Step ', t, '--------------')
        print('Start waiting for the next action',
              env._robot.getOperationState())
        while (env._robot.getOperationState() != RC.CROBOT_STATE_READY):
            time.sleep(0.01)

        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        print('Generated action:', action)
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
        if is_solved:
            # Show off the result
            #env.render()
            pass
        else:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    32)
                print('Generated actions:', actions)
                train(obses_t, actions, rewards, obses_tp1, dones,
                      np.ones_like(rewards))
            # Update target network periodically.
            if t % 1000 == 0:
                update_target()

        if done and len(episode_rewards) % 10 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", len(episode_rewards))
            logger.record_tabular("mean episode reward",
                                  round(np.mean(episode_rewards[-101:-1]), 1))
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()
Ejemplo n.º 14
0
class DQNAgent:

    def __init__(self, identifier, actions, observation_shape, num_steps, x=0.0, y=0.0):
        self.id = identifier
        self.actions = actions
        self.x = x
        self.y = y
        self.yellow_steps = 0
        self.postponed_action = None
        self.obs = None
        self.current_action = None
        self.weights = np.ones(32)
        self.td_errors = np.ones(32)

        self.pre_train = 2500
        self.prioritized = False
        self.prioritized_eps = 1e-4
        self.batch_size = 32
        self.buffer_size = 30000
        self.learning_freq = 500
        self.target_update = 5000

        # Create all the functions necessary to train the model
        self.act, self.train, self.update_target, self.debug = deepq.build_train(
            make_obs_ph=lambda name: TrafficTfInput(observation_shape, name=name),
            q_func=dueling_model,
            num_actions=len(actions),
            optimizer=tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4),
            gamma=0.99,
            double_q=True,
            scope="deepq" + identifier
        )

        # Create the replay buffer
        if self.prioritized:
            self.replay_buffer = PrioritizedReplayBuffer(size=self.buffer_size, alpha=0.6)
            self.beta_schedule = LinearSchedule(num_steps // 4, initial_p=0.4, final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        self.exploration = LinearSchedule(schedule_timesteps=int(num_steps * 0.1), initial_p=1.0, final_p=0.01)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

    def take_action(self, t):
        if self.postponed_action is None:
            # Take action and update exploration to the newest value
            action = self.act(np.array(self.obs)[None], update_eps=self.exploration.value(t))[0]
        else:
            # Take action postponed by yellow light transition
            action = self.postponed_action
            self.postponed_action = None

        return action

    def store(self, rew, new_obs, done):
        # Store transition in the replay buffer.
        self.replay_buffer.add(self.obs, self.current_action, rew, new_obs, float(done))

    def learn(self, t):
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if t > self.pre_train:
            if self.prioritized:
                experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, self.weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                self.weights = np.ones_like(rewards)

            # Minimize the error in Bellman's equation and compute TD-error
            self.td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, self.weights)

            # Update the priorities in the replay buffer
            if self.prioritized:
                new_priorities = np.abs(self.td_errors) + self.prioritized_eps
                self.replay_buffer.update_priorities(batch_idxes, new_priorities)

        self.update_target_network(t)

    def update_target_network(self, t):
        # Update target network periodically.
        if t % self.target_update == 0:
            self.update_target()

    def add_fingerprint_to_obs(self, obs, weights, identifier, td_errors):
        idx = 0

        for w in weights:
            obs[2, identifier, idx] = w
            idx += 1

        for td in td_errors:
            obs[2, identifier, idx] = td
            idx += 1

        return obs

    def add_fingerprint(self, weights, identifier, td_errors):
        self.obs = self.add_fingerprint_to_obs(self.obs, weights, identifier, td_errors)
Ejemplo n.º 15
0
def learn(env,
          q_func,
          policy_fn,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape
    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)
    
    scope = "ampi"
    reuse=None
    grad_norm_clipping=None
    num_actions=env.action_space.n
    optimizer_q=tf.train.AdamOptimizer(learning_rate=lr)
    optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr)
    act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse)
    
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        
        # add
        ob_space = env.observation_space
        ac_space = env.action_space
        pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi
        pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func")
        
        pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi
        target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func")
 
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
        
        # Q_{train}(a,s)
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) 
        
        # y_j
        act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1})
        q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1}))
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
        
        # Regression loss
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        
        # argmax_a Q_{target}(s_j, a)
        z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a')

        # classification loss
        cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits(
                      logits=pi, labels=z_j)
        
        # Q optimization
        if grad_norm_clipping is not None:
            gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients_qq):
                if grad is not None:
                    gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_q = optimizer_q.apply_gradients(gradients_q)
        else:
            optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars)

        # pi optimization
        if grad_norm_clipping is not None:
            gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars)
            for i, (grad, var) in enumerate(gradients_pi):
                if grad is not None:
                    gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_pi = optimizer_pi.apply_gradients(gradients_pi)
        else:
            optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars)

        # update_target Q
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # update_target pi
        update_target_pi = []
        for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name),
                                   sorted(target_pi_func_vars, key=lambda v: v.name)):
            update_target_pi.append(var_target.assign(var))
        update_target_pi = tf.group(*update_target_pi)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, cl_error],
            updates=[optimize_q, optimize_pi]
        )
        update_target = U.function([], [], updates=[update_target_expr, update_target_pi])

        q_values = U.function([obs_t_input], q_t)

        debug = {'q_values': q_values}

    # Create the replay buffer
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            
            action = env.action_space.sample() # not used, just so we have the datatype
            stochastic=True
            ac1, vpred1 =  act(stochastic, np.array(obs)[None])
            action = ac1[0]
            #action, _ = pi.act(stochastic, obs)
            
            #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()
            

            # Log train and res
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)

    return act
Ejemplo n.º 16
0
def train(model_file, game="CartPole-v1"):
    """Train at a game."""
    with tf_util.make_session(8):
        env = gym.make(game)

        def make_placeholder(name):
            """Make a placeholder input."""
            return tf_util.BatchInput(env.observation_space.shape, name=name)

        act_params = {
            'make_obs_ph': make_placeholder,
            'q_func': model,
            'num_actions': env.action_space.n
        }
        act, train, update_target, debug = deepq.build_train(
            **act_params,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4)
        )
        act = ActWrapper(act, act_params)

        replay_buffer = ReplayBuffer(50000)

        exploration = LinearSchedule(
            schedule_timesteps=100000,
            initial_p=1.0,
            final_p=0.02
        )

        tf_util.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for t in itertools.count():
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)
            if not len(episode_rewards) % 100:
                env.render()

            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = (
                    replay_buffer.sample(32)
                )
                train(
                    obses_t, actions, rewards, obses_tp1, dones,
                    np.ones_like(rewards)
                )
            if not t % 1000:
                update_target()
            if not t % 3000:
                if model_file:
                    tf_util.save_state(model_file)
                yield act

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular(
                    "mean episode reward",
                    round(np.mean(episode_rewards[-101:-1]), 1)
                )
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * exploration.value(t))
                )
                logger.dump_tabular()
Ejemplo n.º 17
0
def train_dqn(opts,
              seed=None,
              lr=1e-3,
              total_timesteps=500000,
              buffer_size=50000,
              exploration_fraction=0.1,
              exploration_final_eps=0.02,
              train_freq=1,
              batch_size=32,
              checkpoint_freq=500000,
              learning_starts=1000,
              gamma=1.000,
              target_network_update_freq=3000,
              load_path=None):
    """
    Runs the main recorder by binding certain discrete actions to keys.
    """
    if os.path.exists(opts.model_dir):
        print('Path already exists. Remove? y for yes')
        input_char = getch.getch()
        if not input_char == 'y':
            print('Exiting')
            return
        shutil.rmtree(opts.model_dir)
    os.makedirs(opts.model_dir)
    os.makedirs(os.path.join(opts.model_dir, 'logs'))
    os.makedirs(os.path.join(opts.model_dir, 'weights'))

    #env = gym.make('MountainCar-v0')
    env = gym.make('LunarLander-v2')
    env._max_episode_steps = 1200

    sess = get_session()
    set_global_seeds(seed)

    train_writer = tf.summary.FileWriter(os.path.join(opts.model_dir, 'logs'),
                                         sess.graph)

    q_func = build_q_func('mlp')

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    replay_buffer = ReplayBuffer(buffer_size)

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    obs = env.reset()

    for t in range(total_timesteps):
        # Take action and update exploration to the newest value
        env.render()
        update_eps = exploration.value(t)
        action = act(np.array(obs)[None], update_eps=update_eps)[0]
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            print("Exploration value: {}".format(exploration.value(t)))
            print("Last 25 episode rewards: {}".format(episode_rewards[-25:]))

            reward_summary = tf.Summary(value=[
                tf.Summary.Value(tag='reward',
                                 simple_value=episode_rewards[-1])
            ])
            train_writer.add_summary(reward_summary, t)

            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, batch_idxes = np.ones_like(rewards), None
            td_errors, summary = train(obses_t, actions, rewards, obses_tp1,
                                       dones, weights)
            train_writer.add_summary(summary, t)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            update_target()

        if t > learning_starts and t % checkpoint_freq == 0:
            save_variables(
                os.path.join(opts.model_dir, 'weights', '{}.model'.format(t)))
    save_variables(os.path.join(opts.model_dir, 'weights', 'last.model'))
Ejemplo n.º 18
0
def sobolev_learn_episode(
        env,
        q_func,
        lr=5e-4,
        max_episodes=1000,
        buffer_size=50000,
        epsilon=.1,
        #exploration_fraction=0.1,
        #exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None,
        alpha=1.0,
        grad_norm_clipping=10.0):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return U.BatchInput(observation_space_shape, name=name)

    act, train, update_target, debug = deepq.build_sobolev_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=grad_norm_clipping,
        param_noise=param_noise,
        alpha=alpha)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    '''
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    '''
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    exploration = ConstantSchedule(epsilon)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    episode_lengths = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        e = 0  # num of current episode
        t = 0  # timestep
        while e < max_episodes:
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            episode_lengths[-1] += 1
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                episode_lengths.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)

            # increment counters
            t += 1  # increment timestep
            if done:
                e += 1  # increment episode

            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return act
Ejemplo n.º 19
0
            if is_solved:
                # Capture N samples and save them into a csv file
                env.render()
                if len(exp_demo) < N:
                    temp_list = list(obs)
                    # temp_list.append(done)
                    # temp_list.append(action)
                    exp_demo.append(temp_list)
                else:
                    with open('mentor_demonstrations_NN.csv', 'w', newline='') as csvfile:
                        data_writer = csv.writer(csvfile, delimiter=',')
                        for row in exp_demo:
                            data_writer.writerow(row)
                    break

            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones, ment_obs, ment_obs_tp1, ment_act = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()
Ejemplo n.º 20
0
class DQN(BaseAgent):
    def __init__(self,
                 env,
                 name='default',
                 alg_name='dqn',
                 network_type='mini-mlp',
                 total_timesteps=5e7,
                 batch_size=32,
                 lr=1e-3,
                 gamma=0.99,
                 buffer_size=1e6,
                 final_eps=0.05,
                 exploration_fraction=0.1,
                 training_start=1e5,
                 target_update_freq=1e4,
                 optimizer=tf.train.AdamOptimizer,
                 gradient_clipping=None,
                 reward_clipping=False,
                 tau=1.,
                 double_q=False,
                 dueling=False,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.5,
                 prioritized_replay_beta_init=0.4,
                 prioritized_replay_beta_fraction=1.0,
                 prioritized_replay_eps=1e-6,
                 rolling_reward_mean=20,
                 solved_callback=None,
                 render_training=False,
                 **kwargs):
        """
        Implementation of the Deep Q Learning (DQN) algorithm formulated by Mnih et. al.
        Contains some well known improvements over the vanilla DQN.

        Parameters
        ----------
        env: gym.Environment
            (gym) Environment the agent shall learn from and act on

        name: str
            descriptive name of this DQN configuration, e.g. 'atari-breakout'

        network_type: str
            which network is from 'networks.py'

        total_timesteps: int or float
            number of training timesteps

        batch_size: int
            size of minibatch per backprop

        lr: float
            learning rate

        gamma: float
            discount factor gamma for bellman target

        buffer_size: int or float
            maximum number of in replay buffer

        final_eps: float
            value to which epsilon is annealed

        exploration_fraction: float
            fraction of traing timesteps over which epsilon is annealed

        training_start: int
            timestep at which training of the q network begins

        target_update_freq: int
            frequency of target network updates (in timesteps)

        optimizer: tf.Optimizer
            optimizer class which shall be used such as Adam or RMSprop

        gradient_clipping: int
            if not None, gradients are clipped by this value by norm

        reward_clipping: float
            rewards will be clipped to this value if not None

        tau: float
            interpolation constant for soft update. 1.0 corresponds to
            a full synchronisation of networks weights, as in the original DQN paper

        double_q: bool
            enables Double Q Learning for DQN

        dueling: bool
            splits network architecture into advantage and value streams. V(s, a) gets
            more frequent updates, should stabalize learning

        prioritized_replay: True
            use (proportional) prioritized replay

        prioritized_replay_alpha: float
            alpha for weighting priorization

        prioritized_replay_beta_init: float
            initial value of beta for prioritized replay buffer

        prioritized_replay_beta_fraction: float
            fraction of total timesteps to anneal beta to 1.0

        prioritized_replay_eps: float
            epsilon to add to the TD errors when updating priorities.

        rolling_reward_mean: int
            window of which the rolling mean in the statistics is computed

        solved_callback: function
            function which gets as an input the episode rewards as an array and must return a bool.
            if returned True, the training is considered as done and therefore prematurely interrupted.

        render_training: bool
            whether to render the environment while training

        """

        # instance name
        self.name = name

        # environment to act on / learn from
        self.env = env

        # basic DQN parameters
        self.total_timesteps = float(total_timesteps)
        self.buffer_size = int(float(buffer_size))
        self.batch_size = batch_size
        self.final_eps = final_eps
        self.lr = float(lr)
        self.gamma = float(gamma)
        self.exploration_fraction = float(exploration_fraction)
        self.training_start = int(float(training_start))
        self.target_update_freq = int(float(target_update_freq))

        # tf.Optimizer
        self.optimizer = optimizer

        # minor changes as suggested in some papers
        self.gradient_clipping = int(
            gradient_clipping) if gradient_clipping is not None else None
        self.reward_clipping = int(
            reward_clipping) if reward_clipping is not None else None

        # enhancements to DQN published in papers
        self.tau = float(tau)
        self.double_q = double_q
        self.dueling = dueling
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = float(prioritized_replay_alpha)
        self.prioritized_replay_beta_init = float(prioritized_replay_beta_init)
        self.prioritized_replay_beta_fraction = float(
            prioritized_replay_beta_fraction)
        self.prioritized_replay_eps = float(prioritized_replay_eps)

        # function to determine whether agent is able to act well enough
        self.solved_callback = solved_callback

        # call env.render() each training step
        self.render_training = render_training

        # sliding window for reward calc
        self.rolling_reward_mean = rolling_reward_mean

        # stores latest measure for best policy, e.g. best mean over last N episodes
        self.latest_best = 0.0

        super().__init__(env, alg_name, name, **kwargs)

        # calculate timestep where epsilon reaches its final value
        self.schedule_timesteps = int(self.total_timesteps *
                                      self.exploration_fraction)

        # sanity checks
        assert 0.0 < self.tau <= 1.0

        # env specific parameter
        self.obs_shape = env.observation_space.shape
        self.num_actions = env.action_space.n

        # tf scopes
        self.Q_SCOPE = 'q_network'
        self.TARGET_SCOPE = 'target_network'

        # build Q and target network; using different scopes to distinguish variables for gradient computation
        self.q_t_in, self.q_t = build_network(self.obs_shape,
                                              self.num_actions,
                                              network_type=network_type,
                                              dueling=self.dueling,
                                              scope=self.Q_SCOPE,
                                              summaries=True)
        self.target_tp1_in, self.target_tp1 = build_network(
            self.obs_shape,
            self.num_actions,
            dueling=self.dueling,
            network_type=network_type,
            scope=self.TARGET_SCOPE)

        # double Q learning needs to pass observations t+1 to the q networks for action selection
        # so we reuse already created q network variables but with different input
        if self.double_q:
            self.q_tp1_in, self.q_tp1 = build_network(
                self.obs_shape,
                self.num_actions,
                dueling=self.dueling,
                network_type=network_type,
                scope=self.Q_SCOPE,
                reuse=True)

        # create replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        # list of variables of the different networks. required for copying
        # Q to target network and excluding target network variables from backprop
        self.q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope=self.Q_SCOPE)
        self.target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope=self.TARGET_SCOPE)

        # placeholders used in loss function
        self._L_r = tf.placeholder(tf.float32, (None, ), name='loss_rewards')
        self._L_a = tf.placeholder(tf.int32, (None, ), name='loss_actions')
        self._L_d = tf.placeholder(tf.float32, (None, ), name='loss_dones')

        # pointer to td error vector
        self._td_errors = tf.placeholder(tf.float32, (None, ),
                                         name='td_errors')

        # configure prioritized replay
        if self.prioritized_replay:
            self._is_weights = tf.placeholder(
                tf.float32, (None, ), name='importance_sampling_weights')

            # schedule for PR beta
            beta_steps = int(self.total_timesteps *
                             self.prioritized_replay_beta_fraction)
            self.pr_beta = LinearSchedule(
                beta_steps,
                initial_p=prioritized_replay_beta_init,
                final_p=1.0)

        # epsilon schedule
        self.eps = LinearSchedule(self.schedule_timesteps, final_p=final_eps)

        # init optimizer
        self.opt = self.optimizer(self.lr)

        # specify loss function, only include Q network variables for gradient computation
        self.gradients = self.opt.compute_gradients(self._loss(),
                                                    var_list=self.q_net_vars)

        # clip gradients by norm
        if self.gradient_clipping is not None:
            for idx, (grad, var) in enumerate(self.gradients):
                if grad is not None:
                    self.gradients[idx] = (tf.clip_by_norm(
                        grad, self.gradient_clipping), var)

        # create training op
        self.train_op = self.opt.apply_gradients(self.gradients)

        # update_target_fn will be called periodically to copy Q network to target Q network
        # variable lists are sorted by name to ensure that correct values are copied
        self.update_target_ops = []
        for var_q, var_target in zip(
                sorted(self.q_net_vars, key=lambda v: v.name),
                sorted(self.target_net_vars, key=lambda v: v.name)):
            v_update = var_target.assign(self.tau * var_q +
                                         (1 - self.tau) * var_target)
            self.update_target_ops.append(v_update)
        self.update_target_ops = tf.group(*self.update_target_ops)

        # global tf.Session and Graph init
        self.sess = tf.Session()

        # init tensorboard, variables and debug
        self._finalize_init()

        # sync networks before training
        self.sess.run(self.update_target_ops)

    def _setup_tensorboard(self):
        """
        Adds all variables that might help debugging to Tensorboard.
        At the end, the FileWriter is constructed pointing to the specified directory.

        """

        # more placeholders for summarised variables; along with summaries
        self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon')
        self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward')

        scalar_summary('epsilon', self.eps_ph)
        scalar_summary('reward', self.rew_ph)

        # display q_values while training
        for a_i in range(self.num_actions):
            scalar_summary('QTa_{}'.format(a_i + 1),
                           tf.reduce_mean(self.target_tp1[:, a_i]),
                           scope='Q-Values')
            scalar_summary('Qa_{}'.format(a_i + 1),
                           tf.reduce_mean(self.q_t[:, a_i]),
                           scope='Q-Values')

        # plot network weights
        with tf.variable_scope('weights'):
            for qv in self.q_net_vars:
                tf.summary.histogram('{}'.format(qv.name), qv)
            for tv in self.target_net_vars:
                tf.summary.histogram('{}'.format(tv.name), tv)

        # gradient histograms
        with tf.variable_scope('gradients'):
            for g in self.gradients:
                tf.summary.histogram('{}-grad'.format(g[1].name), g[0])

    def _loss(self):
        """ Defines loss as layed out in the original Nature paper """

        with tf.variable_scope('loss'):

            # either use maximum target q or use value from target network while the action is chosen by the q net
            if self.double_q:
                act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1))
                q_tp1 = tf.reduce_sum(
                    self.target_tp1 *
                    tf.one_hot(act_tp1_idxs, self.num_actions),
                    axis=1)
            else:

                q_tp1 = tf.reduce_max(self.target_tp1, axis=1)

            # bellman target
            y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1)

            # select q value of taken action
            qj = tf.reduce_sum(self.q_t *
                               tf.one_hot(self._L_a, self.num_actions),
                               axis=1)

            # TD errors
            self._td_errors = qj - y

            # apply huber loss
            loss = tf.losses.huber_loss(y, qj)

        if self.use_tensorboard:
            scalar_summary('target', tf.reduce_mean(y))
            scalar_summary('huber-loss', tf.reduce_mean(loss))
            tf.summary.histogram('selected_Q', qj)

        #  importance sampling weights
        if self.prioritized_replay:
            updates = tf.reduce_mean(self._is_weights * loss)
        else:
            updates = tf.reduce_mean(loss)

        return updates

    def _build_feed_dict(self,
                         obs_t,
                         ac_t,
                         rew_t,
                         obs_tp1,
                         dones,
                         eps,
                         rolling_rew,
                         weights=None):
        """ Takes minibatch and returns feed dict for a tf.Session based on the algorithms configuration. """

        # first, add data required in all DQN configs
        feed_d = {
            self.q_t_in: obs_t,
            self.target_tp1_in: obs_tp1,
            self._L_r: rew_t,
            self._L_a: ac_t,
            self._L_d: dones
        }

        # pass obs t+1 to q network
        if self.double_q:
            feed_d[self.q_tp1_in] = obs_tp1

        # importance sampling weights
        if self.prioritized_replay:
            feed_d[self._is_weights] = weights

        # variables only necessary for TensorBoard visualisation
        if self.use_tensorboard:
            feed_d[self.eps_ph] = eps
            feed_d[self.rew_ph] = rolling_rew

        return feed_d

    def learn(self):
        """ Learns Q function for a given amount of timesteps """

        # reset env, store first observation
        obs_t = self.env.reset()

        # save all episode rewards
        episode_reward_series = [[0.0]]
        episode_rewards = []

        self.logger.info(
            'Starting Exploration, training will start at step {}.'.format(
                self.training_start))

        for t in tqdm(range(int(self.total_timesteps))):

            # decide on action either by policy or chose a random one
            epsilon = self.eps.value(t)
            _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon])
            if _rand:
                action = self.env.action_space.sample()
            else:
                action = np.argmax(self.sess.run(self.q_t,
                                                 {self.q_t_in: [obs_t]}),
                                   axis=1)
                assert len(action) == 1, 'only one action can be taken!'
                action = action[0]

            # act on environment with chosen action
            obs_tp1, reward, done, _ = self.env.step(action)

            # clip reward
            if self.reward_clipping:
                reward = 1 if reward > 0 else -1 if reward < 0 else 0

            # store new transition
            self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done))

            # new observation will be current one in next iteration
            obs_t = obs_tp1

            # append current rewards to episode reward series
            episode_reward_series[-1].append(reward)

            if self.render_training:
                self.env.render()

            if t == self.training_start:
                self.logger.info('Training starts now! (t = {})'.format(t))

            # final calculations and env reset
            if done:
                # calculate total reward
                episode_rewards.append(np.sum(episode_reward_series[-1]))
                episode_reward_series.append([0.0])

                # reset env to initial state
                obs_t = self.env.reset()

            # start training after warmup period
            if t >= self.training_start:

                # calculate rolling reward
                rolling_r = np.mean(episode_rewards[-self.rolling_reward_mean:]
                                    ) if len(episode_rewards) > 0 else 0.0

                # post episode stuff: printing and saving
                if done:
                    result_table = [['t', t],
                                    ['episode',
                                     len(episode_rewards)],
                                    ['mean_reward [20]', rolling_r],
                                    ['epsilon', epsilon]]
                    print('\n{}'.format(tabulate(result_table)))

                    # if the policy improved, save as new best ... achieving a good reward in one episode
                    # might not be the best metric. continuously achieving good rewards would better
                    if len(episode_rewards) >= 25:
                        mr = np.mean(
                            episode_rewards[-self.rolling_reward_mean:])
                        if mr >= self.latest_best:
                            self.latest_best = mr
                            self.logger.info(
                                'Saving new best policy with mean[{}]_r = {} ...'
                                .format(self.rolling_reward_mean, mr))
                            self._save('best')

                    # save latest policy
                    self._save()

                    # write current values to csv log
                    self.csvlog.write('{}, {}, {}\n'.format(
                        len(episode_rewards), epsilon, episode_rewards[-1]))

                # sample batch of transitions randomly for training and build feed dictionary
                # prioritized replay needs a beta and returns weights.
                if self.prioritized_replay:
                    o_t, a_t, r_t, o_tp1, do, is_ws, batch_idxs = self.replay_buffer.sample(
                        self.batch_size, self.pr_beta.value(t))
                    feed = self._build_feed_dict(o_t,
                                                 a_t,
                                                 r_t,
                                                 o_tp1,
                                                 do,
                                                 epsilon,
                                                 rolling_r,
                                                 weights=is_ws)
                else:
                    o_t, a_t, r_t, o_tp1, do = self.replay_buffer.sample(
                        self.batch_size)
                    feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do,
                                                 epsilon, rolling_r)

                # run training (and summary) operations
                if self.use_tensorboard:
                    summary, _, td_errors = self.sess.run(
                        [self.merge_op, self.train_op, self._td_errors],
                        feed_dict=feed)
                    self.writer.add_summary(summary, t)
                else:
                    self.sess.run(self.train_op, feed_dict=feed)

                # new td errors needed to update buffer weights
                if self.prioritized_replay:
                    new_prios = np.abs(td_errors) + self.prioritized_replay_eps
                    self.replay_buffer.update_priorities(batch_idxs, new_prios)

                # sync target network every C steps
                if (t - self.training_start) % self.target_update_freq == 0:
                    self.sess.run(self.update_target_ops)

            if self.solved_callback is not None:
                if self.solved_callback(episode_rewards):
                    self.logger.info('Solved!')
                    break

        # total reward of last episode
        episode_rewards.append(np.sum(episode_reward_series[-1]))

        # finalize training, e.g. set flags, write done-file
        self._finalize_training()

    def run(self, render=True):
        """ Runs policy on given environment """

        if not self.is_trained:
            self.logger.warning('Trying to run untrained model!')

        # set necessary parameters to their defaults
        epsilon = self.final_eps
        reward = 0.0
        obs = self.env.reset()

        while True:

            # decide on action either by policy or chose a random one
            _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon])
            if _rand:
                action = self.env.action_space.sample()
            else:
                action = np.argmax(self.sess.run(self.q_t,
                                                 {self.q_t_in: [obs]}),
                                   axis=1)
                assert len(action) == 1, 'only one action can be taken!'
                action = action[0]

            # act on environment with chosen action
            obs, rew, done, _ = self.env.step(action)
            reward += rew

            if render:
                self.env.render()

            if done:
                self.logger.info('Done! Reward {}'.format(reward))
                reward = 0.0
                obs = self.env.reset()
Ejemplo n.º 21
0
def main():
    with U.make_session(8):
        env = gym.make("Pendulum-v0")

        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape,
                                                  name=name),
            q_func=model,
            num_actions=env.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )

        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for t in itertools.count():
            env.render()

            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if is_solved:
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular(
                    "mean episode reward",
                    round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()
Ejemplo n.º 22
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight_2.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha)
    replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE)
    sess = tf.Session()

    mainDQN = DQN(sess, name="main")
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    copy_ops = get_copy_var_ops(dest_scope_name="target",
                                src_scope_name="main")
    sess.run(copy_ops)

    saver = tf.train.Saver(var_list=dqn_var_list)

    for eps in range(MAX_EPISODES):
        # decaying epsilon greedy
        e = 1. / ((eps / 10) + 1)
        done = False
        step_count = 0
        state = env.reset()
        state_queue = deque(maxlen=4)
        next_state_queue = deque(maxlen=4)

        state_queue.append(state)
        next_state_queue.append(state)

        prev_100 = 0
        curr_100 = 0

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                next_state_queue.append(next_state)
                continue

            # training starts
            if np.random.rand() < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(
                    mainDQN.predict(
                        np.reshape(np.array(state_queue),
                                   [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)

            if done:  # Penalty
                reward = -100

            curr_100 += reward

            next_state_queue.append(next_state)

            replay_buffer.add(np.array(state_queue), action, reward,
                              np.array(next_state_queue), done)

            if step_count % TRAIN_EPISODE == 0:
                states, actions, rewards, next_states, _ = replay_buffer.sample(
                    batch_size)
                states, next_states = np.reshape(
                    states, [batch_size, n_size, n_size, 4]), np.reshape(
                        next_states, [batch_size, n_size, n_size, 4])

                Q_t = targetDQN.predict(next_states)
                Q_m = mainDQN.predict(states)
                Q_t = np.max(Q_t, axis=1)

                estimates = rewards + discount * Q_t
                Q_m[np.arange(batch_size), actions] = estimates

                loss = mainDQN.update(states, Q_m)
                print("eps: {} step: {} loss: {}".format(
                    eps, step_count, loss))

                if curr_100 > prev_100:
                    save_path = saver.save(
                        sess, os.path.join(checkpoint_dir, save_file_name))
                    print("Model saved in file: %s" % save_path)

                prev_100 = curr_100
                curr_100 = 0

            if step_count % TARGET_UPDATE_EPS == 0:
                sess.run(copy_ops)

            state_queue.append(next_state)
Ejemplo n.º 23
0
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if is_solved:
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()
Ejemplo n.º 24
0
class MemBufferThread(threading.Thread):
    # 注意可变参数概念
    def __init__(self,
                 mem_queue,
                 max_timesteps=1000000,
                 buffer_size=50000,
                 batch_size=32,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6):

        threading.Thread.__init__(self)
        self.mem_queue = mem_queue
        self.prioritized_replay = prioritized_replay
        self.batch_size = batch_size
        self.batch_idxes = None
        self.prioritized_replay_eps = prioritized_replay_eps

        # Create the replay buffer
        if prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                buffer_size, alpha=prioritized_replay_alpha)
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = max_timesteps
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.beta_schedule = None

    def __len__(self):
        return self.replay_buffer.__len__()

    def sample(self, t):
        if self.prioritized_replay:
            experience = self.replay_buffer.sample(
                self.batch_size,
                beta=self.beta_schedule.value(t))  # 这个t的取值有待商议,
            (obses_t, actions, rewards, obses_tp1, dones, weights,
             self.batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                self.batch_size)
            #  np.ones_like() : Return an array of ones with the same shape and type as a given array.
            weights, self.batch_idxes = np.ones_like(rewards), None

        return obses_t, actions, rewards, obses_tp1, dones, weights

    def update_priorities(self, td_errors):
        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
        self.replay_buffer.update_priorities(self.batch_idxes, new_priorities)

    def run(self):
        # flag = 1
        while True:
            if self.mem_queue.full() is True:
                print("the mem_queue is full")
            # if self.replay_buffer.__len__() >= 100000 and self.replay_buffer.__len__() % 100 == 0:  # bool(flag):
            #     # print("replay_buffer is 100000 !")
            #     print('')
            #    flag = 0
            if self.mem_queue.empty() is not True:
                single_mem = self.mem_queue.get()
                self.replay_buffer.add(single_mem[0], single_mem[1],
                                       single_mem[2], single_mem[3],
                                       single_mem[4])
Ejemplo n.º 25
0
                         and info['ale.lives'] > 0)
        prev_lives = info['ale.lives']

        replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death))
        obs = new_obs
        episode_rewards[-1] += rew

        if done:
            log.add_scalar('reward', episode_rewards[-1], num_iters)
            episode_rewards.append(0.0)
            obs = env.reset()
            num_episodes += 1

        if num_iters > args.learning_starts and num_iters % args.learning_freq == 0:

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                args.batch_size)
            # Reshape state to (batch, channels, x_dim, y_dim)
            obses_t = np.transpose(obses_t, [0, 3, 1, 2])
            obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2])

            # TODO
            td_errors = agent.learn(obses_t, actions, rewards, obses_tp1,
                                    dones)
            td_errors_list.append(td_errors.item())
            log.add_scalar('td_error', td_errors.item(), num_iters)

            num_updates += 1

            # Update target network.
        if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0:
            # TODO
Ejemplo n.º 26
0
    def evaluate(self, num_episodes, render=False):
        with U.make_session(NUM_CORES):
            self.t0 = time.time()
            env = self.env.env

            # Create all the functions necessary to train the model
            act, train, update_target, debug = deepq.build_train(
                    make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
                    q_func=model,
                    num_actions=env.action_space.n,
                    optimizer=tf.train.AdamOptimizer(learning_rate=5e-4)
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(50000)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

            # Initialize the parameters and copy them to the target network.
            U.initialize()
            update_target()

            self.episode_count += 1
            state = env.reset()
            self.scores = [0.0]
            episode_q = []

            for t in itertools.count():
                action = act(state[None], update_eps=exploration.value(t))[0]
                observation, reward, done, _ = env.step(action)
                replay_buffer.add(state, action, reward, observation, float(done))

                state = observation
                self.scores[-1] += reward

                episode_q.append(float(debug['q_values'](state[None]).max()))

                if render:
                    env.render()

                if done:
                    print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:])))
                    self.evaluation.info['q_values'].append(np.mean(episode_q))

                    if len(self.scores) >= num_episodes:
                        return self.final_evaluation()

                    state = env.reset()
                    episode_q = []
                    self.scores.append(0)

                    if self.env.solved(self.scores):
                        self.evaluation.info['solved'] = len(self.scores)

                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            U.reset()
            return self.final_evaluation()
Ejemplo n.º 27
0
class DQNLearningAgent(Agent):
    def __init__(
            self,
            env,
            # observation_space,
            # action_space,
            network=None,
            scope='deepq',
            seed=None,
            lr=None,  # Was 5e-4
            lr_mc=5e-4,
            total_episodes=None,
            total_timesteps=100000,
            buffer_size=50000,
            exploration_fraction=0.1,
            exploration_final_eps=None,  # was 0.02
            train_freq=1,
            train_log_freq=100,
            batch_size=32,
            print_freq=100,
            checkpoint_freq=10000,
            # checkpoint_path=None,
            learning_starts=1000,
            gamma=None,
            target_network_update_freq=500,
            prioritized_replay=False,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta0=0.4,
            prioritized_replay_beta_iters=None,
            prioritized_replay_eps=1e-6,
            save_path=None,
            load_path=None,
            save_reward_threshold=None,
            **network_kwargs):
        super().__init__(env, seed)
        if train_log_freq % train_freq != 0:
            raise ValueError(
                'Train log frequency should be a multiple of train frequency')
        elif checkpoint_freq % train_log_freq != 0:
            raise ValueError(
                'Checkpoint freq should be a multiple of train log frequency, or model saving will not be logged properly'
            )
        print('init dqnlearningagent')
        self.train_log_freq = train_log_freq
        self.scope = scope
        self.learning_starts = learning_starts
        self.save_reward_threshold = save_reward_threshold
        self.batch_size = batch_size
        self.train_freq = train_freq
        self.total_episodes = total_episodes
        self.total_timesteps = total_timesteps
        # TODO: scope not doing anything.
        if network is None and 'lunar' in env.unwrapped.spec.id.lower():
            if lr is None:
                lr = 1e-3
            if exploration_final_eps is None:
                exploration_final_eps = 0.02
            #exploration_fraction = 0.1
            #exploration_final_eps = 0.02
            target_network_update_freq = 1500
            #print_freq = 100
            # num_cpu = 5
            if gamma is None:
                gamma = 0.99

            network = 'mlp'
            network_kwargs = {
                'num_layers': 2,
                'num_hidden': 64,
            }

        self.target_network_update_freq = target_network_update_freq
        self.gamma = gamma

        get_session()
        # set_global_seeds(seed)
        # TODO: Check whether below is ok to substitue for set_global_seeds.
        try:
            import tensorflow as tf
            tf.set_random_seed(seed)
        except ImportError:
            pass

        self.q_func = build_q_func(network, **network_kwargs)

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph

        def make_obs_ph(name):
            return ObservationInput(env.observation_space, name=name)

        act, self.train, self.train_mc, self.update_target, debug = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=self.q_func,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            optimizer_mc=tf.train.AdamOptimizer(learning_rate=lr_mc),
            gamma=gamma,
            grad_norm_clipping=10,
            param_noise=False,
            scope=scope,
            # reuse=reuse,
        )

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': self.q_func,
            'num_actions': env.action_space.n,
        }

        self._act = ActWrapper(act, act_params)

        self.print_freq = print_freq
        self.checkpoint_freq = checkpoint_freq
        # Create the replay buffer
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                buffer_size,
                alpha=prioritized_replay_alpha,
            )
            if prioritized_replay_beta_iters is None:
                if total_episodes is not None:
                    raise NotImplementedError(
                        'Need to check how to set exploration based on episodes'
                    )
                prioritized_replay_beta_iters = total_timesteps
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=prioritized_replay_beta0,
                final_p=1.0,
            )
        else:
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.replay_buffer_mc = ReplayBuffer(buffer_size)
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(
                exploration_fraction *
                total_timesteps if total_episodes is None else total_episodes),
            initial_p=1.0,
            final_p=exploration_final_eps,
        )

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.episode_lengths = [0]
        self.episode_rewards = [0.0]
        self.discounted_episode_rewards = [0.0]
        self.start_values = [None]
        self.lunar_crashes = [0]
        self.lunar_goals = [0]
        self.saved_mean_reward = None

        self.td = None
        if save_path is None:
            self.td = tempfile.mkdtemp()
            outdir = self.td
            self.model_file = os.path.join(outdir, "model")
        else:
            outdir = os.path.dirname(save_path)
            os.makedirs(outdir, exist_ok=True)
            self.model_file = save_path
        print('DQN agent saving to:', self.model_file)
        self.model_saved = False

        if tf.train.latest_checkpoint(outdir) is not None:
            # TODO: Check scope addition
            load_variables(self.model_file, scope=self.scope)
            # load_variables(self.model_file)
            logger.log('Loaded model from {}'.format(self.model_file))
            self.model_saved = True
            raise Exception('Check that we want to load previous model')
        elif load_path is not None:
            # TODO: Check scope addition
            load_variables(load_path, scope=self.scope)
            # load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        self.train_log_file = None
        if save_path and load_path is None:
            self.train_log_file = self.model_file + '.log.csv'
            with open(self.train_log_file, 'w') as f:
                cols = [
                    'episode',
                    't',
                    'td_max',
                    'td_mean',
                    '100ep_r_mean',
                    '100ep_r_mean_discounted',
                    '100ep_v_mean',
                    '100ep_n_crashes_mean',
                    '100ep_n_goals_mean',
                    'saved_model',
                    'smoothing',
                ]
                f.write(','.join(cols) + '\n')

        self.training_episode = 0
        self.t = 0
        self.episode_t = 0
        """
        n = observation_space.n
        m = action_space.n
        self.Q = np.zeros((n, m))

        self._lr_schedule = lr_schedule
        self._eps_schedule = eps_schedule
        self._boltzmann_schedule = boltzmann_schedule
        """

        # Make placeholder for Q values
        self.q_values = debug['q_values']

    def _log_training_details(
        self,
        episode=None,
        t=None,
        td_max=None,
        td_mean=None,
        r_mean=None,
        r_mean_discounted=None,
        v_mean=None,
        n_crashes_mean=None,
        n_goals_mean=None,
        saved_model=False,
        smoothing=False,
    ):
        if self.train_log_file is not None:
            with open(self.train_log_file, 'a+') as f:
                f.write('{}\n'.format(','.join([
                    str(episode),
                    str(t),
                    '{:.5f}'.format(td_max) if td_max is not None else '',
                    '{:.5f}'.format(td_mean) if td_mean is not None else '',
                    '{:.1f}'.format(r_mean) if r_mean is not None else '',
                    '{:.1f}'.format(r_mean_discounted)
                    if r_mean_discounted is not None else '',
                    '{:.1f}'.format(v_mean) if v_mean is not None else '',
                    '{:.1f}'.format(n_crashes_mean)
                    if n_crashes_mean is not None else '',
                    '{:.1f}'.format(n_goals_mean)
                    if n_goals_mean is not None else '',
                    str(int(saved_model)),
                    str(int(smoothing)),
                ])))

    def get_q_values(self, s):
        return self.q_values(s)[0]
        """
        q_t = self.q_func(
            self.obs_t_input.get(),
            self.n_actions,
            scope='q_func',
            reuse=True,  # reuse parameters from act
        )
            Q = sess.run(
                Q_values,
                feed_dict={Q_obs: np.array(states)}
            )

        raise NotImplementedError
        """

    def act(self, s, explore, explore_eps=None):
        # Take action and update exploration to the newest value
        # get_session()
        obs = s
        if explore and explore_eps is None:
            update_eps = self.exploration.value(
                self.t if self.total_episodes is None else self.
                training_episode)
        elif explore:
            update_eps = explore_eps
        else:
            update_eps = 0
        return self._act(
            np.array(obs)[None],
            update_eps=update_eps,
        )[0]

    def smooth(
        self,
        behavior_policy,
        evaluation_timesteps,
        max_k_random_actions=50,
    ):
        """Sample episodes to use for monte-carlo rollouts."""
        obs = self.env.reset()
        ep = 0
        episode_rewards = []
        episode_states = []
        episode_actions = []

        # TODO: Don't hard-code, and bias towards smaller.

        def get_random_k_t():
            k_random = self.np_random.randint(0, max_k_random_actions)
            random_t = self.np_random.randint(k_random, 200)
            return k_random, random_t

        k_random_actions, random_t = get_random_k_t()
        for t in range(evaluation_timesteps):
            episode_t = len(episode_actions)
            if IS_LOCAL and episode_t >= random_t:
                self.env.render()
            if episode_t < k_random_actions or episode_t == random_t:
                next_action = behavior_policy.act(
                    obs,
                    explore=True,
                    explore_eps=1,
                )
            else:
                next_action = behavior_policy.act(obs, explore=False)
            obs1, reward, done, _ = self.env.step(next_action)
            episode_rewards.append(reward)
            episode_states.append(obs)
            episode_actions.append(next_action)
            obs = obs1
            if done:
                for i, (o, a) in enumerate(
                        zip(episode_states[random_t:],
                            episode_actions[random_t:])):
                    weighted_rewards = [
                        r * self.gamma**j
                        for j, r in enumerate(episode_rewards[random_t + i:])
                    ]
                    reward_to_go = sum(weighted_rewards)
                    self.replay_buffer_mc.add(
                        o,
                        a,
                        reward_to_go,
                        None,
                        None,
                    )

                    # Update model.
                    obses_t, actions, rewards, _, _ = self.replay_buffer_mc.sample(
                        self.batch_size)
                    weights = np.ones_like(rewards)
                    td_errors = self.train_mc(obses_t, actions, rewards,
                                              weights)
                    # print(rewards)
                    # print(td_errors)
                    #print(self.get_q_values(o)[a], reward_to_go)
                    # print('----')
                    simulated_t = t - len(episode_rewards) + random_t + i
                    if simulated_t % self.train_log_freq == 0:
                        self._log_training_details(
                            episode=ep,
                            t=simulated_t,
                            td_max=np.max(np.abs(td_errors)),
                            td_mean=np.mean(np.abs(td_errors)),
                            smoothing=True,
                        )

                    # Save model
                    if (self.checkpoint_freq is not None
                            and simulated_t % self.checkpoint_freq == 0):
                        if self.print_freq is not None:
                            logger.log("Saving model due to smoothing")
                        # TODO: Check scope addition
                        save_variables(self.model_file, scope=self.scope)
                        # save_variables(self.model_file)
                        self.model_saved = True

                obs = self.env.reset()
                episode_rewards = []
                episode_states = []
                episode_actions = []
                ep += 1
                k_random_actions, random_t = get_random_k_t()
            """
            # Finish
            obs = obs1
            self.t += 1
            if done:
                self.episode_rewards.append(0.0)
                self.training_episode += 1
                obs = self.env.reset()
            """
        # TODO: Check that model isn't getting worse?
        # TODO: Reload last best saved model like in self.end_learning?

    @property
    def mean_100ep_reward(self):
        return round(np.mean(self.episode_rewards[-101:-1]), 1)

    @property
    def mean_100ep_discounted_reward(self):
        return round(np.mean(self.discounted_episode_rewards[-101:-1]), 1)

    @property
    def mean_100ep_start_value(self):
        return round(np.mean(self.start_values[-100:]), 1)

    @property
    def mean_100ep_lunar_crashes(self):
        return round(np.mean(self.lunar_crashes[-100:]), 1)

    @property
    def mean_100ep_lunar_goals(self):
        return round(np.mean(self.lunar_goals[-100:]), 1)

    @property
    def mean_100ep_length(self):
        return round(np.mean(self.episode_lengths[-100:]), 1)

    def update(self, s, a, s1, r, done, verbose=False, freeze_buffer=False):
        # get_session()
        obs = s
        new_obs = s1
        action = a
        rew = r
        # Store transition in the replay buffer.
        if not freeze_buffer:
            self.replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        self.discounted_episode_rewards[-1] += rew * \
            self.gamma ** self.episode_t
        if self.start_values[-1] is None:
            self.start_values[-1] = max(self.get_q_values(s))
        if rew == -100:
            self.lunar_crashes[-1] = 1
        elif rew == 100:
            self.lunar_goals[-1] = 1

        mean_100ep_reward = self.mean_100ep_reward

        td_errors = None
        if self.t > self.learning_starts and self.t % self.train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if self.prioritized_replay:
                experience = self.replay_buffer.sample(
                    self.batch_size,
                    beta=self.beta_schedule.value(t),
                )
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                    self.batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones,
                                   weights)
            if self.prioritized_replay:
                new_priorities = np.abs(td_errors) + \
                    self.prioritized_replay_eps
                self.replay_buffer.update_priorities(batch_idxes,
                                                     new_priorities)

        if self.t > self.learning_starts and self.t % self.target_network_update_freq == 0:
            # Update target network periodically.
            self.update_target()

        saved = False
        if (self.checkpoint_freq is not None and self.t > self.learning_starts
                and self.training_episode > 100
                and self.t % self.checkpoint_freq == 0):
            if (self.saved_mean_reward is None
                    or mean_100ep_reward > self.saved_mean_reward
                    or (self.save_reward_threshold is not None
                        and mean_100ep_reward >= self.save_reward_threshold)):
                saved = True
                if self.print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase (or mean reward above {}): {} -> {}"
                        .format(
                            self.save_reward_threshold if
                            self.save_reward_threshold is not None else 'NULL',
                            self.saved_mean_reward, mean_100ep_reward))
                # TODO: Check scope addition
                save_variables(self.model_file, scope=self.scope)
                # save_variables(self.model_file)
                self.model_saved = True
                self.saved_mean_reward = mean_100ep_reward

        if self.t > self.learning_starts and self.t % self.train_log_freq == 0:
            self._log_training_details(
                episode=self.training_episode,
                t=self.t,
                td_max=np.max(np.abs(td_errors)),
                td_mean=np.mean(np.abs(td_errors)),
                r_mean=mean_100ep_reward,
                r_mean_discounted=self.mean_100ep_discounted_reward,
                v_mean=self.mean_100ep_start_value,
                n_crashes_mean=self.mean_100ep_lunar_crashes,
                n_goals_mean=self.mean_100ep_lunar_goals,
                saved_model=saved,
            )

        self.t += 1
        self.episode_t += 1
        if done:
            self.start_values.append(None)
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0)
            self.lunar_crashes.append(0)
            self.lunar_goals.append(0)
            self.discounted_episode_rewards.append(0.0)
            self.training_episode += 1
            self.episode_t = 0

    def end_learning(self):
        if self.model_saved:
            if self.print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    self.saved_mean_reward))
            # TODO: Check scope addition
            load_variables(self.model_file, scope=self.scope)
            # load_variables(self.model_file)

    def close(self):
        if self.td is not None:
            import shutil
            shutil.rmtree(self.td)
Ejemplo n.º 28
0
					action = act(obs[None], update_eps=exploration.value(t))[0]
					new_obs, rew, done, _ = env.step(action)
					# Store transition in the replay buffer.
					replay_buffer.add(obs, action, rew, new_obs, float(done))
					obs = new_obs

					episode_rewards += rew
					if done:
						env.render()
						obs = env.reset()
						y_s[i,j] = episode_rewards
						break
				
					# Minimize the error in Bellman's equation on a batch sampled from replay buffer.
					if t > 1000:
						obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) #change to dynamic
						train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))				

					#is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
					#if is_solved:
						# Show off the result
						#env.render()
					#else:
						# Minimize the error in Bellman's equation on a batch sampled from replay buffer.
						#if t > 1000:
							#obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) #change to dynamic
							#train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
						# Update target network periodically.
						if t % target_update == 0:
							update_target()