Esempio n. 1
0
    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for _ in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.dqn_graph.act(
                self.sess,
                np.array(self.obs)[None],
                self.exploration.value(self.num_timesteps))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"],
                        beta=self.beta_schedule.value(self.num_timesteps))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = (
                        self.replay_buffer.sample(config["batch_size"]))
                    batch_idxes = None
                td_errors = self.dqn_graph.train(self.sess, obses_t, actions,
                                                 rewards, obses_tp1, dones,
                                                 np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = np.abs(td_errors) + (
                        config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if self.num_timesteps > config["learning_starts"] and (
                    self.num_timesteps % config["target_network_update_freq"]
                    == 0):
                # Update target network periodically.
                self.dqn_graph.update_target(self.sess)

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration":
            int(100 * self.exploration.value(self.num_timesteps))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("buffer_size", len(self.replay_buffer))
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular(
            "% time spent exploring",
            int(100 * self.exploration.value(self.num_timesteps)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
Esempio n. 2
0
File: dqn.py Progetto: zcli/ray
    def _train_sync(self):
        config = self.config
        sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0
        iter_init_timesteps = self.cur_timestep

        num_loop_iters = 0
        while (self.cur_timestep - iter_init_timesteps <
               config["timesteps_per_iteration"]):
            dt = time.time()
            if self.workers:
                worker_steps = ray.get([
                    w.do_steps.remote(config["sample_batch_size"] //
                                      len(self.workers),
                                      self.cur_timestep,
                                      store=False) for w in self.workers
                ])
                for steps in worker_steps:
                    for obs, action, rew, new_obs, done in steps:
                        self.actor.replay_buffer.add(obs, action, rew, new_obs,
                                                     done)
            else:
                self.actor.do_steps(config["sample_batch_size"],
                                    self.cur_timestep,
                                    store=True)
            num_loop_iters += 1
            self.cur_timestep += config["sample_batch_size"]
            self.steps_since_update += config["sample_batch_size"]
            sample_time += time.time() - dt

            if self.cur_timestep > config["learning_starts"]:
                if config["multi_gpu_optimize"]:
                    dt = time.time()
                    times = self.actor.do_multi_gpu_optimize(self.cur_timestep)
                    if num_loop_iters <= 1:
                        print("Multi-GPU times", times)
                    learn_time += (time.time() - dt)
                else:
                    # Minimize the error in Bellman's equation on a batch
                    # sampled from replay buffer.
                    for _ in range(
                            max(
                                1, config["train_batch_size"] //
                                config["sgd_batch_size"])):
                        dt = time.time()
                        gradients = [
                            self.actor.sample_buffer_gradient(
                                self.cur_timestep)
                        ]
                        learn_time += (time.time() - dt)
                        dt = time.time()
                        for grad in gradients:
                            self.actor.apply_gradients(grad)
                        apply_time += (time.time() - dt)
                dt = time.time()
                self._update_worker_weights()
                sync_time += (time.time() - dt)

            if (self.cur_timestep > config["learning_starts"]
                    and self.steps_since_update >
                    config["target_network_update_freq"]):
                # Update target network periodically.
                self.actor.dqn_graph.update_target(self.actor.sess)
                self.steps_since_update -= config["target_network_update_freq"]
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        if not self.workers:
            stats = self.actor.stats(self.cur_timestep)
            mean_100ep_reward += stats[0]
            mean_100ep_length += stats[1]
            num_episodes += stats[2]
            exploration = stats[3]
            buffer_size_sum += stats[4]
        for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers]):
            mean_100ep_reward += mean_rew
            mean_100ep_length += mean_len
            num_episodes += episodes
            buffer_size_sum += buf_sz
        mean_100ep_reward /= config["num_workers"]
        mean_100ep_length /= config["num_workers"]

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("sample_time", sample_time),
            ("weight_sync_time", sync_time),
            ("apply_time", apply_time),
            ("learn_time", learn_time),
            ("samples_per_s", num_loop_iters *
             np.float64(config["sample_batch_size"]) / sample_time),
            ("learn_samples_per_s", num_loop_iters *
             np.float64(config["train_batch_size"]) / learn_time),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
Esempio n. 3
0
    def _train(self):
        config = self.config
        sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0
        iter_init_timesteps = self.cur_timestep

        num_loop_iters = 0
        steps_per_iter = config["sample_batch_size"] * len(self.workers)
        while (self.cur_timestep - iter_init_timesteps <
               config["timesteps_per_iteration"]):
            dt = time.time()
            ray.get([
                w.do_steps.remote(config["sample_batch_size"],
                                  self.cur_timestep) for w in self.workers
            ])
            num_loop_iters += 1
            self.cur_timestep += steps_per_iter
            self.steps_since_update += steps_per_iter
            sample_time += time.time() - dt

            if self.cur_timestep > config["learning_starts"]:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                self._update_worker_weights()
                sync_time += (time.time() - dt)
                dt = time.time()
                gradients = ray.get([
                    w.get_gradient.remote(self.cur_timestep)
                    for w in self.workers
                ])
                learn_time += (time.time() - dt)
                dt = time.time()
                for grad in gradients:
                    self.actor.apply_gradients(grad)
                apply_time += (time.time() - dt)

            if (self.cur_timestep > config["learning_starts"]
                    and self.steps_since_update >
                    config["target_network_update_freq"]):
                self.actor.dqn_graph.update_target(self.actor.sess)
                # Update target network periodically.
                self._update_worker_weights()
                self.steps_since_update -= config["target_network_update_freq"]
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers]):
            mean_100ep_reward += mean_rew
            mean_100ep_length += mean_len
            num_episodes += episodes
            buffer_size_sum += buf_sz
        mean_100ep_reward /= len(self.workers)
        mean_100ep_length /= len(self.workers)

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("sample_time", sample_time),
            ("weight_sync_time", sync_time),
            ("apply_time", apply_time),
            ("learn_time", learn_time),
            ("samples_per_s",
             num_loop_iters * np.float64(steps_per_iter) / sample_time),
            ("learn_samples_per_s",
             num_loop_iters * np.float64(config["train_batch_size"]) *
             np.float64(config["num_workers"]) / learn_time),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
Esempio n. 4
0
File: dqn.py Progetto: zcli/ray
    def _train_async(self):
        apply_time = RunningStat(())
        wait_time = RunningStat(())
        gradient_lag = RunningStat(())
        iter_init_timesteps = self.cur_timestep
        num_gradients_applied = 0
        gradient_list = [
            worker.do_async_step.remote(i, self.cur_timestep,
                                        self.actor.get_weights(),
                                        num_gradients_applied)
            for i, worker in enumerate(self.workers)
        ]
        steps = self.config["sample_batch_size"] * len(gradient_list)
        self.cur_timestep += steps
        self.steps_since_update += steps

        while gradient_list:
            dt = time.time()
            gradient, info = ray.get(gradient_list[0])
            gradient_list = gradient_list[1:]
            wait_time.push(time.time() - dt)

            if gradient is not None:
                dt = time.time()
                self.actor.apply_gradients(gradient)
                apply_time.push(time.time() - dt)
                gradient_lag.push(num_gradients_applied - info["gradient_id"])
                num_gradients_applied += 1

            if (self.cur_timestep - iter_init_timesteps <
                    self.config["timesteps_per_iteration"]):
                worker_id = info["id"]
                gradient_list.append(
                    self.workers[info["id"]].do_async_step.remote(
                        worker_id, self.cur_timestep, self.actor.get_weights(),
                        num_gradients_applied))
                self.cur_timestep += self.config["sample_batch_size"]
                self.steps_since_update += self.config["sample_batch_size"]

            if (self.cur_timestep > self.config["learning_starts"]
                    and self.steps_since_update >
                    self.config["target_network_update_freq"]):
                # Update target network periodically.
                self.actor.dqn_graph.update_target(self.actor.sess)
                self.steps_since_update -= (
                    self.config["target_network_update_freq"])
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        stats = ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers])
        for stat in stats:
            mean_100ep_reward += stat[0]
            mean_100ep_length += stat[1]
            num_episodes += stat[2]
            exploration = stat[3]
            buffer_size_sum += stat[4]
            set_weights_time = stat[5]
            sample_time = stat[6]
            grad_time = stat[7]
        mean_100ep_reward /= self.config["num_workers"]
        mean_100ep_length /= self.config["num_workers"]

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("mean_set_weights_time", set_weights_time),
            ("mean_sample_time", sample_time),
            ("mean_grad_time", grad_time),
            ("mean_apply_time", float(apply_time.mean)),
            ("mean_ray_wait_time", float(wait_time.mean)),
            ("gradient_lag_mean", float(gradient_lag.mean)),
            ("gradient_lag_stdev", float(gradient_lag.std)),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result