Beispiel #1
0
class Runner(object):
    """Actor object to start running simulation on workers.

  The gradient computation is also executed from this object.
  """
    def __init__(self, env_name, actor_id, logdir="/tmp/ray/a3c/", start=True):
        env = create_env(env_name)
        self.id = actor_id
        num_actions = env.action_space.n
        self.policy = LSTMPolicy(env.observation_space.shape, num_actions,
                                 actor_id)
        self.runner = RunnerThread(env, self.policy, 20)
        self.env = env
        self.logdir = logdir
        if start:
            self.start()

    def pull_batch_from_queue(self):
        """Take a rollout from the queue of the thread runner."""
        rollout = self.runner.queue.get(timeout=600.0)
        if isinstance(rollout, BaseException):
            raise rollout
        while not rollout.terminal:
            try:
                part = self.runner.queue.get_nowait()
                if isinstance(part, BaseException):
                    raise rollout
                rollout.extend(part)
            except queue.Empty:
                break
        return rollout

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

    Calling this clears the queue of completed rollout metrics.
    """
        completed = []
        while True:
            try:
                completed.append(self.runner.metrics_queue.get_nowait())
            except queue.Empty:
                break
        return completed

    def start(self):
        summary_writer = tf.summary.FileWriter(
            os.path.join(self.logdir, "agent_%d" % self.id))
        self.summary_writer = summary_writer
        self.runner.start_runner(self.policy.sess, summary_writer)

    def compute_gradient(self, params):
        self.policy.set_weights(params)
        rollout = self.pull_batch_from_queue()
        batch = process_rollout(rollout, gamma=0.99, lambda_=1.0)
        gradient = self.policy.get_gradients(batch)
        info = {"id": self.id, "size": len(batch.a)}
        return gradient, info
Beispiel #2
0
 def __init__(self, env_name, config):
     Algorithm.__init__(self, env_name, config)
     self.env = create_env(env_name)
     self.policy = LSTMPolicy(self.env.observation_space.shape,
                              self.env.action_space.n, 0)
     self.agents = [
         Runner.remote(env_name, i) for i in range(config["num_workers"])
     ]
     self.parameters = self.policy.get_weights()
     self.iteration = 0
Beispiel #3
0
 def __init__(self, env_name, actor_id, logdir="/tmp/ray/a3c/", start=True):
     env = create_env(env_name)
     self.id = actor_id
     num_actions = env.action_space.n
     self.policy = LSTMPolicy(env.observation_space.shape, num_actions,
                              actor_id)
     self.runner = RunnerThread(env, self.policy, 20)
     self.env = env
     self.logdir = logdir
     if start:
         self.start()
Beispiel #4
0
Datei: a3c.py Projekt: xgong/ray
 def __init__(self, env_name, config, upload_dir=None):
     config.update({"alg": "A3C"})
     Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
     self.env = create_env(env_name)
     self.policy = LSTMPolicy(self.env.observation_space.shape,
                              self.env.action_space.n, 0)
     self.agents = [
         Runner.remote(env_name, i, self.logdir)
         for i in range(config["num_workers"])
     ]
     self.parameters = self.policy.get_weights()
     self.iteration = 0
Beispiel #5
0
Datei: a3c.py Projekt: xgong/ray
class A3C(Algorithm):
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "A3C"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        self.env = create_env(env_name)
        self.policy = LSTMPolicy(self.env.observation_space.shape,
                                 self.env.action_space.n, 0)
        self.agents = [
            Runner.remote(env_name, i, self.logdir)
            for i in range(config["num_workers"])
        ]
        self.parameters = self.policy.get_weights()
        self.iteration = 0

    def train(self):
        gradient_list = [
            agent.compute_gradient.remote(self.parameters)
            for agent in self.agents
        ]
        max_batches = self.config["num_batches_per_iteration"]
        batches_so_far = len(gradient_list)
        while gradient_list:
            done_id, gradient_list = ray.wait(gradient_list)
            gradient, info = ray.get(done_id)[0]
            self.policy.model_update(gradient)
            self.parameters = self.policy.get_weights()
            if batches_so_far < max_batches:
                batches_so_far += 1
                gradient_list.extend([
                    self.agents[info["id"]].compute_gradient.remote(
                        self.parameters)
                ])
        res = self.fetch_metrics_from_workers()
        self.iteration += 1
        return res

    def fetch_metrics_from_workers(self):
        episode_rewards = []
        episode_lengths = []
        metric_lists = [
            a.get_completed_rollout_metrics.remote() for a in self.agents
        ]
        for metrics in metric_lists:
            for episode in ray.get(metrics):
                episode_lengths.append(episode.episode_length)
                episode_rewards.append(episode.episode_reward)
        res = TrainingResult(self.experiment_id.hex, self.iteration,
                             np.mean(episode_rewards),
                             np.mean(episode_lengths), dict())
        return res