Example #1
0
 def _train(self):
     i = max(0, self.iteration - self.config["offset"])
     v = np.tanh(float(i) / self.config["width"])
     v *= self.config["height"]
     return TrainingResult(
         episode_reward_mean=v, episode_len_mean=v,
         timesteps_this_iter=self.config["iter_timesteps"],
         time_this_iter_s=self.config["iter_time"], info={})
Example #2
0
 def _train(self):
     if self.config["mock_error"] and self.iteration == 1 \
             and (self.config["persistent_error"] or not self.restored):
         raise Exception("mock error")
     return TrainingResult(episode_reward_mean=10,
                           episode_len_mean=10,
                           timesteps_this_iter=10,
                           info={})
Example #3
0
    def __call__(self, **kwargs):
        """Report updated training status.

        Args:
            kwargs (TrainingResult): Latest training result status. You must
                at least define `timesteps_total`, but probably want to report
                some of the other metrics as well.
        """

        with self._lock:
            self._latest_result = self._last_result = TrainingResult(**kwargs)
Example #4
0
def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 10])

    # Build the graph for the deep net
    y_conv, keep_prob = deepnn(x)

    with tf.name_scope('loss'):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=y_, logits=y_conv)
    cross_entropy = tf.reduce_mean(cross_entropy)

    with tf.name_scope('adam_optimizer'):
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        correct_prediction = tf.cast(correct_prediction, tf.float32)
    accuracy = tf.reduce_mean(correct_prediction)

    graph_location = tempfile.mkdtemp()
    print('Saving graph to: %s' % graph_location)
    train_writer = tf.summary.FileWriter(graph_location)
    train_writer.add_graph(tf.get_default_graph())

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(20000):
            batch = mnist.train.next_batch(50)
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                        x: batch[0], y_: batch[1], keep_prob: 1.0})

                # !!! Report status to ray.tune !!!
                if status_reporter:
                    status_reporter.report(TrainingResult(
                        timesteps_total=i,
                        mean_accuracy=train_accuracy))

                print('step %d, training accuracy %g' % (i, train_accuracy))
            train_step.run(
                feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

        print('test accuracy %g' % accuracy.eval(feed_dict={
                x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
    def _train(self):
        self.classifier.train(input_fn=lambda: iris_data.train_input_fn(
            self.train_x, self.train_y, 10),
                              steps=100)

        self.steps = self.steps + 100

        eval_result = self.classifier.evaluate(
            input_fn=lambda: iris_data.eval_input_fn(self.test_x, self.test_y,
                                                     10))

        return TrainingResult(timesteps_this_iter=100,
                              timesteps_total=self.steps,
                              mean_validation_accuracy=eval_result['accuracy'])
Example #6
0
File: bc.py Project: velconia/ray
 def _train(self):
     self.optimizer.step()
     metric_lists = [re.get_metrics.remote() for re in
                     self.remote_evaluators]
     total_samples = 0
     total_loss = 0
     for metrics in metric_lists:
         for m in ray.get(metrics):
             total_samples += m["num_samples"]
             total_loss += m["loss"]
     result = TrainingResult(
         mean_loss=total_loss / total_samples,
         timesteps_this_iter=total_samples,
     )
     return result
Example #7
0
File: ddpg.py Project: qyccc/ray
    def _train(self):
        start_timestep = self.global_timestep

        self.optimizer.step()
        self.local_evaluator.update_target()
        self.last_target_update_ts = self.global_timestep
        self.num_target_updates += 1

        self.local_evaluator.set_global_timestep(self.global_timestep)
        for e in self.remote_evaluators:
            e.set_global_timestep.remote(self.global_timestep)

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0

        if self.remote_evaluators:
            stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
        else:
            stats = self.local_evaluator.stats()
            if not isinstance(stats, list):
                stats = [stats]

        if self.config["per_worker_exploration"]:
            # Return stats from workers with the lowest 20% of exploration
            test_stats = stats[-int(max(1, len(stats) * 0.2)):]
        else:
            test_stats = stats

        for s in test_stats:
            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)

        for s in stats:
            num_episodes += s["num_episodes"]

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                episodes_total=num_episodes,
                                timesteps_this_iter=self.global_timestep -
                                start_timestep,
                                info={})

        return result
Example #8
0
    def _train(self):
        start_timestep = self.global_timestep

        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]):

            if self.global_timestep < self.config["learning_starts"]:
                self._populate_replay_buffer()
            else:
                self.optimizer.step()

            stats = self._update_global_stats()

            if self.global_timestep - self.last_target_update_ts > \
                    self.config["target_network_update_freq"]:
                self.local_evaluator.update_target()
                self.last_target_update_ts = self.global_timestep
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        exploration = -1

        for s in stats:
            mean_100ep_reward += s["mean_100ep_reward"] / len(stats)
            mean_100ep_length += s["mean_100ep_length"] / len(stats)
            num_episodes += s["num_episodes"]
            exploration = s["exploration"]

        result = TrainingResult(
            episode_reward_mean=mean_100ep_reward,
            episode_len_mean=mean_100ep_length,
            episodes_total=num_episodes,
            timesteps_this_iter=self.global_timestep - start_timestep,
            info=dict(
                {
                    "exploration": exploration,
                    "num_target_updates": self.num_target_updates,
                }, **self.optimizer.stats()))

        return result
Example #9
0
File: dqn.py Project: zionzheng/ray
    def _train_stats(self, start_timestep):
        if self.remote_evaluators:
            stats = ray.get([
                e.stats.remote() for e in self.remote_evaluators])
        else:
            stats = self.local_evaluator.stats()
            if not isinstance(stats, list):
                stats = [stats]

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        explorations = []

        if self.config["per_worker_exploration"]:
            # Return stats from workers with the lowest 20% of exploration
            test_stats = stats[-int(max(1, len(stats)*0.2)):]
        else:
            test_stats = stats

        for s in test_stats:
            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)

        for s in stats:
            num_episodes += s["num_episodes"]
            explorations.append(s["exploration"])

        opt_stats = self.optimizer.stats()

        result = TrainingResult(
            episode_reward_mean=mean_100ep_reward,
            episode_len_mean=mean_100ep_length,
            episodes_total=num_episodes,
            timesteps_this_iter=self.global_timestep - start_timestep,
            info=dict({
                "min_exploration": min(explorations),
                "max_exploration": max(explorations),
                "num_target_updates": self.num_target_updates,
            }, **opt_stats))

        return result
Example #10
0
    def _fetch_metrics_from_remote_evaluators(self):
        episode_rewards = []
        episode_lengths = []
        metric_lists = [a.get_completed_rollout_metrics.remote()
                        for a in self.remote_evaluators]
        for metrics in metric_lists:
            for episode in ray.get(metrics):
                episode_lengths.append(episode.episode_length)
                episode_rewards.append(episode.episode_reward)
        avg_reward = (
            np.mean(episode_rewards) if episode_rewards else float('nan'))
        avg_length = (
            np.mean(episode_lengths) if episode_lengths else float('nan'))
        timesteps = np.sum(episode_lengths) if episode_lengths else 0

        result = TrainingResult(
            episode_reward_mean=avg_reward,
            episode_len_mean=avg_length,
            timesteps_this_iter=timesteps)

        return result
Example #11
0
    def _train(self):
        start_timestep = self.global_timestep
        num_steps = 0

        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]):

            self.global_timestep += self.optimizer.step()
            num_steps += 1

            if self.global_timestep - self.last_target_update_ts > \
                    self.config["target_network_update_freq"]:
                self.local_evaluator.update_target()
                self.last_target_update_ts = self.global_timestep
                self.num_target_updates += 1

        test_stats = self._update_global_stats()
        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        explorations = []

        for s in test_stats:
            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
            num_episodes += s["num_episodes"]

        opt_stats = self.optimizer.stats()

        result = TrainingResult(
            episode_reward_mean=mean_100ep_reward,
            episode_len_mean=mean_100ep_length,
            episodes_total=num_episodes,
            timesteps_this_iter=self.global_timestep - start_timestep,
            info=dict({
                "num_target_updates": self.num_target_updates,
            }, **opt_stats))

        return result
Example #12
0
    def _train(self):
        self.optimizer.step()

        episode_rewards = []
        episode_lengths = []
        metric_lists = [a.get_completed_rollout_metrics.remote()
                        for a in self.remote_evaluators]
        for metrics in metric_lists:
            for episode in ray.get(metrics):
                episode_lengths.append(episode.episode_length)
                episode_rewards.append(episode.episode_reward)
        avg_reward = np.mean(episode_rewards)
        avg_length = np.mean(episode_lengths)
        timesteps = np.sum(episode_lengths)

        result = TrainingResult(
            episode_reward_mean=avg_reward,
            episode_len_mean=avg_length,
            timesteps_this_iter=timesteps,
            info={})

        return result
Example #13
0
def collect_metrics(local_evaluator, remote_evaluators=[]):
    """Gathers episode metrics from CommonPolicyEvaluator instances."""

    episode_rewards = []
    episode_lengths = []
    policy_rewards = collections.defaultdict(list)
    metric_lists = ray.get([
        a.apply.remote(lambda ev: ev.sampler.get_metrics())
        for a in remote_evaluators
    ])
    metric_lists.append(local_evaluator.sampler.get_metrics())
    for metrics in metric_lists:
        for episode in metrics:
            episode_lengths.append(episode.episode_length)
            episode_rewards.append(episode.episode_reward)
            for (_, policy_id), reward in episode.agent_rewards.items():
                policy_rewards[policy_id].append(reward)
    if episode_rewards:
        min_reward = min(episode_rewards)
        max_reward = max(episode_rewards)
    else:
        min_reward = float('nan')
        max_reward = float('nan')
    avg_reward = np.mean(episode_rewards)
    avg_length = np.mean(episode_lengths)
    timesteps = np.sum(episode_lengths)

    for policy_id, rewards in policy_rewards.copy().items():
        policy_rewards[policy_id] = np.mean(rewards)

    return TrainingResult(episode_reward_max=max_reward,
                          episode_reward_min=min_reward,
                          episode_reward_mean=avg_reward,
                          episode_len_mean=avg_length,
                          episodes_total=len(episode_lengths),
                          timesteps_this_iter=timesteps,
                          policy_reward_mean=dict(policy_rewards))
Example #14
0
File: es.py Project: ml-squad/ray
    def _train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_weights()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["episodes_per_batch"],
            config["timesteps_per_batch"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes
        self.timesteps_so_far += num_timesteps

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_indices),
            batch_size=500)
        g /= noisy_returns.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_indices))
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g +
                                                    config["l2_coeff"] * theta)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)

        step_tend = time.time()
        tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
        tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
        tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())

        tlogger.record_tabular("EpRewMean", noisy_returns.mean())
        tlogger.record_tabular("EpRewStd", noisy_returns.std())
        tlogger.record_tabular("EpLenMean", noisy_lengths.mean())

        tlogger.record_tabular("Norm", float(np.square(theta).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        info = {
            "weights_norm": np.square(theta).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
            "timesteps_this_iter": noisy_lengths.sum(),
            "timesteps_so_far": self.timesteps_so_far,
            "time_elapsed_this_iter": step_tend - step_tstart,
            "time_elapsed": step_tend - self.tstart
        }

        result = TrainingResult(episode_reward_mean=eval_returns.mean(),
                                episode_len_mean=eval_lengths.mean(),
                                timesteps_this_iter=noisy_lengths.sum(),
                                info=info)

        return result
Example #15
0
 def _train(self):
     return TrainingResult(
         episode_reward_mean=self.config["reward_amt"] * self.iteration,
         episode_len_mean=self.config["reward_amt"],
         timesteps_this_iter=self.config["iter_timesteps"],
         time_this_iter_s=self.config["iter_time"], info={})
 def result2(t, rew):
     return TrainingResult(time_total_s=t, neg_mean_loss=rew)
def result(t, rew):
    return TrainingResult(time_total_s=t,
                          episode_reward_mean=rew,
                          training_iteration=int(t))
 def result2(t, rew):
     return TrainingResult(training_iteration=t, neg_mean_loss=rew)
Example #19
0
    def _train(self):
        config = self.config
        sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0
        iter_init_timesteps = self.cur_timestep

        num_loop_iters = 0
        steps_per_iter = config["sample_batch_size"] * len(self.workers)
        while (self.cur_timestep - iter_init_timesteps <
               config["timesteps_per_iteration"]):
            dt = time.time()
            ray.get([
                w.do_steps.remote(config["sample_batch_size"],
                                  self.cur_timestep) for w in self.workers
            ])
            num_loop_iters += 1
            self.cur_timestep += steps_per_iter
            self.steps_since_update += steps_per_iter
            sample_time += time.time() - dt

            if self.cur_timestep > config["learning_starts"]:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                self._update_worker_weights()
                sync_time += (time.time() - dt)
                dt = time.time()
                gradients = ray.get([
                    w.get_gradient.remote(self.cur_timestep)
                    for w in self.workers
                ])
                learn_time += (time.time() - dt)
                dt = time.time()
                for grad in gradients:
                    self.actor.apply_gradients(grad)
                apply_time += (time.time() - dt)

            if (self.cur_timestep > config["learning_starts"]
                    and self.steps_since_update >
                    config["target_network_update_freq"]):
                self.actor.dqn_graph.update_target(self.actor.sess)
                # Update target network periodically.
                self._update_worker_weights()
                self.steps_since_update -= config["target_network_update_freq"]
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers]):
            mean_100ep_reward += mean_rew
            mean_100ep_length += mean_len
            num_episodes += episodes
            buffer_size_sum += buf_sz
        mean_100ep_reward /= len(self.workers)
        mean_100ep_length /= len(self.workers)

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("sample_time", sample_time),
            ("weight_sync_time", sync_time),
            ("apply_time", apply_time),
            ("learn_time", learn_time),
            ("samples_per_s",
             num_loop_iters * np.float64(steps_per_iter) / sample_time),
            ("learn_samples_per_s",
             num_loop_iters * np.float64(config["train_batch_size"]) *
             np.float64(config["num_workers"]) / learn_time),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
Example #20
0
 def _train(self):
     return TrainingResult(
         episode_reward_mean=10, episode_len_mean=10,
         timesteps_this_iter=10, info={})
Example #21
0
File: ppo.py Project: zcli/ray
    def _train(self):
        agents = self.agents
        config = self.config
        model = self.model

        print("===> iteration", self.iteration)

        iter_start = time.time()
        weights = ray.put(model.get_weights())
        [a.load_weights.remote(weights) for a in agents]
        trajectory, total_reward, traj_len_mean = collect_samples(
            agents, config, self.model.observation_filter,
            self.model.reward_filter)
        print("total reward is ", total_reward)
        print("trajectory length mean is ", traj_len_mean)
        print("timesteps:", trajectory["dones"].shape[0])
        if self.file_writer:
            traj_stats = tf.Summary(value=[
                tf.Summary.Value(tag="ppo/rollouts/mean_reward",
                                 simple_value=total_reward),
                tf.Summary.Value(tag="ppo/rollouts/traj_len_mean",
                                 simple_value=traj_len_mean)
            ])
            self.file_writer.add_summary(traj_stats, self.global_step)
        self.global_step += 1

        def standardized(value):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            return (value - value.mean()) / max(1e-4, value.std())

        if config["use_gae"]:
            trajectory["advantages"] = standardized(trajectory["advantages"])
        else:
            trajectory["returns"] = standardized(trajectory["returns"])

        rollouts_end = time.time()
        print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
              ", stepsize=" + str(config["sgd_stepsize"]) + "):")
        names = [
            "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"
        ]
        print(("{:>15}" * len(names)).format(*names))
        trajectory = shuffle(trajectory)
        shuffle_end = time.time()
        tuples_per_device = model.load_data(
            trajectory, self.iteration == 0 and config["full_trace_data_load"])
        load_end = time.time()
        rollouts_time = rollouts_end - iter_start
        shuffle_time = shuffle_end - rollouts_end
        load_time = load_end - shuffle_end
        sgd_time = 0
        for i in range(config["num_sgd_iter"]):
            sgd_start = time.time()
            batch_index = 0
            num_batches = (int(tuples_per_device) //
                           int(model.per_device_batch_size))
            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
            permutation = np.random.permutation(num_batches)
            # Prepare to drop into the debugger
            if self.iteration == config["tf_debug_iteration"]:
                model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess)
            while batch_index < num_batches:
                full_trace = (i == 0 and self.iteration == 0 and batch_index
                              == config["full_trace_nth_sgd_batch"])
                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
                    batch_entropy = model.run_sgd_minibatch(
                        permutation[batch_index] * model.per_device_batch_size,
                        self.kl_coeff, full_trace,
                        self.file_writer)
                loss.append(batch_loss)
                policy_loss.append(batch_policy_loss)
                vf_loss.append(batch_vf_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
            policy_loss = np.mean(policy_loss)
            vf_loss = np.mean(vf_loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
                i, loss, policy_loss, vf_loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
                metric_prefix = "ppo/sgd/final_iter/"
                values.append(
                    tf.Summary.Value(tag=metric_prefix + "kl_coeff",
                                     simple_value=self.kl_coeff))
                values.extend([
                    tf.Summary.Value(tag=metric_prefix + "mean_entropy",
                                     simple_value=entropy),
                    tf.Summary.Value(tag=metric_prefix + "mean_loss",
                                     simple_value=loss),
                    tf.Summary.Value(tag=metric_prefix + "mean_kl",
                                     simple_value=kl)
                ])
                if self.file_writer:
                    sgd_stats = tf.Summary(value=values)
                    self.file_writer.add_summary(sgd_stats, self.global_step)
            self.global_step += 1
            sgd_time += sgd_end - sgd_start
        if kl > 2.0 * config["kl_target"]:
            self.kl_coeff *= 1.5
        elif kl < 0.5 * config["kl_target"]:
            self.kl_coeff *= 0.5

        info = {
            "kl_divergence": kl,
            "kl_coefficient": self.kl_coeff,
            "rollouts_time": rollouts_time,
            "shuffle_time": shuffle_time,
            "load_time": load_time,
            "sgd_time": sgd_time,
            "sample_throughput": len(trajectory["observations"]) / sgd_time
        }

        print("kl div:", kl)
        print("kl coeff:", self.kl_coeff)
        print("rollouts time:", rollouts_time)
        print("shuffle time:", shuffle_time)
        print("load time:", load_time)
        print("sgd time:", sgd_time)
        print("sgd examples/s:", len(trajectory["observations"]) / sgd_time)
        print("total time so far:", time.time() - self.start_time)

        result = TrainingResult(
            episode_reward_mean=total_reward,
            episode_len_mean=traj_len_mean,
            timesteps_this_iter=trajectory["dones"].shape[0],
            info=info)

        return result
Example #22
0
def result(t, rew):
    return TrainingResult(time_total_s=t, episode_reward_mean=rew)
Example #23
0
File: es.py Project: zcli/ray
    def _train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_trainable_flat()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results = self._collect_results(theta_id, config["episodes_per_batch"],
                                        config["timesteps_per_batch"])

        curr_task_results = []
        ob_count_this_batch = 0
        # Loop over the results
        for result in results:
            assert result.eval_length is None, "We aren't doing eval rollouts."
            assert result.noise_inds_n.ndim == 1
            assert result.returns_n2.shape == (len(result.noise_inds_n), 2)
            assert result.lengths_n2.shape == (len(result.noise_inds_n), 2)
            assert result.returns_n2.dtype == np.float32

            result_num_eps = result.lengths_n2.size
            result_num_timesteps = result.lengths_n2.sum()
            self.episodes_so_far += result_num_eps
            self.timesteps_so_far += result_num_timesteps

            curr_task_results.append(result)
            # Update ob stats.
            if self.policy.needs_ob_stat and result.ob_count > 0:
                self.ob_stat.increment(result.ob_sum, result.ob_sumsq,
                                       result.ob_count)
                ob_count_this_batch += result.ob_count

        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert (noise_inds_n.shape[0] == returns_n2.shape[0] ==
                lengths_n2.shape[0])
        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (self.noise.get(idx, self.policy.num_params)
             for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).
        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std)

        step_tend = time.time()
        tlogger.record_tabular("EpRewMean", returns_n2.mean())
        tlogger.record_tabular("EpRewStd", returns_n2.std())
        tlogger.record_tabular("EpLenMean", lengths_n2.mean())

        tlogger.record_tabular(
            "Norm", float(np.square(self.policy.get_trainable_flat()).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", lengths_n2.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("ObCount", ob_count_this_batch)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        info = {
            "weights_norm": np.square(self.policy.get_trainable_flat()).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": lengths_n2.size,
            "episodes_so_far": self.episodes_so_far,
            "timesteps_this_iter": lengths_n2.sum(),
            "timesteps_so_far": self.timesteps_so_far,
            "ob_count": ob_count_this_batch,
            "time_elapsed_this_iter": step_tend - step_tstart,
            "time_elapsed": step_tend - self.tstart
        }

        result = TrainingResult(episode_reward_mean=returns_n2.mean(),
                                episode_len_mean=lengths_n2.mean(),
                                timesteps_this_iter=lengths_n2.sum(),
                                info=info)

        return result
Example #24
0
File: dqn.py Project: zcli/ray
    def _train_async(self):
        apply_time = RunningStat(())
        wait_time = RunningStat(())
        gradient_lag = RunningStat(())
        iter_init_timesteps = self.cur_timestep
        num_gradients_applied = 0
        gradient_list = [
            worker.do_async_step.remote(i, self.cur_timestep,
                                        self.actor.get_weights(),
                                        num_gradients_applied)
            for i, worker in enumerate(self.workers)
        ]
        steps = self.config["sample_batch_size"] * len(gradient_list)
        self.cur_timestep += steps
        self.steps_since_update += steps

        while gradient_list:
            dt = time.time()
            gradient, info = ray.get(gradient_list[0])
            gradient_list = gradient_list[1:]
            wait_time.push(time.time() - dt)

            if gradient is not None:
                dt = time.time()
                self.actor.apply_gradients(gradient)
                apply_time.push(time.time() - dt)
                gradient_lag.push(num_gradients_applied - info["gradient_id"])
                num_gradients_applied += 1

            if (self.cur_timestep - iter_init_timesteps <
                    self.config["timesteps_per_iteration"]):
                worker_id = info["id"]
                gradient_list.append(
                    self.workers[info["id"]].do_async_step.remote(
                        worker_id, self.cur_timestep, self.actor.get_weights(),
                        num_gradients_applied))
                self.cur_timestep += self.config["sample_batch_size"]
                self.steps_since_update += self.config["sample_batch_size"]

            if (self.cur_timestep > self.config["learning_starts"]
                    and self.steps_since_update >
                    self.config["target_network_update_freq"]):
                # Update target network periodically.
                self.actor.dqn_graph.update_target(self.actor.sess)
                self.steps_since_update -= (
                    self.config["target_network_update_freq"])
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        stats = ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers])
        for stat in stats:
            mean_100ep_reward += stat[0]
            mean_100ep_length += stat[1]
            num_episodes += stat[2]
            exploration = stat[3]
            buffer_size_sum += stat[4]
            set_weights_time = stat[5]
            sample_time = stat[6]
            grad_time = stat[7]
        mean_100ep_reward /= self.config["num_workers"]
        mean_100ep_length /= self.config["num_workers"]

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("mean_set_weights_time", set_weights_time),
            ("mean_sample_time", sample_time),
            ("mean_grad_time", grad_time),
            ("mean_apply_time", float(apply_time.mean)),
            ("mean_ray_wait_time", float(wait_time.mean)),
            ("gradient_lag_mean", float(gradient_lag.mean)),
            ("gradient_lag_stdev", float(gradient_lag.std)),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
Example #25
0
File: dqn.py Project: zcli/ray
    def _train_sync(self):
        config = self.config
        sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0
        iter_init_timesteps = self.cur_timestep

        num_loop_iters = 0
        while (self.cur_timestep - iter_init_timesteps <
               config["timesteps_per_iteration"]):
            dt = time.time()
            if self.workers:
                worker_steps = ray.get([
                    w.do_steps.remote(config["sample_batch_size"] //
                                      len(self.workers),
                                      self.cur_timestep,
                                      store=False) for w in self.workers
                ])
                for steps in worker_steps:
                    for obs, action, rew, new_obs, done in steps:
                        self.actor.replay_buffer.add(obs, action, rew, new_obs,
                                                     done)
            else:
                self.actor.do_steps(config["sample_batch_size"],
                                    self.cur_timestep,
                                    store=True)
            num_loop_iters += 1
            self.cur_timestep += config["sample_batch_size"]
            self.steps_since_update += config["sample_batch_size"]
            sample_time += time.time() - dt

            if self.cur_timestep > config["learning_starts"]:
                if config["multi_gpu_optimize"]:
                    dt = time.time()
                    times = self.actor.do_multi_gpu_optimize(self.cur_timestep)
                    if num_loop_iters <= 1:
                        print("Multi-GPU times", times)
                    learn_time += (time.time() - dt)
                else:
                    # Minimize the error in Bellman's equation on a batch
                    # sampled from replay buffer.
                    for _ in range(
                            max(
                                1, config["train_batch_size"] //
                                config["sgd_batch_size"])):
                        dt = time.time()
                        gradients = [
                            self.actor.sample_buffer_gradient(
                                self.cur_timestep)
                        ]
                        learn_time += (time.time() - dt)
                        dt = time.time()
                        for grad in gradients:
                            self.actor.apply_gradients(grad)
                        apply_time += (time.time() - dt)
                dt = time.time()
                self._update_worker_weights()
                sync_time += (time.time() - dt)

            if (self.cur_timestep > config["learning_starts"]
                    and self.steps_since_update >
                    config["target_network_update_freq"]):
                # Update target network periodically.
                self.actor.dqn_graph.update_target(self.actor.sess)
                self.steps_since_update -= config["target_network_update_freq"]
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        if not self.workers:
            stats = self.actor.stats(self.cur_timestep)
            mean_100ep_reward += stats[0]
            mean_100ep_length += stats[1]
            num_episodes += stats[2]
            exploration = stats[3]
            buffer_size_sum += stats[4]
        for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers]):
            mean_100ep_reward += mean_rew
            mean_100ep_length += mean_len
            num_episodes += episodes
            buffer_size_sum += buf_sz
        mean_100ep_reward /= config["num_workers"]
        mean_100ep_length /= config["num_workers"]

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("sample_time", sample_time),
            ("weight_sync_time", sync_time),
            ("apply_time", apply_time),
            ("learn_time", learn_time),
            ("samples_per_s", num_loop_iters *
             np.float64(config["sample_batch_size"]) / sample_time),
            ("learn_samples_per_s", num_loop_iters *
             np.float64(config["train_batch_size"]) / learn_time),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
Example #26
0
 def _train(self):
     return TrainingResult(timesteps_this_iter=1, done=True)