Exemple #1
0
    def __init__(self, num_shards, learning_starts, buffer_size,
                 replay_batch_size, prioritized_replay_alpha,
                 prioritized_replay_beta, prioritized_replay_eps):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.replay_batch_size = replay_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps

        def gen_replay():
            while True:
                yield self.replay()

        ParallelIteratorWorker.__init__(self, gen_replay, False)

        def new_buffer():
            return PrioritizedReplayBuffer(
                self.buffer_size, alpha=prioritized_replay_alpha)

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()
        self.num_added = 0
Exemple #2
0
    def __init__(self, local_worker, minibatch_buffer_size, num_sgd_iter,
                 learner_queue_size, learner_queue_timeout):
        """Initialize the learner thread.

        Arguments:
            local_worker (RolloutWorker): process local rollout worker holding
                policies this thread will call learn_on_batch() on
            minibatch_buffer_size (int): max number of train batches to store
                in the minibatching buffer
            num_sgd_iter (int): number of passes to learn on per train batch
            learner_queue_size (int): max size of queue of inbound
                train batches to this thread
            learner_queue_timeout (int): raise an exception if the queue has
                been empty for this long in seconds
        """
        threading.Thread.__init__(self)
        self.learner_queue_size = WindowStat("size", 50)
        self.local_worker = local_worker
        self.inqueue = queue.Queue(maxsize=learner_queue_size)
        self.outqueue = queue.Queue()
        self.minibatch_buffer = MinibatchBuffer(inqueue=self.inqueue,
                                                size=minibatch_buffer_size,
                                                timeout=learner_queue_timeout,
                                                num_passes=num_sgd_iter)
        self.queue_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.load_timer = TimerStat()
        self.load_wait_timer = TimerStat()
        self.daemon = True
        self.weights_updated = False
        self.stats = {}
        self.stopped = False
    def __init__(self,
                 workers,
                 num_sgd_iter=1,
                 train_batch_size=1,
                 sgd_minibatch_size=0,
                 standardize_fields=frozenset([]),
                 aux_loss_every_k=16,
                 aux_loss_num_sgd_iter=9,
                 aux_loss_start_after_num_steps=0):
        PolicyOptimizer.__init__(self, workers)

        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.num_sgd_iter = num_sgd_iter
        self.sgd_minibatch_size = sgd_minibatch_size
        self.train_batch_size = train_batch_size
        self.learner_stats = {}
        self.policies = dict(
            self.workers.local_worker().foreach_trainable_policy(lambda p, i:
                                                                 (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))

        self.aux_loss_every_k = aux_loss_every_k
        self.aux_loss_num_sgd_iter = aux_loss_num_sgd_iter
        self.aux_loss_start_after_num_steps = aux_loss_start_after_num_steps
        self.memory = []
        # Assert that train batch size is divisible by sgd minibatch size to make populating
        # policy logits simpler.
        assert train_batch_size % sgd_minibatch_size == 0, (
            f"train_batch_size: {train_batch_size}"
            f"sgd_minibatch_size: {sgd_minibatch_size}")
Exemple #4
0
    def _init(self,
              sgd_batch_size=128,
              num_sgd_iter=10,
              train_batch_size=1024,
              num_gpus=0,
              standardize_fields=[]):
        self.batch_size = sgd_batch_size
        self.num_sgd_iter = num_sgd_iter
        self.train_batch_size = train_batch_size
        if not num_gpus:
            self.devices = ["/cpu:0"]
        else:
            self.devices = [
                "/gpu:{}".format(i) for i in range(int(math.ceil(num_gpus)))
            ]
        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
            self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields

        logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices))

        self.policies = dict(
            self.local_evaluator.foreach_trainable_policy(lambda p, i: (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))
        for policy_id, policy in self.policies.items():
            if not isinstance(policy, TFPolicyGraph):
                raise ValueError(
                    "Only TF policies are supported with multi-GPU. Try using "
                    "the simple optimizer instead.")

        # per-GPU graph copies created below must share vars with the policy
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        self.optimizers = {}
        with self.local_evaluator.tf_sess.graph.as_default():
            with self.local_evaluator.tf_sess.as_default():
                for policy_id, policy in self.policies.items():
                    with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
                            ]
                        else:
                            rnn_inputs = []
                        self.optimizers[policy_id] = (
                            LocalSyncParallelOptimizer(
                                policy._optimizer, self.devices,
                                [v
                                 for _, v in policy._loss_inputs], rnn_inputs,
                                self.per_device_batch_size, policy.copy))

                self.sess = self.local_evaluator.tf_sess
                self.sess.run(tf.global_variables_initializer())
Exemple #5
0
    def __init__(self, capacity: int, replay_ratio: float):
        """Initializes MixInReplay instance.

        Args:
            capacity (int): Number of batches to store in total.
            replay_ratio (float): Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
        """
        self.capacity = capacity
        self.replay_ratio = replay_ratio
        self.replay_proportion = None
        if self.replay_ratio != 1.0:
            self.replay_proportion = self.replay_ratio / (1.0 -
                                                          self.replay_ratio)

        def new_buffer():
            return SimpleReplayBuffer(num_slots=capacity)

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics.
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()

        # Added timesteps over lifetime.
        self.num_added = 0

        # Last added batch(es).
        self.last_added_batches = collections.defaultdict(list)
Exemple #6
0
    def _init(self):
        assert isinstance(self.local_evaluator, TFMultiGPUSupport)
        self.batch_size = self.config.get("sgd_batch_size", 128)
        gpu_ids = ray.get_gpu_ids()
        if not gpu_ids:
            self.devices = ["/cpu:0"]
        else:
            self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
        assert self.batch_size > len(self.devices), "batch size too small"
        self.per_device_batch_size = self.batch_size // len(self.devices)
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()

        print("LocalMultiGPUOptimizer devices", self.devices)
        print("LocalMultiGPUOptimizer batch size", self.batch_size)

        # List of (feature name, feature placeholder) tuples
        self.loss_inputs = self.local_evaluator.tf_loss_inputs()

        # per-GPU graph copies created below must share vars with the policy
        tf.get_variable_scope().reuse_variables()

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config.get("sgd_stepsize",
                                                   5e-5)), self.devices,
            [ph for _, ph in self.loss_inputs], self.per_device_batch_size,
            lambda *ph: self.local_evaluator.build_tf_loss(ph),
            self.config.get("logdir", os.getcwd()))

        self.sess = self.local_evaluator.sess
        self.sess.run(tf.global_variables_initializer())
    def _init(self,
              learning_starts=1000,
              buffer_size=10000,
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta=0.4,
              prioritized_replay_eps=1e-6,
              train_batch_size=32,
              sample_batch_size=4,
              clip_rewards=True):

        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()

        # Set up replay buffer
        if prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                buffer_size,
                alpha=prioritized_replay_alpha,
                clip_rewards=clip_rewards)
        else:
            self.replay_buffer = ReplayBuffer(buffer_size, clip_rewards)

        assert buffer_size >= self.replay_starts
 def _init(self, num_sgd_iter=1, timesteps_per_batch=1):
     self.update_weights_timer = TimerStat()
     self.sample_timer = TimerStat()
     self.grad_timer = TimerStat()
     self.throughput = RunningStat()
     self.num_sgd_iter = num_sgd_iter
     self.timesteps_per_batch = timesteps_per_batch
Exemple #9
0
    def __init__(self,
                 workers,
                 expected_batch_size,
                 num_sgd_iter=1,
                 sgd_minibatch_size=0,
                 standardize_fields=frozenset([]),
                 keep_local_weights_in_sync=True,
                 backend="gloo"):
        PolicyOptimizer.__init__(self, workers)
        self.learner_stats = {}
        self.num_sgd_iter = num_sgd_iter
        self.expected_batch_size = expected_batch_size
        self.sgd_minibatch_size = sgd_minibatch_size
        self.standardize_fields = standardize_fields
        self.keep_local_weights_in_sync = keep_local_weights_in_sync
        self.sync_down_timer = TimerStat()
        self.sync_up_timer = TimerStat()
        self.learn_timer = TimerStat()

        # Setup the distributed processes.
        if not self.workers.remote_workers():
            raise ValueError("This optimizer requires >0 remote workers.")
        ip = ray.get(workers.remote_workers()[0].get_node_ip.remote())
        port = ray.get(workers.remote_workers()[0].find_free_port.remote())
        address = "tcp://{ip}:{port}".format(ip=ip, port=port)
        logger.info(
            "Creating torch process group with leader {}".format(address))

        # Get setup tasks in order to throw errors on failure.
        ray.get([
            worker.setup_torch_data_parallel.remote(
                address, i, len(workers.remote_workers()), backend)
            for i, worker in enumerate(workers.remote_workers())
        ])
        logger.info("Torch process group init completed")
    def _init(self,
              sgd_batch_size=128,
              sgd_stepsize=5e-5,
              num_sgd_iter=10,
              timesteps_per_batch=1024,
              standardize_fields=[]):
        self.batch_size = sgd_batch_size
        self.sgd_stepsize = sgd_stepsize
        self.num_sgd_iter = num_sgd_iter
        self.timesteps_per_batch = timesteps_per_batch
        gpu_ids = ray.get_gpu_ids()
        if not gpu_ids:
            self.devices = ["/cpu:0"]
        else:
            self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
            self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields

        print("LocalMultiGPUOptimizer devices", self.devices)

        if set(self.local_evaluator.policy_map.keys()) != {"default"}:
            raise ValueError(
                "Multi-agent is not supported with multi-GPU. Try using the "
                "simple optimizer instead.")
        self.policy = self.local_evaluator.policy_map["default"]
        if not isinstance(self.policy, TFPolicyGraph):
            raise ValueError(
                "Only TF policies are supported with multi-GPU. Try using the "
                "simple optimizer instead.")

        # per-GPU graph copies created below must share vars with the policy
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        with self.local_evaluator.tf_sess.graph.as_default():
            with self.local_evaluator.tf_sess.as_default():
                with tf.variable_scope("default", reuse=tf.AUTO_REUSE):
                    if self.policy._state_inputs:
                        rnn_inputs = self.policy._state_inputs + [
                            self.policy._seq_lens
                        ]
                    else:
                        rnn_inputs = []
                    self.par_opt = LocalSyncParallelOptimizer(
                        tf.train.AdamOptimizer(
                            self.sgd_stepsize), self.devices,
                        [v for _, v in self.policy.loss_inputs()], rnn_inputs,
                        self.per_device_batch_size, self.policy.copy,
                        os.getcwd())

                self.sess = self.local_evaluator.tf_sess
                self.sess.run(tf.global_variables_initializer())
Exemple #11
0
 def _init(self, grads_per_step=100):
     self.apply_timer = TimerStat()
     self.wait_timer = TimerStat()
     self.dispatch_timer = TimerStat()
     self.grads_per_step = grads_per_step
     if not self.remote_evaluators:
         raise ValueError(
             "Async optimizer requires at least 1 remote evaluator")
 def _init(self, num_sgd_iter=1, train_batch_size=1):
     self.update_weights_timer = TimerStat()
     self.sample_timer = TimerStat()
     self.grad_timer = TimerStat()
     self.throughput = RunningStat()
     self.num_sgd_iter = num_sgd_iter
     self.train_batch_size = train_batch_size
     self.learner_stats = {}
Exemple #13
0
 def __init__(self, local_evaluator):
     threading.Thread.__init__(self)
     self.learner_queue_size = WindowStat("size", 50)
     self.local_evaluator = local_evaluator
     self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE)
     self.outqueue = queue.Queue()
     self.queue_timer = TimerStat()
     self.grad_timer = TimerStat()
     self.daemon = True
    def __init__(self, workers, num_sgd_iter=1, train_batch_size=1):
        PolicyOptimizer.__init__(self, workers)

        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.num_sgd_iter = num_sgd_iter
        self.train_batch_size = train_batch_size
        self.learner_stats = {}
 def __init__(self, learner, share_stats):
     threading.Thread.__init__(self)
     self.learner = learner
     self.daemon = True
     if share_stats:
         self.queue_timer = learner.queue_timer
         self.load_timer = learner.load_timer
     else:
         self.queue_timer = TimerStat()
         self.load_timer = TimerStat()
    def __init__(self, workers, grads_per_step=100):
        PolicyOptimizer.__init__(self, workers)

        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
        self.dispatch_timer = TimerStat()
        self.grads_per_step = grads_per_step
        self.learner_stats = {}
        if not self.workers.remote_workers():
            raise ValueError(
                "Async optimizer requires at least 1 remote workers")
 def __init__(self, multi_gpu_learner_thread: MultiGPULearnerThread,
              share_stats: bool):
     threading.Thread.__init__(self)
     self.multi_gpu_learner_thread = multi_gpu_learner_thread
     self.daemon = True
     if share_stats:
         self.queue_timer = multi_gpu_learner_thread.queue_timer
         self.load_timer = multi_gpu_learner_thread.load_timer
     else:
         self.queue_timer = TimerStat()
         self.load_timer = TimerStat()
    def __init__(self, local_evaluator, remote_evaluators, grads_per_step=100):
        PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators)

        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
        self.dispatch_timer = TimerStat()
        self.grads_per_step = grads_per_step
        self.learner_stats = {}
        if not self.remote_evaluators:
            raise ValueError(
                "Async optimizer requires at least 1 remote evaluator")
Exemple #19
0
 def __init__(self, local_worker):
     threading.Thread.__init__(self)
     self.learner_queue_size = WindowStat("size", 50)
     self.local_worker = local_worker
     self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE)
     self.outqueue = queue.Queue()
     self.queue_timer = TimerStat()
     self.grad_timer = TimerStat()
     self.daemon = True
     self.weights_updated = False
     self.stopped = False
     self.stats = {}
Exemple #20
0
    def __init__(self,
                 num_shards,
                 learning_starts,
                 buffer_size,
                 replay_batch_size,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 replay_mode="independent",
                 replay_sequence_length=1):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.replay_batch_size = replay_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.replay_mode = replay_mode
        self.replay_sequence_length = replay_sequence_length

        if replay_sequence_length > 1:
            self.replay_batch_size = int(
                max(1, replay_batch_size // replay_sequence_length))
            logger.info(
                "Since replay_sequence_length={} and replay_batch_size={}, "
                "we will replay {} sequences at a time.".format(
                    replay_sequence_length, replay_batch_size,
                    self.replay_batch_size))

        if replay_mode not in ["lockstep", "independent"]:
            raise ValueError("Unsupported replay mode: {}".format(replay_mode))

        def gen_replay():
            while True:
                yield self.replay()

        ParallelIteratorWorker.__init__(self, gen_replay, False)

        def new_buffer():
            return PrioritizedReplayBuffer(self.buffer_size,
                                           alpha=prioritized_replay_alpha)

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()
        self.num_added = 0

        # Make externally accessible for testing.
        global _local_replay_buffer
        _local_replay_buffer = self
        # If set, return this instead of the usual data for testing.
        self._fake_batch = None
Exemple #21
0
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 schedule_max_timesteps=100000,
                 beta_annealing_fraction=0.2,
                 final_prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 train_batch_size=32,
                 sample_batch_size=4,
                 before_learn_on_batch=None,
                 synchronize_sampling=False):
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts
        # linearly annealing beta used in Rainbow paper
        self.prioritized_replay_beta = LinearSchedule(
            schedule_timesteps=int(schedule_max_timesteps *
                                   beta_annealing_fraction),
            initial_p=prioritized_replay_beta,
            final_p=final_prioritized_replay_beta)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.before_learn_on_batch = before_learn_on_batch
        self.synchronize_sampling = synchronize_sampling

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        if buffer_size < self.replay_starts:
            logger.warning("buffer_size={} < replay_starts={}".format(
                buffer_size, self.replay_starts))
Exemple #22
0
    def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
              timesteps_per_batch=1024):
        assert isinstance(self.local_evaluator, TFMultiGPUSupport)
        self.batch_size = sgd_batch_size
        self.sgd_stepsize = sgd_stepsize
        self.num_sgd_iter = num_sgd_iter
        self.timesteps_per_batch = timesteps_per_batch
        gpu_ids = ray.get_gpu_ids()
        if not gpu_ids:
            self.devices = ["/cpu:0"]
        else:
            self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
        self.batch_size = int(
                sgd_batch_size / len(self.devices)) * len(self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()

        print("LocalMultiGPUOptimizer devices", self.devices)
        print("LocalMultiGPUOptimizer batch size", self.batch_size)

        # List of (feature name, feature placeholder) tuples
        self.loss_inputs = self.local_evaluator.tf_loss_inputs()

        # per-GPU graph copies created below must share vars with the policy
        main_thread_scope = tf.get_variable_scope()
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        with tf.variable_scope(main_thread_scope, reuse=tf.AUTO_REUSE):
            self.par_opt = LocalSyncParallelOptimizer(
                tf.train.AdamOptimizer(self.sgd_stepsize),
                self.devices,
                [ph for _, ph in self.loss_inputs],
                self.per_device_batch_size,
                lambda *ph: self.local_evaluator.build_tf_loss(ph),
                os.getcwd())

        # TODO(rliaw): Find more elegant solution for this
        if hasattr(self.local_evaluator, "init_extra_ops"):
            self.local_evaluator.init_extra_ops(
                self.par_opt.get_device_losses())

        self.sess = self.local_evaluator.sess
        self.sess.run(tf.global_variables_initializer())
    def _init(self, train_batch_size=512, sample_batch_size=50, debug=False):

        self.debug = debug
        self.learning_started = False
        self.train_batch_size = train_batch_size

        self.learner = LearnerThread(self.local_evaluator)
        self.learner.start()

        assert len(self.remote_evaluators) > 0

        # Stats
        self.timers = {
            k: TimerStat()
            for k in
            ["put_weights", "enqueue", "sample_processing", "train", "sample"]
        }
        self.num_weight_syncs = 0
        self.learning_started = False

        # Kick off async background sampling
        self.sample_tasks = TaskPool()
        weights = self.local_evaluator.get_weights()
        for ev in self.remote_evaluators:
            ev.set_weights.remote(weights)
            for _ in range(SAMPLE_QUEUE_DEPTH):
                self.sample_tasks.add(ev, ev.sample.remote())

        self.batch_buffer = []
    def __init__(self, num_shards, learning_starts, buffer_size,
                 train_batch_size, prioritized_replay_alpha,
                 prioritized_replay_beta, prioritized_replay_eps):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.train_batch_size = train_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps

        self.replay_buffer = PrioritizedReplayBuffer(
            self.buffer_size, alpha=prioritized_replay_alpha)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()
Exemple #25
0
    def _init(self,
              learning_starts=1000,
              buffer_size=10000,
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta=0.4,
              prioritized_replay_eps=1e-6,
              train_batch_size=512,
              sample_batch_size=50,
              num_replay_buffer_shards=1,
              max_weight_sync_delay=400,
              clip_rewards=True,
              debug=False):

        self.debug = debug
        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.sample_batch_size = sample_batch_size
        self.max_weight_sync_delay = max_weight_sync_delay

        self.learner = LearnerThread(self.local_evaluator)
        self.learner.start()

        self.replay_actors = create_colocated(ReplayActor, [
            num_replay_buffer_shards, learning_starts, buffer_size,
            train_batch_size, prioritized_replay_alpha,
            prioritized_replay_beta, prioritized_replay_eps, clip_rewards
        ], num_replay_buffer_shards)
        assert len(self.remote_evaluators) > 0

        # Stats
        self.timers = {
            k: TimerStat()
            for k in [
                "put_weights", "get_samples", "enqueue", "sample_processing",
                "replay_processing", "update_priorities", "train", "sample"
            ]
        }
        self.num_weight_syncs = 0
        self.learning_started = False

        # Number of worker steps since the last weight update
        self.steps_since_update = {}

        # Otherwise kick of replay tasks for local gradient updates
        self.replay_tasks = TaskPool()
        for ra in self.replay_actors:
            for _ in range(REPLAY_QUEUE_DEPTH):
                self.replay_tasks.add(ra, ra.replay.remote())

        # Kick off async background sampling
        self.sample_tasks = TaskPool()
        weights = self.local_evaluator.get_weights()
        for ev in self.remote_evaluators:
            ev.set_weights.remote(weights)
            self.steps_since_update[ev] = 0
            for _ in range(SAMPLE_QUEUE_DEPTH):
                self.sample_tasks.add(ev, ev.sample.remote())
Exemple #26
0
    def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
              timesteps_per_batch=1024):
        self.batch_size = sgd_batch_size
        self.sgd_stepsize = sgd_stepsize
        self.num_sgd_iter = num_sgd_iter
        self.timesteps_per_batch = timesteps_per_batch
        gpu_ids = ray.get_gpu_ids()
        if not gpu_ids:
            self.devices = ["/cpu:0"]
        else:
            self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
        self.batch_size = int(
                sgd_batch_size / len(self.devices)) * len(self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()

        print("LocalMultiGPUOptimizer devices", self.devices)
        print("LocalMultiGPUOptimizer batch size", self.batch_size)

        assert set(self.local_evaluator.policy_map.keys()) == {"default"}, \
            "Multi-agent is not supported"
        self.policy = self.local_evaluator.policy_map["default"]
        assert isinstance(self.policy, TFPolicyGraph), \
            "Only TF policies are supported"

        # per-GPU graph copies created below must share vars with the policy
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        with self.local_evaluator.tf_sess.graph.as_default():
            with self.local_evaluator.tf_sess.as_default():
                main_scope = tf.get_variable_scope()
                with tf.variable_scope(main_scope, reuse=tf.AUTO_REUSE):
                    self.par_opt = LocalSyncParallelOptimizer(
                        tf.train.AdamOptimizer(self.sgd_stepsize),
                        self.devices,
                        self.policy.loss_inputs(),
                        self.per_device_batch_size,
                        self.policy.copy,
                        os.getcwd())

                self.sess = self.local_evaluator.tf_sess
                self.sess.run(tf.global_variables_initializer())
 def __init__(self, local_evaluator, minibatch_buffer_size, num_sgd_iter):
     threading.Thread.__init__(self)
     self.learner_queue_size = WindowStat("size", 50)
     self.local_evaluator = local_evaluator
     self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE)
     self.outqueue = queue.Queue()
     self.minibatch_buffer = MinibatchBuffer(self.inqueue,
                                             minibatch_buffer_size,
                                             num_sgd_iter)
     self.queue_timer = TimerStat()
     self.grad_timer = TimerStat()
     self.load_timer = TimerStat()
     self.load_wait_timer = TimerStat()
     self.daemon = True
     self.weights_updated = False
     self.stats = {}
     self.stopped = False
Exemple #28
0
    def __init__(
        self,
        capacity: int,
        replay_ratio: float,
        replay_mode: ReplayMode = ReplayMode.INDEPENDENT,
    ):
        """Initializes MixInReplay instance.

        Args:
            capacity: Number of batches to store in total.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
        """
        self.capacity = capacity
        self.replay_ratio = replay_ratio
        self.replay_proportion = None
        if self.replay_ratio != 1.0:
            self.replay_proportion = self.replay_ratio / (1.0 -
                                                          self.replay_ratio)

        if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]:
            self.replay_mode = ReplayMode.LOCKSTEP
        elif replay_mode in ["independent", ReplayMode.INDEPENDENT]:
            self.replay_mode = ReplayMode.INDEPENDENT
        else:
            raise ValueError("Unsupported replay mode: {}".format(replay_mode))

        def new_buffer():
            return SimpleReplayBuffer(num_slots=capacity)

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics.
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()

        # Added timesteps over lifetime.
        self.num_added = 0

        # Last added batch(es).
        self.last_added_batches = collections.defaultdict(list)
Exemple #29
0
    def _init(self,
              learning_starts=1000,
              buffer_size=10000,
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta=0.4,
              schedule_max_timesteps=100000,
              beta_annealing_fraction=0.2,
              final_prioritized_replay_beta=0.4,
              prioritized_replay_eps=1e-6,
              train_batch_size=32,
              sample_batch_size=4):

        self.replay_starts = learning_starts
        # linearly annealing beta used in Rainbow paper
        self.prioritized_replay_beta = LinearSchedule(
            schedule_timesteps=int(schedule_max_timesteps *
                                   beta_annealing_fraction),
            initial_p=prioritized_replay_beta,
            final_p=final_prioritized_replay_beta)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        assert buffer_size >= self.replay_starts
Exemple #30
0
    def _init(self,
              learning_starts=1000,
              buffer_size=10000,
              train_batch_size=32):
        self.replay_starts = learning_starts
        self.max_buffer_size = buffer_size
        self.train_batch_size = train_batch_size
        assert self.max_buffer_size >= self.replay_starts

        # List of buffered sample batches
        self.replay_buffer = []
        self.buffer_size = 0

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}