def __init__(self, workers, num_sgd_iter=1, train_batch_size=1, sgd_minibatch_size=0, standardize_fields=frozenset([]), keep_local_weights_in_sync=True, backend="gloo"): PolicyOptimizer.__init__(self, workers) self.learner_stats = {} self.num_sgd_iter = num_sgd_iter self.train_batch_size = train_batch_size self.sgd_minibatch_size = sgd_minibatch_size self.standardize_fields = standardize_fields self.keep_local_weights_in_sync = keep_local_weights_in_sync self.update_weights_timer = TimerStat() self.learn_timer = TimerStat() # Setup the distributed processes. if not self.workers.remote_workers(): raise ValueError("This optimizer requires >0 remote workers.") ip = ray.get(workers.remote_workers()[0].get_node_ip.remote()) port = ray.get(workers.remote_workers()[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) logger.info( "Creating torch process group with leader {}".format(address)) # Get setup tasks in order to throw errors on failure. ray.get([ worker.setup_torch_data_parallel.remote( address, i, len(workers.remote_workers()), backend) for i, worker in enumerate(workers.remote_workers()) ]) logger.info("Torch process group init completed")
def __init__(self, workers, learning_starts=1000, buffer_size=10000, train_batch_size=32): """Initialize a batch replay optimizer. Arguments: workers (WorkerSet): set of all workers learning_starts (int): start learning after this number of timesteps have been collected buffer_size (int): max timesteps to keep in the replay buffer train_batch_size (int): number of timesteps to train on at once """ PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts self.max_buffer_size = buffer_size self.train_batch_size = train_batch_size assert self.max_buffer_size >= self.replay_starts # List of buffered sample batches self.replay_buffer = [] self.buffer_size = 0 # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {}
def add_batch(self, batch): PolicyOptimizer._check_not_multiagent(batch) with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"])
def __init__(self, workers, num_sgd_iter=1, train_batch_size=1, sgd_minibatch_size=0, standardize_fields=frozenset([]), aux_loss_every_k=16, aux_loss_num_sgd_iter=9, aux_loss_start_after_num_steps=0): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.sgd_minibatch_size = sgd_minibatch_size self.train_batch_size = train_batch_size self.learner_stats = {} self.policies = dict( self.workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) self.aux_loss_every_k = aux_loss_every_k self.aux_loss_num_sgd_iter = aux_loss_num_sgd_iter self.aux_loss_start_after_num_steps = aux_loss_start_after_num_steps self.memory = [] # Assert that train batch size is divisible by sgd minibatch size to make populating # policy logits simpler. assert train_batch_size % sgd_minibatch_size == 0, ( f"train_batch_size: {train_batch_size}" f"sgd_minibatch_size: {sgd_minibatch_size}")
def __init__(self, workers, num_sgd_iter=1, train_batch_size=1): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.train_batch_size = train_batch_size self.learner_stats = {}
def __init__(self, local_evaluator, remote_evaluators, grads_per_step=100): PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} if not self.remote_evaluators: raise ValueError( "Async optimizer requires at least 1 remote evaluator")
def __init__(self, workers, grads_per_step=100): PolicyOptimizer.__init__(self, workers) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} if not self.workers.remote_workers(): raise ValueError( "Async optimizer requires at least 1 remote workers")
def __init__(self, workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, schedule_max_timesteps=100000, beta_annealing_fraction=0.2, final_prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=32, sample_batch_size=4, before_learn_on_batch=None, synchronize_sampling=False): PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts # linearly annealing beta used in Rainbow paper self.prioritized_replay_beta = LinearSchedule( schedule_timesteps=int(schedule_max_timesteps * beta_annealing_fraction), initial_p=prioritized_replay_beta, final_p=final_prioritized_replay_beta) self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size self.before_learn_on_batch = before_learn_on_batch self.synchronize_sampling = synchronize_sampling # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {} # Set up replay buffer if prioritized_replay: def new_buffer(): return PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) else: def new_buffer(): return ReplayBuffer(buffer_size) self.replay_buffers = collections.defaultdict(new_buffer) if buffer_size < self.replay_starts: logger.warning("buffer_size={} < replay_starts={}".format( buffer_size, self.replay_starts))
def stats(self): timing = { "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3) for k in self.timers } timing["learner_grad_time_ms"] = round( 1000 * self.learner.grad_timer.mean, 3) timing["learner_load_time_ms"] = round( 1000 * self.learner.load_timer.mean, 3) timing["learner_load_wait_time_ms"] = round( 1000 * self.learner.load_wait_timer.mean, 3) timing["learner_dequeue_time_ms"] = round( 1000 * self.learner.queue_timer.mean, 3) stats = { "sample_throughput": round(self.timers["sample"].mean_throughput, 3), "train_throughput": round(self.timers["train"].mean_throughput, 3), "num_weight_syncs": self.num_weight_syncs, "num_steps_replayed": self.num_replayed, "timing_breakdown": timing, "learner_queue": self.learner.learner_queue_size.stats(), } if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): replay_stats = ray.get(self.replay_actors[0].stats.remote(self.debug)) timing = { "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3) for k in self.timers } timing["learner_grad_time_ms"] = round( 1000 * self.learner.grad_timer.mean, 3) timing["learner_dequeue_time_ms"] = round( 1000 * self.learner.queue_timer.mean, 3) stats = { "sample_throughput": round(self.timers["sample"].mean_throughput, 3), "train_throughput": round(self.timers["train"].mean_throughput, 3), "num_weight_syncs": self.num_weight_syncs, "num_samples_dropped": self.num_samples_dropped, "learner_queue": self.learner.learner_queue_size.stats(), "replay_shard_0": replay_stats, } debug_stats = { "timing_breakdown": timing, "pending_sample_tasks": self.sample_tasks.count, "pending_replay_tasks": self.replay_tasks.count, } if self.debug: stats.update(debug_stats) if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): return dict(PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
def stats(self): replay_stats = ray_get_and_free(self.replay_actors[0].stats.remote( self.debug)) timing = { "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3) for k in self.timers } timing["learner_grad_time_ms"] = round( 1000 * self.learner.grad_timer.mean, 3) timing["learner_dequeue_time_ms"] = round( 1000 * self.learner.queue_timer.mean, 3) stats = { "sample_throughput": round(self.timers["sample"].mean_throughput, 3), "train_throughput": round(self.timers["train"].mean_throughput, 3), "num_weight_syncs": self.num_weight_syncs, "num_samples_dropped": self.num_samples_dropped, "learner_queue": self.learner.learner_queue_size.stats(), "replay_shard_0": replay_stats, } debug_stats = { "timing_breakdown": timing, "pending_sample_tasks": self.sample_tasks.count, "pending_replay_tasks": self.replay_tasks.count, } if self.debug: stats.update(debug_stats) if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): timing = { "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3) for k in self.timers } timing["learner_grad_time_ms"] = round( 1000 * self.learner.grad_timer.mean, 3) timing["learner_load_time_ms"] = round( 1000 * self.learner.load_timer.mean, 3) timing["learner_load_wait_time_ms"] = round( 1000 * self.learner.load_wait_timer.mean, 3) timing["learner_dequeue_time_ms"] = round( 1000 * self.learner.queue_timer.mean, 3) stats = { "sample_throughput": round(self.timers["sample"].mean_throughput, 3), "train_throughput": round(self.timers["train"].mean_throughput, 3), "num_weight_syncs": self.num_weight_syncs, "num_steps_replayed": self.num_replayed, "timing_breakdown": timing, "learner_queue": self.learner.learner_queue_size.stats(), } if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): return dict( PolicyOptimizer.stats(self), **{ "wait_time_ms": round(1000 * self.wait_timer.mean, 3), "apply_time_ms": round(1000 * self.apply_timer.mean, 3), "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3), })
def stats(self): return dict( PolicyOptimizer.stats(self), **{ "update_weights_time_ms": round( 1000 * self.update_weights_timer.mean, 3), "learn_time_ms": round(1000 * self.learn_timer.mean, 3), "learner": self.learner_stats, })
def stats(self): return dict(PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3), "opt_samples": round(self.grad_timer.mean_units_processed, 3), })
def stats(self): return dict( PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
def __init__(self, workers, grads_per_step=100): """Initialize an async gradients optimizer. Arguments: grads_per_step (int): The number of gradients to collect and apply per each call to step(). This number should be sufficiently high to amortize the overhead of calling step(). """ PolicyOptimizer.__init__(self, workers) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} if not self.workers.remote_workers(): raise ValueError( "Async optimizer requires at least 1 remote workers")
def stats(self): return dict( PolicyOptimizer.stats(self), **{ "sync_weights_up_time": round(1000 * self.sync_up_timer.mean, 3), "sync_weights_down_time": round( 1000 * self.sync_down_timer.mean, 3), "learn_time_ms": round(1000 * self.learn_timer.mean, 3), "learner": self.learner_stats, })
def __init__(self, workers, train_batch_size=10000, microbatch_size=1000): PolicyOptimizer.__init__(self, workers) if train_batch_size <= microbatch_size: raise ValueError( "The microbatch size must be smaller than the train batch " "size, got {} vs {}".format(microbatch_size, train_batch_size)) self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.train_batch_size = train_batch_size self.microbatch_size = microbatch_size self.learner_stats = {} self.policies = dict( self.workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies))
def __init__(self, workers, num_sgd_iter=1, train_batch_size=1, sgd_minibatch_size=0, standardize_fields=frozenset([])): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.sgd_minibatch_size = sgd_minibatch_size self.train_batch_size = train_batch_size self.learner_stats = {} self.policies = dict(self.workers.local_worker() .foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies))
def __init__(self, workers, learning_starts=1000, buffer_size=10000, train_batch_size=32): PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts self.max_buffer_size = buffer_size self.train_batch_size = train_batch_size assert self.max_buffer_size >= self.replay_starts # List of buffered sample batches self.replay_buffer = [] self.buffer_size = 0 # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {}
def stats(self): return dict( PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3), "opt_samples": round(self.grad_timer.mean_units_processed, 3), "learner": self.learner_stats, })
def stats(self): def timer_to_ms(timer): return round(1000 * timer.mean, 3) stats = self.aggregator.stats() stats.update(self.get_mean_stats_and_reset()) stats["timing_breakdown"] = { "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer), "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer), "learner_load_time_ms": timer_to_ms(self.learner.load_timer), "learner_load_wait_time_ms": timer_to_ms(self.learner.load_wait_timer), "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer), } stats["learner_queue"] = self.learner.learner_queue_size.stats() if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): def timer_to_ms(timer): return round(1000 * timer.mean, 3) stats_list = [] learner_info = {} for ws_id in self.aggregator_set.keys(): aggregator = self.aggregator_set[ws_id] learner = self.learner_set[ws_id] stats = aggregator.stats() stats.update(self.get_mean_stats_and_reset()) stats["timing_breakdown"] = { "optimizer_step_time_ms": timer_to_ms( self._optimizer_step_timer), "learner_grad_time_ms": timer_to_ms(learner.grad_timer), "learner_load_time_ms": timer_to_ms(learner.load_timer), "learner_load_wait_time_ms": timer_to_ms( learner.load_wait_timer), "learner_dequeue_time_ms": timer_to_ms(learner.queue_timer), } stats["learner_queue"] = learner.learner_queue_size.stats() if learner.stats: learner_info["policy{}".format(ws_id)] = learner.stats if not self.sync_sampling: learner_info["policy{}".format(ws_id)]["train_timesteps"] \ = int(learner.stats[ "train_timesteps"] // learner.num_sgd_iter) learner_info["policy{}".format(ws_id)]["sample_timesteps"] = \ stats["sample_timesteps"] learner_info["policy{}".format(ws_id)]["training_iteration"] = \ int(stats["sample_timesteps"] // self.train_batch_size) stats.pop("sample_timesteps") stats_list.append(stats) ret_stat = wrap_dict_list(stats_list) ret_stat["learner"] = learner_info original_stat = PolicyOptimizer.stats(self) original_stat.update(ret_stat) return original_stat
def stats(self): def timer_to_ms(timer): return round(1000 * timer.mean, 3) timing = { "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer), "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer), "learner_load_time_ms": timer_to_ms(self.learner.load_timer), "learner_load_wait_time_ms": timer_to_ms( self.learner.load_wait_timer), "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer), } stats = dict({ "num_weight_syncs": self.num_weight_syncs, "num_steps_replayed": self.num_replayed, "timing_breakdown": timing, "learner_queue": self.learner.learner_queue_size.stats(), }, **self.get_mean_stats_and_reset()) self._last_stats_val.clear() if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def stats(self): def timer_to_ms(timer): return round(1000 * timer.mean, 3) timing = { "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer), "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer), "learner_load_time_ms": timer_to_ms(self.learner.load_timer), "learner_load_wait_time_ms": timer_to_ms(self.learner.load_wait_timer), "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer), } stats = dict( { "num_weight_syncs": self.num_weight_syncs, "num_steps_replayed": self.num_replayed, "timing_breakdown": timing, "learner_queue": self.learner.learner_queue_size.stats(), }, **self.get_mean_stats_and_reset()) self._last_stats_val.clear() if self.learner.stats: stats["learner"] = self.learner.stats return dict(PolicyOptimizer.stats(self), **stats)
def __init__(self, workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=512, rollout_fragment_length=50, num_replay_buffer_shards=1, max_weight_sync_delay=400, debug=False, batch_replay=False): """Initialize an async replay optimizer. Arguments: workers (WorkerSet): all workers learning_starts (int): wait until this many steps have been sampled before starting optimization. buffer_size (int): max size of the replay buffer prioritized_replay (bool): whether to enable prioritized replay prioritized_replay_alpha (float): replay alpha hyperparameter prioritized_replay_beta (float): replay beta hyperparameter prioritized_replay_eps (float): replay eps hyperparameter train_batch_size (int): size of batches to learn on rollout_fragment_length (int): size of batches to sample from workers. num_replay_buffer_shards (int): number of actors to use to store replay samples max_weight_sync_delay (int): update the weights of a rollout worker after collecting this number of timesteps from it debug (bool): return extra debug stats batch_replay (bool): replay entire sequential batches of experiences instead of sampling steps individually """ PolicyOptimizer.__init__(self, workers) self.debug = debug self.batch_replay = batch_replay self.replay_starts = learning_starts self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.max_weight_sync_delay = max_weight_sync_delay self.learner = LearnerThread(self.workers.local_worker()) self.learner.start() if self.batch_replay: replay_cls = BatchReplayActor else: replay_cls = ReplayActor self.replay_actors = create_colocated(replay_cls, [ num_replay_buffer_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps, ], num_replay_buffer_shards) # Stats self.timers = { k: TimerStat() for k in [ "put_weights", "get_samples", "sample_processing", "replay_processing", "update_priorities", "train", "sample" ] } self.num_weight_syncs = 0 self.num_samples_dropped = 0 self.learning_started = False # Number of worker steps since the last weight update self.steps_since_update = {} # Otherwise kick of replay tasks for local gradient updates self.replay_tasks = TaskPool() for ra in self.replay_actors: for _ in range(REPLAY_QUEUE_DEPTH): self.replay_tasks.add(ra, ra.replay.remote()) # Kick off async background sampling self.sample_tasks = TaskPool() if self.workers.remote_workers(): self._set_workers(self.workers.remote_workers())
def __init__(self, workers, train_batch_size=500, sample_batch_size=50, # num_envs_per_worker=1, num_gpus=0, # lr=0.0005, replay_buffer_num_slots=0, replay_proportion=0.0, num_data_loader_buffers=1, max_sample_requests_in_flight_per_worker=2, broadcast_interval=1, num_sgd_iter=1, sgd_minibatch_size=1, learner_queue_size=16, learner_queue_timeout=300, num_aggregation_workers=0, shuffle_sequences=True, sync_sampling=False, minibatch_buffer_size=1, _fake_gpus=False): PolicyOptimizer.__init__(self, workers) self._stats_start_time = time.time() self._last_stats_time = {} self._last_stats_sum = {} self.learner_set = {} self.aggregator_set = {} self.sync_sampling = sync_sampling assert isinstance(workers, SuperWorkerSet) for ws_id, ws in workers.items(): if num_gpus > 1 or num_data_loader_buffers > 1: # logger.info( # "Enabling multi-GPU mode, {} GPUs, {} parallel # loaders".format( # num_gpus, num_data_loader_buffers)) # if num_data_loader_buffers < minibatch_buffer_size: # raise ValueError( # "In multi-gpu mode you must have at least as many " # "parallel data loader buffers as minibatch buffers: " # "{} vs {}".format(num_data_loader_buffers, # minibatch_buffer_size)) # self.learner = TFMultiGPULearner( # self.workers.local_worker(), # lr=lr, # num_gpus=num_gpus, # train_batch_size=train_batch_size, # num_data_loader_buffers=num_data_loader_buffers, # minibatch_buffer_size=minibatch_buffer_size, # num_sgd_iter=num_sgd_iter, # learner_queue_size=learner_queue_size, # learner_queue_timeout=learner_queue_timeout, # _fake_gpus=_fake_gpus) raise NotImplementedError() else: if self.sync_sampling: learner = SyncLearnerThread( ws.local_worker(), minibatch_buffer_size=minibatch_buffer_size, num_sgd_iter=num_sgd_iter, learner_queue_size=learner_queue_size, learner_queue_timeout=learner_queue_timeout, num_gpus=num_gpus, sgd_batch_size=sgd_minibatch_size ) else: learner = AsyncLearnerThread( ws.local_worker(), minibatch_buffer_size=minibatch_buffer_size, num_sgd_iter=num_sgd_iter, learner_queue_size=learner_queue_size, learner_queue_timeout=learner_queue_timeout, ) learner.start() self.learner_set[ws_id] = learner if num_aggregation_workers > 0: raise NotImplementedError() # self.aggregator = TreeAggregator( # workers, # num_aggregation_workers, # replay_proportion=replay_proportion, # max_sample_requests_in_flight_per_worker=( # max_sample_requests_in_flight_per_worker), # replay_buffer_num_slots=replay_buffer_num_slots, # train_batch_size=train_batch_size, # sample_batch_size=sample_batch_size, # broadcast_interval=broadcast_interval) else: aggregator = DRAggregator( ws, replay_proportion=replay_proportion, max_sample_requests_in_flight_per_worker=( max_sample_requests_in_flight_per_worker if not self.sync_sampling else 1), replay_buffer_num_slots=replay_buffer_num_slots, train_batch_size=train_batch_size, sample_batch_size=sample_batch_size, broadcast_interval=broadcast_interval, sync_sampling=sync_sampling ) self.aggregator_set[ws_id] = aggregator self.train_batch_size = train_batch_size self.shuffle_sequences = shuffle_sequences logger.debug("===== Do you in sync sampling mode? {} =====".format( sync_sampling)) # Stats self._optimizer_step_timer = TimerStat() self._stats_start_time = time.time() self._last_stats_time = {} self.episode_history = {ws_id: [] for ws_id, _ in self.workers.items()} self.to_be_collected = {ws_id: [] for ws_id, _ in self.workers.items()}
def __init__( self, workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, final_prioritized_replay_beta=0.4, train_batch_size=32, before_learn_on_batch=None, synchronize_sampling=False, prioritized_replay_beta_annealing_timesteps=100000 * 0.2, ): """Initialize an sync replay optimizer. Args: workers (WorkerSet): all workers learning_starts (int): wait until this many steps have been sampled before starting optimization. buffer_size (int): max size of the replay buffer prioritized_replay (bool): whether to enable prioritized replay prioritized_replay_alpha (float): replay alpha hyperparameter prioritized_replay_beta (float): replay beta hyperparameter prioritized_replay_eps (float): replay eps hyperparameter final_prioritized_replay_beta (float): Final value of beta. train_batch_size (int): size of batches to learn on before_learn_on_batch (function): callback to run before passing the sampled batch to learn on synchronize_sampling (bool): whether to sample the experiences for all policies with the same indices (used in MADDPG). prioritized_replay_beta_annealing_timesteps (int): The timestep at which PR-beta annealing should end. """ PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts # Linearly annealing beta used in Rainbow paper, stopping at # `final_prioritized_replay_beta`. self.prioritized_replay_beta = PiecewiseSchedule( endpoints=[(0, prioritized_replay_beta), (prioritized_replay_beta_annealing_timesteps, final_prioritized_replay_beta)], outside_value=final_prioritized_replay_beta, framework=None) self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size self.before_learn_on_batch = before_learn_on_batch self.synchronize_sampling = synchronize_sampling # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {} # Set up replay buffer if prioritized_replay: def new_buffer(): return PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) else: def new_buffer(): return ReplayBuffer(buffer_size) self.replay_buffers = collections.defaultdict(new_buffer) if buffer_size < self.replay_starts: logger.warning("buffer_size={} < replay_starts={}".format( buffer_size, self.replay_starts))
def __init__(self, workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, schedule_max_timesteps=100000, beta_annealing_fraction=0.2, final_prioritized_replay_beta=0.4, train_batch_size=32, sample_batch_size=4, before_learn_on_batch=None, synchronize_sampling=False): """Initialize an sync replay optimizer. Arguments: workers (WorkerSet): all workers learning_starts (int): wait until this many steps have been sampled before starting optimization. buffer_size (int): max size of the replay buffer prioritized_replay (bool): whether to enable prioritized replay prioritized_replay_alpha (float): replay alpha hyperparameter prioritized_replay_beta (float): replay beta hyperparameter prioritized_replay_eps (float): replay eps hyperparameter schedule_max_timesteps (int): number of timesteps in the schedule beta_annealing_fraction (float): fraction of schedule to anneal beta over final_prioritized_replay_beta (float): final value of beta train_batch_size (int): size of batches to learn on sample_batch_size (int): size of batches to sample from workers before_learn_on_batch (function): callback to run before passing the sampled batch to learn on synchronize_sampling (bool): whether to sample the experiences for all policies with the same indices (used in MADDPG). """ PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts # linearly annealing beta used in Rainbow paper self.prioritized_replay_beta = LinearSchedule( schedule_timesteps=int(schedule_max_timesteps * beta_annealing_fraction), initial_p=prioritized_replay_beta, final_p=final_prioritized_replay_beta) self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size self.before_learn_on_batch = before_learn_on_batch self.synchronize_sampling = synchronize_sampling # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {} # Set up replay buffer if prioritized_replay: def new_buffer(): return PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) else: def new_buffer(): return ReplayBuffer(buffer_size) self.replay_buffers = collections.defaultdict(new_buffer) if buffer_size < self.replay_starts: logger.warning("buffer_size={} < replay_starts={}".format( buffer_size, self.replay_starts))
def __init__(self, workers, train_batch_size=500, sample_batch_size=50, num_envs_per_worker=1, num_gpus=0, lr=0.0005, replay_buffer_num_slots=0, replay_proportion=0.0, num_data_loader_buffers=1, max_sample_requests_in_flight_per_worker=2, broadcast_interval=1, num_sgd_iter=1, minibatch_buffer_size=1, learner_queue_size=16, num_aggregation_workers=0, _fake_gpus=False): PolicyOptimizer.__init__(self, workers) self._stats_start_time = time.time() self._last_stats_time = {} self._last_stats_sum = {} if num_gpus > 1 or num_data_loader_buffers > 1: logger.info( "Enabling multi-GPU mode, {} GPUs, {} parallel loaders".format( num_gpus, num_data_loader_buffers)) if num_data_loader_buffers < minibatch_buffer_size: raise ValueError( "In multi-gpu mode you must have at least as many " "parallel data loader buffers as minibatch buffers: " "{} vs {}".format(num_data_loader_buffers, minibatch_buffer_size)) self.learner = TFMultiGPULearner( self.workers.local_worker(), lr=lr, num_gpus=num_gpus, train_batch_size=train_batch_size, num_data_loader_buffers=num_data_loader_buffers, minibatch_buffer_size=minibatch_buffer_size, num_sgd_iter=num_sgd_iter, learner_queue_size=learner_queue_size, _fake_gpus=_fake_gpus) else: self.learner = LearnerThread(self.workers.local_worker(), minibatch_buffer_size, num_sgd_iter, learner_queue_size) self.learner.start() # Stats self._optimizer_step_timer = TimerStat() self._stats_start_time = time.time() self._last_stats_time = {} if num_aggregation_workers > 0: self.aggregator = TreeAggregator( workers, num_aggregation_workers, replay_proportion=replay_proportion, max_sample_requests_in_flight_per_worker=( max_sample_requests_in_flight_per_worker), replay_buffer_num_slots=replay_buffer_num_slots, train_batch_size=train_batch_size, sample_batch_size=sample_batch_size, broadcast_interval=broadcast_interval) else: self.aggregator = SimpleAggregator( workers, replay_proportion=replay_proportion, max_sample_requests_in_flight_per_worker=( max_sample_requests_in_flight_per_worker), replay_buffer_num_slots=replay_buffer_num_slots, train_batch_size=train_batch_size, sample_batch_size=sample_batch_size, broadcast_interval=broadcast_interval)
def __init__(self, workers, sgd_batch_size=128, num_sgd_iter=10, sample_batch_size=200, num_envs_per_worker=1, train_batch_size=1024, num_gpus=0, standardize_fields=[], straggler_mitigation=False): PolicyOptimizer.__init__(self, workers) self.batch_size = sgd_batch_size self.num_sgd_iter = num_sgd_iter self.num_envs_per_worker = num_envs_per_worker self.sample_batch_size = sample_batch_size self.train_batch_size = train_batch_size self.straggler_mitigation = straggler_mitigation if not num_gpus: self.devices = ["/cpu:0"] else: self.devices = [ "/gpu:{}".format(i) for i in range(int(math.ceil(num_gpus))) ] self.batch_size = int(sgd_batch_size / len(self.devices)) * len( self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices)) self.policies = dict( self.workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) for policy_id, policy in self.policies.items(): if not isinstance(policy, TFPolicy): raise ValueError( "Only TF policies are supported with multi-GPU. Try using " "the simple optimizer instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.optimizers = {} with self.workers.local_worker().tf_sess.graph.as_default(): with self.workers.local_worker().tf_sess.as_default(): for policy_id, policy in self.policies.items(): with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE): if policy._state_inputs: rnn_inputs = policy._state_inputs + [ policy._seq_lens ] else: rnn_inputs = [] self.optimizers[policy_id] = ( LocalSyncParallelOptimizer( policy._optimizer, self.devices, [v for _, v in policy._loss_inputs], rnn_inputs, self.per_device_batch_size, policy.copy)) self.sess = self.workers.local_worker().tf_sess self.sess.run(tf.global_variables_initializer())
def __init__(self, workers, sgd_batch_size=128, num_sgd_iter=10, rollout_fragment_length=200, num_envs_per_worker=1, train_batch_size=1024, num_gpus=0, standardize_fields=[], shuffle_sequences=True, _fake_gpus=False): """Initialize a synchronous multi-gpu optimizer. Arguments: workers (WorkerSet): all workers sgd_batch_size (int): SGD minibatch size within train batch size num_sgd_iter (int): number of passes to learn on per train batch rollout_fragment_length (int): size of batches to sample from workers. num_envs_per_worker (int): num envs in each rollout worker train_batch_size (int): size of batches to learn on num_gpus (int): number of GPUs to use for data-parallel SGD standardize_fields (list): list of fields in the training batch to normalize shuffle_sequences (bool): whether to shuffle the train batch prior to SGD to break up correlations _fake_gpus (bool): Whether to use fake-GPUs (CPUs) instead of actual GPUs (should only be used for testing on non-GPU machines). """ PolicyOptimizer.__init__(self, workers) self.batch_size = sgd_batch_size self.num_sgd_iter = num_sgd_iter self.num_envs_per_worker = num_envs_per_worker self.rollout_fragment_length = rollout_fragment_length self.train_batch_size = train_batch_size self.shuffle_sequences = shuffle_sequences # Collect actual devices to use. if not num_gpus: _fake_gpus = True num_gpus = 1 type_ = "cpu" if _fake_gpus else "gpu" self.devices = [ "/{}:{}".format(type_, i) for i in range(int(math.ceil(num_gpus))) ] self.batch_size = int(sgd_batch_size / len(self.devices)) * len( self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices)) self.policies = dict(self.workers.local_worker() .foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) for policy_id, policy in self.policies.items(): if not isinstance(policy, TFPolicy): raise ValueError( "Only TF graph policies are supported with multi-GPU. " "Try setting `simple_optimizer=True` instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.optimizers = {} with self.workers.local_worker().tf_sess.graph.as_default(): with self.workers.local_worker().tf_sess.as_default(): for policy_id, policy in self.policies.items(): with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE): if policy._state_inputs: rnn_inputs = policy._state_inputs + [ policy._seq_lens ] else: rnn_inputs = [] self.optimizers[policy_id] = ( LocalSyncParallelOptimizer( policy._optimizer, self.devices, [v for _, v in policy._loss_inputs], rnn_inputs, self.per_device_batch_size, policy.copy)) self.sess = self.workers.local_worker().tf_sess self.sess.run(tf.global_variables_initializer())
def __init__(self, local_evaluator, remote_evaluators, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=512, sample_batch_size=50, num_replay_buffer_shards=1, max_weight_sync_delay=400, debug=False, batch_replay=False): PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) self.debug = debug self.batch_replay = batch_replay self.replay_starts = learning_starts self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.max_weight_sync_delay = max_weight_sync_delay self.learner = LearnerThread(self.local_evaluator) self.learner.start() if self.batch_replay: replay_cls = BatchReplayActor else: replay_cls = ReplayActor self.replay_actors = create_colocated(replay_cls, [ num_replay_buffer_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps, ], num_replay_buffer_shards) # Stats self.timers = { k: TimerStat() for k in [ "put_weights", "get_samples", "sample_processing", "replay_processing", "update_priorities", "train", "sample" ] } self.num_weight_syncs = 0 self.num_samples_dropped = 0 self.learning_started = False # Number of worker steps since the last weight update self.steps_since_update = {} # Otherwise kick of replay tasks for local gradient updates self.replay_tasks = TaskPool() for ra in self.replay_actors: for _ in range(REPLAY_QUEUE_DEPTH): self.replay_tasks.add(ra, ra.replay.remote()) # Kick off async background sampling self.sample_tasks = TaskPool() if self.remote_evaluators: self._set_evaluators(self.remote_evaluators)