def __init__(self, num_shards, learning_starts, buffer_size, replay_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.replay_batch_size = replay_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps def gen_replay(): while True: yield self.replay() ParallelIteratorWorker.__init__(self, gen_replay, False) def new_buffer(): return PrioritizedReplayBuffer( self.buffer_size, alpha=prioritized_replay_alpha) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() self.num_added = 0
def __init__(self, local_worker, minibatch_buffer_size, num_sgd_iter, learner_queue_size, learner_queue_timeout): """Initialize the learner thread. Arguments: local_worker (RolloutWorker): process local rollout worker holding policies this thread will call learn_on_batch() on minibatch_buffer_size (int): max number of train batches to store in the minibatching buffer num_sgd_iter (int): number of passes to learn on per train batch learner_queue_size (int): max size of queue of inbound train batches to this thread learner_queue_timeout (int): raise an exception if the queue has been empty for this long in seconds """ threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=learner_queue_size) self.outqueue = queue.Queue() self.minibatch_buffer = MinibatchBuffer(inqueue=self.inqueue, size=minibatch_buffer_size, timeout=learner_queue_timeout, num_passes=num_sgd_iter) self.queue_timer = TimerStat() self.grad_timer = TimerStat() self.load_timer = TimerStat() self.load_wait_timer = TimerStat() self.daemon = True self.weights_updated = False self.stats = {} self.stopped = False
def __init__(self, workers, num_sgd_iter=1, train_batch_size=1, sgd_minibatch_size=0, standardize_fields=frozenset([]), aux_loss_every_k=16, aux_loss_num_sgd_iter=9, aux_loss_start_after_num_steps=0): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.sgd_minibatch_size = sgd_minibatch_size self.train_batch_size = train_batch_size self.learner_stats = {} self.policies = dict( self.workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) self.aux_loss_every_k = aux_loss_every_k self.aux_loss_num_sgd_iter = aux_loss_num_sgd_iter self.aux_loss_start_after_num_steps = aux_loss_start_after_num_steps self.memory = [] # Assert that train batch size is divisible by sgd minibatch size to make populating # policy logits simpler. assert train_batch_size % sgd_minibatch_size == 0, ( f"train_batch_size: {train_batch_size}" f"sgd_minibatch_size: {sgd_minibatch_size}")
def _init(self, sgd_batch_size=128, num_sgd_iter=10, train_batch_size=1024, num_gpus=0, standardize_fields=[]): self.batch_size = sgd_batch_size self.num_sgd_iter = num_sgd_iter self.train_batch_size = train_batch_size if not num_gpus: self.devices = ["/cpu:0"] else: self.devices = [ "/gpu:{}".format(i) for i in range(int(math.ceil(num_gpus))) ] self.batch_size = int(sgd_batch_size / len(self.devices)) * len( self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices)) self.policies = dict( self.local_evaluator.foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) for policy_id, policy in self.policies.items(): if not isinstance(policy, TFPolicyGraph): raise ValueError( "Only TF policies are supported with multi-GPU. Try using " "the simple optimizer instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.optimizers = {} with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): for policy_id, policy in self.policies.items(): with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE): if policy._state_inputs: rnn_inputs = policy._state_inputs + [ policy._seq_lens ] else: rnn_inputs = [] self.optimizers[policy_id] = ( LocalSyncParallelOptimizer( policy._optimizer, self.devices, [v for _, v in policy._loss_inputs], rnn_inputs, self.per_device_batch_size, policy.copy)) self.sess = self.local_evaluator.tf_sess self.sess.run(tf.global_variables_initializer())
def __init__(self, capacity: int, replay_ratio: float): """Initializes MixInReplay instance. Args: capacity (int): Number of batches to store in total. replay_ratio (float): Ratio of replayed samples in the returned batches. E.g. a ratio of 0.0 means only return new samples (no replay), a ratio of 0.5 means always return newest sample plus one old one (1:1), a ratio of 0.66 means always return the newest sample plus 2 old (replayed) ones (1:2), etc... """ self.capacity = capacity self.replay_ratio = replay_ratio self.replay_proportion = None if self.replay_ratio != 1.0: self.replay_proportion = self.replay_ratio / (1.0 - self.replay_ratio) def new_buffer(): return SimpleReplayBuffer(num_slots=capacity) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics. self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() # Added timesteps over lifetime. self.num_added = 0 # Last added batch(es). self.last_added_batches = collections.defaultdict(list)
def _init(self): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = self.config.get("sgd_batch_size", 128) gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] assert self.batch_size > len(self.devices), "batch size too small" self.per_device_batch_size = self.batch_size // len(self.devices) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy tf.get_variable_scope().reuse_variables() self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config.get("sgd_stepsize", 5e-5)), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), self.config.get("logdir", os.getcwd())) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer())
def _init(self, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=32, sample_batch_size=4, clip_rewards=True): self.replay_starts = learning_starts self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() # Set up replay buffer if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha, clip_rewards=clip_rewards) else: self.replay_buffer = ReplayBuffer(buffer_size, clip_rewards) assert buffer_size >= self.replay_starts
def _init(self, num_sgd_iter=1, timesteps_per_batch=1): self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch
def __init__(self, workers, expected_batch_size, num_sgd_iter=1, sgd_minibatch_size=0, standardize_fields=frozenset([]), keep_local_weights_in_sync=True, backend="gloo"): PolicyOptimizer.__init__(self, workers) self.learner_stats = {} self.num_sgd_iter = num_sgd_iter self.expected_batch_size = expected_batch_size self.sgd_minibatch_size = sgd_minibatch_size self.standardize_fields = standardize_fields self.keep_local_weights_in_sync = keep_local_weights_in_sync self.sync_down_timer = TimerStat() self.sync_up_timer = TimerStat() self.learn_timer = TimerStat() # Setup the distributed processes. if not self.workers.remote_workers(): raise ValueError("This optimizer requires >0 remote workers.") ip = ray.get(workers.remote_workers()[0].get_node_ip.remote()) port = ray.get(workers.remote_workers()[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) logger.info( "Creating torch process group with leader {}".format(address)) # Get setup tasks in order to throw errors on failure. ray.get([ worker.setup_torch_data_parallel.remote( address, i, len(workers.remote_workers()), backend) for i, worker in enumerate(workers.remote_workers()) ]) logger.info("Torch process group init completed")
def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024, standardize_fields=[]): self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int(sgd_batch_size / len(self.devices)) * len( self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields print("LocalMultiGPUOptimizer devices", self.devices) if set(self.local_evaluator.policy_map.keys()) != {"default"}: raise ValueError( "Multi-agent is not supported with multi-GPU. Try using the " "simple optimizer instead.") self.policy = self.local_evaluator.policy_map["default"] if not isinstance(self.policy, TFPolicyGraph): raise ValueError( "Only TF policies are supported with multi-GPU. Try using the " "simple optimizer instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): with tf.variable_scope("default", reuse=tf.AUTO_REUSE): if self.policy._state_inputs: rnn_inputs = self.policy._state_inputs + [ self.policy._seq_lens ] else: rnn_inputs = [] self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer( self.sgd_stepsize), self.devices, [v for _, v in self.policy.loss_inputs()], rnn_inputs, self.per_device_batch_size, self.policy.copy, os.getcwd()) self.sess = self.local_evaluator.tf_sess self.sess.run(tf.global_variables_initializer())
def _init(self, grads_per_step=100): self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step if not self.remote_evaluators: raise ValueError( "Async optimizer requires at least 1 remote evaluator")
def _init(self, num_sgd_iter=1, train_batch_size=1): self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.train_batch_size = train_batch_size self.learner_stats = {}
def __init__(self, local_evaluator): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_evaluator = local_evaluator self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE) self.outqueue = queue.Queue() self.queue_timer = TimerStat() self.grad_timer = TimerStat() self.daemon = True
def __init__(self, workers, num_sgd_iter=1, train_batch_size=1): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter self.train_batch_size = train_batch_size self.learner_stats = {}
def __init__(self, learner, share_stats): threading.Thread.__init__(self) self.learner = learner self.daemon = True if share_stats: self.queue_timer = learner.queue_timer self.load_timer = learner.load_timer else: self.queue_timer = TimerStat() self.load_timer = TimerStat()
def __init__(self, workers, grads_per_step=100): PolicyOptimizer.__init__(self, workers) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} if not self.workers.remote_workers(): raise ValueError( "Async optimizer requires at least 1 remote workers")
def __init__(self, multi_gpu_learner_thread: MultiGPULearnerThread, share_stats: bool): threading.Thread.__init__(self) self.multi_gpu_learner_thread = multi_gpu_learner_thread self.daemon = True if share_stats: self.queue_timer = multi_gpu_learner_thread.queue_timer self.load_timer = multi_gpu_learner_thread.load_timer else: self.queue_timer = TimerStat() self.load_timer = TimerStat()
def __init__(self, local_evaluator, remote_evaluators, grads_per_step=100): PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} if not self.remote_evaluators: raise ValueError( "Async optimizer requires at least 1 remote evaluator")
def __init__(self, local_worker): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE) self.outqueue = queue.Queue() self.queue_timer = TimerStat() self.grad_timer = TimerStat() self.daemon = True self.weights_updated = False self.stopped = False self.stats = {}
def __init__(self, num_shards, learning_starts, buffer_size, replay_batch_size, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, replay_mode="independent", replay_sequence_length=1): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.replay_batch_size = replay_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_mode = replay_mode self.replay_sequence_length = replay_sequence_length if replay_sequence_length > 1: self.replay_batch_size = int( max(1, replay_batch_size // replay_sequence_length)) logger.info( "Since replay_sequence_length={} and replay_batch_size={}, " "we will replay {} sequences at a time.".format( replay_sequence_length, replay_batch_size, self.replay_batch_size)) if replay_mode not in ["lockstep", "independent"]: raise ValueError("Unsupported replay mode: {}".format(replay_mode)) def gen_replay(): while True: yield self.replay() ParallelIteratorWorker.__init__(self, gen_replay, False) def new_buffer(): return PrioritizedReplayBuffer(self.buffer_size, alpha=prioritized_replay_alpha) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() self.num_added = 0 # Make externally accessible for testing. global _local_replay_buffer _local_replay_buffer = self # If set, return this instead of the usual data for testing. self._fake_batch = None
def __init__(self, workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, schedule_max_timesteps=100000, beta_annealing_fraction=0.2, final_prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=32, sample_batch_size=4, before_learn_on_batch=None, synchronize_sampling=False): PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts # linearly annealing beta used in Rainbow paper self.prioritized_replay_beta = LinearSchedule( schedule_timesteps=int(schedule_max_timesteps * beta_annealing_fraction), initial_p=prioritized_replay_beta, final_p=final_prioritized_replay_beta) self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size self.before_learn_on_batch = before_learn_on_batch self.synchronize_sampling = synchronize_sampling # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {} # Set up replay buffer if prioritized_replay: def new_buffer(): return PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) else: def new_buffer(): return ReplayBuffer(buffer_size) self.replay_buffers = collections.defaultdict(new_buffer) if buffer_size < self.replay_starts: logger.warning("buffer_size={} < replay_starts={}".format( buffer_size, self.replay_starts))
def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int( sgd_batch_size / len(self.devices)) * len(self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy main_thread_scope = tf.get_variable_scope() # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with tf.variable_scope(main_thread_scope, reuse=tf.AUTO_REUSE): self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.sgd_stepsize), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), os.getcwd()) # TODO(rliaw): Find more elegant solution for this if hasattr(self.local_evaluator, "init_extra_ops"): self.local_evaluator.init_extra_ops( self.par_opt.get_device_losses()) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer())
def _init(self, train_batch_size=512, sample_batch_size=50, debug=False): self.debug = debug self.learning_started = False self.train_batch_size = train_batch_size self.learner = LearnerThread(self.local_evaluator) self.learner.start() assert len(self.remote_evaluators) > 0 # Stats self.timers = { k: TimerStat() for k in ["put_weights", "enqueue", "sample_processing", "train", "sample"] } self.num_weight_syncs = 0 self.learning_started = False # Kick off async background sampling self.sample_tasks = TaskPool() weights = self.local_evaluator.get_weights() for ev in self.remote_evaluators: ev.set_weights.remote(weights) for _ in range(SAMPLE_QUEUE_DEPTH): self.sample_tasks.add(ev, ev.sample.remote()) self.batch_buffer = []
def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat()
def _init(self, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=512, sample_batch_size=50, num_replay_buffer_shards=1, max_weight_sync_delay=400, clip_rewards=True, debug=False): self.debug = debug self.replay_starts = learning_starts self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size self.sample_batch_size = sample_batch_size self.max_weight_sync_delay = max_weight_sync_delay self.learner = LearnerThread(self.local_evaluator) self.learner.start() self.replay_actors = create_colocated(ReplayActor, [ num_replay_buffer_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps, clip_rewards ], num_replay_buffer_shards) assert len(self.remote_evaluators) > 0 # Stats self.timers = { k: TimerStat() for k in [ "put_weights", "get_samples", "enqueue", "sample_processing", "replay_processing", "update_priorities", "train", "sample" ] } self.num_weight_syncs = 0 self.learning_started = False # Number of worker steps since the last weight update self.steps_since_update = {} # Otherwise kick of replay tasks for local gradient updates self.replay_tasks = TaskPool() for ra in self.replay_actors: for _ in range(REPLAY_QUEUE_DEPTH): self.replay_tasks.add(ra, ra.replay.remote()) # Kick off async background sampling self.sample_tasks = TaskPool() weights = self.local_evaluator.get_weights() for ev in self.remote_evaluators: ev.set_weights.remote(weights) self.steps_since_update[ev] = 0 for _ in range(SAMPLE_QUEUE_DEPTH): self.sample_tasks.add(ev, ev.sample.remote())
def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024): self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int( sgd_batch_size / len(self.devices)) * len(self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) assert set(self.local_evaluator.policy_map.keys()) == {"default"}, \ "Multi-agent is not supported" self.policy = self.local_evaluator.policy_map["default"] assert isinstance(self.policy, TFPolicyGraph), \ "Only TF policies are supported" # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): main_scope = tf.get_variable_scope() with tf.variable_scope(main_scope, reuse=tf.AUTO_REUSE): self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.sgd_stepsize), self.devices, self.policy.loss_inputs(), self.per_device_batch_size, self.policy.copy, os.getcwd()) self.sess = self.local_evaluator.tf_sess self.sess.run(tf.global_variables_initializer())
def __init__(self, local_evaluator, minibatch_buffer_size, num_sgd_iter): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_evaluator = local_evaluator self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE) self.outqueue = queue.Queue() self.minibatch_buffer = MinibatchBuffer(self.inqueue, minibatch_buffer_size, num_sgd_iter) self.queue_timer = TimerStat() self.grad_timer = TimerStat() self.load_timer = TimerStat() self.load_wait_timer = TimerStat() self.daemon = True self.weights_updated = False self.stats = {} self.stopped = False
def __init__( self, capacity: int, replay_ratio: float, replay_mode: ReplayMode = ReplayMode.INDEPENDENT, ): """Initializes MixInReplay instance. Args: capacity: Number of batches to store in total. replay_ratio: Ratio of replayed samples in the returned batches. E.g. a ratio of 0.0 means only return new samples (no replay), a ratio of 0.5 means always return newest sample plus one old one (1:1), a ratio of 0.66 means always return the newest sample plus 2 old (replayed) ones (1:2), etc... """ self.capacity = capacity self.replay_ratio = replay_ratio self.replay_proportion = None if self.replay_ratio != 1.0: self.replay_proportion = self.replay_ratio / (1.0 - self.replay_ratio) if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]: self.replay_mode = ReplayMode.LOCKSTEP elif replay_mode in ["independent", ReplayMode.INDEPENDENT]: self.replay_mode = ReplayMode.INDEPENDENT else: raise ValueError("Unsupported replay mode: {}".format(replay_mode)) def new_buffer(): return SimpleReplayBuffer(num_slots=capacity) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics. self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() # Added timesteps over lifetime. self.num_added = 0 # Last added batch(es). self.last_added_batches = collections.defaultdict(list)
def _init(self, learning_starts=1000, buffer_size=10000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, schedule_max_timesteps=100000, beta_annealing_fraction=0.2, final_prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, train_batch_size=32, sample_batch_size=4): self.replay_starts = learning_starts # linearly annealing beta used in Rainbow paper self.prioritized_replay_beta = LinearSchedule( schedule_timesteps=int(schedule_max_timesteps * beta_annealing_fraction), initial_p=prioritized_replay_beta, final_p=final_prioritized_replay_beta) self.prioritized_replay_eps = prioritized_replay_eps self.train_batch_size = train_batch_size # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.replay_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.learner_stats = {} # Set up replay buffer if prioritized_replay: def new_buffer(): return PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) else: def new_buffer(): return ReplayBuffer(buffer_size) self.replay_buffers = collections.defaultdict(new_buffer) assert buffer_size >= self.replay_starts
def _init(self, learning_starts=1000, buffer_size=10000, train_batch_size=32): self.replay_starts = learning_starts self.max_buffer_size = buffer_size self.train_batch_size = train_batch_size assert self.max_buffer_size >= self.replay_starts # List of buffered sample batches self.replay_buffer = [] self.buffer_size = 0 # Stats self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.learner_stats = {}