def make_learner_thread(local_worker, config): if not config["simple_optimizer"]: logger.info( "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks". format(config["num_gpus"], config["num_multi_gpu_tower_stacks"])) num_stacks = config["num_multi_gpu_tower_stacks"] buffer_size = config["minibatch_buffer_size"] if num_stacks < buffer_size: logger.warning( "In multi-GPU mode you should have at least as many " "multi-GPU tower stacks (to load data into on one device) as " "you have stack-index slots in the buffer! You have " f"configured {num_stacks} stacks and a buffer of size " f"{buffer_size}. Setting " f"`minibatch_buffer_size={num_stacks}`.") config["minibatch_buffer_size"] = num_stacks learner_thread = MultiGPULearnerThread( local_worker, num_gpus=config["num_gpus"], lr=config["lr"], train_batch_size=config["train_batch_size"], num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) else: learner_thread = LearnerThread( local_worker, minibatch_buffer_size=config["minibatch_buffer_size"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) return learner_thread
def make_learner_thread(local_worker, config): if config["num_gpus"] > 1 or config["num_data_loader_buffers"] > 1: logger.info( "Enabling multi-GPU mode, {} GPUs, {} parallel loaders".format( config["num_gpus"], config["num_data_loader_buffers"])) if config["num_data_loader_buffers"] < config["minibatch_buffer_size"]: raise ValueError( "In multi-gpu mode you must have at least as many " "parallel data loader buffers as minibatch buffers: " "{} vs {}".format(config["num_data_loader_buffers"], config["minibatch_buffer_size"])) learner_thread = TFMultiGPULearner( local_worker, num_gpus=config["num_gpus"], lr=config["lr"], train_batch_size=config["train_batch_size"], num_data_loader_buffers=config["num_data_loader_buffers"], minibatch_buffer_size=config["minibatch_buffer_size"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) else: learner_thread = LearnerThread( local_worker, minibatch_buffer_size=config["minibatch_buffer_size"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) return learner_thread
def make_learner_thread(local_worker, config): if not config["simple_optimizer"]: logger.info( "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks". format(config["num_gpus"], config["num_multi_gpu_tower_stacks"])) if config["num_multi_gpu_tower_stacks"] < \ config["minibatch_buffer_size"]: raise ValueError( "In multi-GPU mode you must have at least as many " "parallel multi-GPU towers as minibatch buffers: " "{} vs {}".format(config["num_multi_gpu_tower_stacks"], config["minibatch_buffer_size"])) learner_thread = MultiGPULearnerThread( local_worker, num_gpus=config["num_gpus"], lr=config["lr"], train_batch_size=config["train_batch_size"], num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], minibatch_buffer_size=config["minibatch_buffer_size"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) else: learner_thread = LearnerThread( local_worker, minibatch_buffer_size=config["minibatch_buffer_size"], num_sgd_iter=config["num_sgd_iter"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"]) return learner_thread
def __init__( self, local_worker: RolloutWorker, num_gpus: int = 1, lr=None, # deprecated. train_batch_size: int = 500, num_multi_gpu_tower_stacks: int = 1, minibatch_buffer_size: int = 1, num_sgd_iter: int = 1, learner_queue_size: int = 16, learner_queue_timeout: int = 300, num_data_load_threads: int = 16, _fake_gpus: bool = False): """Initializes a MultiGPULearnerThread instance. Args: local_worker (RolloutWorker): Local RolloutWorker holding policies this thread will call load_data() and optimizer() on. num_gpus (int): Number of GPUs to use for data-parallel SGD. train_batch_size (int): Size of batches (minibatches if `num_sgd_iter` > 1) to learn on. num_multi_gpu_tower_stacks (int): Number of buffers to parallelly load data into on one device. Each buffer is of size of `train_batch_size` and hence increases GPU memory usage accordingly. minibatch_buffer_size (int): Max number of train batches to store in the minibatch buffer. num_sgd_iter (int): Number of passes to learn on per train batch (minibatch if `num_sgd_iter` > 1). learner_queue_size (int): Max size of queue of inbound train batches to this thread. num_data_load_threads (int): Number of threads to use to load data into GPU memory in parallel. """ LearnerThread.__init__(self, local_worker, minibatch_buffer_size, num_sgd_iter, learner_queue_size, learner_queue_timeout) self.train_batch_size = train_batch_size # TODO: (sven) Allow multi-GPU to work for multi-agent as well. self.policy = self.local_worker.policy_map[DEFAULT_POLICY_ID] logger.info("MultiGPULearnerThread devices {}".format( self.policy.devices)) assert self.train_batch_size % len(self.policy.devices) == 0 assert self.train_batch_size >= len(self.policy.devices),\ "batch too small" if set(self.local_worker.policy_map.keys()) != {DEFAULT_POLICY_ID}: raise NotImplementedError("Multi-gpu mode for multi-agent") self.tower_stack_indices = list(range(num_multi_gpu_tower_stacks)) self.idle_tower_stacks = queue.Queue() self.ready_tower_stacks = queue.Queue() for idx in self.tower_stack_indices: self.idle_tower_stacks.put(idx) for i in range(num_data_load_threads): self.loader_thread = _MultiGPULoaderThread(self, share_stats=(i == 0)) self.loader_thread.start() self.minibatch_buffer = MinibatchBuffer(self.ready_tower_stacks, minibatch_buffer_size, learner_queue_timeout, num_sgd_iter)
def __init__(self, local_worker, num_gpus=1, lr=0.0005, train_batch_size=500, num_data_loader_buffers=1, minibatch_buffer_size=1, num_sgd_iter=1, learner_queue_size=16, learner_queue_timeout=300, num_data_load_threads=16, _fake_gpus=False): """Initialize a multi-gpu learner thread. Arguments: local_worker (RolloutWorker): process local rollout worker holding policies this thread will call learn_on_batch() on num_gpus (int): number of GPUs to use for data-parallel SGD lr (float): learning rate train_batch_size (int): size of batches to learn on num_data_loader_buffers (int): number of buffers to load data into in parallel. Each buffer is of size of train_batch_size and increases GPU memory usage proportionally. minibatch_buffer_size (int): max number of train batches to store in the minibatching buffer num_sgd_iter (int): number of passes to learn on per train batch learner_queue_size (int): max size of queue of inbound train batches to this thread num_data_loader_threads (int): number of threads to use to load data into GPU memory in parallel """ LearnerThread.__init__(self, local_worker, minibatch_buffer_size, num_sgd_iter, learner_queue_size, learner_queue_timeout) self.lr = lr self.train_batch_size = train_batch_size if not num_gpus: self.devices = ["/cpu:0"] elif _fake_gpus: self.devices = [ "/cpu:{}".format(i) for i in range(int(math.ceil(num_gpus))) ] else: self.devices = [ "/gpu:{}".format(i) for i in range(int(math.ceil(num_gpus))) ] logger.info("TFMultiGPULearner devices {}".format(self.devices)) assert self.train_batch_size % len(self.devices) == 0 assert self.train_batch_size >= len(self.devices), "batch too small" if set(self.local_worker.policy_map.keys()) != {DEFAULT_POLICY_ID}: raise NotImplementedError("Multi-gpu mode for multi-agent") self.policy = self.local_worker.policy_map[DEFAULT_POLICY_ID] # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.par_opt = [] with self.local_worker.tf_sess.graph.as_default(): with self.local_worker.tf_sess.as_default(): with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE): if self.policy._state_inputs: rnn_inputs = self.policy._state_inputs + [ self.policy._seq_lens ] else: rnn_inputs = [] adam = tf.train.AdamOptimizer(self.lr) for _ in range(num_data_loader_buffers): self.par_opt.append( LocalSyncParallelOptimizer( adam, self.devices, [v for _, v in self.policy._loss_inputs], rnn_inputs, 999999, # it will get rounded down self.policy.copy)) self.sess = self.local_worker.tf_sess self.sess.run(tf.global_variables_initializer()) self.idle_optimizers = queue.Queue() self.ready_optimizers = queue.Queue() for opt in self.par_opt: self.idle_optimizers.put(opt) for i in range(num_data_load_threads): self.loader_thread = _LoaderThread(self, share_stats=(i == 0)) self.loader_thread.start() self.minibatch_buffer = MinibatchBuffer( self.ready_optimizers, minibatch_buffer_size, learner_queue_timeout, num_sgd_iter)