def feed_parallel(self, iterable, num_places=None): if isinstance(self.place, core.CUDAPlace): places = [ core.CUDAPlace(i) for i in six.xrange(self._get_number_of_places_(num_places)) ] else: places = [ core.CPUPlace() for _ in six.xrange(self._get_number_of_places_(num_places)) ] if len(iterable) != len(places): raise ValueError("feed_parallel takes multiple mini-batches. Each " "mini-batch will be feed on each device. The " "number of devices and number of mini-batches " "must be same.") place = self.place for p, batch in six.zip(places, iterable): self.place = p yield self.feed(batch) self.place = place
def feed_parallel(self, iterable, num_places=None): """ Takes multiple mini-batches. Each mini-batch will be feed on each device in advance. Args: iterable(list|tuple): the input data. num_places(int): the number of devices. Default None. Returns: dict: the result of conversion. Notes: The number of devices and number of mini-batches must be same. """ if isinstance(self.place, core.CUDAPlace): places = [ core.CUDAPlace(i) for i in six.xrange(self._get_number_of_places_(num_places)) ] else: places = [ core.CPUPlace() for _ in six.xrange(self._get_number_of_places_(num_places)) ] if len(iterable) != len(places): raise ValueError("feed_parallel takes multiple mini-batches. Each " "mini-batch will be feed on each device. The " "number of devices and number of mini-batches " "must be same.") place = self.place for p, batch in six.zip(places, iterable): self.place = p yield self.feed(batch) self.place = place
def check_and_get_place(place): """ Check the type of place or get the default place Args: place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. Raises: TypeError if the type mismatched. Returns: the original place if it is not None. if fluid is compiled with CUDA, returns CUDAPlace(0) by default. Otherwise returns CPUPlace by default. """ if place is None: if core.is_compiled_with_cuda(): return core.CUDAPlace(0) else: return core.CPUPlace() else: if not isinstance(place, core.CUDAPlace) and not isinstance( place, core.CPUPlace): raise TypeError("Place should be either CUDAPlace or CPUPlace") return place
def __init__(self, use_cuda, loss_name=None, main_program=None, share_vars_from=None, exec_strategy=None, build_strategy=None, num_trainers=1, trainer_id=0, **kwargs): """ ParallelExecutor can run program in parallel. Args: use_cuda(bool): Whether to use CUDA or not. loss_name(str, default None): The loss name must set in training. main_program(Program, default None): The program that need to run, if not provided, then default_main_program will be used. share_vars_from(ParallelExecutor, default None): If provied, it will share variables from the specified ParallelExecutor. num_trainers(int, default 1): If greater than 1, NCCL will be initialized with multpile rank of nodes, each node should have same number of GPUs. Distributed training will be enabled then. trainer_id(int, default 0): Must use together with num_trainers. trainer_id is the "rank" of current node starts from 0. Returns: A ParallelExecutor object. Raises: TypeError: If share_vars_from is provided, but not ParallelExecutor object. Examples: .. code-block:: python train_exe = fluid.ParallelExecutor( use_cuda=True, loss_name=loss.name) test_exe = fluid.ParallelExecutor( use_cuda=True, main_program=test_program, share_vars_from=train_exe) train_loss, = train_exe.run([loss.name], feed=feed_dict) test_loss, = test_exe.run([loss.name], feed=feed_dict) """ if len(kwargs) != 0: err_msg = "" for key in kwargs: if key in dir(ExecutionStrategy): err_msg += \ "Setting {0} by constructor is deprecated. Use " \ "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \ "pe=ParallelExecutor(exec_strategy=strategy) " \ "instead.\n ".format(key) elif key in dir(BuildStrategy): err_msg += \ "Setting {0} by constructor is deprecated. Use " \ "strategy=BuildStrategy(); See help(" \ "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format( key) else: err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format( key) raise ValueError(err_msg) self._places = [] self._act_places = [] if use_cuda: for i in xrange(core.get_cuda_device_count()): p = core.Place() self._act_places.append(core.CUDAPlace(i)) p.set_place(self._act_places[-1]) self._places.append(p) else: for i in xrange(multiprocessing.cpu_count()): p = core.Place() self._act_places.append(core.CPUPlace()) p.set_place(self._act_places[-1]) self._places.append(p) assert self._places, "no place for execution" if exec_strategy is None: exec_strategy = ExecutionStrategy() if use_cuda: exec_strategy.use_event = True else: exec_strategy.use_event = False if exec_strategy.num_threads == 0: if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. exec_strategy.num_threads = len(self._places) * 2 else: exec_strategy.num_threads = min( len(self._places) * 2, multiprocessing.cpu_count()) if build_strategy is None: build_strategy = BuildStrategy() main = main_program main = main if main else framework.default_main_program() scope = executor.global_scope() if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): raise TypeError("share_vars_from must be ParallelExecutor.") local_scopes = share_vars_from.executor.local_scopes( ) if share_vars_from else [] self.persistable_vars = [ v.name for v in filter( lambda var: var.persistable and var.type != core.VarDesc. VarType.RAW, main.list_vars()) ] self.executor = core.ParallelExecutor( self._places, set([ p.name for p in main.global_block().iter_parameters() if not p.stop_gradient ]), set(self.persistable_vars), main.desc, loss_name if loss_name else '', scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) self.scope = scope