Esempio n. 1
0
def set_gradient_clip(clip, param_list=None, program=None):
    """
        To specify parameters that require gradient clip.
        Args:
            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
                    which describes the type and detailed attributes of required gradient clip.
            param_list(list, None by default): Parameters that require gradient clip. 
                    It can be a list of parameter or a list of parameter's name. 
                    When it's None, all parameters in the program will be included. 
            program(Program, None by default): The program where parameters are. 
                    Will be the default main program when assigned with None.
    """
    if not isinstance(clip, BaseGradientClipAttr):
        raise TypeError(
            "'clip' should be an instance of BaseGradientClipAttr's derived class"
        )
    if program is None:
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
    if all(isinstance(elem, basestring) for elem in param_list):
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
        param.gradient_clip_attr = copy.deepcopy(clip)
Esempio n. 2
0
    def __init__(self, feed_list, place, program=None):
        self.feed_dtypes = []
        self.feed_names = []
        self.feed_shapes = []
        self.feed_lod_level = []
        if program is None:
            program = default_main_program()
        for each_var in feed_list:
            if isinstance(each_var, basestring):
                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)
            self.feed_names.append(each_var.name)
            shape = each_var.shape
            batch_size_dim = -1
            for i, s in enumerate(shape):
                if s < 0:
                    batch_size_dim = i
                    break
            if batch_size_dim == -1:
                raise ValueError("Variable {0} must has a batch size dimension",
                                 each_var.name)
            self.feed_lod_level.append(each_var.lod_level)
            self.feed_shapes.append(shape)

        self.place = place
Esempio n. 3
0
def set_gradient_clip(clip, param_list=None, program=None):
    """
        To specify parameters that require gradient clip.
        Args:
            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
                    which describes the type and detailed attributes of required gradient clip.
            param_list(list, None by default): Parameters that require gradient clip. 
                    It can be a list of parameter or a list of parameter's name. 
                    When it's None, all parameters in the program will be included. 
            program(Program, None by default): The program where parameters are. 
                    Will be the default main program when assigned with None.
    """
    if not isinstance(clip, BaseGradientClipAttr):
        raise TypeError(
            "'clip' should be an instance of BaseGradientClipAttr's derived class"
        )
    if program is None:
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
    if all(isinstance(elem, basestring) for elem in param_list):
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )

    for param in param_list:
        param.gradient_clip_attr = copy.deepcopy(clip)
Esempio n. 4
0
    def __init__(self, feed_list, place, program=None):
        self.feed_dtypes = []
        self.feed_names = []
        self.feed_shapes = []
        self.feed_lod_level = []
        if program is None:
            program = default_main_program()
        for each_var in feed_list:
            if isinstance(each_var, basestring):
                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)
            self.feed_names.append(each_var.name)
            shape = each_var.shape
            batch_size_dim = -1
            for i, s in enumerate(shape):
                if s < 0:
                    batch_size_dim = i
                    break
            if batch_size_dim == -1:
                raise ValueError(
                    "Variable {0} must has a batch size dimension",
                    each_var.name)
            self.feed_lod_level.append(each_var.lod_level)
            self.feed_shapes.append(shape)

        self.place = place
Esempio n. 5
0
    def run(self,
            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
            return_numpy=True):
        if feed is None:
            feed = {}
        if fetch_list is None:
            fetch_list = []

        if program is None:
            program = default_main_program()

        if not isinstance(program, Program):
            raise TypeError()

        if scope is None:
            scope = g_scope

        program = program.clone()
        global_block = program.global_block()
        feed_var = global_block.create_var(
            name=feed_var_name,
            type=core.VarDesc.VarType.FEED_MINIBATCH,
            persistable=True)

        for i, name in enumerate(feed):
            out = global_block.var(name)
            global_block.prepend_op('feed',
                                    inputs={'X': [feed_var]},
                                    outputs={'Out': [out]},
                                    attrs={'col': i})
            cur_feed = feed[name]
            if not isinstance(cur_feed, core.LoDTensor):
                cur_feed = self.aslodtensor(cur_feed)
            core.set_feed_variable(scope, cur_feed, feed_var.name, i)

        fetch_var = global_block.create_var(
            name=fetch_var_name,
            type=core.VarDesc.VarType.FETCH_LIST,
            persistable=True)
        for i, var in enumerate(fetch_list):
            global_block.append_op(type='fetch',
                                   inputs={'X': [var]},
                                   outputs={'Out': [fetch_var]},
                                   attrs={'col': i})

        self.executor.run(program.desc, scope, 0, True)
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
            for i in xrange(len(fetch_list))
        ]

        if return_numpy:
            outs = as_numpy(outs)
        return outs
Esempio n. 6
0
 def global_learning_rate(self, program=None):
     """
     get global decayed learning rate
     :return:
     """
     if program is None:
         program = framework.default_main_program()
     return self._learning_rate_map.get(program, None)
    def transpile(self,
                  optimize_ops,
                  params_grads,
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
                  split_method=round_robin):
        """
            Transpile the program to a distributed data-parallelism programs.

            The main_program will be transform to use a remote parameter server
            to do parameter optimization. And the optimization graph will be put
            in to a parameter server program.

            Use different methods to split trainable varialbles to different
            parameter servers.

            Example to run:

            exe = fluid.Executor(place)
            t = fluid.DistributeTranspiler()
            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)

            pserver_endpoint = os.getenv("PSERVER")
            if pserver_endpoint:
                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
                exe.run(fluid.default_startup_program())
                exe.run(pserver_prog)
            else:
                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
                exe.run(fluid.default_startup_program())

                for pass_id in range(PASS_NUM):
                    ...

            :param optimize_ops: op list of optimization, should be the
                                 return value of Optimizer.minimize
            :type optimize_ops: list
            :param program: program to optimize, default default_main_program
            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
            :type pservers: string

            :return: return a list of programs
        """
        if program is None:
            program = default_main_program()
        self.program = program
        self.trainers = trainers
        self.optimize_ops = optimize_ops
        self._optimize_distributed(optimize_ops,
                                   program,
                                   params_grads,
                                   pservers=pservers,
                                   trainers=trainers,
                                   split_method=split_method)
Esempio n. 8
0
    def create_optimization_pass(self,
                                 parameters_and_grads,
                                 loss,
                                 startup_program=None):
        """Add optimization operators to update gradients to variables.

        Args:
          loss: the target that this optimization is for.
          parameters_and_grads: a list of (variable, gradient) pair to update.

        Returns:
          return_op_list: a list of operators that will complete one step of
          optimization. This will include parameter update ops, global step
          update ops and any other custom ops required by subclasses to manage
          their internal state.
          :param startup_program:
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
        # the subclass will implement the _append_optimize_op method and the
        #  _initialize_tensors method. The subclass can extend the
        # _create_accumulators method if it needs to create accumulators
        # for parameters and extend _finish_update method to add custom ops.

        # Create any accumulators
        program = loss.block.program
        self._dtype = loss.dtype
        with program_guard(program, startup_program):
            global_block = framework.default_main_program().global_block()
            start = len(global_block.ops)
            self.helper = LayerHelper(self.__class__.__name__)
            self._create_accumulators(loss.block,
                                      [p[0] for p in parameters_and_grads])
            self._create_global_learning_rate()

            optimize_ops = []
            for param_and_grad in parameters_and_grads:
                with param_and_grad[0].block.program.optimized_guard(
                        param_and_grad[0]):
                    if param_and_grad[0].trainable is True and param_and_grad[
                            1] is not None:
                        optimize_op = self._append_optimize_op(
                            loss.block, param_and_grad)
                        optimize_ops.append(optimize_op)

            # Get custom finish ops for subclasses
            # FIXME: Need to fix this once we figure out how to handle dependencies
            self._finish_update(loss.block)

            end = len(global_block.ops)
            return global_block.slice_ops(start, end)
Esempio n. 9
0
 def __init__(self, learning_rate, regularization=None):
     if not isinstance(learning_rate, float) and \
             not isinstance(learning_rate, framework.Variable):
         raise TypeError("learning rate should be float or Variable")
     self.regularization = regularization
     self._learning_rate = learning_rate
     # each program should have a independent learning rate
     # program -> Variable(learning_rate)
     self._learning_rate_map = dict()
     if isinstance(self._learning_rate, framework.Variable):
         self._learning_rate_map[
             framework.default_main_program()] = self._learning_rate
     # Dictionary of accumulators. Some optimizer subclasses need to
     # allocate and manage extra variables associated with the parameters
     # to train. These variables are called accumulators.
     # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
     self._accumulators = defaultdict(lambda: dict())
     self.helper = None
Esempio n. 10
0
    def _create_global_learning_rate(self):
        lr = self.global_learning_rate()

        if isinstance(lr, framework.Variable):
            return
        else:
            if not isinstance(self._learning_rate, float):
                raise TypeError(
                    "learning rate variable is create outside optimizer,"
                    "can not create new learning rate variable for new program"
                )

        # create learning rate in the current main program
        self._learning_rate_map[
            framework.default_main_program()] = layers.create_global_var(
                name=unique_name.generate("learning_rate"),
                shape=[1],
                value=float(self._learning_rate),
                dtype='float32',
                persistable=True)
Esempio n. 11
0
    def __init__(self,
                 average_window_rate,
                 params_grads=None,
                 min_average_window=10000,
                 max_average_window=10000,
                 **kwargs):
        super(ModelAverage, self).__init__(0.0, **kwargs)
        self.average_window = average_window_rate
        self.min_average_window = min_average_window
        self.max_average_window = max_average_window

        self.params_grads = [] if params_grads is None else params_grads
        params = {}
        for param, grad in self.params_grads:
            if param.do_model_average != False:
                params[param.name] = (param, grad)
        for param in framework.default_main_program().global_block(
        ).all_parameters():
            if param.name not in params and param.do_model_average != False:
                grad = param.block.create_var(name=unique_name.generate(
                    ".".join([param.name, 'tmp'])),
                                              dtype=param.dtype,
                                              persistable=False,
                                              stop_gradient=True)
                params[param.name] = (param, grad)
        self.params_grads = params.values()

        for param, grad in self.params_grads:
            self._append_average_accumulate_op(param)

        self.apply_program = Program()
        block = self.apply_program.global_block()
        with program_guard(main_program=self.apply_program):
            for param_grad in self.params_grads:
                self._add_average_apply_op(block, param_grad)

        self.restore_program = Program()
        block = self.restore_program.global_block()
        with program_guard(main_program=self.restore_program):
            for param_grad in self.params_grads:
                self._add_average_restore_op(block, param_grad)
Esempio n. 12
0
 def __append_optimize_op__(op, block, grad_to_block_id):
     if self._is_opt_op(op):
         self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                  default_main_program())
     else:
         self._append_pserver_non_opt_ops(block, op)
Esempio n. 13
0
 def main_program(self):
     prog = self.kwargs.get('main_program', None)
     if prog is None:
         return default_main_program()
     else:
         return prog
Esempio n. 14
0
    def run(self,
            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
            return_numpy=True,
            use_program_cache=False):
        """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list.

        Python executor takes a program, add feed operators and fetch operators to this program according
        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
        the variables(or names) that user want to get after program run. Note: the executor will run all
        operators in the program but not only the operators dependent by the fetch_list

        :param program: the program that need to run, if not provied, then default_main_program will be used.
        :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
        :param fetch_list: a list of variable or variable names that user want to get, run will return them according
        to this list.
        :param feed_var_name: the name for the input variable of feed Operator.
        :param fetch_var_name: the name for the output variable of feed Operator.
        :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
        :param return_numpy: if convert the fetched tensor to numpy
        :param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
        :return: result according to fetch_list.
        """
        if feed is None:
            feed = {}
        if not isinstance(feed, dict):
            raise TypeError(
                "feed requires dict as its Parameter. But you passed in %s" %
                (type(feed)))
        if fetch_list is None:
            fetch_list = []
        if program is None:
            program = default_main_program()

        if not isinstance(program, Program):
            raise TypeError(
                "Executor requires Program as its Parameter. But you passed in %s"
                % (type(program)))

        if scope is None:
            scope = global_scope()

        cache_key = get_program_cache_key(feed, fetch_list)
        if use_program_cache:
            cached_program = self._get_program_cache(cache_key)
            if cached_program is None:
                cached_program = self._add_feed_fetch_ops(
                    program=program,
                    feed=feed,
                    fetch_list=fetch_list,
                    feed_var_name=feed_var_name,
                    fetch_var_name=fetch_var_name)
                self._add_program_cache(cache_key, cached_program)
            program = cached_program
        else:
            self.program_caches.pop(cache_key, None)
            program = self._add_feed_fetch_ops(
                program=program,
                feed=feed,
                fetch_list=fetch_list,
                feed_var_name=feed_var_name,
                fetch_var_name=fetch_var_name)

        self._feed_data(program, feed, feed_var_name, scope)
        self.executor.run(program.desc, scope, 0, True, True)
        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
        if return_numpy:
            outs = as_numpy(outs)
        return outs
Esempio n. 15
0
 def main_program(self):
     prog = self.kwargs.get('main_program', None)
     if prog is None:
         return default_main_program()
     else:
         return prog
Esempio n. 16
0
    def run(self,
            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
            return_numpy=True,
            use_program_cache=False):
        """
        Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
        Python executor takes a program, add feed operators and fetch operators to this program according
        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
        the variables(or names) that user want to get after program run.

        Note: the executor will run all
        operators in the program but not only the operators dependent by the fetch_list

        Args:
            program(Program): the program that need to run, if not provied, then default_main_program will be used.
            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
            feed_var_name(str): the name for the input variable of feed Operator.
            fetch_var_name(str): the name for the output variable of fetch Operator.
            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
            return_numpy(bool): if convert the fetched tensor to numpy
            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.

        Returns:

            list(numpy.array): fetch result according to fetch_list.


        Examples:

            >>> data = layers.data(name='X', shape=[1], dtype='float32')
            >>> hidden = layers.fc(input=data, size=10)
            >>> layers.assign(hidden, out)
            >>> loss = layers.mean(out)
            >>> adam = fluid.optimizer.Adam()
            >>> adam.minimize(loss)

            >>> cpu = core.CPUPlace()
            >>> exe = Executor(cpu)
            >>> exe.run(default_startup_program())

            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
            >>> outs = exe.run(
            >>>     feed={'X': x},
            >>>     fetch_list=[loss.name])
        """
        if feed is None:
            feed = {}
        if not isinstance(feed, dict):
            raise TypeError(
                "feed requires dict as its Parameter. But you passed in %s" %
                (type(feed)))
        if fetch_list is None:
            fetch_list = []
        if program is None:
            program = default_main_program()

        if not isinstance(program, Program):
            raise TypeError(
                "Executor requires Program as its Parameter. But you passed in %s"
                % (type(program)))

        if scope is None:
            scope = global_scope()

        cache_key = _get_program_cache_key(feed, fetch_list)
        if use_program_cache:
            cached_program = self._get_program_cache(cache_key)
            if cached_program is None:
                cached_program = self._add_feed_fetch_ops(
                    program=program,
                    feed=feed,
                    fetch_list=fetch_list,
                    feed_var_name=feed_var_name,
                    fetch_var_name=fetch_var_name)
                self._add_program_cache(cache_key, cached_program)
            program = cached_program
        else:
            self.program_caches.pop(cache_key, None)
            program = self._add_feed_fetch_ops(program=program,
                                               feed=feed,
                                               fetch_list=fetch_list,
                                               feed_var_name=feed_var_name,
                                               fetch_var_name=fetch_var_name)

        self._feed_data(program, feed, feed_var_name, scope)
        self.executor.run(program.desc, scope, 0, True, True)
        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
        if return_numpy:
            outs = as_numpy(outs)
        return outs
Esempio n. 17
0
    def __init__(self,
                 use_cuda,
                 loss_name=None,
                 main_program=None,
                 share_vars_from=None,
                 exec_strategy=None,
                 build_strategy=None,
                 num_trainers=1,
                 trainer_id=0,
                 **kwargs):
        """
        ParallelExecutor can run program in parallel.

        Args:
            use_cuda(bool): Whether to use CUDA or not.
            loss_name(str, default None): The loss name must set in training.
            main_program(Program, default None): The program that need to run,
                if not provided, then default_main_program will be used.
            share_vars_from(ParallelExecutor, default None): If provied,
                it will share variables from the specified ParallelExecutor.
            num_trainers(int, default 1): If greater than 1, NCCL will be
                initialized with multpile rank of nodes, each node should have
                same number of GPUs. Distributed training will be enabled then.
            trainer_id(int, default 0): Must use together with num_trainers.
                trainer_id is the "rank" of current node starts from 0.

        Returns:
            A ParallelExecutor object.

        Raises:
            TypeError: If share_vars_from is provided, but not ParallelExecutor
                object.

        Examples:
            .. code-block:: python

              train_exe = fluid.ParallelExecutor(
                  use_cuda=True, loss_name=loss.name)
              test_exe = fluid.ParallelExecutor(
                  use_cuda=True,
                  main_program=test_program,
                  share_vars_from=train_exe)

              train_loss, = train_exe.run([loss.name], feed=feed_dict)
              test_loss, = test_exe.run([loss.name], feed=feed_dict)
        """
        if len(kwargs) != 0:
            err_msg = ""
            for key in kwargs:
                if key in dir(ExecutionStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
                        "pe=ParallelExecutor(exec_strategy=strategy) " \
                        "instead.\n ".format(key)
                elif key in dir(BuildStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=BuildStrategy(); See help(" \
                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
                            key)
                else:
                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
                        key)
            raise ValueError(err_msg)

        self._places = []
        self._act_places = []
        if use_cuda:
            for i in xrange(core.get_cuda_device_count()):
                p = core.Place()
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])
                self._places.append(p)
        else:
            for i in xrange(multiprocessing.cpu_count()):
                p = core.Place()
                self._act_places.append(core.CPUPlace())
                p.set_place(self._act_places[-1])
                self._places.append(p)
        assert self._places, "no place for execution"

        if exec_strategy is None:
            exec_strategy = ExecutionStrategy()
            if use_cuda:
                exec_strategy.use_event = True
            else:
                exec_strategy.use_event = False

        if exec_strategy.num_threads == 0:
            if use_cuda:
                # Experiments on se-resnext shows that too many threads hurt
                # performance. Worth tunning for other models in the future.
                exec_strategy.num_threads = len(self._places) * 2
            else:
                exec_strategy.num_threads = min(
                    len(self._places) * 2, multiprocessing.cpu_count())

        if build_strategy is None:
            build_strategy = BuildStrategy()

        main = main_program
        main = main if main else framework.default_main_program()
        scope = executor.global_scope()

        if share_vars_from and not isinstance(share_vars_from,
                                              ParallelExecutor):
            raise TypeError("share_vars_from must be ParallelExecutor.")
        local_scopes = share_vars_from.executor.local_scopes(
        ) if share_vars_from else []

        self.persistable_vars = [
            v.name for v in filter(
                lambda var: var.persistable and var.type != core.VarDesc.
                VarType.RAW, main.list_vars())
        ]

        self.executor = core.ParallelExecutor(
            self._places,
            set([
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]), set(self.persistable_vars), main.desc,
            loss_name if loss_name else '', scope, local_scopes, exec_strategy,
            build_strategy, num_trainers, trainer_id)
        self.scope = scope
Esempio n. 18
0
    def run(self,
            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
            return_numpy=True,
            use_program_cache=False):
        """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list.

        Python executor takes a program, add feed operators and fetch operators to this program according
        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
        the variables(or names) that user want to get after program run. Note: the executor will run all
        operators in the program but not only the operators dependent by the fetch_list

        :param program: the program that need to run, if not provied, then default_main_program will be used.
        :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
        :param fetch_list: a list of variable or variable names that user want to get, run will return them according
        to this list.
        :param feed_var_name: the name for the input variable of feed Operator.
        :param fetch_var_name: the name for the output variable of feed Operator.
        :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
        :param return_numpy: if convert the fetched tensor to numpy
        :param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
        :return: result according to fetch_list.
        """
        if feed is None:
            feed = {}
        if not isinstance(feed, dict):
            raise TypeError("feed should be a map")
        if fetch_list is None:
            fetch_list = []
        if program is None:
            program = default_main_program()

        if not isinstance(program, Program):
            raise TypeError()

        if scope is None:
            scope = global_scope()

        cache_key = get_program_cache_key(feed, fetch_list)
        if use_program_cache:
            cached_program = self._get_program_cache(cache_key)
            if cached_program is None:
                cached_program = self._add_feed_fetch_ops(
                    program=program,
                    feed=feed,
                    fetch_list=fetch_list,
                    feed_var_name=feed_var_name,
                    fetch_var_name=fetch_var_name)
                self._add_program_cache(cache_key, cached_program)
            program = cached_program
        else:
            self.program_caches.pop(cache_key, None)
            program = self._add_feed_fetch_ops(program=program,
                                               feed=feed,
                                               fetch_list=fetch_list,
                                               feed_var_name=feed_var_name,
                                               fetch_var_name=fetch_var_name)

        self._feed_data(program, feed, feed_var_name, scope)
        self.executor.run(program.desc, scope, 0, True, True)
        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
        if return_numpy:
            outs = as_numpy(outs)
        return outs
Esempio n. 19
0
 def main_program(self):
     return default_main_program()
Esempio n. 20
0
    def __init__(self,
                 use_cuda,
                 loss_name=None,
                 main_program=None,
                 share_vars_from=None,
                 exec_strategy=None,
                 build_strategy=None,
                 num_trainers=1,
                 trainer_id=0,
                 **kwargs):
        """
        ParallelExecutor can run program in parallel.

        Args:
            use_cuda(bool): Whether to use CUDA or not.
            loss_name(str, default None): The loss name must set in training.
            main_program(Program, default None): The program that need to run,
                if not provided, then default_main_program will be used.
            share_vars_from(ParallelExecutor, default None): If provied,
                it will share variables from the specified ParallelExecutor.
            num_trainers(int, default 1): If greater than 1, NCCL will be
                initialized with multpile rank of nodes, each node should have
                same number of GPUs. Distributed training will be enabled then.
            trainer_id(int, default 0): Must use together with num_trainers.
                trainer_id is the "rank" of current node starts from 0.

        Returns:
            A ParallelExecutor object.

        Raises:
            TypeError: If share_vars_from is provided, but not ParallelExecutor
                object.

        Examples:
            .. code-block:: python

              train_exe = fluid.ParallelExecutor(
                  use_cuda=True, loss_name=loss.name)
              test_exe = fluid.ParallelExecutor(
                  use_cuda=True,
                  main_program=test_program,
                  share_vars_from=train_exe)

              train_loss, = train_exe.run([loss.name], feed=feed_dict)
              test_loss, = test_exe.run([loss.name], feed=feed_dict)
        """
        if len(kwargs) != 0:
            err_msg = ""
            for key in kwargs:
                if key in dir(ExecutionStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
                        "pe=ParallelExecutor(exec_strategy=strategy) " \
                        "instead.\n ".format(key)
                elif key in dir(BuildStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=BuildStrategy(); See help(" \
                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
                            key)
                else:
                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
                        key)
            raise ValueError(err_msg)

        self._places = []
        self._act_places = []
        if use_cuda:
            for i in xrange(core.get_cuda_device_count()):
                p = core.Place()
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])
                self._places.append(p)
        else:
            for i in xrange(multiprocessing.cpu_count()):
                p = core.Place()
                self._act_places.append(core.CPUPlace())
                p.set_place(self._act_places[-1])
                self._places.append(p)
        assert self._places, "no place for execution"

        if exec_strategy is None:
            exec_strategy = ExecutionStrategy()
            if use_cuda:
                exec_strategy.use_event = True
            else:
                exec_strategy.use_event = False

        if exec_strategy.num_threads == 0:
            if use_cuda:
                # Experiments on se-resnext shows that too many threads hurt
                # performance. Worth tunning for other models in the future.
                exec_strategy.num_threads = len(self._places) * 2
            else:
                exec_strategy.num_threads = min(
                    len(self._places) * 2, multiprocessing.cpu_count())

        if build_strategy is None:
            build_strategy = BuildStrategy()

        main = main_program
        main = main if main else framework.default_main_program()
        scope = executor.global_scope()

        if share_vars_from and not isinstance(share_vars_from,
                                              ParallelExecutor):
            raise TypeError("share_vars_from must be ParallelExecutor.")
        local_scopes = share_vars_from.executor.local_scopes(
        ) if share_vars_from else []

        self.persistable_vars = [
            v.name
            for v in filter(
                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
                main.list_vars())
        ]

        self.executor = core.ParallelExecutor(
            self._places,
            set([
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
            set(self.persistable_vars), main.desc, loss_name
            if loss_name else '', scope, local_scopes, exec_strategy,
            build_strategy, num_trainers, trainer_id)
        self.scope = scope
Esempio n. 21
0
    def run(self,
            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
            return_numpy=True):
        if feed is None:
            feed = {}
        if fetch_list is None:
            fetch_list = []

        if program is None:
            program = default_main_program()

        if not isinstance(program, Program):
            raise TypeError()

        if scope is None:
            scope = g_scope

        program = program.clone()
        global_block = program.global_block()
        feed_var = global_block.create_var(
            name=feed_var_name,
            type=core.VarDesc.VarType.FEED_MINIBATCH,
            persistable=True)

        for i, name in enumerate(feed):
            out = global_block.var(name)
            global_block.prepend_op(
                'feed',
                inputs={'X': [feed_var]},
                outputs={'Out': [out]},
                attrs={'col': i})
            cur_feed = feed[name]
            if not isinstance(cur_feed, core.LoDTensor):
                cur_feed = self.aslodtensor(cur_feed)
            core.set_feed_variable(scope, cur_feed, feed_var.name, i)

        fetch_var = global_block.create_var(
            name=fetch_var_name,
            type=core.VarDesc.VarType.FETCH_LIST,
            persistable=True)
        for i, var in enumerate(fetch_list):
            global_block.append_op(
                type='fetch',
                inputs={'X': [var]},
                outputs={'Out': [fetch_var]},
                attrs={'col': i})

        self.executor.run(program.desc, scope, 0, True)
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
            for i in xrange(len(fetch_list))
        ]

        if return_numpy:
            outs = as_numpy(outs)
        return outs
Esempio n. 22
0
    def transpile(self,
                  trainer_id,
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
                  split_method=splitter.round_robin,
                  sync_mode=True):
        """
            Transpile the program to distributed data-parallelism programs.
            The main_program will be transformed to use a remote parameter server
            to do parameter optimization. And the optimization graph will be put
            into a parameter server program.

            Use different methods to split trainable variables to different
            parameter servers.

            Steps to transpile trainer:
            1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
            2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
            3. modify trainer program add split_op to each grad variable.
            4. append send_op to send splited variables to server and fetch
               params(splited blocks or origin param) from server.
            5. append concat_op to merge splited blocks to update local weights.

            Steps to transpile pserver:
            1. create new program for parameter server.
            2. create params and grad variables that assigned to current server instance.
            3. create a sub-block in the server side program
            4. append ops that should run on current server instance.
            5. add listen_and_serv op

            :param trainer_id: one unique id for each trainer in a job.
            :type trainer_id: int
            :param program: program to transpile, default is default_main_program
            :type program: Program
            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
            :type pservers: string
            :param trainers: total number of workers/trainers in the job
            :type trainers: int
            :param split_method: A function to determin how to split variables
                to different servers equally.
            :type split_method: function
            :type sync_mode: boolean default True
            :param sync_mode: if sync_mode is set True, it means that dist transpiler
            will transpile the program into sync_mode pserver and trainer program.
        """
        assert (callable(split_method))
        if program is None:
            program = default_main_program()
        self.origin_program = program
        self.trainer_num = trainers
        self.sync_mode = sync_mode
        # TODO(typhoonzero): currently trainer_id is fetched from cluster system
        # like Kubernetes, we should port this to use etcd later when developing
        # fluid distributed training with fault-tolerance.
        self.trainer_id = trainer_id
        pserver_endpoints = pservers.split(",")
        self.pserver_endpoints = pserver_endpoints
        self.optimize_ops, params_grads = self._get_optimize_pass()

        # process lookup_table_op
        # 1. check all lookup_table_op is distributed
        # 2. check all lookup_table_op share the same table.
        distributed_lookup_table_ops = []
        # support only one distributed_lookup_table now
        self.table_name = None
        for op in program.global_block().ops:
            if op.type == LOOKUP_TABLE_TYPE:
                if op.attrs['is_distributed'] is True:
                    if self.table_name is None:
                        self.table_name = op.input("W")[0]
                    if self.table_name != op.input("W")[0]:
                        raise RuntimeError("all distributed lookup_table_ops"
                                           " should have only one table")
                    distributed_lookup_table_ops.append(op)
                else:
                    if self.table_name is not None:
                        assert op.input("W")[0] != self.table_name

        self.has_distributed_lookup_table = len(
            distributed_lookup_table_ops) > 0

        # step1: For large parameters and gradients, split them into smaller
        # blocks.
        param_list = []
        grad_list = []
        for p, g in params_grads:
            # skip parameter marked not trainable
            if type(p) == Parameter and p.trainable == False:
                continue
            param_list.append(p)
            grad_list.append(g)

        if self.has_distributed_lookup_table:
            param_list = [
                param for param in param_list if param.name != self.table_name
            ]
            grad_list = [
                grad for grad in grad_list
                if grad.name != framework.grad_var_name(self.table_name)
            ]
            self.table_param_grad = [
                param_grad for param_grad in params_grads
                if param_grad[0].name == self.table_name
            ][0]
            table_grad_var = self.table_param_grad[1]
            self.table_grad_list = [
                program.global_block().create_var(
                    name="%s.trainer_%d.pserver_%d" %
                    (table_grad_var.name, trainer_id, index),
                    type=table_grad_var.type,
                    shape=table_grad_var.shape,
                    dtype=table_grad_var.dtype)
                for index in range(len(self.pserver_endpoints))
            ]

        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
        # step2: Create new vars for the parameters and gradients blocks and
        # add ops to do the split.
        grad_var_mapping = self._append_split_op(program, grad_blocks)
        param_var_mapping = self._create_vars_from_blocklist(
            program, param_blocks)
        # step3: Add gradients as send op inputs and parameters as send
        # op outputs.
        send_inputs = []
        send_outputs = []
        for b in grad_blocks:  # append by order
            varname, block_id, _ = b.split(":")
            send_inputs.append(grad_var_mapping[varname][int(block_id)])
        for b in param_blocks:
            varname, block_id, _ = b.split(":")
            send_outputs.append(param_var_mapping[varname][int(block_id)])
        # let send_op know which endpoint to send which var to, eplist has the same
        # order as send_inputs.
        eplist = split_method(send_inputs, pserver_endpoints)
        # create mapping of endpoint -> split var to create pserver side program
        self.param_grad_ep_mapping = dict()
        for i, ep in enumerate(eplist):
            param = send_outputs[i]
            grad = send_inputs[i]
            if not self.param_grad_ep_mapping.has_key(ep):
                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
            self.param_grad_ep_mapping[ep]["params"].append(param)
            self.param_grad_ep_mapping[ep]["grads"].append(grad)

        rpc_client_var = program.global_block().create_var(
            name=RPC_CLIENT_VAR_NAME,
            persistable=True,
            type=core.VarDesc.VarType.RAW)

        # create send_op
        program.global_block().append_op(type="send",
                                         inputs={"X": send_inputs},
                                         outputs={
                                             "Out": send_outputs,
                                             "RPCClient": rpc_client_var
                                         },
                                         attrs={
                                             "endpoints": pserver_endpoints,
                                             "epmap": eplist,
                                             "sync_mode": self.sync_mode
                                         })
        # step4: Concat the parameters splits together after recv.
        for varname, splited_var in param_var_mapping.iteritems():
            if len(splited_var) <= 1:
                continue
            orig_param = program.global_block().vars[varname]
            program.global_block().append_op(type="concat",
                                             inputs={"X": splited_var},
                                             outputs={"Out": [orig_param]},
                                             attrs={"axis": 0})

        if self.has_distributed_lookup_table:
            self._replace_lookup_table_op_with_prefetch(
                program, rpc_client_var, eplist)
            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
                                                     pserver_endpoints)