Beispiel #1
0
def set_cuda_rng_state(state_list):
    """

    Sets generator state for all cuda generators.

    Args:
        state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().

    Returns:
        None.

    Examples:
        .. code-block:: python

            import paddle
            sts = paddle.get_cuda_rng_state()
            paddle.set_cuda_rng_state(sts)

    """
    if core.is_compiled_with_cuda():
        if not len(state_list) == core.get_cuda_device_count():
            raise ValueError(
                "Length of cuda state list shoule be equal to the cuda device count"
            )
        for i in range(core.get_cuda_device_count()):
            core.default_cuda_generator(i).set_state(state_list[i])
 def _get_batch_size(self, use_cuda, use_parallel_executor):
     batch_size_times = 1
     if use_parallel_executor:
         batch_size_times = core.get_cuda_device_count(
         ) if use_cuda else int(
             os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
     return self.base_batch_size * batch_size_times
Beispiel #3
0
def seed(seed):
    """

    Sets the seed for global default generator, which manages the random number generation.

    Args:
        seed(int): The random seed to set. It is recommend to set a large int number.

    Returns:
        Generator: The global default generator object.

    Examples:
        .. code-block:: python

            import paddle
            gen = paddle.seed(102)

    """
    #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
    # 2. support gpu generator by global device

    seed = int(seed)

    if core.is_compiled_with_cuda():
        for i in range(core.get_cuda_device_count()):
            core.default_cuda_generator(i).manual_seed(seed)

    return core.default_cpu_generator().manual_seed(seed)
Beispiel #4
0
def get_gpus(selected_gpus):
    if selected_gpus is None:
        from paddle.fluid import core
        gpus_num = core.get_cuda_device_count()
        gpus = [str(x) for x in range(0, gpus_num)]
    else:
        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
        if cuda_visible_devices is None or cuda_visible_devices == "":
            gpus = [x.strip() for x in selected_gpus.split(',')]
        else:
            # change selected_gpus into relative values
            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
            # therefore selected_gpus=0,1,2,3
            cuda_visible_devices_list = cuda_visible_devices.split(',')
            for x in selected_gpus.split(','):
                assert x in cuda_visible_devices_list, "Can't find "\
                "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                % (x, cuda_visible_devices)
            gpus = [
                cuda_visible_devices_list.index(x.strip())
                for x in selected_gpus.split(',')
            ]
            logger.info("Change selected_gpus into reletive values. --ips:{} "
                        "will change into relative_ips:{} according to your "
                        "CUDA_VISIBLE_DEVICES:{}".format(
                            selected_gpus, gpus, cuda_visible_devices_list))

    return gpus
    def test_get_default_nprocs(self):
        paddle.set_device('cpu')
        nprocs = _get_default_nprocs()
        self.assertEqual(nprocs, multiprocessing.cpu_count())

        paddle.set_device('gpu')
        nprocs = _get_default_nprocs()
        self.assertEqual(nprocs, core.get_cuda_device_count())
Beispiel #6
0
def _get_default_nprocs():
    device = get_device()
    if 'gpu' in device:
        return core.get_cuda_device_count()
    elif 'xpu' in device:
        return core.get_xpu_device_count()
    elif 'cpu' in device:
        return multiprocessing.cpu_count()
    else:
        raise RuntimeError(
            "`paddle.distributed.spawn` does not support parallel training on device `{}` now."
            .format(device))
Beispiel #7
0
    def main(self,
             use_cuda=True,
             use_parallel_executor=False,
             use_double_buffer=False):
        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()

        self.use_cuda = use_cuda
        self.use_parallel_executor = use_parallel_executor
        self.use_double_buffer = use_double_buffer

        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            in_data, label, loss, optimizer, feed_queue = simple_fc_net(
                in_size=self.in_size,
                class_num=self.class_num,
                hidden_sizes=self.hidden_sizes,
                batch_size=self.batch_size,
                queue_capacity=self.queue_capacity,
                use_double_buffer=self.use_double_buffer)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            startup_exe = fluid.Executor(place)
            startup_exe.run(startup_program)

            if use_parallel_executor:
                main_exe = fluid.ParallelExecutor(use_cuda,
                                                  loss_name=loss.name)
                if use_cuda:
                    self.batch_size_times = core.get_cuda_device_count()
                else:
                    self.batch_size_times = int(
                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            else:
                main_exe = startup_exe
                self.batch_size_times = 1

            reader = self.random_reader()
            thread = threading.Thread(target=feed_data,
                                      args=(feed_queue, reader))
            thread.start()

            self.outputs = []
            for _ in range(self.iterations):
                fetches = main_exe.run(fetch_list=[in_data.name, label.name])
                fetches = [as_numpy(fetch) for fetch in fetches]
                self.outputs.append(fetches)

            feed_queue.close()
            self.validate()
 def _build_program(self,
                    place,
                    layout,
                    seed,
                    sync_bn=False,
                    only_forward=False):
     """Build program."""
     main = fluid.Program()
     startup = fluid.Program()
     main.random_seed = seed
     startup.random_seed = seed
     use_cudnn = self.dtype == np.float16
     with fluid.unique_name.guard():
         with fluid.program_guard(main, startup):
             data = fluid.layers.data(
                 name='input',
                 shape=self.dshape,
                 dtype=self.dtype,
                 append_batch_size=False)
             conv = fluid.layers.conv2d(
                 input=data,
                 num_filters=32,
                 filter_size=1,
                 param_attr=fluid.ParamAttr(name='conv2d_weight'),
                 bias_attr=False,
                 use_cudnn=use_cudnn)
             bn = fluid.layers.batch_norm(
                 conv,
                 param_attr=fluid.ParamAttr(name='bn_scale'),
                 bias_attr=fluid.ParamAttr(name='bn_bias'),
                 moving_mean_name='bn_moving_mean',
                 moving_variance_name='bn_moving_variance',
                 data_layout=layout,
                 is_test=only_forward)
             if core.is_compiled_with_rocm():
                 bn = fluid.layers.cast(bn, 'float32')
             else:
                 bn = fluid.layers.cast(bn, 'float64')
             sigmoid = fluid.layers.sigmoid(bn)
             out = fluid.layers.reduce_sum(sigmoid)
             if not sync_bn:
                 out = out / core.get_cuda_device_count()
             if not only_forward:
                 sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
                 sgd_opt.backward(out)
     return main, startup, [out, conv, bn]
Beispiel #9
0
def device_count():
    '''
    Return the number of GPUs available.
    
    Returns:
        int: the number of GPUs available.

    Examples:
        .. code-block:: python

            import paddle

            paddle.device.cuda.device_count()

    '''

    num_gpus = core.get_cuda_device_count() if hasattr(
        core, 'get_cuda_device_count') else 0

    return num_gpus
Beispiel #10
0
def get_cuda_rng_state():
    """

    Get random state of cuda generators.

    Args:
        None.

    Returns:
        GeneratorState:  object.

    Examples:
        .. code-block:: python

            import paddle
            sts = paddle.get_cuda_rng_state()

    """
    state_list = []
    if core.is_compiled_with_cuda():
        for i in range(core.get_cuda_device_count()):
            state_list.append(core.default_cuda_generator(i).get_state())

    return state_list
    def run_main(self, place, with_data_parallel):
        self.place = place
        self.with_data_parallel = with_data_parallel

        if not core.is_compiled_with_cuda() and isinstance(
                self.place, core.CUDAPlace):
            return

        if isinstance(self.place, core.CUDAPlace):
            device_cnt = core.get_cuda_device_count(
            ) if self.with_data_parallel else 1
        else:
            device_cnt = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count())
            ) if self.with_data_parallel else 1

        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            d = layers.reshape(d, shape=[10])
            prev = layers.reshape(prev, shape=[10])
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)
            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                d2 = layers.reshape(d2, shape=[10])
                prev2 = layers.reshape(prev2, shape=[10])
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        sum_result.persistable = True
        tmp = layers.unsqueeze(sum_result, axes=[0])
        tmp = layers.expand(tmp, expand_times=[10, 1])
        fc = layers.fc(tmp, size=256)
        loss = layers.mean(sum_result)

        optim = fluid.optimizer.Adam(learning_rate=1e-3)
        optim.minimize(loss)

        exe = Executor(self.place)
        exe.run(fluid.default_startup_program())

        prog = fluid.default_main_program()
        if self.with_data_parallel:
            prog = compiler.CompiledProgram(
                fluid.default_main_program()).with_data_parallel(
                    loss_name=loss.name)

        for _ in range(5):
            d = []
            for i in range(3):
                tmp = numpy.random.random(size=[10]).astype('float32')
                if not self.with_data_parallel:
                    d.append(tmp)
                else:
                    d.append(numpy.array([tmp] * device_cnt))

            outs = exe.run(program=prog,
                           feed={
                               'd0': d[0],
                               'd1': d[1],
                               'd2': d[2]
                           },
                           fetch_list=[sum_result])
            self.assertAlmostEqual(numpy.sum(d),
                                   numpy.sum(outs[0]),
                                   delta=0.01)
Beispiel #12
0
 def _get_device_count(self, use_cuda):
     return core.get_cuda_device_count() if use_cuda else int(
         os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
Beispiel #13
0
    def _compare(self, place, layout, only_forward):
        """Compare results."""
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
        scope = core.Scope()
        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
        data = create_or_get_tensor(scope, "input",
                                    OpTest.np_dtype_to_fluid_dtype(data),
                                    place)

        # Single-GPU, N = 32 per GPU
        main, startup, outs = self._build_program(place, layout, seed, False,
                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD',
                'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        bn_fetches = exe.run(program=main,
                             feed={'input': data},
                             fetch_list=fetch_names)

        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
        assert core.get_cuda_device_count() > 1
        main, startup, outs = self._build_program(place, layout, seed, True,
                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD',
                'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        for nm in fetch_names:
            fv = fluid.framework._get_var(str(nm), program=main)
            fv.persistable = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.sync_batch_norm = True
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
            outs[0].name if not only_forward else None,
            build_strategy=build_strategy)
        sync_bn_fetches = exe.run(program=comp_prog,
                                  feed={'input': data},
                                  fetch_list=fetch_names)

        for i in six.moves.xrange(1, len(sync_bn_fetches)):
            bn_val = bn_fetches[i]
            sync_bn_val = sync_bn_fetches[i]
            if sync_bn_val.shape != bn_val.shape:
                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
            self.assertTrue(
                np.allclose(bn_val, sync_bn_val, atol=self.atol),
                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
Beispiel #14
0
def _get_subprocess_env_list(nprocs, options):
    # NOTE (xiongkun03) Why put backend deduction  here ? 
    # Becase _get_subprocess_env_list is used by many testcases. 
    # So for campability, we put backend deduction here 

    # logic for handle backend option
    if 'backend' not in options or options['backend'] == 'auto':
        options['backend'] = _get_default_backend()
    check_backend(options['backend'])
    block_windows_and_macos(options['backend'])

    # contruct processes env list
    processes_env_list = []

    # get args from kwargs
    args = ParallelEnvArgs()

    # deal with `ips`
    args.cluster_node_ips = options.get('ips', None)
    if args.cluster_node_ips is None:
        args.cluster_node_ips = options.get('cluster_node_ips', None)
        if args.cluster_node_ips is None:
            args.cluster_node_ips = "127.0.0.1"

    # deal with `gpus` or `xpus`
    # set default selected devices(gpus or xpus)
    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ]
    # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place,
    # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error
    # when using `ParallelEnv`
    # NOTE(chenweihang): use absolute gpu or xpu card id
    if options['backend'] == 'nccl':
        args.selected_devices = options.get('gpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_cuda_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`CUDA_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `gpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected gpu card %s cannot found in "
                                     "CUDA_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))

    elif options['backend'] == 'bkcl':
        args.selected_devices = options.get('xpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("XPU_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_xpu_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`XPU_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `xpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected xpu card %s cannot found in "
                                     "XPU_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))
    elif options['backend'] == 'cncl':
        args.selected_devices = options.get('mlus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("MLU_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_mlu_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`MLU_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `mlus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected mlu card %s cannot found in "
                                     "MLU_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))
    elif options['backend'] == 'gloo':
        # TODO check gpu / xpu flag must not exist
        warnings.warn(
            "Your model will be trained under CPUONLY mode by using GLOO,"
            "because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device."
        )
        args.paddle_cpuonly = True
        args.selected_devices = None
        args.ips = args.cluster_node_ips
        assert options.get(
            'use_paddlecloud',
            None) is None, "CPUONLY spawn doesn't support use paddle cloud"
        assert len(
            args.cluster_node_ips.split(',')
        ) <= 1, "CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s."
        assert _get_trainers_num(
        ) == 1, "CPUONLY spawn doesn't support multi-trainer"

    # set other inner args
    args.node_ip = options.get('node_ip', None)
    if args.node_ip is None:
        args.node_ip = _get_node_ip(args.cluster_node_ips)

    args.started_port = options.get('started_port', None)

    args.use_paddlecloud = options.get('use_paddlecloud', None)
    if args.use_paddlecloud is None:
        args.use_paddlecloud = use_paddlecloud()

    # get cluster and pod config
    if options['backend'] == 'gloo':
        devices_per_proc = [x for x in range(0, nprocs)]
        cluster, pod = get_cluster_from_args(args, DeviceMode.CPU,
                                             devices_per_proc)
    else:
        cluster, pod = get_cluster_and_pod(args)

    # prepare subprocess env list
    for trainer in pod.trainers:
        processes_env_list.append(
            _prepare_trainer_env(cluster, trainer, options['backend']))

    # [Debug] print config
    args.print_config = options.get('print_config', False)
    if args.print_config:
        _print_arguments(args)

    return processes_env_list
Beispiel #15
0
    def main(self,
             use_cuda=True,
             use_parallel_executor=False,
             use_double_buffer=False,
             use_feed_list=False,
             use_decorate_paddle_reader=False):
        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()

        self.use_cuda = use_cuda
        self.use_parallel_executor = use_parallel_executor
        self.use_double_buffer = use_double_buffer
        self.use_feed_list = use_feed_list
        self.use_decorate_paddle_reader = use_decorate_paddle_reader

        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net(
                in_size=self.in_size,
                class_num=self.class_num,
                hidden_sizes=self.hidden_sizes,
                batch_size=self.batch_size,
                queue_capacity=self.queue_capacity,
                use_double_buffer=self.use_double_buffer,
                use_feed_list=self.use_feed_list)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = fluid.Executor(place)
            exe.run(startup_program)

            train_cp = main_program
            if use_parallel_executor:
                train_cp = compiler.CompiledProgram(
                    main_program).with_data_parallel(loss_name=loss.name)
                if use_cuda:
                    self.batch_size_times = core.get_cuda_device_count()
                else:
                    self.batch_size_times = int(
                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            else:
                self.batch_size_times = 1

            reader = self.tensor_reader(use_decorate_paddle_reader)
            batch_reader = paddle.batch(reader, batch_size=self.batch_size)

            self.inputs = []
            self.outputs = []

            if use_decorate_paddle_reader:
                if use_feed_list:
                    py_reader.decorate_paddle_reader(batch_reader)
                else:
                    py_reader.decorate_sample_list_generator(batch_reader)
                py_reader.start()
            else:
                thread = threading.Thread(target=feed_data,
                                          args=(feed_queue, batch_reader))
                thread.daemon = True
                thread.start()

            try:
                while True:
                    fetches = exe.run(train_cp,
                                      fetch_list=[in_data.name, label.name])
                    fetches = [as_numpy(fetch) for fetch in fetches]
                    self.outputs.append(fetches)
            except fluid.core.EOFException:
                pass

            feed_queue.close()
            self.validate()
            if use_decorate_paddle_reader:
                py_reader.exited = True
                py_reader.thread.join()
            else:
                thread.join()
Beispiel #16
0
def _get_subprocess_env_list(nprocs, options):
    # contruct processes env list
    processes_env_list = []

    # get args from kwargs
    args = ParallelEnvArgs()

    # deal with `ips`
    args.cluster_node_ips = options.get('ips', None)
    if args.cluster_node_ips is None:
        args.cluster_node_ips = options.get('cluster_node_ips', None)
        if args.cluster_node_ips is None:
            args.cluster_node_ips = "127.0.0.1"

    # deal with `gpus` or `xpus`
    # set default selected devices(gpus or xpus)
    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ]
    # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place,
    # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error
    # when using `ParallelEnv`
    # NOTE(chenweihang): use absolute gpu or xpu card id
    if core.is_compiled_with_cuda():
        args.selected_devices = options.get('gpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_cuda_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`CUDA_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `gpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected gpu card %s cannot found in "
                                     "CUDA_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))

    elif core.is_compiled_with_xpu():
        args.selected_devices = options.get('xpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("XPU_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_xpu_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`XPU_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `xpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected xpu card %s cannot found in "
                                     "XPU_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))

    # set other inner args
    args.node_ip = options.get('node_ip', None)
    if args.node_ip is None:
        args.node_ip = _get_node_ip(args.cluster_node_ips)

    args.started_port = options.get('started_port', None)

    args.use_paddlecloud = options.get('use_paddlecloud', None)
    if args.use_paddlecloud is None:
        args.use_paddlecloud = use_paddlecloud()

    # get cluster and pod config
    cluster, pod = get_cluster_and_pod(args)

    # prepare subprocess env list
    for trainer in pod.trainers:
        processes_env_list.append(_prepare_trainer_env(cluster, trainer))

    # [Debug] print config
    args.print_config = options.get('print_config', False)
    if args.print_config:
        _print_arguments(args)

    return processes_env_list
Beispiel #17
0
def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
    """
    Start multiple processes with ``spawn`` method for parallel training.

    .. note::
        ``spawn`` now only supports GPU collective mode.

    Args:
        func (function): The target function is called by spawned process.
            This function need to be able to pickled, so it must be defined
            at the top level of a module.
        args (tuple, optional): Arguments passed to ``func``.
        nprocs (int, optional): Number of processed to start. Default: -1.
            when nprocs is -1, the available device will be obtained from 
            the environment variable when the model is executed: If use GPU, 
            the currently available device ID is obtained from the environment 
            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
            CPU number is obtained from the environment variable CPU_NUM. 
            For example, export CPU_NUM=4, if the environment variable is not set, 
            the spawn method will add default value to the environment variable 
            and set its value to 1.
        join (bool, optional): Perform a blocking join on all spawned processes.
            Default: True.
        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
        **options(dict, optional): Other initial parallel execution environment 
            configuration options. The following options are currently supported: 
            (1) start_method (string): the way to start a process. 
            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
            Because the CUDA runtime does not support the ``fork`` start method, 
            when use CUDA in subprocesses, we should start process by ``spawn`` 
            or ``forkserver`` method. Default: "spawn" ; 
            (2) gpus (string): The training process will run on the 
            selected gpus, such as "0,1,2,3". Default: None; 
            (3) ips (string): Paddle cluster nodes ips, such as 
            "192.168.0.16,192.168.0.17". Default: "127.0.0.1" . 

    Returns:
        ``MultiprocessContext`` object, it hold the spawned processes.

    Examples:
        .. code-block:: python

            from __future__ import print_function

            import paddle
            import paddle.nn as nn
            import paddle.optimizer as opt
            import paddle.distributed as dist

            class LinearNet(nn.Layer):
                def __init__(self):
                    super(LinearNet, self).__init__()
                    self._linear1 = nn.Linear(10, 10)
                    self._linear2 = nn.Linear(10, 1)
                    
                def forward(self, x):
                    return self._linear2(self._linear1(x))

            def train(print_result=False): 
                # 1. initialize parallel environment
                dist.init_parallel_env()

                # 2. create data parallel layer & optimizer
                layer = LinearNet()
                dp_layer = paddle.DataParallel(layer)

                loss_fn = nn.MSELoss()
                adam = opt.Adam(
                    learning_rate=0.001, parameters=dp_layer.parameters())

                # 3. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
                loss = loss_fn(outputs, labels)
                
                if print_result is True:
                    print("loss:", loss.numpy())
                
                loss.backward()

                adam.step()
                adam.clear_grad()

            # Usage 1: only pass function. 
            # If your training method no need any argument, and 
            # use all visible devices for parallel training. 
            if __name__ == '__main__':
                dist.spawn(train)

            # Usage 2: pass function and arguments.
            # If your training method need some arguments, and 
            # use all visible devices for parallel training.
            if __name__ == '__main__':
                dist.spawn(train, args=(True,))

            # Usage 3: pass function, arguments and nprocs.
            # If your training method need some arguments, and 
            # only use part of visible devices for parallel training.
            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
            # this case will use cards {0,1}; If you set 
            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
            # cards {4,5}
            if __name__ == '__main__':
                dist.spawn(train, args=(True,), nprocs=2)

            # Usage 4: pass function, arguments, nprocs and gpus.
            # If your training method need some arguments, and 
            # only use part of visible devices for parallel training,
            # but you can't set your machine's environment variable 
            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
            # {0,1,2,3,4,5,6,7}, you can pass `gpus` to 
            # select the GPU cards you want to use. For example,
            # this case will use cards {4,5} if your machine hold 8 cards.
            if __name__ == '__main__':
                dist.spawn(train, args=(True,), nprocs=2, gpus='4,5')
    """
    # NOTE(chenweihang): [ why only supports python3.4+ ? ]
    # Python supported setting the child process startup method
    # since 3.4. The previous version can only use the default startup 
    # method, while the default startup method of Unix is fork, which 
    # cannot support CUDA runtime multi-process
    _py_supported_check()

    # Give an error hint when the users enter a configuration option 
    # that does not exist
    _options_valid_check(options)

    # get default nprocs
    if nprocs == -1:
        device = get_device()
        if device == 'cpu':
            # TODO: not supports cpu parallel now
            nprocs = _cpu_num()
        elif device == 'gpu':
            nprocs = core.get_cuda_device_count()
        elif device == 'xpu':
            nprocs = core.get_xpu_device_count()
        else:
            raise ValueError(
                "`device` should be a string of `cpu`, 'gpu' or 'xpu', but got {}".
                format(device))

    # NOTE(chenweihang): [ why need get cluster info before run? ]
    # when using `paddle.distributed.spawn` start parallel training, 
    # we should get cluster info before starting subprocess, and pass 
    # correct info to each subprocess
    procs_env_list = _get_subprocess_env_list(nprocs, options)

    # start processes
    # NOTE(chenweihang): [ why default start method is spawn? ]
    # The CUDA runtime does not support the fork start method, 
    # either the spawn or forkserver start method are required 
    # to use CUDA in subprocesses.
    start_method = options.get('start_method', None)
    if start_method is None:
        start_method = 'spawn'
    mp = multiprocessing.get_context(start_method)

    error_queues = []
    return_queues = []
    processes = []
    for i in range(nprocs):
        error_queue = mp.SimpleQueue()
        return_queue = mp.SimpleQueue()
        process = mp.Process(
            target=_func_wrapper,
            args=(func, args, error_queue, return_queue, procs_env_list[i]))
        process.daemon = daemon
        process.start()
        error_queues.append(error_queue)
        return_queues.append(return_queue)
        processes.append(process)

    context = MultiprocessContext(processes, error_queues, return_queues)
    if not join:
        return context

    # loop until all process end
    while not context.join():
        pass

    # finally return context
    return context
Beispiel #18
0
def _get_subprocess_env_list(nprocs, options):
    # contruct processes env list
    processes_env_list = []

    # get args from kwargs
    args = ParallelEnvArgs()

    # set default `node_ip` and `cluster_node_ips`
    args.cluster_node_ips = options.get('cluster_node_ips', None)
    args.node_ip = options.get('node_ip', None)
    if args.cluster_node_ips is not None and args.node_ip is None:
        raise ValueError("please input current node ip, "
                         "cannot only give `cluster_node_ips`.")
    default_node_ip = "127.0.0.1"
    if args.node_ip is None:
        args.node_ip = default_node_ip
    if args.cluster_node_ips is None:
        args.cluster_node_ips = default_node_ip

    # set default selected gpus
    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
    # because the FLAGS_selected_gpus may be used in other place,
    # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
    # when using `ParallelEnv`
    # NOTE(chenweihang): use absolute gpu card id
    args.selected_gpus = options.get('selected_gpus', None)
    env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
    if env_devices is None or env_devices == "":
        env_devices_list = [
            str(x) for x in six.moves.range(core.get_cuda_device_count())
        ]
    else:
        env_devices_list = env_devices.split(',')
    if args.selected_gpus is None:
        if len(env_devices_list) < nprocs:
            raise RuntimeError(
                "the number of visible devices(%d) is less than the number "
                "of spawn processes(%d), please ensure that the correct "
                "`nprocs` argument is passed or the environment variable "
                "`CUDA_VISIBLE_DEVICES` is correctly configured." %
                (len(env_devices_list), nprocs))
        args.selected_gpus = ",".join(
            [str(env_devices_list[x]) for x in range(0, nprocs)])
    else:
        for card_id in args.selected_gpus.split(','):
            if card_id not in env_devices_list:
                raise ValueError("The selected gpu card %s cannot found in "
                                 "CUDA_VISIBLE_DEVICES (%s)." %
                                 (card_id, ",".join(env_devices_list)))

    # set other arguments
    args.started_port = options.get('started_port', None)
    args.use_paddlecloud = options.get('use_paddlecloud', False)
    args.print_config = options.get('print_config', False)

    # reuse code of launch.py
    cluster, pod = get_cluster_and_pod(args)

    # prepare subprocess env list
    for trainer in pod.trainers:
        processes_env_list.append(_prepare_trainer_env(cluster, trainer))

    # print config
    if args.print_config:
        _print_arguments(args)

    return processes_env_list