Beispiel #1
0
    def _transpile(self, startup_program, main_program):
        """
        Transpile the programs to distributed programs. And add the variables.
        """
        worker_endpoints = fleet.worker_endpoints()
        trainer_id = fleet.worker_index()
        current_endpoint = fleet.worker_endpoints()[trainer_id]
        worker_endpoints_env = ','.join(worker_endpoints)
        trainers_num = fleet.worker_num()

        if self.print_config:
            print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
                  trainer_id:{}".format(worker_endpoints, trainers_num,
                                        current_endpoint, trainer_id))

        # call transpiler
        config = dist_transpiler.DistributeTranspilerConfig()
        config.mode = self._strategy.mode
        config.collective_mode = self._strategy.collective_mode

        config.nccl_comm_num = self._strategy.nccl_comm_num
        config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce
        config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks

        t = dist_transpiler.DistributeTranspiler(config=config)
        t.transpile(trainer_id=trainer_id,
                    trainers=worker_endpoints_env,
                    startup_program=startup_program,
                    program=main_program,
                    current_endpoint=current_endpoint)
Beispiel #2
0
def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None

    # the port of all pservers, needed by both trainer and pserver
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    # comma separated ips of all pservers, needed by trainer and
    # pserver
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    # total number of workers/trainers in the job, needed by
    # trainer and pserver
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # the IP of the local machine, needed by pserver only
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

    config = distribute_transpiler.DistributeTranspilerConfig()
    config.slice_var_up = not args.no_split_var
    t = distribute_transpiler.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
        # NOTE: *MUST* use train_prog, for we are using with guard to
        # generate different program for train and test.
        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=not args.async_mode,
        startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(
            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
Beispiel #3
0
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        """
        minimize a program through loss
        Args:
            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
        Returns:
            tuple: (optimize_ops, params_grads) which are, list of operators appended;
            and list of (param, grad) Variables pair for optimization.
        Note that in parameter server mode, a worker will not get anything about optimize_os
        Because optmizer algorithms run on pserver side. We will make this usable in pserver
        process, but currently the optimization part is written into Fleet(). A user does not
        need to care about how to startup a pserver node.
        """
        optimize_ops, param_grads = self._optimizer.minimize(
            loss, startup_program, parameter_list, no_grad_set)

        worker_endpoints = fleet.worker_endpoints()
        trainer_id = fleet.worker_index()
        current_endpoint = fleet.worker_endpoints()[trainer_id]

        startup_program = startup_program if startup_program else \
            fluid.framework.default_startup_program

        # call transpiler
        config = dist_transpiler.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = dist_transpiler.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=','.join(worker_endpoints),
            startup_program=startup_program,
            current_endpoint=current_endpoint)

        return optimize_ops, param_grads
Beispiel #4
0
    def test(self):
        self._check()

        trainer_id = self.trainer_id
        num_trainers = self.num_trainers

        # if the test program is not built, which means that is the first time
        # to call the test method, we will first build the test program and
        # add ops to broadcast bn-related parameters from trainer 0 to other
        # trainers for distributed tests.
        if not self.test_initialized:
            emb, loss, _, _, _ = self.build_program(False,
                                                    self.num_trainers > 1)
            emb_name = emb.name
            assert self._get_info(emb_name) is None
            self._set_info('emb_name', emb.name)

            if num_trainers > 1 and self.has_run_train:
                self._append_broadcast_ops(self.test_program)

            if num_trainers > 1 and not self.has_run_train:
                worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
                current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")

                config = dist_transpiler.DistributeTranspilerConfig()
                config.mode = "collective"
                config.collective_mode = "grad_allreduce"
                t = dist_transpiler.DistributeTranspiler(config=config)
                t.transpile(trainer_id=trainer_id,
                            trainers=worker_endpoints,
                            startup_program=self.startup_program,
                            program=self.test_program,
                            current_endpoint=current_endpoint)
        else:
            emb_name = self._get_info('emb_name')

        gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
        place = fluid.CUDAPlace(gpu_id)
        exe = fluid.Executor(place)
        if not self.has_run_train:
            exe.run(self.startup_program)

        if not self.test_reader:
            test_reader = reader.test
        else:
            test_reader = self.test_reader
        if not self.test_initialized:
            test_list, test_name_list = test_reader(self.dataset_dir,
                                                    self.val_targets)
            assert self._get_info('test_list') is None
            assert self._get_info('test_name_list') is None
            self._set_info('test_list', test_list)
            self._set_info('test_name_list', test_name_list)
        else:
            test_list = self._get_info('test_list')
            test_name_list = self._get_info('test_name_list')

        test_program = self.test_program

        if not self.has_run_train:
            assert self.checkpoint_dir, "No checkpoint found for test."
            self.load_checkpoint(executor=exe,
                                 main_program=test_program,
                                 load_for_train=False)

        feeder = fluid.DataFeeder(place=place,
                                  feed_list=['image', 'label'],
                                  program=test_program)
        fetch_list = [emb_name]

        self.test_initialized = True

        test_start = time.time()
        self._run_test(exe, test_list, test_name_list, feeder, fetch_list)
        test_end = time.time()
        logger.info("test time: {:.4f}".format(test_end - test_start))