コード例 #1
0
def train():
    # 1. initialize parallel environment
    dist.init_parallel_env()

    # 2. get current ParallelEnv
    parallel_env = dist.ParallelEnv()
    print("rank: ", parallel_env.rank)
    print("world_size: ", parallel_env.world_size)
コード例 #2
0
def train():
    """parallelenv"""
    # 1. initialize parallel env
    dist.init_parallel_env()

    # 2. get current ParallelEnv
    parallel_env = dist.ParallelEnv()
    assert parallel_env.rank == 0
    assert parallel_env.world_size == 2
    print("test_ParallelEnv ... ok")
コード例 #3
0
ファイル: program.py プロジェクト: hacunamatada/PaddleOCR
def preprocess(is_train=False):
    FLAGS = ArgsParser().parse_args()
    profiler_options = FLAGS.profiler_options
    config = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    profile_dic = {"profiler_options": FLAGS.profiler_options}
    merge_config(profile_dic)

    if is_train:
        # save_config
        save_model_dir = config['Global']['save_model_dir']
        os.makedirs(save_model_dir, exist_ok=True)
        with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f:
            yaml.dump(dict(config),
                      f,
                      default_flow_style=False,
                      sort_keys=False)
        log_file = '{}/train.log'.format(save_model_dir)
    else:
        log_file = None
    logger = get_logger(name='root', log_file=log_file)

    # check if set use_gpu=True in paddlepaddle cpu version
    use_gpu = config['Global']['use_gpu']
    check_gpu(use_gpu)

    alg = config['Architecture']['algorithm']
    assert alg in [
        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
        'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
        'SEED', 'SDMGR'
    ]
    windows_not_support_list = ['PSE']
    if platform.system() == "Windows" and alg in windows_not_support_list:
        logger.warning('{} is not support in Windows now'.format(
            windows_not_support_list))
        sys.exit()

    device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'
    device = paddle.set_device(device)

    config['Global']['distributed'] = dist.get_world_size() != 1

    if config['Global']['use_visualdl']:
        from visualdl import LogWriter
        save_model_dir = config['Global']['save_model_dir']
        vdl_writer_path = '{}/vdl/'.format(save_model_dir)
        os.makedirs(vdl_writer_path, exist_ok=True)
        vdl_writer = LogWriter(logdir=vdl_writer_path)
    else:
        vdl_writer = None
    print_dict(config, logger)
    logger.info('train with paddle {} and device {}'.format(
        paddle.__version__, device))
    return config, device, logger, vdl_writer
コード例 #4
0
def preprocess(is_train=False):
    FLAGS = ArgsParser().parse_args()
    config = load_config(FLAGS.config)
    merge_config(FLAGS.opt)

    # check if set use_gpu=True in paddlepaddle cpu version
    use_gpu = config['Global']['use_gpu']
    check_gpu(use_gpu)

    alg = config['Architecture']['algorithm']
    assert alg in [
        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
        'CLS'
    ]

    device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'
    device = paddle.set_device(device)

    config['Global']['distributed'] = dist.get_world_size() != 1
    if is_train:
        # save_config
        save_model_dir = config['Global']['save_model_dir']
        os.makedirs(save_model_dir, exist_ok=True)
        with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f:
            yaml.dump(dict(config),
                      f,
                      default_flow_style=False,
                      sort_keys=False)
        log_file = '{}/train.log'.format(save_model_dir)
    else:
        log_file = None
    logger = get_logger(name='root', log_file=log_file)
    if config['Global']['use_visualdl']:
        from visualdl import LogWriter
        save_model_dir = config['Global']['save_model_dir']
        vdl_writer_path = '{}/vdl/'.format(save_model_dir)
        os.makedirs(vdl_writer_path, exist_ok=True)
        vdl_writer = LogWriter(logdir=vdl_writer_path)
    else:
        vdl_writer = None
    print_dict(config, logger)
    logger.info('train with paddle {} and device {}'.format(
        paddle.__version__, device))
    return config, device, logger, vdl_writer
コード例 #5
0
  * @file test.py
  * @author [email protected]
  * @date 2020-12-30 15:53
  * @brief
  *
  **************************************************************************/
"""
import os
import sys
import paddle
import paddle.distributed as dist
from utils import run_priority
os.system("unset CUDA_VISIBLE_DEVICES")
os.system("export CUDA_VISIBLE_DEVICES=1")
dist.init_parallel_env()
parallel_env = dist.ParallelEnv()


@run_priority(level="P0")
def test_get_rank():
    """parallelenv"""
    assert parallel_env.rank == 0
    print("{} ... ok".format(sys._getframe().f_code.co_name))


@run_priority(level="P0")
def test_get_world_size():
    """parallelenv"""
    assert parallel_env.world_size == 1
    print("{} ... ok".format(sys._getframe().f_code.co_name))
コード例 #6
0
    def read_datasets(self, splits=None, data_files=None):
        def remove_if_exit(filepath):
            if isinstance(filepath, (list, tuple)):
                for file in filepath:
                    try:
                        os.remove(file)
                    except OSError:
                        pass
            else:
                try:
                    os.remove(filepath)
                except OSError:
                    pass

        if data_files is None:
            if splits is None:
                splits = list(self.BUILDER_CONFIGS[
                    self.name]['splits'].keys()) if hasattr(
                        self, "BUILDER_CONFIGS") else list(self.SPLITS.keys())

            assert isinstance(
                splits, str
            ) or (isinstance(splits, list) and isinstance(splits[0], str)) or (
                isinstance(splits, tuple) and isinstance(splits[0], str)
            ), "`splits` should be a string or list of string or a tuple of string."

            if isinstance(splits, str):
                splits = [splits]
            datasets = DatasetTuple(splits)
            parallel_env = dist.ParallelEnv()
            unique_endpoints = _get_unique_endpoints(
                parallel_env.trainer_endpoints[:])
            # move register hook to first and register togather
            lock_files = []
            for split in splits:
                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
                if self.name is not None:
                    lock_file = lock_file + "." + self.name
                lock_file += "." + split + ".done" + "." + str(os.getppid())
                lock_files.append(lock_file)
            # Must register to all procs to make the lock file can be removed
            # when any proc breaks. Otherwise, the single registered proc may
            # not receive proper singal send by the parent proc to exit.
            atexit.register(lambda: remove_if_exit(lock_files))
            for split in splits:
                filename = self._get_data(split)
                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
                if self.name is not None:
                    lock_file = lock_file + "." + self.name
                lock_file += "." + split + ".done" + "." + str(os.getppid())
                # `lock_file` indicates the finished status of`_get_data`.
                # `_get_data` only works in the `unique_endpoints` specified
                # proc since `get_path_from_url` only work for it. The other
                # procs wait `_get_data` to be finished.
                if parallel_env.current_endpoint in unique_endpoints:
                    f = open(lock_file, "w")
                    f.close()
                else:
                    while not os.path.exists(lock_file):
                        time.sleep(1)
                datasets[split] = self.read(filename=filename, split=split)
        else:
            assert isinstance(data_files, str) or isinstance(
                data_files, tuple
            ) or isinstance(
                data_files, list
            ), "`data_files` should be a string or tuple or list of strings."
            if isinstance(data_files, str):
                data_files = [data_files]
            default_split = 'train'
            if splits:
                if isinstance(splits, str):
                    splits = [splits]
                datasets = DatasetTuple(splits)
                assert len(splits) == len(
                    data_files
                ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
                for i in range(len(data_files)):
                    datasets[splits[i]] = self.read(filename=data_files[i],
                                                    split=splits[i])
            else:
                datasets = DatasetTuple(
                    ["split" + str(i) for i in range(len(data_files))])
                for i in range(len(data_files)):
                    datasets["split" + str(i)] = self.read(
                        filename=data_files[i], split=default_split)

        return datasets if len(datasets) > 1 else datasets[0]
コード例 #7
0
ファイル: pp_layers.py プロジェクト: wuhuachaocoding/Paddle
    def __init__(self,
                 layers,
                 num_stages=None,
                 topology=None,
                 loss_fn=None,
                 seg_method="uniform",
                 recompute_interval=0,
                 recompute_offload=False,
                 recompute_partition=False):
        super(PipelineLayer, self).__init__()
        if num_stages is None and topology is None:
            raise ValueError("should provide num_stages or topology")

        # lazy import
        import paddle.distributed as dist
        from paddle.distributed import fleet

        self.device_id = dist.ParallelEnv().device_id
        self.layers = layers
        self._loss_fn = loss_fn
        self._topo = topology
        self._recompute_interval = recompute_interval
        self._recompute_offload = recompute_offload
        self._recompute_partition = recompute_partition

        if recompute_interval > 0:
            logger.info(
                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}"
                .format(recompute_offload, recompute_partition))
        _initialize_recompute_setting(recompute_offload, recompute_partition)

        world_size = dist.get_world_size()
        self.global_rank = dist.get_rank()

        if self._topo:
            self._stage_id = self._topo.get_coord(self.global_rank).pipe
            self._num_stages = self._topo.get_dim_size("pipe")
            if num_stages:
                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
                    self._num_stages)
        else:
            # construct default topology
            if world_size % num_stages != 0:
                raise ValueError(
                    "should provide correct num_stages({}) "
                    "which can be divided by world_size({})".format(
                        num_stages, world_size))
            dp_num = world_size // num_stages
            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
                                                   [dp_num, num_stages, 1])
            self._stage_id = self._topo.get_coord(self.global_rank).pipe
            self._num_stages = self._topo.get_dim_size("pipe")

        # initialize segment
        self._layers_desc = list(self.layers)
        self._num_layers = len(self._layers_desc)
        self._start_pos = 0
        self._end_pos = self._num_layers - 1
        self._segment_network(seg_method)
        self.shared_layers = paddle.nn.LayerDict()
        self.shared_weight_attrs = {}

        # construct layer
        self.run_function = []
        self._build_layer()

        self.shared_comm = self._construct_shared_comm()
        self._synchronize_shared_weights()
コード例 #8
0
def train_model_multigrid(cfg, world_size=1, validate=True):
    """Train model entry

    Args:
    	cfg (dict): configuration.
    	parallel (bool): Whether multi-card training. Default: Treu
        validate (bool): Whether to do evaluation. Default: False.

    """
    # Init multigrid.
    multigrid = None
    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
        multigrid = MultigridSchedule()
        cfg = multigrid.init_multigrid(cfg)
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
    multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]

    parallel = world_size != 1
    logger = get_logger("paddlevideo")
    batch_size = cfg.DATASET.get('batch_size', 2)
    places = paddle.set_device('gpu')
    model_name = cfg.model_name
    output_dir = cfg.get("output_dir", f"./output/{model_name}")
    mkdir(output_dir)
    local_rank = dist.ParallelEnv().local_rank
    precise_bn = cfg.get("PRECISEBN")
    num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN

    # 1. Construct model
    model = build_model(cfg.MODEL)
    if parallel:
        model = paddle.DataParallel(model)

    # 2. Construct dataloader
    train_loader, valid_loader, precise_bn_loader = \
        construct_loader(cfg,
                         places,
                         validate,
                         precise_bn,
                         num_iters_precise_bn,
                         world_size,
                         )

    # 3. Construct optimizer
    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
    optimizer = build_optimizer(cfg.OPTIMIZER,
                                lr,
                                parameter_list=model.parameters())

    # Resume
    resume_epoch = cfg.get("resume_epoch", 0)
    if resume_epoch:
        filename = osp.join(
            output_dir,
            model_name + str(local_rank) + '_' + f"{resume_epoch:05d}")
        subn_load(model, filename, optimizer)

    # 4. Train Model
    best = 0.
    total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)
    for epoch in range(total_epochs):
        if epoch < resume_epoch:
            logger.info(
                f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
            )
            continue

        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, changed = multigrid.update_long_cycle(cfg, epoch)
            if changed:
                logger.info("====== Rebuild model/optimizer/loader =====")
                (
                    model,
                    lr,
                    optimizer,
                    train_loader,
                    valid_loader,
                    precise_bn_loader,
                ) = build_trainer(cfg, places, parallel, validate, precise_bn,
                                  num_iters_precise_bn, world_size)

                #load checkpoint after re-build model
                if epoch != 0:
                    #epoch no need to -1, haved add 1 when save
                    filename = osp.join(
                        output_dir,
                        model_name + str(local_rank) + '_' + f"{(epoch):05d}")
                    subn_load(model, filename, optimizer)
                #update lr last epoch, not to use saved params
                lr.last_epoch = epoch
                lr.step(rebuild=True)

        model.train()
        record_list = build_record(cfg.MODEL)
        tic = time.time()
        for i, data in enumerate(train_loader):
            record_list['reader_time'].update(time.time() - tic)
            # 4.1 forward
            if parallel:
                outputs = model._layers.train_step(data)
                ## required for DataParallel, will remove in next version
                model._reducer.prepare_for_backward(
                    list(model._find_varbase(outputs)))
            else:
                outputs = model.train_step(data)
            # 4.2 backward
            avg_loss = outputs['loss']
            avg_loss.backward()
            # 4.3 minimize
            optimizer.step()
            optimizer.clear_grad()

            # log record
            record_list['lr'].update(
                optimizer._global_learning_rate().numpy()[0], batch_size)
            for name, value in outputs.items():
                record_list[name].update(value.numpy()[0], batch_size)
            record_list['batch_time'].update(time.time() - tic)
            tic = time.time()

            if i % cfg.get("log_interval", 10) == 0:
                ips = "ips: {:.5f} instance/sec.".format(
                    batch_size / record_list["batch_time"].val)
                log_batch(record_list, i, epoch + 1, total_epochs, "train", ips)

            # learning rate iter step
            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
                lr.step()

        # learning rate epoch step
        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
            lr.step()

        ips = "ips: {:.5f} instance/sec.".format(
            batch_size * record_list["batch_time"].count /
            record_list["batch_time"].sum)
        log_epoch(record_list, epoch + 1, "train", ips)

        def evaluate(best):
            model.eval()
            record_list = build_record(cfg.MODEL)
            record_list.pop('lr')
            tic = time.time()
            for i, data in enumerate(valid_loader):
                if parallel:
                    outputs = model._layers.val_step(data)
                else:
                    outputs = model.val_step(data)

                # log_record
                for name, value in outputs.items():
                    record_list[name].update(value.numpy()[0], batch_size)

                record_list['batch_time'].update(time.time() - tic)
                tic = time.time()

                if i % cfg.get("log_interval", 10) == 0:
                    ips = "ips: {:.5f} instance/sec.".format(
                        batch_size / record_list["batch_time"].val)
                    log_batch(record_list, i, epoch + 1, total_epochs, "val",
                              ips)

            ips = "ips: {:.5f} instance/sec.".format(
                batch_size * record_list["batch_time"].count /
                record_list["batch_time"].sum)
            log_epoch(record_list, epoch + 1, "val", ips)

            best_flag = False
            if record_list.get('top1') and record_list['top1'].avg > best:
                best = record_list['top1'].avg
                best_flag = True
            return best, best_flag

        # use precise bn to improve acc
        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
            logger.info(f"do precise BN in {epoch+1} ...")
            do_preciseBN(model, precise_bn_loader, parallel,
                         min(num_iters_precise_bn, len(precise_bn_loader)))

        #  aggregate sub_BN stats
        logger.info("Aggregate sub_BatchNorm stats...")
        aggregate_sub_bn_stats(model)

        # 5. Validation
        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
            logger.info(f"eval in {epoch+1} ...")
            with paddle.fluid.dygraph.no_grad():
                best, save_best_flag = evaluate(best)
            # save best
            if save_best_flag:
                save(optimizer.state_dict(),
                     osp.join(output_dir, model_name + "_best.pdopt"))
                save(model.state_dict(),
                     osp.join(output_dir, model_name + "_best.pdparams"))
                logger.info(
                    f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
                )

        # 6. Save model and optimizer
        if is_eval_epoch(
                cfg, epoch,
                total_epochs, multigrid.schedule) or epoch % cfg.get(
                    "save_interval", 10) == 0 or epoch in multi_save_epoch:
            logger.info("[Save parameters] ======")
            subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,
                      model, optimizer)

    logger.info(f'training {model_name} finished')