Example #1
0
def entry_point(config: ConfigParser):
    '''
    entry-point function for a single worker, distributed training
    '''

    local_world_size = config['local_world_size']

    # check distributed environment cfgs
    if config['distributed']:  # distributed gpu mode
        # check gpu available
        if torch.cuda.is_available():
            if torch.cuda.device_count() < local_world_size:
                raise RuntimeError(
                    f'the number of GPU ({torch.cuda.device_count()}) is less than '
                    f'the number of processes ({local_world_size}) running on each node'
                )
            local_master = (config['local_rank'] == 0)
        else:
            raise RuntimeError(
                'CUDA is not available, Distributed training is not supported.'
            )
    else:  # one gpu or cpu mode
        if config['local_world_size'] != 1:
            raise RuntimeError(
                'local_world_size must set be to 1, if distributed is set to false.'
            )
        config.update_config('local_rank', 0)
        local_master = True
        config.update_config('global_rank', 0)

    logger = config.get_logger('train') if local_master else None
    if config['distributed']:
        logger.info('Distributed GPU training model start...'
                    ) if local_master else None
    else:
        logger.info(
            'One GPU or CPU training mode start...') if local_master else None

    if config['distributed']:
        # these are the parameters used to initialize the process group
        env_dict = {
            key: os.environ[key]
            for key in ('MASTER_ADDR', 'MASTER_PORT', 'RANK', 'WORLD_SIZE')
        }
        logger.info(
            f'[Process {os.getpid()}] Initializing process group with: {env_dict}'
        ) if local_master else None

        # init process group
        dist.init_process_group(backend='nccl', init_method='env://')
        config.update_config('global_rank', dist.get_rank())
        # info distributed training cfg
        logger.info(
            f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' +
            f'rank = {dist.get_rank()}, backend={dist.get_backend()}'
        ) if local_master else None

    # start train
    main(config, local_master, logger if local_master else None)

    # tear down the process group
    dist.destroy_process_group()
Example #2
0
def entry_point(config: ConfigParser):
    '''
    entry-point function for a single worker distributed training
    a single worker contain (torch.cuda.device_count() / local_world_size) gpus
    '''

    local_world_size = config['local_world_size']

    # check distributed environment cfgs
    if config['distributed']:  # distributed gpu mode
        # check gpu available
        if torch.cuda.is_available():
            if torch.cuda.device_count() < local_world_size:
                raise RuntimeError(
                    f'the number of GPU ({torch.cuda.device_count()}) is less than '
                    f'the number of processes ({local_world_size}) running on each node'
                )
            local_master = (config['local_rank'] == 0)
        else:
            raise RuntimeError(
                'CUDA is not available, Distributed training is not supported.'
            )
    else:  # one gpu or cpu mode
        if config['local_world_size'] != 1:
            raise RuntimeError(
                'local_world_size must set be to 1, if distributed is set to false.'
            )
        config.update_config('local_rank', 0)
        local_master = True
        config.update_config('global_rank', 0)

    logger = config.get_logger('train') if local_master else None
    if config['distributed']:
        logger.info('Distributed GPU training model start...'
                    ) if local_master else None
    else:
        logger.info(
            'One GPU or CPU training mode start...') if local_master else None
    # else:
    #     sys.stdin.close()

    # cfg CUDNN whether deterministic
    if config['deterministic']:
        fix_random_seed_for_reproduce(config['seed'])
        logger.warn(
            'You have chosen to deterministic training. '
            'This will fix random seed, turn on the CUDNN deterministic setting, turn off the CUDNN benchmark '
            'which can slow down your training considerably! '
        ) if local_master else None
    else:
        torch.backends.cudnn.benchmark = True
        logger.warn(
            'You have chosen to benchmark training. '
            'This will turn on the CUDNN benchmark setting'
            'which can speed up your training considerably! '
            'You may see unexpected behavior when restarting '
            'from checkpoints due to RandomizedMultiLinearMap need deterministic turn on.'
        ) if local_master else None

    if config['distributed']:
        # init process group
        dist.init_process_group(backend='nccl', init_method='env://')
        config.update_config('global_rank', dist.get_rank())
        # log distributed training cfg
        logger.info(
            f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' +
            f'rank = {dist.get_rank()}, backend={dist.get_backend()}'
        ) if local_master else None

    # start train
    main(config, local_master, logger if local_master else None)

    # tear down the process group
    dist.destroy_process_group()