コード例 #1
0
        backend='nccl',
        #init_method='/gpfs/gpfs0/groups/mozafari/ruixliu/tmp/misc/sharedfile',
        rank=local_rank,
        world_size=local_size)

    set_environment_variables_for_nccl_backend(local_size == global_size)

    # Prepare Logger
    job_id = rutils.get_current_time()
    logger = rutils.FileLogging('%s_bert_pretrain_%d' % (job_id, local_rank))
    #logger = Logger(cuda=torch.cuda.is_available())
    logger.info('job id: %s' % job_id)
    logger.info(rutils.parser_args_to_dict(args))

    # # Extact config file from blob storage
    job_config = BertJobConfiguration(
        config_file_path=os.path.join(args.config_file_path, config_file))
    logger.info(job_config.config)

    job_name = job_config.get_name()
    # Setting the distributed variables

    #run = Run.get_context()

    if not use_multigpu_with_single_device_per_process:
        device = torch.device("cuda")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.distributed.init_process_group(backend=args.backend)
コード例 #2
0
ファイル: train.py プロジェクト: skaarthik/AzureML-BERT
    local_rank = get_local_rank()
    global_size = get_global_size()
    local_size = get_local_size()
    # TODO use logger
    print('local_rank = {}'.format(local_rank))
    print('global_size = {}'.format(global_size))
    print('local_size = {}'.format(local_size))

    set_environment_variables_for_nccl_backend(local_size == global_size)

    # Prepare Logger
    logger = Logger(cuda=torch.cuda.is_available())

    # # Extact config file from blob storage
    job_config = BertJobConfiguration(
        config_file_path=os.path.join(path, config_file))
    # Replace placeholder path prefix by path corresponding to "ds.path('data/bert_data/').as_mount()"
    job_config.replace_path_placeholders(path)

    job_name = job_config.get_name()
    # Setting the distributed variables

    run = Run.get_context()

    if not use_multigpu_with_single_device_per_process:
        device = torch.device("cuda")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs