def cli_main(): parser = options.get_training_parser() parser.add_argument('--train-subtransformer', action='store_true', default=False, help='whether train SuperTransformer or SubTransformer') parser.add_argument('--sub-configs', required=False, is_config_file=True, help='when training SubTransformer, use --configs to specify architecture and --sub-configs to specify other settings') # for profiling parser.add_argument('--profile-flops', action='store_true', help='measure the FLOPs of a SubTransformer') parser.add_argument('--latgpu', action='store_true', help='measure SubTransformer latency on GPU') parser.add_argument('--latcpu', action='store_true', help='measure SubTransformer latency on CPU') parser.add_argument('--latiter', type=int, default=300, help='how many iterations to run when measure the latency') parser.add_argument('--latsilent', action='store_true', help='keep silent when measure latency') parser.add_argument('--validate-subtransformer', action='store_true', help='evaluate the SubTransformer on the validation set') options.add_generation_args(parser) args = options.parse_args_and_arch(parser) if args.latcpu: args.cpu = True args.fp16 = False if args.latgpu or args.latcpu or args.profile_flops: args.distributed_world_size = 1 if args.pdb: pdb.set_trace() if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(modify_parser=None): parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) # params_file = os.path.join(os.path.dirname(args.save_dir),"({0})-params.log".format(os.path.basename(args.save_dir))) # with open(params_file,"w",encoding="utf-8") as w: # w.write(str(args).replace(", ",",\n")) # print("saving params file into{}...".format(params_file)) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.debug: args.distributed_world_size = 1 args.train_subset = args.valid_subset #args.cpu = True if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(main_fn): argv = sys.argv[1:] # This is a maker that separates meta-learning arguments from downstream training arguments split_index = argv.index('---') meta_argv = argv[:split_index] maybe_downstream_argv = argv[split_index + 1:] parser = options.get_meta_training_parser() meta_learning_args = options.parse_args_and_arch(parser, input_args=meta_argv) fine_tune_args = None if meta_learning_args.baseline: split_index = maybe_downstream_argv.index('---') downstream_argv = maybe_downstream_argv[:split_index] baseline_argv = maybe_downstream_argv[split_index + 1:] parser = options.get_meta_learning_parser() fine_tune_args = options.parse_args_and_arch(parser, input_args=baseline_argv) else: downstream_argv = maybe_downstream_argv parser = options.get_meta_learning_parser() downstream_args = options.parse_args_and_arch(parser, input_args=downstream_argv) print('Meta-learning Arguments: ') print(meta_learning_args) print('Downstream Arguments: ') print(downstream_args) print('Fine-tune Args: ') print(fine_tune_args) if meta_learning_args.distributed_init_method is None: distributed_utils.infer_init_method(meta_learning_args) if meta_learning_args.distributed_init_method is not None: # distributed training distributed_main(meta_learning_args.device_id, meta_learning_args=meta_learning_args, downstream_args=downstream_args, fine_tune_args=fine_tune_args, main_fn=main_fn) elif meta_learning_args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) meta_learning_args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) meta_learning_args.distributed_rank = None # set based on device id if max(meta_learning_args.update_freq ) > 1 and meta_learning_args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(meta_learning_args, downstream_args, fine_tune_args, main_fn), nprocs=meta_learning_args.distributed_world_size, ) else: # single GPU training main_fn(meta_learning_args=meta_learning_args, downstream_args=downstream_args, fine_tune_args=fine_tune_args)
def cli_main(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) # print the model hparams for k, v in sorted(args.__dict__.items(), key=lambda x: x[0]): print('{:40s} = {}'.format(k, v)) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) #args.distributed_world_size = 1 #args.cpu = True if args.debug: args.distributed_world_size = 1 args.train_subset = args.valid_subset if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) try: git_branch = subprocess.check_output(['git', 'symbolic-ref', '--short', 'HEAD']) git_revision = subprocess.check_output(['git', 'rev-parse', 'HEAD']) except: git_branch = 'unknown' git_revision = 'unknown' print('GIT: {} {}'.format(git_branch, git_revision)) print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print('-' * 80) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def load_megatron_lm(args): """ Load Megatron_lm in fp16. A Tesla V100 is enough for inference. I haven't implement the parallel method for fine-tuning. You can refer to fairseq_eval_lm.py implementation for parallel function. :return: TransformerLanguageModelWrapper """ megatron_path = join(args.checkpoint_dir, 'Megatron_11b', 'megatron_11b') # init args for task initialization if os.path.exists(join(megatron_path, 'task.pkl')): task = _pickle.load(open(join(megatron_path, 'task.pkl'), 'rb')) else: sys.argv.append(megatron_path) task_args = get_task_args() distributed_utils.infer_init_method(task_args) task_args.distributed_rank = None task = tasks.setup_task(task_args) _pickle.dump(task, open(join(megatron_path, 'task.pkl'), 'wb')) # load model partitions if os.path.exists(join(megatron_path, 'model.pt')): merge_partition = torch.load(join(megatron_path, 'model.pt')) else: merge_partition = dict() for i in range(8): # load checkpoints ckpt = torch.load(join(megatron_path, 'model-model_part-{}.pt'.format(i)), map_location='cpu') if i == 0: merge_partition = ckpt else: print("Load from partition {}".format(i)) for param_name, param in tqdm(ckpt['model'].items()): if 'version' in param_name: continue src_param, tgt_param = merge_partition['model'][ param_name], param if param_name.endswith( 'out_proj.weight') or param_name.endswith( 'fc2.weight'): res = torch.cat((src_param, tgt_param), dim=1) elif param_name.endswith('k_proj.weight') or param_name.endswith('k_proj.bias') or \ param_name.endswith('v_proj.weight') or param_name.endswith('v_proj.bias') or \ param_name.endswith('q_proj.weight') or param_name.endswith('q_proj.bias') or \ param_name.endswith('fc1.weight') or param_name.endswith('fc1.bias') or \ param_name.endswith('output_projection.weight') or param_name.endswith('embed_tokens.weight'): res = torch.cat((src_param, tgt_param), dim=0) else: res = src_param merge_partition['model'][param_name] = res # build model args = merge_partition['args'] args.model_parallel_size = 0 # torch.save(merge_partition, join(CKPT_DIR, 'Megatron_11b/megatron_11b/model.pt')) model = TransformerLanguageModelWrapper.build_model(args, task) model.load_state_dict(merge_partition['model']) return model.half().cuda()
def cli_main(): parser = options.get_training_parser() parser.add_argument("--mask", action="store_true") parser.add_argument("--decoder-wise-training", action="store_true") parser.add_argument("--load", default="", type=str) parser.add_argument("--focus", default=-1, type=int) parser.add_argument("--masking", action="store_true") parser.add_argument("--early-stop", action="store_true") parser.add_argument("--save-path", default="", type=str) parser.add_argument("--train-decoder-only", action="store_true") args = options.parse_args_and_arch(parser) if getattr(args, "pnet", False) and args.load == "": print("training pnet requires loading a pretrained model") sys.exit() if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args) if args.mask: args.masking = True main(args)
def cli_main(modify_parser=None): parser = options.get_training_parser() parser.add_argument( '--remove-bpe', nargs='?', const='@@ ', default=None, help='remove BPE tokens before scoring ' '(can be set to sentencepiece). Being used for monitoring ' 'and validation') args = options.parse_args_and_arch(parser, modify_parser=modify_parser) print_options_meaning_changes(args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(training_args, modify_parser=None): # print(training_args) # get args for FairSeq by converting the hyperparameters as if they # were command-line arguments argv_copy = copy.deepcopy(sys.argv) # some arguments are pre-defined for SageMaker sys.argv[1:] = training_args parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) try: logger.info("ENV MASTER_ADDR={}, MASTER_PORT={}".format(os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])) except: logger.info("ENV MASTER_ADDR and MASTER_PORT not configured! Probably running without Sagemaker!") if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True parser = options.get_training_parser() args = options.parse_args_and_arch(parser) print(args) # if os.path.exists(f"{args.save_dir}/checkpoint_best.pt"): # raise Exception('Already Trained!') if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(modify_parser=None): """ Dongxu: 1) Parse arguments; 2) choose distribution strategy; 3) call main() with args. :param modify_parser: :return: """ parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(modify_parser=None): parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) print_options_meaning_changes(args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: if not getattr(args, 'tpu', False): # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy('file_system') xmp.spawn( fn=distributed_main, args=(args, ), nprocs=8, # use all 8 TPU cores ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument("--adv_sr", action='store_true', default=False, help='whether to train with Adv SR') parser.add_argument("--num_cands", default=9, type=int, help='pre-defined number of segmentation candidates') parser.add_argument("--src_pert_prob", default=0.33, type=float, help='perturbation ratio for the source sentence') parser.add_argument("--tgt_pert_prob", default=0.33, type=float, help='perturbation ratio for the target sentence') parser.add_argument("--sp_model", help='directory to sentencepiece module for pre-segmenting candidates') args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument('--do-evaluate', action='store_true', default=False, help='Only do evaluation (for squad)') parser.add_argument('--do-layer-decay', action='store_true', default=False, help='Do layer-wise learning rate decay ') parser.add_argument('--layer-decay', default=1.0, type=float, help='The coefficient of layer decay') args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() ## 获取相关参数 args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: ## distributed_init_method 一般使用默认参数None, 多机训练配置 # 每个进程使用的本节点的GPU id作为local_rank参数,传入运行代码 #设置args.distributed_world_size 多机训练使用的所有GPU数量:nnodes * nproc_per_node #设置args.distributed_rank 当前GPU在所有节点的所有GPU中的ID distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: ##多机多卡训练 # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: ## 当前进程再使用torch的multiprocessing spawn出多个process来使用多个GPU, ## 但由于torch.distributed.launch包通过循环调用train.py, 已经为每个GPU循环创建了进程, ## 所以最好设置 distributed_no_spawn 为True, 每个进程只使用一个GPU start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: ## 设置distributed_no_spawn为True后,torch.distributed.launch创建的每个进程会直接调用distributed_main distributed_main(args.device_id, args) ## args.device_id就是args.local_rank,因此就是当前节点的GPU ID elif args.distributed_world_size > 1: ##单机多卡训练 # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print('| NOTE: you may get better performance with: --ddp-backend=no_c10d') torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main_helper(args): if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_backend == "ccl": main(args, init_distributed=True) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: if not getattr(args, "tpu", False): # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = "tcp://localhost:{port}".format( port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn(fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size) else: import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy("file_system") xmp.spawn( fn=distributed_main, args=(args, ), nprocs=8 # use all 8 TPU cores ) else: # single GPU training main(args)
def cli_main(modify_parser=None): parser = options.get_training_parser() parser.add_argument('--comet', action='store_true', help='Log results on comet') parser.add_argument('--comet-tag', default="", help='Set experiment.set_tag(args.comet_tag), or use an auto-generated tag if this is empty') parser.add_argument('--comet-real-tag', default="", type=str, help='Log results on comet') parser.add_argument('--comet-project', default='normalizations', type=str, help='Log results on comet') parser.add_argument('--api-key', default="", type=str) args = options.parse_args_and_arch(parser, modify_parser=modify_parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): args = parse() if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() add_damethod_args(parser) args = options.parse_args_and_arch(parser) os.makedirs(args.save_dir, exist_ok=True) sys.stdout = Logger(args.save_dir + "/log.txt", "w", sys.stdout) if args.multidatasource == 'mixed' and args.damethod != "naive": args.task = "translation_da" assert args.criterion == "cross_entropy_da" else: args.task = "translation" if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument( '--filter_best_last_ckpts', type=str, default=False, help= 'whether to filter out checkpoint_best and checkpoint_last from checkpoint list' ) parser.add_argument('--log_valid_progress', type=str, default=False, help='whether to log validation progress') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument( "--comet-logging", action="store_true", help="Whether to use Comet.ML for logging", ) args = options.parse_args_and_arch(parser) logging = getattr(args, "comet_logging", False) config = None if logging: PROJECT = "machine-translation" if not keyring.get_password("comet", PROJECT): comet_ml_api_key = getpass("Please enter the comet.ml API key: ") keyring.set_password("comet", PROJECT, comet_ml_api_key) else: comet_ml_api_key = keyring.get_password("comet", PROJECT) experiment = Experiment( api_key=comet_ml_api_key, project_name="machine-translation", workspace="machine-translation", auto_output_logging=None, ) config = { "api_key": comet_ml_api_key, "experiment_key": experiment.get_key() } print("Proceeding with Comet.ML logging...") if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, config, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args, config) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = "tcp://localhost:{port}".format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d": print( "| NOTE: you may get better performance with: --ddp-backend=no_c10d" ) torch.multiprocessing.spawn(fn=distributed_main, args=(args, config), nprocs=args.distributed_world_size) else: # single GPU training main(args, config=config) if config: experiment.end()
def train_main(alpha, beta, save_path): parser = options.get_training_parser() input_args = [ data_set, '--share-decoder-input-output-embed', '--arch', 'transformer_iwslt_de_en', '--max-tokens', '4000', '--lr', '5e-4', '--save-interval', '2', '--max-epoch', '85', '--patience', '5', '--optimizer', 'adam', '--adam-betas', '(0.9, 0.98)', '--clip-norm', '0.0', '--weight-decay', '0.0001', '--dropout', '0.3', '--lr-scheduler', 'inverse_sqrt', '--warmup-updates', '4000', '--keep-last-epochs', '4', '--criterion', 'jensen_cross_entropy', '--alpha', str(alpha), '--beta', str(beta), '--use-uniform', '--fp16', '--save-dir', save_path ] args = options.parse_args_and_arch(parser, input_args=input_args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args) ckpts = os.listdir(args.save_dir) try: ckpts.remove('checkpoint_last.pt') except ValueError: print("no checkpoint_last.pt in folder", args.save_dir) f = open(os.path.join(args.save_dir, "final_entropies.txt"), "a+") results = {} entropies = {} for ckpt in ckpts: if '.pt' in ckpt: path = os.path.join(args.save_dir, ckpt) f.write(path + '\n') run_generation(path, results, entropies) f.write('{entropy: ' + str(entropies[path]) + ', bleu: ' + str(results[path]) + '}\n') f.close() return results
import socket args.device_id = i if args.distributed_rank is None: # torch.multiprocessing.spawn args.distributed_rank = i args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) main(args) if __name__ == '__main__': parser = options.get_training_parser() args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id print('''| NOTE: you may get better performance with: python -m torch.distributed.launch --nproc_per_node {ngpu} train.py {no_c10d}(...) '''.format( ngpu=args.distributed_world_size,
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument('--torch-file-system', action='store_true') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) # set sharing strategy file system in case /dev/shm/ limits are small if args.torch_file_system: torch.multiprocessing.set_sharing_strategy('file_system') training_name = get_training_name(args) base_save_dir = generate_save_dir(args, training_name, sys.argv[1:]) setattr(args, 'training_name', training_name) setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints')) setattr(args, 'tensorboard_logdir', os.path.join(base_save_dir, 'tensorboard')) save_config(vars(args), base_save_dir) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)