def multiprocess_train( rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' ): with distributed_utils.distributed_context( rank, opt, port, rank_offset, gpu, hostname ) as opt: # Run the actual training opt['multiprocessing'] = True return single_train.TrainLoop(opt).train()
def multiprocess_eval( rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' ): """ Run a multiprocessing evaluation. Invoked by launch_and_eval, not instantiated directly. """ with distributed_utils.distributed_context( rank, opt, port, rank_offset, gpu, hostname ) as opt: return eval_model.eval_model(opt)
def multiprocess_eval( rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' ): """ Run a multiprocessing evaluation. Invoked by launch_and_eval, not instantiated directly. """ init_method = f'tcp://{hostname}:{port}' with distributed_utils.distributed_context( rank, opt, rank_offset, gpu, init_method=init_method ) as opt: opt['multiprocessing'] = True return eval_model.eval_model(opt)
def multiprocess_train(rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'): init_method = f"tcp://{hostname}:{port}" with distributed_utils.distributed_context(rank, opt, rank_offset, gpu, init_method=init_method) as opt: # Run the actual training opt['multiprocessing'] = True try: return single_train.TrainLoop(opt).train() except Exception: import parlai.utils.logging as logging logging.critical(traceback.format_exc()) logging.critical( f"Got the above exception on worker {rank + rank_offset}. " "This may cause hangs requiring manual killing of processes.") raise