parser = argparse.ArgumentParser() parser.add_argument("--run_num", default='00', type=str) parser.add_argument("--yaml_config", default='./config/UNet.yaml', type=str) parser.add_argument("--config", default='default', type=str) parser.add_argument("--comm_mode", default='slurm-nccl', type=str) parser.add_argument("--io_only", action="store_true") parser.add_argument("--enable_amp", action="store_true") parser.add_argument("--cpu_pipeline", action="store_true") parser.add_argument("--no_copy", action="store_true") args = parser.parse_args() run_num = args.run_num params = YParams(os.path.abspath(args.yaml_config), args.config) # get env variables if (args.comm_mode == "openmpi-nccl"): #use pmix server address: only works for single node addrport = os.getenv("PMIX_SERVER_URI2").split("//")[1] comm_addr = addrport.split(":")[0] comm_rank = int(os.getenv('OMPI_COMM_WORLD_RANK', 0)) comm_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", 0)) elif (args.comm_mode == "slurm-nccl"): comm_addr = os.getenv("SLURM_SRUN_COMM_HOST") comm_size = int(os.getenv("SLURM_NTASKS")) comm_rank = int(os.getenv("PMI_RANK")) # common stuff comm_local_rank = comm_rank % torch.cuda.device_count()
self.model.load_state_dict(checkpoint['model_state']) self.iters = checkpoint['iters'] self.startEpoch = checkpoint['epoch'] + 1 self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--local_rank", default=0, type=int) parser.add_argument("--yaml_config", default='./config/cifar100.yaml', type=str) parser.add_argument("--config", default='default', type=str) args = parser.parse_args() params = YParams(os.path.abspath(args.yaml_config), args.config) # setup distributed training variables and intialize cluster if using params['world_size'] = 1 if 'WORLD_SIZE' in os.environ: params['world_size'] = int(os.environ['WORLD_SIZE']) params['local_rank'] = args.local_rank params['world_rank'] = 0 if params['world_size'] > 1: torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='env://') params['world_rank'] = dist.get_rank() params['global_batch_size'] = params.batch_size params['batch_size'] = int(params.batch_size // params['world_size'])
stage_target = "lustre" # get env variables comm_addr = os.getenv("SLURM_SRUN_COMM_HOST") comm_size = int(os.getenv("SLURM_NTASKS")) comm_rank = int(os.getenv("PMI_RANK")) comm_local_rank = comm_rank % torch.cuda.device_count() comm_port = "29500" os.environ["MASTER_ADDR"] = comm_addr os.environ["MASTER_PORT"] = comm_port # init process group dist.init_process_group(backend="nccl", rank=comm_rank, world_size=comm_size) # load parameters params = YParams("config/UNet_transpose.yaml", "default") device = torch.device("cuda:{}".format(comm_local_rank)) # setup dist.barrier() tstart = time.time() # stage in? if stage_target == "dram": # copy the input file into local DRAM for each socket: tmpfs_root = '/dev/shm' #tmpfs_root = '/tmp' #tmpfs_root = '/run/cosmo_data' gpus_per_socket = torch.cuda.device_count() // 2 socket = 0 if comm_local_rank < gpus_per_socket else 1 new_data_path = os.path.join(tmpfs_root, 'numa{}'.format(socket), os.path.basename(params.data_path))
'G_state': netG.state_dict(), 'D_state': netD.state_dict(), 'optimizerG_state_dict': optimizerG.state_dict(), 'optimizerD_state_dict': optimizerD.state_dict() }, params.checkpoint_file) if __name__ == '__main__': torch.backends.cudnn.benchmark = True if len(sys.argv) != 3: logging.error("Usage", sys.argv[0], "configuration_YAML_file", "configuration") exit() params = YParams(os.path.abspath(sys.argv[1]), sys.argv[2]) if not os.path.exists(params.experiment_dir): os.makedirs(os.path.abspath(params.experiment_dir)) logging_utils.log_to_file(logger_name=None, log_filename=os.path.join( params.experiment_dir, 'out.log')) params.log() tboard_writer = SummaryWriter( log_dir=os.path.join(params.experiment_dir, 'logs/')) params.experiment_dir = os.path.abspath(params.experiment_dir) params.checkpoint_file = os.path.join(params.experiment_dir, 'checkpt.tar') if params.seed: random.seed(params.seed)
print(upars.shape) print(unnormalized(upars[:, 0], 'R0').shape) pars_norm[i * batch:(i + 1) * batch, :] = upars pars_orig[i * batch:(i + 1) * batch, :] = np.stack([ unnormalized(upars[:, 0], 'R0'), unnormalized(upars[:, 1], 'WFHcomp'), unnormalized(upars[:, 2], 'WFHdays') ], axis=-1) print("Output: shape %s, type %s, size %f MB" % (str(out.shape), str(out.dtype), out.nbytes / 1e6)) with h5py.File(outname, 'w') as f: f.create_dataset('symptomatic3D', data=out) f.create_dataset('parBio', data=pars_orig) f.create_dataset('uniBio', data=pars_norm) print("Saved output to %s" % (outname)) if __name__ == '__main__': torch.backends.cudnn.benchmark = True if len(sys.argv) != 5: print("Usage", sys.argv[0], "configuration_YAML_file", "configuration", "checkpoint", "outfile") exit() params = YParams(os.path.abspath(sys.argv[1]), sys.argv[2]) N = 64 bs = 64 generate(params, sys.argv[3], sys.argv[4], num=N, batch=bs)
from utils.YParams import YParams import torch from utils.data_loader_opt import get_data_loader_distributed import time #load parameters params = YParams("config/UNet.yaml", "default") device = torch.device("cuda:0") train_data_loader = get_data_loader_distributed(params, 0) it = 0 tstart = time.time() for inp, tar in train_data_loader: inp = inp.to(device) tar = tar.to(device) it += 1 tend = time.time() print("Iterations took {}s for {} iterations ({} iter/s)".format(tend - tstart, it, float(it)/(tend - tstart)))
from utils.YParams import YParams import torch from utils.data_loader_dali_smooth import get_data_loader_distributed import time #load parameters params = YParams("config/UNet_transpose.yaml", "default") device = torch.device("cuda:0") # get data loader train_data_loader = get_data_loader_distributed(params, 0) it = 0 tstart = time.time() for inp, tar in train_data_loader: it += 1 tend = time.time() print("Iterations took {}s for {} iterations ({} iter/s)".format( tend - tstart, it, float(it) / (tend - tstart)))
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--local_rank", default=0, type=int) parser.add_argument("--run_num", default='00', type=str) parser.add_argument("--yaml_config", default='./config/UNet.yaml', type=str) parser.add_argument("--config", default='default', type=str) args = parser.parse_args() run_num = args.run_num params = YParams(os.path.abspath(args.yaml_config), args.config) params.distributed = False if 'WORLD_SIZE' in os.environ: params.distributed = int(os.environ['WORLD_SIZE']) > 1 world_rank = 0 if params.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.gpu = args.local_rank world_rank = torch.distributed.get_rank() torch.backends.cudnn.benchmark = True
parser = argparse.ArgumentParser() parser.add_argument("--run_num", default='00', type=str) parser.add_argument("--yaml_config", default='./config/UNet.yaml', type=str) parser.add_argument("--config", default='default', type=str) parser.add_argument("--comm_mode", default='slurm-nccl', type=str) parser.add_argument("--io_only", action="store_true") parser.add_argument("--enable_amp", action="store_true") parser.add_argument("--global_timing", action="store_true") args = parser.parse_args() run_num = args.run_num params = YParams(os.path.abspath(args.yaml_config), args.config) params.distributed = True # get env variables if (args.comm_mode == "openmpi-nccl"): #use pmix server address: only works for single node addrport = os.getenv("PMIX_SERVER_URI2").split("//")[1] comm_addr = addrport.split(":")[0] comm_rank = int(os.getenv('OMPI_COMM_WORLD_RANK', 0)) comm_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", 0)) elif (args.comm_mode == "slurm-nccl"): comm_addr = os.getenv("SLURM_SRUN_COMM_HOST") comm_size = int(os.getenv("SLURM_NTASKS")) comm_rank = int(os.getenv("PMI_RANK")) # common stuff