def entry(rank, world_size, config, resume, only_validation): torch.manual_seed(config["meta"]["seed"]) # For both CPU and GPU np.random.seed(config["meta"]["seed"]) random.seed(config["meta"]["seed"]) os.environ["MASTER_ADDR"] = "localhost" s = socket() s.bind(("", 0)) os.environ["MASTER_PORT"] = "1111" # A random local port # Initialize the process group dist.init_process_group("gloo", rank=rank, world_size=world_size) # The DistributedSampler will split the dataset into the several cross-process parts. # On the contrary, "Sampler=None, shuffle=True", each GPU will get all data in the whole dataset. train_dataloader = DataLoader( dataset=initialize_module(config["train_dataset"]["path"], args=config["train_dataset"]["args"]), **config["train_dataset"]["dataloader"], ) valid_dataloader = DataLoader( dataset=initialize_module(config["validation_dataset"]["path"], args=config["validation_dataset"]["args"]), num_workers=0, batch_size=1 ) model = initialize_module(config["model"]["path"], args=config["model"]["args"]) optimizer = torch.optim.Adam( params=model.parameters(), lr=config["optimizer"]["lr"], betas=(config["optimizer"]["beta1"], config["optimizer"]["beta2"]) ) loss_function = getattr(loss, config["loss_function"]["name"])(**config["loss_function"]["args"]) trainer_class = initialize_module(config["trainer"]["path"], initialize=False) trainer = trainer_class( dist=dist, rank=rank, config=config, resume=resume, only_validation=only_validation, model=model, loss_function=loss_function, optimizer=optimizer, train_dataloader=train_dataloader, validation_dataloader=valid_dataloader ) trainer.train()
def _load_dataloader(dataset_config): dataset = initialize_module(dataset_config["path"], args=dataset_config["args"], initialize=True) dataloader = DataLoader( dataset=dataset, batch_size=1, num_workers=0, ) return dataloader
def main(config, checkpoint_path, output_dir): inferencer_class = initialize_module(config["inferencer"]["path"], initialize=False) inferencer = inferencer_class( config, checkpoint_path, output_dir ) inferencer()
def _load_model(model_config, checkpoint_path, device): model = initialize_module(model_config["path"], args=model_config["args"], initialize=True) model_checkpoint = torch.load(checkpoint_path, map_location=device) model_static_dict = model_checkpoint["model"] epoch = model_checkpoint["epoch"] print(f"当前正在处理 tar 格式的模型断点,其 epoch 为:{epoch}.") model.load_state_dict(model_static_dict) model.to(device) model.eval() return model, model_checkpoint["epoch"]