Ejemplo n.º 1
0
def entry(rank, world_size, config, resume, only_validation):
    torch.manual_seed(config["meta"]["seed"])  # For both CPU and GPU
    np.random.seed(config["meta"]["seed"])
    random.seed(config["meta"]["seed"])

    os.environ["MASTER_ADDR"] = "localhost"
    s = socket()
    s.bind(("", 0))
    os.environ["MASTER_PORT"] = "1111"  # A random local port

    # Initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

    # The DistributedSampler will split the dataset into the several cross-process parts.
    # On the contrary, "Sampler=None, shuffle=True", each GPU will get all data in the whole dataset.
    train_dataloader = DataLoader(
        dataset=initialize_module(config["train_dataset"]["path"], args=config["train_dataset"]["args"]),
        **config["train_dataset"]["dataloader"],
    )

    valid_dataloader = DataLoader(
        dataset=initialize_module(config["validation_dataset"]["path"], args=config["validation_dataset"]["args"]),
        num_workers=0,
        batch_size=1
    )

    model = initialize_module(config["model"]["path"], args=config["model"]["args"])

    optimizer = torch.optim.Adam(
        params=model.parameters(),
        lr=config["optimizer"]["lr"],
        betas=(config["optimizer"]["beta1"], config["optimizer"]["beta2"])
    )

    loss_function = getattr(loss, config["loss_function"]["name"])(**config["loss_function"]["args"])
    trainer_class = initialize_module(config["trainer"]["path"], initialize=False)

    trainer = trainer_class(
        dist=dist,
        rank=rank,
        config=config,
        resume=resume,
        only_validation=only_validation,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        train_dataloader=train_dataloader,
        validation_dataloader=valid_dataloader
    )

    trainer.train()
Ejemplo n.º 2
0
 def _load_dataloader(dataset_config):
     dataset = initialize_module(dataset_config["path"], args=dataset_config["args"], initialize=True)
     dataloader = DataLoader(
         dataset=dataset,
         batch_size=1,
         num_workers=0,
     )
     return dataloader
Ejemplo n.º 3
0
def main(config, checkpoint_path, output_dir):
    inferencer_class = initialize_module(config["inferencer"]["path"], initialize=False)
    inferencer = inferencer_class(
        config,
        checkpoint_path,
        output_dir
    )
    inferencer()
Ejemplo n.º 4
0
    def _load_model(model_config, checkpoint_path, device):
        model = initialize_module(model_config["path"], args=model_config["args"], initialize=True)
        model_checkpoint = torch.load(checkpoint_path, map_location=device)
        model_static_dict = model_checkpoint["model"]
        epoch = model_checkpoint["epoch"]
        print(f"当前正在处理 tar 格式的模型断点,其 epoch 为:{epoch}.")

        model.load_state_dict(model_static_dict)
        model.to(device)
        model.eval()
        return model, model_checkpoint["epoch"]