Beispiel #1
0
def test_net_arch():
    hp = load_hparam("config/default.yaml")
    net = Net_arch(hp)

    # TODO: This is example code. You should change this part as you need. You can code this part as forward
    x = torch.rand(64, 10)
    x = net.fc1(x)
    assert x.shape == (64, 10)
    x = net.fc2(x)
    assert x.shape == (64, 1)
Beispiel #2
0
def test_net_arch():
    hp = load_hparam("config/default.yaml")
    net = Net_arch(hp)

    # TODO: This is example code. You should change this part as you need. You can code this part as forward
    x = torch.rand(8, 1, 28, 28)
    x = net.conv1(x)  # x: (B,4,14,14)
    assert x.shape == (8, 4, 14, 14)
    x = net.conv2(x)  # x: (B,4,7,7)
    assert x.shape == (8, 4, 7, 7)
    x = torch.flatten(x, 1)  # x: (B,4*7*7)
    assert x.shape == (8, 4 * 7 * 7)
    x = net.fc(x)  # x: (B,10)
    assert x.shape == (8, 10)
Beispiel #3
0
def train_loop(hp, logger, writer):
    # make dataloader
    logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train)
    logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.MSELoss()
    model = Model(hp, net_arch, loss_f)

    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    else:
        logger.info("Starting new training run.")

    try:
        for model.epoch in itertools.count(model.epoch + 1):
            if model.epoch > hp.train.num_iter:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer)
        logger.info("End of Train")
    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
def test_net_arch():
    os.makedirs(TEST_DIR, exist_ok=True)
    with initialize(config_path="../../config"):
        cfg = compose(config_name="default",
                      overrides=[f"working_dir={TEST_DIR}"])

    net = Net_arch(cfg)

    # TODO: This is example code. You should change this part as you need. You can code this part as forward
    x = torch.rand(8, 1, 28, 28)
    x = net.conv1(x)  # x: (B,4,14,14)
    assert x.shape == (8, 4, 14, 14)
    x = net.conv2(x)  # x: (B,4,7,7)
    assert x.shape == (8, 4, 7, 7)
    x = torch.flatten(x, 1)  # x: (B,4*7*7)
    assert x.shape == (8, 4 * 7 * 7)
    x = net.fc(x)  # x: (B,10)
    assert x.shape == (8, 10)
    def test_save_load_network(self):
        local_net = Net_arch(self.cfg)
        self.loss_f = nn.MSELoss()
        local_model = Model(self.cfg, local_net, self.loss_f)

        self.model.save_network()
        save_filename = "%s_%d.pt" % (self.cfg.name, self.model.step)
        save_path = os.path.join(self.cfg.log.chkpt_dir, save_filename)
        self.cfg.load.network_chkpt_path = save_path

        assert os.path.exists(save_path) and os.path.isfile(save_path)

        local_model.load_network()
        parameters = zip(list(local_model.net.parameters()),
                         list(self.model.net.parameters()))
        for load, origin in parameters:
            assert (load == origin).all()
Beispiel #6
0
    def test_save_load_state(self):
        local_net = Net_arch(self.hp)
        self.loss_f = nn.MSELoss()
        local_model = Model(self.hp, local_net, self.loss_f)

        self.model.save_training_state(self.logger)
        save_filename = "%s_%d.state" % (self.hp.log.name, self.model.step)
        save_path = os.path.join(self.hp.log.chkpt_dir, save_filename)
        self.hp.load.resume_state_path = save_path

        assert os.path.exists(save_path) and os.path.isfile(save_path)
        assert os.path.exists(self.hp.log.log_file_path) and os.path.isfile(
            self.hp.log.log_file_path)

        local_model.load_training_state(logger=self.logger)
        parameters = zip(list(local_model.net.parameters()),
                         list(self.model.net.parameters()))
        for load, origin in parameters:
            assert (load == origin).all()
        assert local_model.epoch == self.model.epoch
        assert local_model.step == self.model.step
def train_loop(rank, hp, world_size=1):
    # reload hp
    hp = DotDict(hp)
    if hp.model.device.lower() == "cuda" and world_size != 0:
        setup(hp, rank, world_size)
    if rank != 0:
        logger = None
        writer = None
    else:
        # set logger
        logger = make_logger(hp)
        # set writer (tensorboard / wandb)
        writer = Writer(hp, hp.log.log_dir)
        hp_str = yaml.dump(hp.to_dict())
        logger.info("Config:")
        logger.info(hp_str)
        if hp.data.train_dir == "" or hp.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")

    if hp.model.device.lower() == "cuda" and world_size != 0:
        hp.model.device = rank
        torch.cuda.set_device(rank)
    else:
        hp.model.device = hp.model.device.lower()

    # make dataloader
    if logger is not None:
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train, rank,
                                     world_size)
    if logger is not None:
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.MSELoss()
    model = Model(hp, net_arch, loss_f, rank, world_size)

    # load training state
    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    else:
        if logger is not None:
            logger.info("Starting new training run.")

    try:
        epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > hp.train.num_iter:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer)
        cleanup()
        if logger is not None:
            logger.info("End of Train")
    except Exception as e:
        if logger is not None:
            logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
        cleanup()
 def setup_method(self, method):
     super(TestModel, self).setup_method()
     self.net = Net_arch(self.cfg)
     self.loss_f = nn.CrossEntropyLoss()
     self.model = Model(self.cfg, self.net, self.loss_f)
Beispiel #9
0
def train_loop(rank, cfg):
    logger = get_logger(cfg, os.path.basename(__file__))
    if cfg.device == "cuda" and cfg.dist.gpus != 0:
        cfg.device = rank
        # turn off background generator when distributed run is on
        cfg.data.use_background_generator = False
        setup(cfg, rank)
        torch.cuda.set_device(cfg.device)

    # setup writer
    if is_logging_process():
        # set log/checkpoint dir
        os.makedirs(cfg.log.chkpt_dir, exist_ok=True)
        # set writer (tensorboard / wandb)
        writer = Writer(cfg, "tensorboard")
        cfg_str = OmegaConf.to_yaml(cfg)
        logger.info("Config:\n" + cfg_str)
        if cfg.data.train_dir == "" or cfg.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")
        logger.info("BackgroundGenerator is turned off when Distributed running is on")

        # download MNIST dataset before making dataloader
        # TODO: This is example code. You should change this part as you need
        _ = torchvision.datasets.MNIST(
            root=hydra.utils.to_absolute_path("dataset/meta"),
            train=True,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
        _ = torchvision.datasets.MNIST(
            root=hydra.utils.to_absolute_path("dataset/meta"),
            train=False,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
    # Sync dist processes (because of download MNIST Dataset)
    if cfg.dist.gpus != 0:
        dist.barrier()

    # make dataloader
    if is_logging_process():
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(cfg, DataloaderMode.train, rank)
    if is_logging_process():
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(cfg, DataloaderMode.test, rank)

    # init Model
    net_arch = Net_arch(cfg)
    loss_f = torch.nn.CrossEntropyLoss()
    model = Model(cfg, net_arch, loss_f, rank)

    # load training state / network checkpoint
    if cfg.load.resume_state_path is not None:
        model.load_training_state()
    elif cfg.load.network_chkpt_path is not None:
        model.load_network()
    else:
        if is_logging_process():
            logger.info("Starting new training run.")

    try:
        if cfg.dist.gpus == 0 or cfg.data.divide_dataset_per_gpu:
            epoch_step = 1
        else:
            epoch_step = cfg.dist.gpus
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > cfg.num_epoch:
                break
            train_model(cfg, model, train_loader, writer)
            if model.epoch % cfg.log.chkpt_interval == 0:
                model.save_network()
                model.save_training_state()
            test_model(cfg, model, test_loader, writer)
        if is_logging_process():
            logger.info("End of Train")
    except Exception as e:
        if is_logging_process():
            logger.error(traceback.format_exc())
        else:
            traceback.print_exc()
    finally:
        if cfg.dist.gpus != 0:
            cleanup()
Beispiel #10
0
def train_loop(rank, hp, world_size=0):
    if hp.model.device == "cuda" and world_size != 0:
        hp.model.device = rank
        # turn off background generator when distributed run is on
        hp.data.use_background_generator = False
        setup(hp, rank, world_size)
        torch.cuda.set_device(hp.model.device)

    # setup logger / writer
    if rank != 0:
        logger = None
        writer = None
    else:
        # set logger
        logger = make_logger(hp)
        # set writer (tensorboard / wandb)
        writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard"))
        hp_str = yaml.dump(hp.to_dict())
        logger.info("Config:")
        logger.info(hp_str)
        if hp.data.train_dir == "" or hp.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")
        logger.info(
            "BackgroundGenerator is turned off when Distributed running is on")

        # download MNIST dataset before making dataloader
        # TODO: This is example code. You should change this part as you need
        _ = torchvision.datasets.MNIST(
            root="dataset/meta",
            train=True,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
        _ = torchvision.datasets.MNIST(
            root="dataset/meta",
            train=False,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
    # Sync dist processes (because of download MNIST Dataset)
    if world_size != 0:
        dist.barrier()

    # make dataloader
    if logger is not None:
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train, rank,
                                     world_size)
    if logger is not None:
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.CrossEntropyLoss()
    model = Model(hp, net_arch, loss_f, rank, world_size)

    # load training state / network checkpoint
    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    elif hp.load.network_chkpt_path is not None:
        model.load_network(logger=logger)
    else:
        if logger is not None:
            logger.info("Starting new training run.")

    try:
        if world_size == 0 or hp.data.divide_dataset_per_gpu:
            epoch_step = 1
        else:
            epoch_step = world_size
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > hp.train.num_epoch:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer, logger)
        if logger is not None:
            logger.info("End of Train")
    except Exception as e:
        if logger is not None:
            logger.error(traceback.format_exc())
        else:
            traceback.print_exc()
    finally:
        if world_size != 0:
            cleanup()
 def setup_method(self, method):
     super(TestModel, self).setup_method()
     self.net = Net_arch(self.hp)
     self.loss_f = nn.MSELoss()
     self.model = Model(self.hp, self.net, self.loss_f)