def test_net_arch(): hp = load_hparam("config/default.yaml") net = Net_arch(hp) # TODO: This is example code. You should change this part as you need. You can code this part as forward x = torch.rand(64, 10) x = net.fc1(x) assert x.shape == (64, 10) x = net.fc2(x) assert x.shape == (64, 1)
def test_net_arch(): hp = load_hparam("config/default.yaml") net = Net_arch(hp) # TODO: This is example code. You should change this part as you need. You can code this part as forward x = torch.rand(8, 1, 28, 28) x = net.conv1(x) # x: (B,4,14,14) assert x.shape == (8, 4, 14, 14) x = net.conv2(x) # x: (B,4,7,7) assert x.shape == (8, 4, 7, 7) x = torch.flatten(x, 1) # x: (B,4*7*7) assert x.shape == (8, 4 * 7 * 7) x = net.fc(x) # x: (B,10) assert x.shape == (8, 10)
def train_loop(hp, logger, writer): # make dataloader logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train) logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f) if hp.load.resume_state_path is not None: model.load_training_state(logger) else: logger.info("Starting new training run.") try: for model.epoch in itertools.count(model.epoch + 1): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) logger.info("End of Train") except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def test_net_arch(): os.makedirs(TEST_DIR, exist_ok=True) with initialize(config_path="../../config"): cfg = compose(config_name="default", overrides=[f"working_dir={TEST_DIR}"]) net = Net_arch(cfg) # TODO: This is example code. You should change this part as you need. You can code this part as forward x = torch.rand(8, 1, 28, 28) x = net.conv1(x) # x: (B,4,14,14) assert x.shape == (8, 4, 14, 14) x = net.conv2(x) # x: (B,4,7,7) assert x.shape == (8, 4, 7, 7) x = torch.flatten(x, 1) # x: (B,4*7*7) assert x.shape == (8, 4 * 7 * 7) x = net.fc(x) # x: (B,10) assert x.shape == (8, 10)
def test_save_load_network(self): local_net = Net_arch(self.cfg) self.loss_f = nn.MSELoss() local_model = Model(self.cfg, local_net, self.loss_f) self.model.save_network() save_filename = "%s_%d.pt" % (self.cfg.name, self.model.step) save_path = os.path.join(self.cfg.log.chkpt_dir, save_filename) self.cfg.load.network_chkpt_path = save_path assert os.path.exists(save_path) and os.path.isfile(save_path) local_model.load_network() parameters = zip(list(local_model.net.parameters()), list(self.model.net.parameters())) for load, origin in parameters: assert (load == origin).all()
def test_save_load_state(self): local_net = Net_arch(self.hp) self.loss_f = nn.MSELoss() local_model = Model(self.hp, local_net, self.loss_f) self.model.save_training_state(self.logger) save_filename = "%s_%d.state" % (self.hp.log.name, self.model.step) save_path = os.path.join(self.hp.log.chkpt_dir, save_filename) self.hp.load.resume_state_path = save_path assert os.path.exists(save_path) and os.path.isfile(save_path) assert os.path.exists(self.hp.log.log_file_path) and os.path.isfile( self.hp.log.log_file_path) local_model.load_training_state(logger=self.logger) parameters = zip(list(local_model.net.parameters()), list(self.model.net.parameters())) for load, origin in parameters: assert (load == origin).all() assert local_model.epoch == self.model.epoch assert local_model.step == self.model.step
def train_loop(rank, hp, world_size=1): # reload hp hp = DotDict(hp) if hp.model.device.lower() == "cuda" and world_size != 0: setup(hp, rank, world_size) if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, hp.log.log_dir) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") if hp.model.device.lower() == "cuda" and world_size != 0: hp.model.device = rank torch.cuda.set_device(rank) else: hp.model.device = hp.model.device.lower() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state if hp.load.resume_state_path is not None: model.load_training_state(logger) else: if logger is not None: logger.info("Starting new training run.") try: epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) cleanup() if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.info("Exiting due to exception: %s" % e) traceback.print_exc() cleanup()
def setup_method(self, method): super(TestModel, self).setup_method() self.net = Net_arch(self.cfg) self.loss_f = nn.CrossEntropyLoss() self.model = Model(self.cfg, self.net, self.loss_f)
def train_loop(rank, cfg): logger = get_logger(cfg, os.path.basename(__file__)) if cfg.device == "cuda" and cfg.dist.gpus != 0: cfg.device = rank # turn off background generator when distributed run is on cfg.data.use_background_generator = False setup(cfg, rank) torch.cuda.set_device(cfg.device) # setup writer if is_logging_process(): # set log/checkpoint dir os.makedirs(cfg.log.chkpt_dir, exist_ok=True) # set writer (tensorboard / wandb) writer = Writer(cfg, "tensorboard") cfg_str = OmegaConf.to_yaml(cfg) logger.info("Config:\n" + cfg_str) if cfg.data.train_dir == "" or cfg.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info("BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if cfg.dist.gpus != 0: dist.barrier() # make dataloader if is_logging_process(): logger.info("Making train dataloader...") train_loader = create_dataloader(cfg, DataloaderMode.train, rank) if is_logging_process(): logger.info("Making test dataloader...") test_loader = create_dataloader(cfg, DataloaderMode.test, rank) # init Model net_arch = Net_arch(cfg) loss_f = torch.nn.CrossEntropyLoss() model = Model(cfg, net_arch, loss_f, rank) # load training state / network checkpoint if cfg.load.resume_state_path is not None: model.load_training_state() elif cfg.load.network_chkpt_path is not None: model.load_network() else: if is_logging_process(): logger.info("Starting new training run.") try: if cfg.dist.gpus == 0 or cfg.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = cfg.dist.gpus for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > cfg.num_epoch: break train_model(cfg, model, train_loader, writer) if model.epoch % cfg.log.chkpt_interval == 0: model.save_network() model.save_training_state() test_model(cfg, model, test_loader, writer) if is_logging_process(): logger.info("End of Train") except Exception as e: if is_logging_process(): logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if cfg.dist.gpus != 0: cleanup()
def train_loop(rank, hp, world_size=0): if hp.model.device == "cuda" and world_size != 0: hp.model.device = rank # turn off background generator when distributed run is on hp.data.use_background_generator = False setup(hp, rank, world_size) torch.cuda.set_device(hp.model.device) # setup logger / writer if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard")) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info( "BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root="dataset/meta", train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root="dataset/meta", train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if world_size != 0: dist.barrier() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.CrossEntropyLoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state / network checkpoint if hp.load.resume_state_path is not None: model.load_training_state(logger) elif hp.load.network_chkpt_path is not None: model.load_network(logger=logger) else: if logger is not None: logger.info("Starting new training run.") try: if world_size == 0 or hp.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_epoch: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer, logger) if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if world_size != 0: cleanup()
def setup_method(self, method): super(TestModel, self).setup_method() self.net = Net_arch(self.hp) self.loss_f = nn.MSELoss() self.model = Model(self.hp, self.net, self.loss_f)