def init_logger(self): print_ranks = [0] self.train_logger = log.make_logger(self.rank, print_ranks) self.train_logger.register_metric("iter", log.IterationMeter(), "iter: {}/{}") self.train_logger.register_metric( "loss", log.AverageMeter(), "loss: {:.16f}", True ) self.train_logger.register_metric( "latency", log.LatencyMeter(), "latency(ms): {:.16f}", True ) self.val_logger = log.make_logger(self.rank, print_ranks) self.val_logger.register_metric("iter", log.IterationMeter(), "iter: {}/{}") self.val_logger.register_metric("auc", log.IterationMeter(), "eval_auc: {}")
def setup_method(self): # set log/checkpoint dir self.TEST_DIR = pathlib.Path(TEST_DIR) self.log_dir = (self.TEST_DIR / "logs").resolve() self.chkpt_dir = (self.TEST_DIR / "chkpt").resolve() os.makedirs(self.TEST_DIR, exist_ok=True) os.makedirs(self.log_dir, exist_ok=True) os.makedirs(self.chkpt_dir, exist_ok=True) # set hp self.hp = load_hparam("config/default.yaml") self.hp.model.device = "cpu" self.hp.log.log_dir = self.log_dir self.hp.log.chkpt_dir = self.chkpt_dir self.hp.log.use_wandb = False self.hp.log.use_tensorboard = False # set logger self.logger = make_logger(self.hp)
d_tar = 50.0 lr = 0.01 num_epoch = 10 loss_fn = CrossEntropyLoss() model = VGG_Face_PubFig(saved=True) for param in model.parameters(): param.requires_grad = True model.train() model[1].fc8.register_forward_pre_hook(hook) optimizer = SGD(model.parameters(), lr=lr, weight_decay=1e-5) logger = make_logger('aug_face') logger.info('config\nlamb : {}\nd_tar : {}\nlr : {}\nnum_epoch :{}\n'\ .format(lamb, d_tar, lr, num_epoch)) print('attack validation') #val_model(model, test_loader) print('''val Acc: 96.3077%\ncorrect : 626, total : 650\n''') print('original prediction rate') #val_model(model, orig_loader) print('''val Acc: 98.6154%\ncorrect : 641, total : 650\n''') for epoch in range(num_epoch): epoch_log = 'Epoch {}/{}'.format(epoch + 1, num_epoch) + '\n' epoch_log += '-' * 30 + '\n'
def train_loop(rank, hp, world_size=1): # reload hp hp = DotDict(hp) if hp.model.device.lower() == "cuda" and world_size != 0: setup(hp, rank, world_size) if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, hp.log.log_dir) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") if hp.model.device.lower() == "cuda" and world_size != 0: hp.model.device = rank torch.cuda.set_device(rank) else: hp.model.device = hp.model.device.lower() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state if hp.load.resume_state_path is not None: model.load_training_state(logger) else: if logger is not None: logger.info("Starting new training run.") try: epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) cleanup() if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.info("Exiting due to exception: %s" % e) traceback.print_exc() cleanup()
def train_loop(rank, hp, world_size=0): if hp.model.device == "cuda" and world_size != 0: hp.model.device = rank # turn off background generator when distributed run is on hp.data.use_background_generator = False setup(hp, rank, world_size) torch.cuda.set_device(hp.model.device) # setup logger / writer if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard")) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info( "BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root="dataset/meta", train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root="dataset/meta", train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if world_size != 0: dist.barrier() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.CrossEntropyLoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state / network checkpoint if hp.load.resume_state_path is not None: model.load_training_state(logger) elif hp.load.network_chkpt_path is not None: model.load_network(logger=logger) else: if logger is not None: logger.info("Starting new training run.") try: if world_size == 0 or hp.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_epoch: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer, logger) if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if world_size != 0: cleanup()