def get_callbacks(self): """ Define any callbacks for the training """ model_filename = os.path.join(self.output_path, self.inference_filename) print("Writing model to '{}'".format(model_filename)) # Save model whenever we get better validation loss model_checkpoint = K.callbacks.ModelCheckpoint(model_filename, verbose=1, monitor="val_loss", save_best_only=True) directoryName = "unet_block{}_inter{}_intra{}".format( self.blocktime, self.num_threads, self.num_inter_threads) # Tensorboard callbacks if self.use_upsampling: tensorboard_filename = os.path.join( self.output_path, "keras_tensorboard_upsampling" "_batch{}/{}".format(self.batch_size, directoryName)) else: tensorboard_filename = os.path.join( self.output_path, "keras_tensorboard_transposed" "_batch{}/{}".format(self.batch_size, directoryName)) tensorboard_checkpoint = K.callbacks.TensorBoard( log_dir=tensorboard_filename, write_graph=True, write_images=True) foundations.set_tensorboard_logdir(tensorboard_filename) return model_filename, [model_checkpoint, tensorboard_checkpoint]
def __init__(self, train_dl, val_dl, test_dl, model: torch.nn.Module, optimizer, scheduler, criterion, params): self.train_dl = train_dl self.val_dl = val_dl self.test_dl = test_dl self.visual_iter = iter(val_dl) self.unnorm = Unnormalize(training_mean, training_std) self.model = model self.optimizer = optimizer self.num_epochs = params["num_epochs"] self.lr = params["max_lr"] self.scheduler = scheduler self.criterion = criterion os.makedirs('checkpoints', exist_ok=True) os.makedirs('tensorboard', exist_ok=True) if settings.USE_FOUNDATIONS: foundations.set_tensorboard_logdir('tensorboard') self.writer = SummaryWriter("tensorboard") self.meter_train = Meter(self.writer, 'train',0) self.meter_val = Meter(self.writer, 'val',0) self.current_epoch = 0 self.best_metric = 1e9 self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.phase = 'train' self.train()
def __init__(self, train_dl, val_dl, test_dl, model: torch.nn.Module, optimizer, scheduler, criterion, params): self.train_dl = train_dl self.val_dl = val_dl self.test_dl = test_dl self.visual_iter = iter(val_dl) self.unnorm = Unnormalize(model.input_mean, model.input_std) # self.model = torch.nn.DataParallel(convert_model(model)).cuda() # serious bugs due to DataParallel, may caused by BN and apex self.model = model self.optimizer = optimizer self.num_epochs = params["num_epochs"] self.lr = params["max_lr"] self.clip_gradient = params["clip_gradient"] self.scheduler = scheduler self.criterion = criterion self.batch_repeat = params["batch_repeat"] os.makedirs('checkpoints', exist_ok=True) os.makedirs('tensorboard', exist_ok=True) if settings.USE_FOUNDATIONS: foundations.set_tensorboard_logdir('tensorboard') self.writer = SummaryWriter("tensorboard") self.meter_train = Classification_Meter(self.writer, 'train', 0) self.meter_val = Classification_Meter(self.writer, 'val', 0) self.current_epoch = 0 self.best_metric = 1e9 self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.phase = 'train' self.seeds = [np.random.randint(0, 2e9), random.randint(0, 2e9)] self.train() self.history_best = {}
def train(self, xtrain, ytrain, xval, yval): callbacks = [] tb = TensorBoard(log_dir='tflogs', write_graph=True, write_grads=False) callbacks.append(tb) try: foundations.set_tensorboard_logdir('tflogs') except: print("foundations command not found") es = EarlyStopping(monitor='val_loss', mode='min', patience=5, min_delta=0.0001, verbose=1) callbacks.append(tb) callbacks.append(es) rp = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=2, verbose=1) callbacks.append(rp) f1_callback = f1_score_callback(xval, yval, model_save_filename=self.model_save_filename) callbacks.append(f1_callback) class_weights = {1: 5, 0: 1} train_generator = DataGenerator(xtrain, ytrain) validation_generator = DataGenerator(xval, yval) self.model.fit_generator(train_generator, steps_per_epoch = len(train_generator), epochs = model_params['epochs'], validation_data=validation_generator, callbacks = callbacks, shuffle = False, use_multiprocessing = True, verbose = 1, class_weight =class_weights) self.model = load_model(self.model_save_filename, custom_objects={'customPooling': customPooling}) try: foundations.save_artifact(self.model_save_filename, key='trained_model.h5') except: print("foundations command not found")
print("____________________________________________") print("_________________DATA_______________________") print("____________________________________________") import DessaCallback as dc import PyArrowDataExtraction as de tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) #tensorboard csv_callback = keras.callbacks.CSVLogger("experiment_training.log", separator=",", append=False) #csvlogger csv_callback_test = keras.callbacks.CSVLogger("experiment_testing.log", separator=",", append=False) #csvloggertesting foundations.set_tensorboard_logdir(logdir) #foundations pds = pq.ParquetDataset(parquet_files) pds.split_row_groups = True table = pds.read() print(str(table.num_rows)) xy = de.getXandYFromPyArrow(table) pds2 = pq.ParquetDataset(parquet_files2) pds2.split_row_groups = True table2 = pds2.read() print(str(table2.num_rows)) if not table2.num_rows > 0: table2 = table xy2 = de.getXandYFromPyArrow(table2)
from foundations import load_parameters, log_params print("using atlas framework") params = load_parameters() seed_everything(params['seed']) log_params(params) params = parse_params(params) print(params) model = CIFAR_Module(params).cuda() lr_logger = LearningRateLogger() logger = TensorBoardLogger("../logs", name=params["backbone"]) if USE_FOUNDATIONS: from foundations import set_tensorboard_logdir set_tensorboard_logdir(f'../logs/{params["backbone"]}') checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor='acc', prefix=str(params["seed"])) t_params = get_trainer_params(params) trainer = Trainer(callbacks=[lr_logger], logger=logger, checkpoint_callback=checkpoint_callback, **t_params) trainer.fit(model) if USE_FOUNDATIONS and checkpoint_callback.best_model_path != "": from foundations import log_metric, save_artifact save_artifact(checkpoint_callback.best_model_path, key='best_model_checkpoint')
def train(train_dl, val_dl, test_dl, val_dl_iter, model, optimizer, scheduler, criterion, params, train_sampler, val_sampler, rank): n_epochs = params['n_epochs'] max_lr = params['max_lr'] val_rate = params['val_rate'] batch_repeat = params['batch_repeat'] history_best = {} best_metric = 0 if rank == 0: os.makedirs('checkpoints', exist_ok=True) os.makedirs('tensorboard', exist_ok=True) if settings.USE_FOUNDATIONS: foundations.set_tensorboard_logdir('tensorboard') writer = SummaryWriter("tensorboard") else: writer = None for epoch in range(n_epochs): train_records = DistributedClassificationMeter(writer=writer, phase="train", epoch=epoch, workers=params["gpus"], criterion=criterion) if train_sampler: train_sampler.set_epoch(epoch) train_one_epoch(epoch, model, train_dl, max_lr, optimizer, criterion, scheduler, train_records, batch_repeat, rank, writer, params) if epoch % val_rate == 0: val_records = DistributedClassificationMeter( writer=writer, phase="validation", epoch=epoch, workers=params["gpus"], criterion=criterion) if val_sampler: val_sampler.set_epoch(epoch) validate(model, val_dl, criterion, val_records, rank) # 改的时候记得改大于小于啊!!! # aaaa记得改初始值啊 info = val_records.log_metric(write_scalar=False) selection_metric = info["acc"] if selection_metric >= best_metric and rank == 0: best_metric = selection_metric print( f'>>> Saving best model metric={selection_metric:.4f} compared to previous best {best_metric:.4f}' ) checkpoint = { 'model': model.module.state_dict(), 'params': params } history_best = { "train_" + key: value for key, value in train_records.get_metric().items() } for key, value in val_records.get_metric().items(): history_best["val_" + key] = value torch.save(checkpoint, 'checkpoints/best_model.pth') if settings.USE_FOUNDATIONS: foundations.save_artifact('checkpoints/best_model.pth', key='best_model_checkpoint') # Log metrics to GUI if rank == 0: for metric, value in history_best.items(): if settings.USE_FOUNDATIONS: foundations.log_metric(metric, float(value)) else: print(metric, float(value))
import foundations foundations.set_tensorboard_logdir('tensorboard_files/')