def main( model, config=None, comment="No comment", checkpoint=None, ): if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint['state_dict']) print("...Checkpoint loaded") # Checking cuda device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # Convert to suitable device model = model.to(device) print("Number of parameters: ", sum(p.numel() for p in model.parameters())) logging.info("Model created...") # using parsed configurations to create a dataset num_of_class = len(cfg["data"]["label_dict"]) # Create dataset train_loader, valid_loader, test_loader = get_data_loader(cfg) logging.info("Dataset and Dataloaders created") # create a metric for evaluating train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # read settings from json file # initlize optimizing methods : lr, scheduler of lr, optimizer learning_rate = cfg["optimizer"]["lr"] optimizer = get_optimizer(cfg) optimizer = optimizer(model.parameters(), lr=learning_rate) loss_fn = get_loss_fn(cfg) criterion = loss_fn() ## Learning rate decay max_lr = 3e-3 # Maximum LR min_lr = cfg["optimizer"]["min_lr"] # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr save_method = cfg["optimizer"]["lr_scheduler_factor"] lr_patiences = cfg["optimizer"]["lr_patience"] lr_factor = cfg["optimizer"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, mode=save_method, min_lr=min_lr, patience=lr_patiences, factor=lr_factor) # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) print("\nTraing shape: {} samples".format(len(train_loader.dataset))) print("Validation shape: {} samples".format(len(valid_loader.dataset))) print("Beginning training...") # export the result to log file logging.info("--------------------------------") logging.info("session name: {}".format(cfg["session"]["sess_name"])) # logging.info(model) logging.info("CONFIGS:") logging.info(cfg) # initialize the early_stopping object checkpoint_path = os.path.join(log_dir, "Checkpoint.pt") save_mode = cfg["train"]["mode"] early_patience = cfg["train"]["early_patience"] early_stopping = callbacks.EarlyStopping(patience=early_patience, mode=save_mode, path=checkpoint_path) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 t0 = time.time() for epoch in range(num_epoch): t1 = time.time() train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch( epoch, num_epoch, model, device, train_loader, valid_loader, criterion, optimizer, train_metrics, val_metrics, ) train_checkpoint = { 'epoch': epoch + 1, 'valid_loss': val_loss, 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } scheduler.step(val_loss) ## lr scheduling logging.info( "\n------Epoch %d / %d, Training time: %.4f seconds------" % (epoch + 1, num_epoch, (time.time() - t1))) logging.info("Training loss: {} - Other training metrics: {}".format( train_loss, train_result)) logging.info( "Validation loss: {} - Other validation metrics: {}".format( val_loss, val_result)) ## tensorboard tb_writer.add_scalar("Training Loss", train_loss, epoch + 1) tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1) tb_writer.add_scalar("Training Accuracy", train_result["accuracy_score"], epoch + 1) tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"], epoch + 1) # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1) # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1) # Save model if save_mode == "min": early_stopping(val_loss, train_checkpoint) else: early_stopping(val_acc, train_checkpoint) if early_stopping.early_stop: logging.info("Early Stopping!!!") break # testing on test set # load the test model and making inference print("\nInference on the testing set") checkpoint = torch.load(checkpoint_path) test_model = checkpoint['model'] test_model.load_state_dict(checkpoint['state_dict']) test_model = test_model.to(device) # logging report report = tester.test_result(test_model, test_loader, device, cfg) logging.info("\nClassification Report: \n {}".format(report)) logging.info('Completed in %.3f seconds.' % (time.time() - t0)) print("Classification Report: \n{}".format(report)) print('Completed in %.3f seconds.' % (time.time() - t0)) print( 'Start Tensorboard with tensorboard --logdir {}, view at http://localhost:6006/' .format(log_dir))
def main(model, dataset, validation_flag, comment="No comment", checkpoint=None, num_of_class=2): # Checking cuda device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint['state_dict']) print("...Checkpoint loaded") # Convert to suitable device model = model.to(device) print("Number of parameters: ", sum(p.numel() for p in model.parameters())) logging.info("Model created...") # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] print("Reading training data from file: ", data) training_set = pd.read_csv(data) # check if validation flag is on if validation_flag == 0: # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] print("Reading validation data from file: ", valid) valid_set = pd.read_csv(valid) else: # auto divide validation set print("Splitting dataset into train and valid....") validation_split = float(cfg["data"]["validation_ratio"]) training_set, valid_set, _, _ = data_split(training_set, validation_split) print("Done Splitting !!!") data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # Create dataset training_set = dataset(training_set, data_path, transform.train_transform) valid_set = dataset(valid_set, data_path, transform.val_transform) # End sampler train_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) logging.info("Dataset and Dataloaders created") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = criterion() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, mode=save_method, min_lr=min_lr, patience=patiences, factor=lr_factor) # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) print("\nTraing shape: {} samples".format(len(train_loader.dataset))) print("Validation shape: {} samples".format(len(val_loader.dataset))) print("Beginning training...") # export the result to log file logging.info("--------------------------------") logging.info("session name: {}".format(cfg["session"]["sess_name"])) # logging.info(model) logging.info("CONFIGS:") logging.info(cfg) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 t0 = time.time() for epoch in range(0, num_epoch): t1 = time.time() print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss')) train_loss, val_loss, train_result, val_result = trainer.train_one_epoch( epoch, num_epoch, model, device, train_loader, val_loader, criterion, optimizer, train_metrics, val_metrics, ) scheduler.step(val_loss) # lr scheduling logging.info( "\n------Epoch %d / %d, Training time: %.4f seconds------" % (epoch + 1, num_epoch, (time.time() - t1))) logging.info("Training loss: {} - Other training metrics: {}".format( train_loss, train_result)) logging.info( "Validation loss: {} - Other validation metrics: {}".format( val_loss, val_result)) tb_writer.add_scalar("Training Loss", train_loss, epoch + 1) tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1) tb_writer.add_scalar("Training Accuracy", train_result["accuracy_score"], epoch + 1) tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"], epoch + 1) # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1) # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1) # saving epoch with best validation accuracy if best_val_acc < float(val_result["accuracy_score"]): logging.info("Validation accuracy= " + str(val_result["accuracy_score"])) logging.info("====> Save best at epoch {}".format(epoch + 1)) best_val_acc = val_result["accuracy_score"] checkpoint = { 'epoch': epoch + 1, 'valid_loss': val_loss, 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(checkpoint, log_dir + "/Checkpoint.pt") # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data) # prepare the dataset testing_set = dataset(test_df, data_path, transform.val_transform) test_loader = torch.utils.data.DataLoader(testing_set, batch_size=32, shuffle=False) print("\nInference on the testing set") # load the test model and making inference checkpoint = torch.load(log_dir + "/Checkpoint.pt") test_model = checkpoint['model'] test_model.load_state_dict(checkpoint['state_dict']) test_model = test_model.to(device) # logging report report = tester.test_result(test_model, test_loader, device, cfg) logging.info("\nClassification Report: \n {}".format(report)) logging.info('%d epochs completed in %.3f seconds.' % (num_epoch, (time.time() - t0))) print("Classification Report: \n{}".format(report)) print('%d epochs completed in %.3f seconds.' % (num_epoch, (time.time() - t0))) print( f'Start Tensorboard with "tensorboard --logdir {log_dir}", view at http://localhost:6006/' )
def main(): parser = argparse.ArgumentParser(description='NA') parser.add_argument('-c', '--configure', default='cfgs/chexphoto.cfg', help='JSON file') parser.add_argument('-cp', '--checkpoint', default=None, help='checkpoint path') args = parser.parse_args() checkpoint = args.checkpoint # read configure file with open(args.configure) as f: cfg = json.load(f) time_str = str(datetime.now().strftime("%Y%m%d-%H%M")) tensorboard_writer = logger.make_writer(cfg["session"]["sess_name"], time_str) # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] valid = cfg['data']['test_csv_name'] data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) validation_split = float(cfg["data"]["validation_ratio"]) # create dataset training_set = pd.read_csv(data, usecols=["file_name", "label"]) valid_set = pd.read_csv(valid, usecols=["file_name", "label"]) # train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataloader.ClassificationDataset(training_set, data_path, transform.train_transform) testing_set = dataloader.ClassificationDataset(valid_set, data_path, transform.val_transform) # create dataloaders # global train_loader # global val_loader #SAmpler to prevent inbalance data label # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) #End sampler train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False, ) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model extractor_name = cfg["train"]["extractor"] model = cls.ClassificationModel(model_name=extractor_name).create_model() #load checkpoint to continue training if checkpoint is not None: print('...Load checkpoint from {}'.format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print('...Checkpoint loaded') classifier = nn.Sequential(nn.Linear(1408, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, 6, bias=True)) # create classfier # replace the last linear layer with your custom classifier # model._avg_pooling = SPPLayer([1,2,4]) model._fc = classifier # model.last_linear = self.cls # select with layers to unfreeze params = list(model.parameters()) len_param = len(params) # for index,param in enumerate(model.parameters()): # if index == (len_param -1): # param.requires_grad = True # else: # param.requires_grad = False # for param in model.parameters(): # print(param.requires_grad) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = criterion() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr) # before training, let's create a file for logging model result log_file = logger.make_file(cfg["session"]["sess_name"], time_str) logger.log_initilize(log_file) print("Beginning training...") # export the result to log file f = open("saved/logs/traning_{}.txt".format(cfg["session"]["sess_name"]), "a") logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # logging.info(f.read()) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) print("Epoch {} / {} \n Training acc: {} - Other training metrics: ". format(i + 1, num_epoch, train_result["accuracy_score"])) print("Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) f.write( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) f.write("Epoch {} / {} \n Training acc: {} - Other training metrics: ". format(i + 1, num_epoch, train_result["accuracy_score"])) tensorboard_writer.add_scalar("training accuracy", train_result["accuracy_score"], i + 1) tensorboard_writer.add_scalar("training f1_score", train_result["f1_score"], i + 1) tensorboard_writer.add_scalar("training metrics", loss, i + 1) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) print( "Epoch {} / {} \n valid acc: {} - Other training metrics: ".format( i + 1, num_epoch, val_result["accuracy_score"])) f.write(" \n Validation loss : {} - Other validation metrics:".format( val_loss)) tensorboard_writer.add_scalar("valid accuracy", val_result["accuracy_score"], i + 1) tensorboard_writer.add_scalar("valid f1_score", val_result["f1_score"], i + 1) tensorboard_writer.add_scalar("valid metrics", val_loss, i + 1) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["accuracy_score"]): logging.info("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch") f.write("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch") best_val_acc = val_result["accuracy_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # else: # # logging.info( # # "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving" # # ) # continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data, usecols=["file_name", "label"]) # prepare the dataset testing_set = dataloader.ClassificationDataset(test_df, data_path, transform.val_transform) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=32, shuffle=False) print("Inference on the testing set") # load the test model and making inference test_model = cls.ClassificationModel( model_name=extractor_name).create_model() model_path = os.path.join("saved/models", time_str + "-" + cfg["train"]["save_as_name"]) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.test_result(test_model, test_loader, device, cfg))
def main(): # read configure file with open("cfgs/tenes.cfg") as f: cfg = json.load(f) # using parsed configurations to create a dataset data = cfg["data"]["data_csv_name"] data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) validation_split = float(cfg["data"]["validation_ratio"]) # create dataset training_set = pd.read_csv(data, usecols=["image_name", "target"]) training_set["image_name"] = training_set["image_name"] + '.jpg' training_set = shuffle(training_set) training_set = training_set.sample(25000) print(training_set['target'].value_counts()) train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataloader.ClassificationDataset(train, data_path, transform.train_transform) testing_set = dataloader.ClassificationDataset(test, data_path, transform.val_transform) # create dataloaders # global train_loader # global val_loader train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False, ) logging.info("Dataset and Dataloaders created") # create a model extractor_name = cfg["train"]["extractor"] model = cls.ClassificationModel(model_name=extractor_name).create_model() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.FocalLoss() optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] scheduler = ReduceLROnPlateau(optimizer, save_method, patience=patiences, factor=lr_factor) # before training, let's create a file for logging model result time_str = str(datetime.now().strftime("%Y%m%d-%H%M")) log_file = logger.make_file(cfg["session"]["sess_name"], time_str) logger.log_initilize(log_file) print("Beginning training...") # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("Training size: " + str(len(train))) logging.info("Validation size: " + str(len(test))) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling scheduler.step(val_loss) logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch") best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + cfg["train"]["save_as_name"], ) else: logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> No saving") continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_df = pd.read_csv(test_data, usecols=["image_name", "target"]) test_df['image_name'] = test_df['image_name'] + '.jpg' # prepare the dataset testing_set = dataloader.TestDataset(test_df, 'dataset/test/test', transform.test_transform) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=16, shuffle=False) print("\n Inference on the testing set") # load the test model and making inference test_model = cls.ClassificationModel( model_name=extractor_name).create_model() model_path = os.path.join("saved/models", time_str + "-" + cfg["train"]["save_as_name"]) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.test_result(test_model, test_loader, device))
def main(collocation, model, dataset, validation_flag, current_fold, comment="No comment", checkpoint=None, logger=None, num_of_class=2): # read training set data = cfg["data"]["data_csv_name"] data = re.sub(r"fold[0-9]", str(current_fold), data) print("Reading training data from file: ", data) training_set = pd.read_csv(data, delimiter="*", header=None) # check if validation flag is on if validation_flag == 1: # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] valid = re.sub(r"fold[0-9]", str(current_fold), valid) print("Reading validation data from file: ", valid) valid_set = pd.read_csv(valid, delimiter="*", header=None) else: # auto divide validation set validation_split = float(cfg["data"]["validation_ratio"]) training_set, valid_set = data_split(training_set, validation_split) data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # create dataset training_set = dataset(training_set, data_path, padding=True, normalize=True) testing_set = dataset(valid_set, data_path, padding=True, normalize=True) # End sampler train_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True, collate_fn=collocation) val_loader = torch.utils.data.DataLoader(testing_set, batch_size=batch_size, shuffle=False, collate_fn=collocation) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model # extractor_name = cfg["train"]["extractor"] # model = cls(model_name=extractor_name).create_model() # model = cls( # num_blocks=6, # in_channels=1, # out_channels=64, # bottleneck_channels=0, # kernel_sizes=8, # num_pred_classes=2 # ) model = cls(class_num=2, num_of_blocks=9, training=True, dense_layers=[256, 256]) for param in model.parameters(): param.requires_grad = True # load checkpoint to continue training if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print("...Checkpoint loaded") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) print(sum(p.numel() for p in model.parameters())) time.sleep(4) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function)) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.WeightedFocalLoss(weight=None, gamma=2, reduction="sum") # criterion = nn.CrossEntropyLoss(reduction='none') optimizer = getattr(torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr optimizer = optimizer(model.parameters(), lr=learning_rate, momentum=0.9) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = ReduceLROnPlateau( optimizer, mode=save_method, factor=lr_factor, min_lr=0.00001, verbose=True, patience=patiences, ) # before training, let's create a neptune protocol for tracking experiment neptune.init("deepbox/gtopia-ml") PARAMS = { "loss_function": cfg["optimizer"]["loss"], "optimizers": cfg["optimizer"]["name"], "learning_rate": cfg["optimizer"]["lr"], "lr_factor": cfg["train"]["reduce_lr_factor"], "patiences": cfg["train"]["patience"], "loss_function": cfg["optimizer"]["loss"], "data_path": cfg["data"]["data_csv_name"], "batch_size": batch_size, } # create neptune experiment neptune.create_experiment( name=comment + "_" + str(current_fold), params=PARAMS, tags=[ str(current_fold), cfg["train"]["model.class"], cfg["data"]["mode"] ], ) logging.info("Created experiment tracking protocol") print("Beginning training...") print("Traing shape: ", len(train_loader.dataset)) print("Validation shape: ", len(val_loader.dataset)) time.sleep(3) # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("session description: {} \n".format(comment)) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, num_of_class, ) # neptune logging neptune.log_metric("train_loss", loss) neptune.log_metric("validation_loss", val_loss) for single_metric in train_result.keys(): neptune.log_metric("train_" + single_metric, train_result[single_metric]) neptune.log_metric("val_" + single_metric, val_result[single_metric]) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ". format(i + 1, num_epoch, loss)) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format( val_loss)) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info("Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch \n") best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_data = re.sub(r"fold[0-9]", str(current_fold), test_data) print("reading testing data from file: ", test_data) test_df = pd.read_csv(test_data, delimiter="*", header=None) # prepare the dataset testing_set = dataset(test_df, data_path, padding=False, normalize=True) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=1, shuffle=False, collate_fn=collocation) print("Inference on the testing set") # load the test model and making inference test_model = cls(class_num=2, num_of_blocks=9, training=True, dense_layers=[256, 256]) # test_model = cls( # num_blocks=6, # in_channels=1, # out_channels=64, # bottleneck_channels=0, # kernel_sizes=8, # num_pred_classes=2, # ) model_path = os.path.join( "saved/models", time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"], ) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info( tester.adaptive_test_result(test_model, test_loader, device, cfg, num_of_class)) f = open("test_report.txt", "w") f.write("Test results \n : {}".format( tester.adaptive_test_result(test_model, test_loader, device, cfg))) f.close() # send some versions of code neptune.log_artifact("test_report.txt") neptune.log_artifact("data_loader/dataloader.py") neptune.log_artifact("cfgs/tenes.cfg") neptune.log_artifact("trainer.py") neptune.log_artifact("test.py") neptune.log_artifact("run_exp_2.py") if (cfg["train"]["model.class"] == "Lecnet"): neptune.log_artifact("model/classification.py") else: neptune.log_artifact("model/benchmark.py") # saving torch models print("---End of testing phase----") neptune.stop()
def main(collocation,model,dataset,validation_flag,current_fold,comment="No comment", checkpoint=None,logger=None): # parser = argparse.ArgumentParser(description='NA') # parser.add_argument('-c', '--configure', default='cfgs/chexphoto.cfg', help='JSON file') # parser.add_argument('-cp', '--checkpoint', default=None, help = 'checkpoint path') # args = parser.parse_args() # checkpoint = args.checkpoint # # read configure file # with open(args.configure) as f: # cfg = json.load(f) # using parsed configurations to create a dataset # read training set data = cfg["data"]["data_csv_name"] data = re.sub(r"fold[0-9]",str(current_fold),data) print("Reading training data from file: ",data) training_set = pd.read_csv(data,delimiter='*',header=None) # check if validation flag is on if (validation_flag==1): # using custom validation set print("Creating validation set from file") valid = cfg["data"]["validation_csv_name"] valid = re.sub(r"fold[0-9]",str(current_fold),valid) print("Reading validation data from file: ",valid) valid_set = pd.read_csv(valid,delimiter='*',header=None) else: # auto divide validation set validation_split = float(cfg["data"]["validation_ratio"]) training_set,valid_set = data_split(training_set,validation_split) data_path = cfg["data"]["data_path"] batch_size = int(cfg["data"]["batch_size"]) # create dataset # train, test, _, _ = dataloader.data_split(training_set, validation_split) training_set = dataset( training_set, data_path, padding=True,normalize=True ) testing_set = dataset( valid_set, data_path, padding=True,normalize=True ) # create dataloaders # global train_loader # global val_loader # SAmpler to prevent inbalance data label # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) # End sampler train_loader = torch.utils.data.DataLoader( training_set, batch_size=batch_size, shuffle=True,collate_fn=collocation ) val_loader = torch.utils.data.DataLoader( testing_set, batch_size=batch_size, shuffle=False,collate_fn=collocation ) # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,) logging.info("Dataset and Dataloaders created") # create a model # extractor_name = cfg["train"]["extractor"] # model = cls(model_name=extractor_name).create_model() model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256]) # model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2) # load checkpoint to continue training if checkpoint is not None: print("...Load checkpoint from {}".format(checkpoint)) checkpoint = torch.load(checkpoint) model.load_state_dict(checkpoint) print("...Checkpoint loaded") classifier = nn.Sequential( nn.Linear(1408, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, 6, bias=True), ) # create classfier # replace the last linear layer with your custom classifier # model._avg_pooling = SPPLayer([1,2,4]) # model._fc = classifier # model.last_linear = self.cls # select with layers to unfreeze params = list(model.parameters()) len_param = len(params) # for index,param in enumerate(model.parameters()): # if index == (len_param -1): # param.requires_grad = True # else: # param.requires_grad = False # for param in model.parameters(): # print(param.requires_grad) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using device: {} ".format(device)) # convert to suitable device # global model model = model.to(device) print(sum(p.numel() for p in model.parameters())) time.sleep(4) logging.info("Model created...") # create a metric for evaluating # global train_metrics # global val_metrics train_metrics = metrics.Metrics(cfg["train"]["metrics"]) val_metrics = metrics.Metrics(cfg["train"]["metrics"]) print("Metrics implemented successfully") # method to optimize the model # read settings from json file loss_function = cfg["optimizer"]["loss"] optimizers = cfg["optimizer"]["name"] learning_rate = cfg["optimizer"]["lr"] # initlize optimizing methods : lr, scheduler of lr, optimizer try: # if the loss function comes from nn package criterion = getattr( nn, loss_function, "The loss {} is not available".format(loss_function) ) except: # use custom loss criterion = getattr( custom_loss, loss_function, "The loss {} is not available".format(loss_function), ) criterion = custom_loss.WeightedFocalLoss(weight=None, gamma=2,reduction='mean') # criterion = nn.CrossEntropyLoss(reduction='none') optimizer = getattr( torch.optim, optimizers, "The optimizer {} is not available".format(optimizers) ) max_lr = 3e-3 # Maximum LR min_lr = 1e-5 # Minimum LR t_max = 10 # How many epochs to go from max_lr to min_lr # optimizer = torch.optim.Adam( # params=model.parameters(), lr=max_lr, amsgrad=False) optimizer = optimizer(model.parameters(), lr=learning_rate) save_method = cfg["train"]["lr_scheduler_factor"] patiences = cfg["train"]["patience"] lr_factor = cfg["train"]["reduce_lr_factor"] # scheduler = ReduceLROnPlateau( # optimizer, save_method, patience=patiences, factor=lr_factor # ) scheduler = ReduceLROnPlateau(optimizer, mode='min',factor=0.5,min_lr=0.00001,verbose=True,patience=5) # before training, let's create a file for logging model result print("Beginning training...") time.sleep(3) # export the result to log file logging.info("-----") logging.info("session name: {} \n".format(cfg["session"]["sess_name"])) logging.info("session description: {} \n".format(comment)) logging.info(model) logging.info("\n") logging.info("CONFIGS \n") # logging the configs: # logging.info(f.read()) # training models num_epoch = int(cfg["train"]["num_epoch"]) best_val_acc = 0 for i in range(0, num_epoch): loss, val_loss, train_result, val_result = trainer.train_one_epoch( model, train_loader, val_loader, device, optimizer, criterion, train_metrics, val_metrics, ) # lr scheduling logging.info( "Epoch {} / {} \n Training loss: {} - Other training metrics: ".format( i + 1, num_epoch, loss ) ) # tensorboard_writer.add_scalar("training accuracy",train_result["accuracy_score"],i + 1) # tensorboard_writer.add_scalar("training f1_score",train_result["f1_score"],i + 1) # tensorboard_writer.add_scalar("training metrics",loss,i + 1) logging.info(train_result) logging.info( " \n Validation loss : {} - Other validation metrics:".format(val_loss) ) # tensorboard_writer.add_scalar("valid accuracy",val_result["accuracy_score"],i + 1) # tensorboard_writer.add_scalar("valid f1_score",val_result["f1_score"],i + 1) # tensorboard_writer.add_scalar("valid metrics",val_loss,i + 1) logging.info(val_result) logging.info("\n") # saving epoch with best validation accuracy if best_val_acc < float(val_result["f1_score"]): logging.info( "Validation f1= " + str(val_result["f1_score"]) + "===> Save best epoch \n" ) best_val_acc = val_result["f1_score"] torch.save( model.state_dict(), "saved/models/" + time_str + "-" + str(current_fold) + "-" +cfg["train"]["save_as_name"], ) scheduler.step(val_loss) # else: # # logging.info( # # "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving" # # ) # continue # testing on test set test_data = cfg["data"]["test_csv_name"] data_path = cfg["data"]["data_path"] test_data = re.sub(r"fold[0-9]",str(current_fold),test_data) print("reading testing data from file: ",test_data) test_df = pd.read_csv(test_data,delimiter='*',header=None) # prepare the dataset testing_set = dataset( test_df, data_path, padding=False, normalize=True ) # make dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_loader = torch.utils.data.DataLoader(testing_set, batch_size=1, shuffle=False,collate_fn=collocation) print("Inference on the testing set") # load the test model and making inference test_model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256]) # test_model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2) model_path = os.path.join( "saved/models", time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"] ) test_model.load_state_dict(torch.load(model_path)) test_model = test_model.to(device) logging.info(tester.adaptive_test_result(test_model, test_loader, device, cfg)) # saving torch models print("---End of testing phase----")