Exemple #1
0
def main(
    model,
    config=None,
    comment="No comment",
    checkpoint=None,
):
    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint['state_dict'])
        print("...Checkpoint loaded")

    # Checking cuda
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))

    # Convert to suitable device
    model = model.to(device)
    print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
    logging.info("Model created...")

    # using parsed configurations to create a dataset
    num_of_class = len(cfg["data"]["label_dict"])

    # Create dataset
    train_loader, valid_loader, test_loader = get_data_loader(cfg)
    logging.info("Dataset and Dataloaders created")

    # create a metric for evaluating
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # read settings from json file
    # initlize optimizing methods : lr, scheduler of lr, optimizer
    learning_rate = cfg["optimizer"]["lr"]
    optimizer = get_optimizer(cfg)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    loss_fn = get_loss_fn(cfg)
    criterion = loss_fn()

    ## Learning rate decay
    max_lr = 3e-3  # Maximum LR
    min_lr = cfg["optimizer"]["min_lr"]  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    save_method = cfg["optimizer"]["lr_scheduler_factor"]
    lr_patiences = cfg["optimizer"]["lr_patience"]
    lr_factor = cfg["optimizer"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode=save_method,
                                  min_lr=min_lr,
                                  patience=lr_patiences,
                                  factor=lr_factor)
    # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    print("\nTraing shape: {} samples".format(len(train_loader.dataset)))
    print("Validation shape: {} samples".format(len(valid_loader.dataset)))
    print("Beginning training...")

    # export the result to log file
    logging.info("--------------------------------")
    logging.info("session name: {}".format(cfg["session"]["sess_name"]))
    # logging.info(model)
    logging.info("CONFIGS:")
    logging.info(cfg)

    # initialize the early_stopping object
    checkpoint_path = os.path.join(log_dir, "Checkpoint.pt")
    save_mode = cfg["train"]["mode"]
    early_patience = cfg["train"]["early_patience"]
    early_stopping = callbacks.EarlyStopping(patience=early_patience,
                                             mode=save_mode,
                                             path=checkpoint_path)

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    t0 = time.time()

    for epoch in range(num_epoch):
        t1 = time.time()
        train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch(
            epoch,
            num_epoch,
            model,
            device,
            train_loader,
            valid_loader,
            criterion,
            optimizer,
            train_metrics,
            val_metrics,
        )

        train_checkpoint = {
            'epoch': epoch + 1,
            'valid_loss': val_loss,
            'model': model,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        scheduler.step(val_loss)

        ## lr scheduling
        logging.info(
            "\n------Epoch %d / %d, Training time: %.4f seconds------" %
            (epoch + 1, num_epoch, (time.time() - t1)))
        logging.info("Training loss: {} - Other training metrics: {}".format(
            train_loss, train_result))
        logging.info(
            "Validation loss: {} - Other validation metrics: {}".format(
                val_loss, val_result))

        ## tensorboard
        tb_writer.add_scalar("Training Loss", train_loss, epoch + 1)
        tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1)
        tb_writer.add_scalar("Training Accuracy",
                             train_result["accuracy_score"], epoch + 1)
        tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"],
                             epoch + 1)
        # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1)
        # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1)

        # Save model
        if save_mode == "min":
            early_stopping(val_loss, train_checkpoint)
        else:
            early_stopping(val_acc, train_checkpoint)
        if early_stopping.early_stop:
            logging.info("Early Stopping!!!")
            break

    # testing on test set
    # load the test model and making inference
    print("\nInference on the testing set")
    checkpoint = torch.load(checkpoint_path)
    test_model = checkpoint['model']
    test_model.load_state_dict(checkpoint['state_dict'])
    test_model = test_model.to(device)

    # logging report
    report = tester.test_result(test_model, test_loader, device, cfg)
    logging.info("\nClassification Report: \n {}".format(report))
    logging.info('Completed in %.3f seconds.' % (time.time() - t0))

    print("Classification Report: \n{}".format(report))
    print('Completed in %.3f seconds.' % (time.time() - t0))
    print(
        'Start Tensorboard with tensorboard --logdir {}, view at http://localhost:6006/'
        .format(log_dir))
Exemple #2
0
def main(model,
         dataset,
         validation_flag,
         comment="No comment",
         checkpoint=None,
         num_of_class=2):

    # Checking cuda
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))

    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint['state_dict'])
        print("...Checkpoint loaded")

    # Convert to suitable device
    model = model.to(device)
    print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
    logging.info("Model created...")

    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    print("Reading training data from file: ", data)
    training_set = pd.read_csv(data)

    # check if validation flag is on
    if validation_flag == 0:
        # using custom validation set
        print("Creating validation set from file")
        valid = cfg["data"]["validation_csv_name"]
        print("Reading validation data from file: ", valid)
        valid_set = pd.read_csv(valid)
    else:
        # auto divide validation set
        print("Splitting dataset into train and valid....")
        validation_split = float(cfg["data"]["validation_ratio"])
        training_set, valid_set, _, _ = data_split(training_set,
                                                   validation_split)
        print("Done Splitting !!!")

    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])

    # Create dataset
    training_set = dataset(training_set, data_path, transform.train_transform)
    valid_set = dataset(valid_set, data_path, transform.val_transform)

    # End sampler
    train_loader = torch.utils.data.DataLoader(training_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(valid_set,
                                             batch_size=batch_size,
                                             shuffle=False)
    logging.info("Dataset and Dataloaders created")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))
    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = criterion()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    # optimizer = torch.optim.Adam(
    # params=model.parameters(), lr=max_lr, amsgrad=False)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode=save_method,
                                  min_lr=min_lr,
                                  patience=patiences,
                                  factor=lr_factor)
    # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    print("\nTraing shape: {} samples".format(len(train_loader.dataset)))
    print("Validation shape: {} samples".format(len(val_loader.dataset)))
    print("Beginning training...")

    # export the result to log file
    logging.info("--------------------------------")
    logging.info("session name: {}".format(cfg["session"]["sess_name"]))
    # logging.info(model)
    logging.info("CONFIGS:")
    logging.info(cfg)

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    t0 = time.time()
    for epoch in range(0, num_epoch):
        t1 = time.time()
        print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss'))
        train_loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            epoch,
            num_epoch,
            model,
            device,
            train_loader,
            val_loader,
            criterion,
            optimizer,
            train_metrics,
            val_metrics,
        )
        scheduler.step(val_loss)

        # lr scheduling
        logging.info(
            "\n------Epoch %d / %d, Training time: %.4f seconds------" %
            (epoch + 1, num_epoch, (time.time() - t1)))
        logging.info("Training loss: {} - Other training metrics: {}".format(
            train_loss, train_result))
        logging.info(
            "Validation loss: {} - Other validation metrics: {}".format(
                val_loss, val_result))

        tb_writer.add_scalar("Training Loss", train_loss, epoch + 1)
        tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1)
        tb_writer.add_scalar("Training Accuracy",
                             train_result["accuracy_score"], epoch + 1)
        tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"],
                             epoch + 1)
        # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1)
        # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1)

        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            logging.info("Validation accuracy= " +
                         str(val_result["accuracy_score"]))
            logging.info("====> Save best at epoch {}".format(epoch + 1))
            best_val_acc = val_result["accuracy_score"]
            checkpoint = {
                'epoch': epoch + 1,
                'valid_loss': val_loss,
                'model': model,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(checkpoint, log_dir + "/Checkpoint.pt")

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data)

    # prepare the dataset
    testing_set = dataset(test_df, data_path, transform.val_transform)
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=32,
                                              shuffle=False)
    print("\nInference on the testing set")

    # load the test model and making inference
    checkpoint = torch.load(log_dir + "/Checkpoint.pt")
    test_model = checkpoint['model']
    test_model.load_state_dict(checkpoint['state_dict'])
    test_model = test_model.to(device)

    # logging report
    report = tester.test_result(test_model, test_loader, device, cfg)
    logging.info("\nClassification Report: \n {}".format(report))
    logging.info('%d epochs completed in %.3f seconds.' % (num_epoch,
                                                           (time.time() - t0)))

    print("Classification Report: \n{}".format(report))
    print('%d epochs completed in %.3f seconds.' % (num_epoch,
                                                    (time.time() - t0)))
    print(
        f'Start Tensorboard with "tensorboard --logdir {log_dir}", view at http://localhost:6006/'
    )
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(description='NA')
    parser.add_argument('-c',
                        '--configure',
                        default='cfgs/chexphoto.cfg',
                        help='JSON file')
    parser.add_argument('-cp',
                        '--checkpoint',
                        default=None,
                        help='checkpoint path')
    args = parser.parse_args()
    checkpoint = args.checkpoint
    # read configure file
    with open(args.configure) as f:
        cfg = json.load(f)
    time_str = str(datetime.now().strftime("%Y%m%d-%H%M"))
    tensorboard_writer = logger.make_writer(cfg["session"]["sess_name"],
                                            time_str)
    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    valid = cfg['data']['test_csv_name']
    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])
    validation_split = float(cfg["data"]["validation_ratio"])
    # create dataset
    training_set = pd.read_csv(data, usecols=["file_name", "label"])
    valid_set = pd.read_csv(valid, usecols=["file_name", "label"])
    # train, test, _, _ = dataloader.data_split(training_set, validation_split)

    training_set = dataloader.ClassificationDataset(training_set, data_path,
                                                    transform.train_transform)

    testing_set = dataloader.ClassificationDataset(valid_set, data_path,
                                                   transform.val_transform)
    # create dataloaders
    # global train_loader
    # global val_loader
    #SAmpler to prevent inbalance data label
    # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    #End sampler
    train_loader = torch.utils.data.DataLoader(
        training_set,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        testing_set,
        batch_size=batch_size,
        shuffle=False,
    )
    # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    logging.info("Dataset and Dataloaders created")
    # create a model
    extractor_name = cfg["train"]["extractor"]
    model = cls.ClassificationModel(model_name=extractor_name).create_model()
    #load checkpoint to continue training
    if checkpoint is not None:
        print('...Load checkpoint from {}'.format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint)
        print('...Checkpoint loaded')

        classifier = nn.Sequential(nn.Linear(1408, 512, bias=True),
                                   nn.ReLU(inplace=True),
                                   nn.Linear(512, 6, bias=True))

        # create classfier
        # replace the last linear layer with your custom classifier
        # model._avg_pooling = SPPLayer([1,2,4])
        model._fc = classifier
        # model.last_linear = self.cls
        # select with layers to unfreeze
        params = list(model.parameters())
        len_param = len(params)
        # for index,param in enumerate(model.parameters()):
        #     if index == (len_param -1):
        #         param.requires_grad = True
        #     else:
        #         param.requires_grad = False
        # for param in model.parameters():
        #     print(param.requires_grad)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))

    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = criterion()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    # optimizer = torch.optim.Adam(
    # params=model.parameters(), lr=max_lr, amsgrad=False)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    # scheduler = ReduceLROnPlateau(
    #     optimizer, save_method, patience=patiences, factor=lr_factor
    # )
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    # before training, let's create a file for logging model result

    log_file = logger.make_file(cfg["session"]["sess_name"], time_str)

    logger.log_initilize(log_file)
    print("Beginning training...")
    # export the result to log file
    f = open("saved/logs/traning_{}.txt".format(cfg["session"]["sess_name"]),
             "a")
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")
    # logging the configs:
    # logging.info(f.read())
    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
        )

        # lr scheduling

        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        print("Epoch {} / {} \n Training acc: {} - Other training metrics: ".
              format(i + 1, num_epoch, train_result["accuracy_score"]))
        print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".
              format(i + 1, num_epoch, loss))
        f.write(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        f.write("Epoch {} / {} \n Training acc: {} - Other training metrics: ".
                format(i + 1, num_epoch, train_result["accuracy_score"]))
        tensorboard_writer.add_scalar("training accuracy",
                                      train_result["accuracy_score"], i + 1)
        tensorboard_writer.add_scalar("training f1_score",
                                      train_result["f1_score"], i + 1)
        tensorboard_writer.add_scalar("training metrics", loss, i + 1)
        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(
                val_loss))
        print(
            "Epoch {} / {} \n valid acc: {} - Other training metrics: ".format(
                i + 1, num_epoch, val_result["accuracy_score"]))
        f.write(" \n Validation loss : {} - Other validation metrics:".format(
            val_loss))
        tensorboard_writer.add_scalar("valid accuracy",
                                      val_result["accuracy_score"], i + 1)
        tensorboard_writer.add_scalar("valid f1_score", val_result["f1_score"],
                                      i + 1)
        tensorboard_writer.add_scalar("valid metrics", val_loss, i + 1)
        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            logging.info("Validation accuracy= " +
                         str(val_result["accuracy_score"]) +
                         "===> Save best epoch")
            f.write("Validation accuracy= " +
                    str(val_result["accuracy_score"]) + "===> Save best epoch")
            best_val_acc = val_result["accuracy_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" +
                cfg["train"]["save_as_name"],
            )
        scheduler.step(val_loss)
        # else:
        #     # logging.info(
        #     #     "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving"
        #     # )
        #     continue

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data, usecols=["file_name", "label"])

    # prepare the dataset
    testing_set = dataloader.ClassificationDataset(test_df, data_path,
                                                   transform.val_transform)

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=32,
                                              shuffle=False)
    print("Inference on the testing set")

    # load the test model and making inference
    test_model = cls.ClassificationModel(
        model_name=extractor_name).create_model()
    model_path = os.path.join("saved/models",
                              time_str + "-" + cfg["train"]["save_as_name"])
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(tester.test_result(test_model, test_loader, device, cfg))
def main():
    # read configure file
    with open("cfgs/tenes.cfg") as f:
        cfg = json.load(f)

    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])
    validation_split = float(cfg["data"]["validation_ratio"])
    # create dataset
    training_set = pd.read_csv(data, usecols=["image_name", "target"])
    training_set["image_name"] = training_set["image_name"] + '.jpg'
    training_set = shuffle(training_set)
    training_set = training_set.sample(25000)

    print(training_set['target'].value_counts())
    train, test, _, _ = dataloader.data_split(training_set, validation_split)

    training_set = dataloader.ClassificationDataset(train, data_path,
                                                    transform.train_transform)

    testing_set = dataloader.ClassificationDataset(test, data_path,
                                                   transform.val_transform)
    # create dataloaders
    # global train_loader
    # global val_loader
    train_loader = torch.utils.data.DataLoader(
        training_set,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        testing_set,
        batch_size=batch_size,
        shuffle=False,
    )

    logging.info("Dataset and Dataloaders created")
    # create a model
    extractor_name = cfg["train"]["extractor"]
    model = cls.ClassificationModel(model_name=extractor_name).create_model()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))
    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = custom_loss.FocalLoss()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  save_method,
                                  patience=patiences,
                                  factor=lr_factor)

    # before training, let's create a file for logging model result
    time_str = str(datetime.now().strftime("%Y%m%d-%H%M"))
    log_file = logger.make_file(cfg["session"]["sess_name"], time_str)
    logger.log_initilize(log_file)
    print("Beginning training...")
    # export the result to log file
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info("Training size: " + str(len(train)))
    logging.info("Validation size: " + str(len(test)))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")
    # logging the configs:

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
        )

        # lr scheduling
        scheduler.step(val_loss)
        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(
                val_loss))
        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["f1_score"]):
            logging.info("Validation f1= " + str(val_result["f1_score"]) +
                         "===> Save best epoch")
            best_val_acc = val_result["f1_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" +
                cfg["train"]["save_as_name"],
            )
        else:
            logging.info("Validation f1= " + str(val_result["f1_score"]) +
                         "===> No saving")
            continue

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data, usecols=["image_name", "target"])
    test_df['image_name'] = test_df['image_name'] + '.jpg'
    # prepare the dataset
    testing_set = dataloader.TestDataset(test_df, 'dataset/test/test',
                                         transform.test_transform)

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=16,
                                              shuffle=False)
    print("\n Inference on the testing set")

    # load the test model and making inference
    test_model = cls.ClassificationModel(
        model_name=extractor_name).create_model()
    model_path = os.path.join("saved/models",
                              time_str + "-" + cfg["train"]["save_as_name"])
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(tester.test_result(test_model, test_loader, device))
Exemple #5
0
def main(collocation,
         model,
         dataset,
         validation_flag,
         current_fold,
         comment="No comment",
         checkpoint=None,
         logger=None,
         num_of_class=2):

    # read training set

    data = cfg["data"]["data_csv_name"]
    data = re.sub(r"fold[0-9]", str(current_fold), data)
    print("Reading training data from file: ", data)
    training_set = pd.read_csv(data, delimiter="*", header=None)

    # check if validation flag is on
    if validation_flag == 1:
        # using custom validation set
        print("Creating validation set from file")
        valid = cfg["data"]["validation_csv_name"]
        valid = re.sub(r"fold[0-9]", str(current_fold), valid)
        print("Reading validation data from file: ", valid)
        valid_set = pd.read_csv(valid, delimiter="*", header=None)
    else:
        # auto divide validation set
        validation_split = float(cfg["data"]["validation_ratio"])
        training_set, valid_set = data_split(training_set, validation_split)

    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])

    # create dataset
    training_set = dataset(training_set,
                           data_path,
                           padding=True,
                           normalize=True)
    testing_set = dataset(valid_set, data_path, padding=True, normalize=True)

    # End sampler
    train_loader = torch.utils.data.DataLoader(training_set,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               collate_fn=collocation)
    val_loader = torch.utils.data.DataLoader(testing_set,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             collate_fn=collocation)
    # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    logging.info("Dataset and Dataloaders created")

    # create a model
    # extractor_name = cfg["train"]["extractor"]
    # model = cls(model_name=extractor_name).create_model()
    # model = cls(
    #     num_blocks=6,
    #     in_channels=1,
    #     out_channels=64,
    #     bottleneck_channels=0,
    #     kernel_sizes=8,
    #     num_pred_classes=2
    # )

    model = cls(class_num=2,
                num_of_blocks=9,
                training=True,
                dense_layers=[256, 256])
    for param in model.parameters():
        param.requires_grad = True

    # load checkpoint to continue training
    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint)
        print("...Checkpoint loaded")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    print(sum(p.numel() for p in model.parameters()))
    time.sleep(4)

    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))

    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = custom_loss.WeightedFocalLoss(weight=None,
                                              gamma=2,
                                              reduction="sum")
    # criterion = nn.CrossEntropyLoss(reduction='none')
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    optimizer = optimizer(model.parameters(), lr=learning_rate, momentum=0.9)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    # scheduler = ReduceLROnPlateau(
    #     optimizer, save_method, patience=patiences, factor=lr_factor
    # )
    scheduler = ReduceLROnPlateau(
        optimizer,
        mode=save_method,
        factor=lr_factor,
        min_lr=0.00001,
        verbose=True,
        patience=patiences,
    )

    # before training, let's create a neptune protocol for tracking experiment
    neptune.init("deepbox/gtopia-ml")

    PARAMS = {
        "loss_function": cfg["optimizer"]["loss"],
        "optimizers": cfg["optimizer"]["name"],
        "learning_rate": cfg["optimizer"]["lr"],
        "lr_factor": cfg["train"]["reduce_lr_factor"],
        "patiences": cfg["train"]["patience"],
        "loss_function": cfg["optimizer"]["loss"],
        "data_path": cfg["data"]["data_csv_name"],
        "batch_size": batch_size,
    }
    # create neptune experiment
    neptune.create_experiment(
        name=comment + "_" + str(current_fold),
        params=PARAMS,
        tags=[
            str(current_fold), cfg["train"]["model.class"], cfg["data"]["mode"]
        ],
    )

    logging.info("Created experiment tracking protocol")
    print("Beginning training...")
    print("Traing shape: ", len(train_loader.dataset))
    print("Validation shape: ", len(val_loader.dataset))
    time.sleep(3)

    # export the result to log file
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info("session description: {} \n".format(comment))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
            num_of_class,
        )

        # neptune logging
        neptune.log_metric("train_loss", loss)
        neptune.log_metric("validation_loss", val_loss)
        for single_metric in train_result.keys():
            neptune.log_metric("train_" + single_metric,
                               train_result[single_metric])
            neptune.log_metric("val_" + single_metric,
                               val_result[single_metric])

        # lr scheduling

        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))

        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(
                val_loss))

        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["f1_score"]):
            logging.info("Validation f1= " + str(val_result["f1_score"]) +
                         "===> Save best epoch \n")

            best_val_acc = val_result["f1_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" + str(current_fold) + "-" +
                cfg["train"]["save_as_name"],
            )
        scheduler.step(val_loss)

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_data = re.sub(r"fold[0-9]", str(current_fold), test_data)
    print("reading testing data from file: ", test_data)
    test_df = pd.read_csv(test_data, delimiter="*", header=None)

    # prepare the dataset
    testing_set = dataset(test_df, data_path, padding=False, normalize=True)

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=1,
                                              shuffle=False,
                                              collate_fn=collocation)
    print("Inference on the testing set")

    # load the test model and making inference
    test_model = cls(class_num=2,
                     num_of_blocks=9,
                     training=True,
                     dense_layers=[256, 256])
    # test_model = cls(
    #     num_blocks=6,
    #     in_channels=1,
    #     out_channels=64,
    #     bottleneck_channels=0,
    #     kernel_sizes=8,
    #     num_pred_classes=2,
    # )

    model_path = os.path.join(
        "saved/models",
        time_str + "-" + str(current_fold) + "-" +
        cfg["train"]["save_as_name"],
    )
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(
        tester.adaptive_test_result(test_model, test_loader, device, cfg,
                                    num_of_class))
    f = open("test_report.txt", "w")
    f.write("Test results \n : {}".format(
        tester.adaptive_test_result(test_model, test_loader, device, cfg)))
    f.close()

    # send some versions of code
    neptune.log_artifact("test_report.txt")
    neptune.log_artifact("data_loader/dataloader.py")
    neptune.log_artifact("cfgs/tenes.cfg")
    neptune.log_artifact("trainer.py")
    neptune.log_artifact("test.py")
    neptune.log_artifact("run_exp_2.py")

    if (cfg["train"]["model.class"] == "Lecnet"):
        neptune.log_artifact("model/classification.py")
    else:
        neptune.log_artifact("model/benchmark.py")

    # saving torch models
    print("---End of testing phase----")
    neptune.stop()
Exemple #6
0
def main(collocation,model,dataset,validation_flag,current_fold,comment="No comment", checkpoint=None,logger=None):
    # parser = argparse.ArgumentParser(description='NA')
    # parser.add_argument('-c', '--configure', default='cfgs/chexphoto.cfg', help='JSON file')
    # parser.add_argument('-cp', '--checkpoint', default=None, help = 'checkpoint path')
    # args = parser.parse_args()
    # checkpoint = args.checkpoint
    # # read configure file
    # with open(args.configure) as f:
    #     cfg = json.load(f)
    # using parsed configurations to create a dataset
    # read training set
    
    data = cfg["data"]["data_csv_name"]
    data = re.sub(r"fold[0-9]",str(current_fold),data)
    print("Reading training data from file: ",data)
    training_set = pd.read_csv(data,delimiter='*',header=None)
    
    # check if validation flag is on
    if (validation_flag==1):
        # using custom validation set
        print("Creating validation set from file")
        valid = cfg["data"]["validation_csv_name"]
        valid = re.sub(r"fold[0-9]",str(current_fold),valid)
        print("Reading validation data from file: ",valid)
        valid_set = pd.read_csv(valid,delimiter='*',header=None)
    else:
        # auto divide validation set
        validation_split = float(cfg["data"]["validation_ratio"])
        training_set,valid_set = data_split(training_set,validation_split)

    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])

    
    # create dataset
    # train, test, _, _ = dataloader.data_split(training_set, validation_split)

    training_set = dataset(
        training_set, data_path, padding=True,normalize=True
    )

    testing_set = dataset(
        valid_set, data_path, padding=True,normalize=True
    )
    # create dataloaders
    # global train_loader
    # global val_loader
    # SAmpler to prevent inbalance data label
    # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    # End sampler
    train_loader = torch.utils.data.DataLoader(
        training_set, batch_size=batch_size, shuffle=True,collate_fn=collocation
    )
    val_loader = torch.utils.data.DataLoader(
        testing_set, batch_size=batch_size, shuffle=False,collate_fn=collocation

    )
    # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    logging.info("Dataset and Dataloaders created")
    # create a model
    # extractor_name = cfg["train"]["extractor"]
    # model = cls(model_name=extractor_name).create_model()
    model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256])
    # model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2)
    # load checkpoint to continue training
    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint)
        print("...Checkpoint loaded")

        classifier = nn.Sequential(
            nn.Linear(1408, 512, bias=True),
            nn.ReLU(inplace=True),
            nn.Linear(512, 6, bias=True),
        )

        # create classfier
        # replace the last linear layer with your custom classifier
        # model._avg_pooling = SPPLayer([1,2,4])
        # model._fc = classifier
        # model.last_linear = self.cls
        # select with layers to unfreeze
        params = list(model.parameters())
        len_param = len(params)
        # for index,param in enumerate(model.parameters()):
        #     if index == (len_param -1):
        #         param.requires_grad = True
        #     else:
        #         param.requires_grad = False
        # for param in model.parameters():
        #     print(param.requires_grad)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    print(sum(p.numel() for p in model.parameters()))
    time.sleep(4)

    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function, "The loss {} is not available".format(loss_function)
        )

    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = custom_loss.WeightedFocalLoss(weight=None, gamma=2,reduction='mean')
    # criterion = nn.CrossEntropyLoss(reduction='none')
    optimizer = getattr(
        torch.optim, optimizers, "The optimizer {} is not available".format(optimizers)
    )
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    # optimizer = torch.optim.Adam(
    # params=model.parameters(), lr=max_lr, amsgrad=False)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    # scheduler = ReduceLROnPlateau(
    #     optimizer, save_method, patience=patiences, factor=lr_factor
    # )
    scheduler = ReduceLROnPlateau(optimizer, mode='min',factor=0.5,min_lr=0.00001,verbose=True,patience=5)

    # before training, let's create a file for logging model result


    print("Beginning training...")
    time.sleep(3)
    # export the result to log file
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info("session description: {} \n".format(comment))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")
    # logging the configs:
    # logging.info(f.read())
    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
        )

        # lr scheduling

        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(
                i + 1, num_epoch, loss
            )
        )
        # tensorboard_writer.add_scalar("training accuracy",train_result["accuracy_score"],i + 1)
        # tensorboard_writer.add_scalar("training f1_score",train_result["f1_score"],i + 1)
        # tensorboard_writer.add_scalar("training metrics",loss,i + 1)
        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(val_loss)
        )
        
        # tensorboard_writer.add_scalar("valid accuracy",val_result["accuracy_score"],i + 1)
        # tensorboard_writer.add_scalar("valid f1_score",val_result["f1_score"],i + 1)
        # tensorboard_writer.add_scalar("valid metrics",val_loss,i + 1)
        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["f1_score"]):
            logging.info(
                "Validation f1= "
                + str(val_result["f1_score"])
                + "===> Save best epoch \n"
            )

            best_val_acc = val_result["f1_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" + str(current_fold) + "-" +cfg["train"]["save_as_name"],
            )
        scheduler.step(val_loss)
        # else:
        #     # logging.info(
        #     #     "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving"
        #     # )
        #     continue

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_data = re.sub(r"fold[0-9]",str(current_fold),test_data)
    print("reading testing data from file: ",test_data)
    test_df = pd.read_csv(test_data,delimiter='*',header=None)
    

    # prepare the dataset
    testing_set = dataset(
        test_df, data_path, padding=False, normalize=True
    )

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set, batch_size=1, shuffle=False,collate_fn=collocation)
    print("Inference on the testing set")

    # load the test model and making inference
    test_model = cls(class_num=2,num_of_blocks=9,training=True,dense_layers=[256,256])
    # test_model = cls( num_blocks = 8, in_channels=1,out_channels=64,bottleneck_channels=0,kernel_sizes=8,num_pred_classes=2)

    model_path = os.path.join(
        "saved/models", time_str + "-" + str(current_fold) + "-" + cfg["train"]["save_as_name"]
    )
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(tester.adaptive_test_result(test_model, test_loader, device, cfg))

    # saving torch models
    print("---End of testing phase----")