Ejemplo n.º 1
0
def test_fun(dataset, opts):
    test_arg_list = []
    for test_rot in [None]:
        for test_flip in [None]:
            test_arg_list.append((test_rot, test_flip))
    for test_arg in test_arg_list:
        #for test_arg in [(None, None)]:
        opts.test_arg = test_arg
        # dataset and iterator
        dataset_val = dataset.get_dataset(opts)
        iterator = dataset_val.make_one_shot_iterator()
        volume, joint_coord, shape, data_num = iterator.get_next()
        inputs = tf.placeholder(
            tf.float32,
            shape=[None, None, None, None, 1 + opts.temporal * opts.nJoint])
        dn_p = 0

        # network
        outputs, _ = get_network(inputs, opts)

        # save and load
        saver = tf.train.Saver(var_list=tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope=opts.network))

        start = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            print('restore trained model')
            saver.restore(
                sess,
                os.path.join(opts.output_path, opts.name,
                             'model%d.ckpt' % opts.epochs))
            print('test start')
            res = []
            while True:
                try:
                    v, joint, s, dn = sess.run(
                        [volume, joint_coord, shape, data_num])
                    if opts.temporal:
                        if np.squeeze(dn) != dn_p:
                            output_val = first_heatmap_p(joint, s, opts)
                            dn_p = np.squeeze(dn)
                        else:
                            output_val = get_heatmap_p(output_val, opts)
                        output_val = sess.run(outputs[-1],
                                              feed_dict={
                                                  inputs:
                                                  np.concatenate(
                                                      [v, output_val], axis=-1)
                                              })
                    else:
                        output_val = sess.run(outputs[-1],
                                              feed_dict={inputs: v})
                    res.append(test_result(output_val, joint, s, dn, opts))
                except tf.errors.OutOfRangeError:
                    break
            save_test_result(res, opts)
            reset_dict()
        tf.reset_default_graph()
        print("test end, elapsed time: ", time.time() - start)
Ejemplo n.º 2
0
def main(
    cfg,
    model,
    log_dir,
    checkpoint=None,
):
    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint['state_dict'])
        print("...Checkpoint loaded")

    # Checking cuda
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))

    # Convert to suitable device
    # logging.info(model)
    model = model.to(device)
    logging.info("Number parameters of model: {:,}".format(
        sum(p.numel() for p in model.parameters())))

    # using parsed configurations to create a dataset
    # Create dataset
    num_of_class = len(cfg["data"]["label_dict"])
    train_loader, valid_loader, test_loader = get_data_loader(cfg)
    print("Dataset and Dataloaders created")

    # create a metric for evaluating
    metric_names = cfg["train"]["metrics"]
    train_metrics = metrics_loader.Metrics(metric_names)
    val_metrics = metrics_loader.Metrics(metric_names)
    print("Metrics implemented successfully")

    ## read settings from json file
    ## initlize optimizer from config
    optimizer_module, optimizer_params = get_optimizer(cfg)
    optimizer = optimizer_module(model.parameters(), **optimizer_params)
    ## initlize sheduler from config
    scheduler_module, scheduler_params = get_lr_scheduler(cfg)
    scheduler = scheduler_module(optimizer, **scheduler_params)
    # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)
    loss_fn = get_loss_fn(cfg)
    criterion = loss_fn()

    print("\nTraing shape: {} samples".format(len(train_loader.dataset)))
    print("Validation shape: {} samples".format(len(valid_loader.dataset)))
    print("Beginning training...")

    # initialize the early_stopping object
    save_mode = cfg["train"]["mode"]
    early_patience = cfg["train"]["patience"]
    checkpoint_path = os.path.join(log_dir, "Checkpoint.ckpt")
    early_stopping = callbacks.EarlyStopping(patience=early_patience,
                                             mode=save_mode,
                                             path=checkpoint_path)

    # training models
    logging.info("--" * 50)
    num_epochs = int(cfg["train"]["num_epochs"])
    t0 = time.time()
    for epoch in range(num_epochs):
        t1 = time.time()
        if epoch == 3:
            print('\t Release _ PARAMETERS')
            for param in model.parameters():
                param.requires_grad = True

        print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss'))
        train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch(
            epoch,
            num_epochs,
            model,
            device,
            train_loader,
            valid_loader,
            criterion,
            optimizer,
            train_metrics,
            val_metrics,
        )
        ## lr scheduling
        scheduler.step(val_loss)

        ## log to file
        logging.info(
            "\n------Epoch {} / {}, Training time: {:.4f} seconds------".
            format(epoch, num_epochs, (time.time() - t1)))
        logging.info(
            f"Training loss: {train_loss} \n Training metrics: {train_result}")
        logging.info(
            f"Validation loss: {val_loss} \n Validation metrics: {val_result}")

        ## tensorboard writer
        tb_writer.add_scalar("Training Loss", train_loss, epoch)
        tb_writer.add_scalar("Valid Loss", val_loss, epoch)
        for metric_name in metric_names:
            tb_writer.add_scalar(f"Training {metric_name}",
                                 train_result[metric_name], epoch)
            tb_writer.add_scalar(f"Validation {metric_name}",
                                 val_result[metric_name], epoch)

        train_checkpoint = {
            'epoch': epoch,
            'valid_loss': val_loss,
            'model': model,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        # Save model
        if save_mode == "min":
            early_stopping(val_loss, train_checkpoint)
        else:
            early_stopping(val_acc, train_checkpoint)
        if early_stopping.early_stop:
            logging.info("Early Stopping!!!")
            break

    # testing on test set
    # load the test model and making inference
    print("\n==============Inference on the testing set==============")
    best_checkpoint = torch.load(checkpoint_path)
    test_model = best_checkpoint['model']
    test_model.load_state_dict(best_checkpoint['state_dict'])
    test_model = test_model.to(device)
    test_model.eval()

    # logging report
    report = tester.test_result(test_model, test_loader, device, cfg)
    logging.info(f"\nClassification Report: \n {report}")
    logging.info("Completed in {:.3f} seconds. ".format(time.time() - t0))

    print(f"Classification Report: \n {report}")
    print("Completed in {:.3f} seconds.".format(time.time() - t0))
    print(f"-------- Checkpoints and logs are saved in ``{log_dir}`` --------")

    return checkpoint_path
Ejemplo n.º 3
0
def main(
    model,
    config=None,
    comment="No comment",
    checkpoint=None,
):
    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint['state_dict'])
        print("...Checkpoint loaded")

    # Checking cuda
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))

    # Convert to suitable device
    model = model.to(device)
    print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
    logging.info("Model created...")

    # using parsed configurations to create a dataset
    num_of_class = len(cfg["data"]["label_dict"])

    # Create dataset
    train_loader, valid_loader, test_loader = get_data_loader(cfg)
    logging.info("Dataset and Dataloaders created")

    # create a metric for evaluating
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # read settings from json file
    # initlize optimizing methods : lr, scheduler of lr, optimizer
    learning_rate = cfg["optimizer"]["lr"]
    optimizer = get_optimizer(cfg)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    loss_fn = get_loss_fn(cfg)
    criterion = loss_fn()

    ## Learning rate decay
    max_lr = 3e-3  # Maximum LR
    min_lr = cfg["optimizer"]["min_lr"]  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    save_method = cfg["optimizer"]["lr_scheduler_factor"]
    lr_patiences = cfg["optimizer"]["lr_patience"]
    lr_factor = cfg["optimizer"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode=save_method,
                                  min_lr=min_lr,
                                  patience=lr_patiences,
                                  factor=lr_factor)
    # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    print("\nTraing shape: {} samples".format(len(train_loader.dataset)))
    print("Validation shape: {} samples".format(len(valid_loader.dataset)))
    print("Beginning training...")

    # export the result to log file
    logging.info("--------------------------------")
    logging.info("session name: {}".format(cfg["session"]["sess_name"]))
    # logging.info(model)
    logging.info("CONFIGS:")
    logging.info(cfg)

    # initialize the early_stopping object
    checkpoint_path = os.path.join(log_dir, "Checkpoint.pt")
    save_mode = cfg["train"]["mode"]
    early_patience = cfg["train"]["early_patience"]
    early_stopping = callbacks.EarlyStopping(patience=early_patience,
                                             mode=save_mode,
                                             path=checkpoint_path)

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    t0 = time.time()

    for epoch in range(num_epoch):
        t1 = time.time()
        train_loss, train_acc, val_loss, val_acc, train_result, val_result = trainer.train_one_epoch(
            epoch,
            num_epoch,
            model,
            device,
            train_loader,
            valid_loader,
            criterion,
            optimizer,
            train_metrics,
            val_metrics,
        )

        train_checkpoint = {
            'epoch': epoch + 1,
            'valid_loss': val_loss,
            'model': model,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        scheduler.step(val_loss)

        ## lr scheduling
        logging.info(
            "\n------Epoch %d / %d, Training time: %.4f seconds------" %
            (epoch + 1, num_epoch, (time.time() - t1)))
        logging.info("Training loss: {} - Other training metrics: {}".format(
            train_loss, train_result))
        logging.info(
            "Validation loss: {} - Other validation metrics: {}".format(
                val_loss, val_result))

        ## tensorboard
        tb_writer.add_scalar("Training Loss", train_loss, epoch + 1)
        tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1)
        tb_writer.add_scalar("Training Accuracy",
                             train_result["accuracy_score"], epoch + 1)
        tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"],
                             epoch + 1)
        # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1)
        # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1)

        # Save model
        if save_mode == "min":
            early_stopping(val_loss, train_checkpoint)
        else:
            early_stopping(val_acc, train_checkpoint)
        if early_stopping.early_stop:
            logging.info("Early Stopping!!!")
            break

    # testing on test set
    # load the test model and making inference
    print("\nInference on the testing set")
    checkpoint = torch.load(checkpoint_path)
    test_model = checkpoint['model']
    test_model.load_state_dict(checkpoint['state_dict'])
    test_model = test_model.to(device)

    # logging report
    report = tester.test_result(test_model, test_loader, device, cfg)
    logging.info("\nClassification Report: \n {}".format(report))
    logging.info('Completed in %.3f seconds.' % (time.time() - t0))

    print("Classification Report: \n{}".format(report))
    print('Completed in %.3f seconds.' % (time.time() - t0))
    print(
        'Start Tensorboard with tensorboard --logdir {}, view at http://localhost:6006/'
        .format(log_dir))
Ejemplo n.º 4
0
def main(model,
         dataset,
         validation_flag,
         comment="No comment",
         checkpoint=None,
         num_of_class=2):

    # Checking cuda
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))

    if checkpoint is not None:
        print("...Load checkpoint from {}".format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint['state_dict'])
        print("...Checkpoint loaded")

    # Convert to suitable device
    model = model.to(device)
    print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
    logging.info("Model created...")

    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    print("Reading training data from file: ", data)
    training_set = pd.read_csv(data)

    # check if validation flag is on
    if validation_flag == 0:
        # using custom validation set
        print("Creating validation set from file")
        valid = cfg["data"]["validation_csv_name"]
        print("Reading validation data from file: ", valid)
        valid_set = pd.read_csv(valid)
    else:
        # auto divide validation set
        print("Splitting dataset into train and valid....")
        validation_split = float(cfg["data"]["validation_ratio"])
        training_set, valid_set, _, _ = data_split(training_set,
                                                   validation_split)
        print("Done Splitting !!!")

    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])

    # Create dataset
    training_set = dataset(training_set, data_path, transform.train_transform)
    valid_set = dataset(valid_set, data_path, transform.val_transform)

    # End sampler
    train_loader = torch.utils.data.DataLoader(training_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(valid_set,
                                             batch_size=batch_size,
                                             shuffle=False)
    logging.info("Dataset and Dataloaders created")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))
    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = criterion()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    # optimizer = torch.optim.Adam(
    # params=model.parameters(), lr=max_lr, amsgrad=False)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode=save_method,
                                  min_lr=min_lr,
                                  patience=patiences,
                                  factor=lr_factor)
    # scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    print("\nTraing shape: {} samples".format(len(train_loader.dataset)))
    print("Validation shape: {} samples".format(len(val_loader.dataset)))
    print("Beginning training...")

    # export the result to log file
    logging.info("--------------------------------")
    logging.info("session name: {}".format(cfg["session"]["sess_name"]))
    # logging.info(model)
    logging.info("CONFIGS:")
    logging.info(cfg)

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    t0 = time.time()
    for epoch in range(0, num_epoch):
        t1 = time.time()
        print(('\n' + '%13s' * 3) % ('Epoch', 'gpu_mem', 'mean_loss'))
        train_loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            epoch,
            num_epoch,
            model,
            device,
            train_loader,
            val_loader,
            criterion,
            optimizer,
            train_metrics,
            val_metrics,
        )
        scheduler.step(val_loss)

        # lr scheduling
        logging.info(
            "\n------Epoch %d / %d, Training time: %.4f seconds------" %
            (epoch + 1, num_epoch, (time.time() - t1)))
        logging.info("Training loss: {} - Other training metrics: {}".format(
            train_loss, train_result))
        logging.info(
            "Validation loss: {} - Other validation metrics: {}".format(
                val_loss, val_result))

        tb_writer.add_scalar("Training Loss", train_loss, epoch + 1)
        tb_writer.add_scalar("Valid Loss", val_loss, epoch + 1)
        tb_writer.add_scalar("Training Accuracy",
                             train_result["accuracy_score"], epoch + 1)
        tb_writer.add_scalar("Valid Accuracy", val_result["accuracy_score"],
                             epoch + 1)
        # tb_writer.add_scalar("training f1_score", train_result["f1_score"], epoch + 1)
        # tb_writer.add_scalar("valid f1_score", val_result["f1_score"], epoch + 1)

        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            logging.info("Validation accuracy= " +
                         str(val_result["accuracy_score"]))
            logging.info("====> Save best at epoch {}".format(epoch + 1))
            best_val_acc = val_result["accuracy_score"]
            checkpoint = {
                'epoch': epoch + 1,
                'valid_loss': val_loss,
                'model': model,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(checkpoint, log_dir + "/Checkpoint.pt")

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data)

    # prepare the dataset
    testing_set = dataset(test_df, data_path, transform.val_transform)
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=32,
                                              shuffle=False)
    print("\nInference on the testing set")

    # load the test model and making inference
    checkpoint = torch.load(log_dir + "/Checkpoint.pt")
    test_model = checkpoint['model']
    test_model.load_state_dict(checkpoint['state_dict'])
    test_model = test_model.to(device)

    # logging report
    report = tester.test_result(test_model, test_loader, device, cfg)
    logging.info("\nClassification Report: \n {}".format(report))
    logging.info('%d epochs completed in %.3f seconds.' % (num_epoch,
                                                           (time.time() - t0)))

    print("Classification Report: \n{}".format(report))
    print('%d epochs completed in %.3f seconds.' % (num_epoch,
                                                    (time.time() - t0)))
    print(
        f'Start Tensorboard with "tensorboard --logdir {log_dir}", view at http://localhost:6006/'
    )
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='NA')
    parser.add_argument('-c',
                        '--configure',
                        default='cfgs/chexphoto.cfg',
                        help='JSON file')
    parser.add_argument('-cp',
                        '--checkpoint',
                        default=None,
                        help='checkpoint path')
    args = parser.parse_args()
    checkpoint = args.checkpoint
    # read configure file
    with open(args.configure) as f:
        cfg = json.load(f)
    time_str = str(datetime.now().strftime("%Y%m%d-%H%M"))
    tensorboard_writer = logger.make_writer(cfg["session"]["sess_name"],
                                            time_str)
    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    valid = cfg['data']['test_csv_name']
    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])
    validation_split = float(cfg["data"]["validation_ratio"])
    # create dataset
    training_set = pd.read_csv(data, usecols=["file_name", "label"])
    valid_set = pd.read_csv(valid, usecols=["file_name", "label"])
    # train, test, _, _ = dataloader.data_split(training_set, validation_split)

    training_set = dataloader.ClassificationDataset(training_set, data_path,
                                                    transform.train_transform)

    testing_set = dataloader.ClassificationDataset(valid_set, data_path,
                                                   transform.val_transform)
    # create dataloaders
    # global train_loader
    # global val_loader
    #SAmpler to prevent inbalance data label
    # train_loader = torch.utils.data.DataLoader(training_set,sampler=ImbalancedDatasetSampler(training_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    #End sampler
    train_loader = torch.utils.data.DataLoader(
        training_set,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        testing_set,
        batch_size=batch_size,
        shuffle=False,
    )
    # val_loader = torch.utils.data.DataLoader(testing_set,sampler=ImbalancedDatasetSampler(testing_set, callback_get_label=lambda x, i: tuple(x[i][1].tolist())),batch_size=batch_size,)

    logging.info("Dataset and Dataloaders created")
    # create a model
    extractor_name = cfg["train"]["extractor"]
    model = cls.ClassificationModel(model_name=extractor_name).create_model()
    #load checkpoint to continue training
    if checkpoint is not None:
        print('...Load checkpoint from {}'.format(checkpoint))
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint)
        print('...Checkpoint loaded')

        classifier = nn.Sequential(nn.Linear(1408, 512, bias=True),
                                   nn.ReLU(inplace=True),
                                   nn.Linear(512, 6, bias=True))

        # create classfier
        # replace the last linear layer with your custom classifier
        # model._avg_pooling = SPPLayer([1,2,4])
        model._fc = classifier
        # model.last_linear = self.cls
        # select with layers to unfreeze
        params = list(model.parameters())
        len_param = len(params)
        # for index,param in enumerate(model.parameters()):
        #     if index == (len_param -1):
        #         param.requires_grad = True
        #     else:
        #         param.requires_grad = False
        # for param in model.parameters():
        #     print(param.requires_grad)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))

    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = criterion()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    max_lr = 3e-3  # Maximum LR
    min_lr = 1e-5  # Minimum LR
    t_max = 10  # How many epochs to go from max_lr to min_lr
    # optimizer = torch.optim.Adam(
    # params=model.parameters(), lr=max_lr, amsgrad=False)
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    # scheduler = ReduceLROnPlateau(
    #     optimizer, save_method, patience=patiences, factor=lr_factor
    # )
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=min_lr)

    # before training, let's create a file for logging model result

    log_file = logger.make_file(cfg["session"]["sess_name"], time_str)

    logger.log_initilize(log_file)
    print("Beginning training...")
    # export the result to log file
    f = open("saved/logs/traning_{}.txt".format(cfg["session"]["sess_name"]),
             "a")
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")
    # logging the configs:
    # logging.info(f.read())
    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
        )

        # lr scheduling

        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        print("Epoch {} / {} \n Training acc: {} - Other training metrics: ".
              format(i + 1, num_epoch, train_result["accuracy_score"]))
        print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".
              format(i + 1, num_epoch, loss))
        f.write(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        f.write("Epoch {} / {} \n Training acc: {} - Other training metrics: ".
                format(i + 1, num_epoch, train_result["accuracy_score"]))
        tensorboard_writer.add_scalar("training accuracy",
                                      train_result["accuracy_score"], i + 1)
        tensorboard_writer.add_scalar("training f1_score",
                                      train_result["f1_score"], i + 1)
        tensorboard_writer.add_scalar("training metrics", loss, i + 1)
        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(
                val_loss))
        print(
            "Epoch {} / {} \n valid acc: {} - Other training metrics: ".format(
                i + 1, num_epoch, val_result["accuracy_score"]))
        f.write(" \n Validation loss : {} - Other validation metrics:".format(
            val_loss))
        tensorboard_writer.add_scalar("valid accuracy",
                                      val_result["accuracy_score"], i + 1)
        tensorboard_writer.add_scalar("valid f1_score", val_result["f1_score"],
                                      i + 1)
        tensorboard_writer.add_scalar("valid metrics", val_loss, i + 1)
        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            logging.info("Validation accuracy= " +
                         str(val_result["accuracy_score"]) +
                         "===> Save best epoch")
            f.write("Validation accuracy= " +
                    str(val_result["accuracy_score"]) + "===> Save best epoch")
            best_val_acc = val_result["accuracy_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" +
                cfg["train"]["save_as_name"],
            )
        scheduler.step(val_loss)
        # else:
        #     # logging.info(
        #     #     "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving"
        #     # )
        #     continue

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data, usecols=["file_name", "label"])

    # prepare the dataset
    testing_set = dataloader.ClassificationDataset(test_df, data_path,
                                                   transform.val_transform)

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=32,
                                              shuffle=False)
    print("Inference on the testing set")

    # load the test model and making inference
    test_model = cls.ClassificationModel(
        model_name=extractor_name).create_model()
    model_path = os.path.join("saved/models",
                              time_str + "-" + cfg["train"]["save_as_name"])
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(tester.test_result(test_model, test_loader, device, cfg))
Ejemplo n.º 6
0
def main():
    # read configure file
    with open("cfgs/tenes.cfg") as f:
        cfg = json.load(f)

    # using parsed configurations to create a dataset
    data = cfg["data"]["data_csv_name"]
    data_path = cfg["data"]["data_path"]
    batch_size = int(cfg["data"]["batch_size"])
    validation_split = float(cfg["data"]["validation_ratio"])
    # create dataset
    training_set = pd.read_csv(data, usecols=["image_name", "target"])
    training_set["image_name"] = training_set["image_name"] + '.jpg'
    training_set = shuffle(training_set)
    training_set = training_set.sample(25000)

    print(training_set['target'].value_counts())
    train, test, _, _ = dataloader.data_split(training_set, validation_split)

    training_set = dataloader.ClassificationDataset(train, data_path,
                                                    transform.train_transform)

    testing_set = dataloader.ClassificationDataset(test, data_path,
                                                   transform.val_transform)
    # create dataloaders
    # global train_loader
    # global val_loader
    train_loader = torch.utils.data.DataLoader(
        training_set,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        testing_set,
        batch_size=batch_size,
        shuffle=False,
    )

    logging.info("Dataset and Dataloaders created")
    # create a model
    extractor_name = cfg["train"]["extractor"]
    model = cls.ClassificationModel(model_name=extractor_name).create_model()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.info("Using device: {} ".format(device))
    # convert to suitable device
    # global model
    model = model.to(device)
    logging.info("Model created...")

    # create a metric for evaluating
    # global train_metrics
    # global val_metrics
    train_metrics = metrics.Metrics(cfg["train"]["metrics"])
    val_metrics = metrics.Metrics(cfg["train"]["metrics"])
    print("Metrics implemented successfully")

    # method to optimize the model
    # read settings from json file
    loss_function = cfg["optimizer"]["loss"]
    optimizers = cfg["optimizer"]["name"]
    learning_rate = cfg["optimizer"]["lr"]

    # initlize optimizing methods : lr, scheduler of lr, optimizer
    try:
        # if the loss function comes from nn package
        criterion = getattr(
            nn, loss_function,
            "The loss {} is not available".format(loss_function))
    except:
        # use custom loss
        criterion = getattr(
            custom_loss,
            loss_function,
            "The loss {} is not available".format(loss_function),
        )
    criterion = custom_loss.FocalLoss()
    optimizer = getattr(torch.optim, optimizers,
                        "The optimizer {} is not available".format(optimizers))
    optimizer = optimizer(model.parameters(), lr=learning_rate)
    save_method = cfg["train"]["lr_scheduler_factor"]
    patiences = cfg["train"]["patience"]
    lr_factor = cfg["train"]["reduce_lr_factor"]
    scheduler = ReduceLROnPlateau(optimizer,
                                  save_method,
                                  patience=patiences,
                                  factor=lr_factor)

    # before training, let's create a file for logging model result
    time_str = str(datetime.now().strftime("%Y%m%d-%H%M"))
    log_file = logger.make_file(cfg["session"]["sess_name"], time_str)
    logger.log_initilize(log_file)
    print("Beginning training...")
    # export the result to log file
    logging.info("-----")
    logging.info("session name: {} \n".format(cfg["session"]["sess_name"]))
    logging.info("Training size: " + str(len(train)))
    logging.info("Validation size: " + str(len(test)))
    logging.info(model)
    logging.info("\n")
    logging.info("CONFIGS \n")
    # logging the configs:

    # training models
    num_epoch = int(cfg["train"]["num_epoch"])
    best_val_acc = 0
    for i in range(0, num_epoch):
        loss, val_loss, train_result, val_result = trainer.train_one_epoch(
            model,
            train_loader,
            val_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            val_metrics,
        )

        # lr scheduling
        scheduler.step(val_loss)
        logging.info(
            "Epoch {} / {} \n Training loss: {} - Other training metrics: ".
            format(i + 1, num_epoch, loss))
        logging.info(train_result)
        logging.info(
            " \n Validation loss : {} - Other validation metrics:".format(
                val_loss))
        logging.info(val_result)
        logging.info("\n")
        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["f1_score"]):
            logging.info("Validation f1= " + str(val_result["f1_score"]) +
                         "===> Save best epoch")
            best_val_acc = val_result["f1_score"]
            torch.save(
                model.state_dict(),
                "saved/models/" + time_str + "-" +
                cfg["train"]["save_as_name"],
            )
        else:
            logging.info("Validation f1= " + str(val_result["f1_score"]) +
                         "===> No saving")
            continue

    # testing on test set
    test_data = cfg["data"]["test_csv_name"]
    data_path = cfg["data"]["data_path"]
    test_df = pd.read_csv(test_data, usecols=["image_name", "target"])
    test_df['image_name'] = test_df['image_name'] + '.jpg'
    # prepare the dataset
    testing_set = dataloader.TestDataset(test_df, 'dataset/test/test',
                                         transform.test_transform)

    # make dataloader
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_loader = torch.utils.data.DataLoader(testing_set,
                                              batch_size=16,
                                              shuffle=False)
    print("\n Inference on the testing set")

    # load the test model and making inference
    test_model = cls.ClassificationModel(
        model_name=extractor_name).create_model()
    model_path = os.path.join("saved/models",
                              time_str + "-" + cfg["train"]["save_as_name"])
    test_model.load_state_dict(torch.load(model_path))
    test_model = test_model.to(device)
    logging.info(tester.test_result(test_model, test_loader, device))