def evaluate_acc(model, bert_model, dataloader, engineered_features=False):
    """Evaluate accuracy of a model.

    Written by Leo Nguyen. Contact Xenovortex, if problems arises.

    Args:
        model (torch.nn.Module): PyTorch model of a Regression Neural Network
        bert_model (torch.nn.Module): BERT PyTorch model for feature extraction
        dataloader (PyTorch dataloader): PyTorch dataloader of dataset
        engineered_features (bool, optional): contenate engineered features to vectorized sentence
    
    Return: 
        accuracy of the model on dataset
    """

    # check if GPU available
    device = gpu.check_gpu()

    # move model to device
    model = model.to(device)

    # sigmoid
    sigmoid = torch.nn.Sigmoid()

    # log
    correct = 0
    total = 0

    with torch.no_grad():
        for i, data in enumerate(tqdm(dataloader)):
            # move batch and model to device
            model.to(device)
            input_id = data[0].to(device)
            segment = data[1].to(device)
            label = data[2].to(device)
            if engineered_features:
                extra_feat = data[3].to(device)

            # BERT feature extraction
            features = bert_model.get_features(input_id, segment)

            # add engineered features
            if engineered_features:
                features = torch.cat((features, extra_feat), 1)

            # prediction
            output = sigmoid(model(features))
            pred = torch.round(output)

            # count correct predictions
            total += label.size(0)
            correct += (pred == label).sum().item()

    return 100 * correct / total
Esempio n. 2
0
    def __init__(self):
        """BERT wrapper class

        Written by Leo Nguyen. Contact Xenovortex, if problems arises.
        """
        self.device = gpu.check_gpu()

        # Load BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            "bert-base-german-cased", do_lower_case=True)

        # Load pretrained BERT and move to GPU (if available)
        self.model = BertModel.from_pretrained("bert-base-german-cased")
        self.model = self.model.to(self.device)
        self.model.eval()
        print("BERT model moved to device:", self.device)
def evaluate_model(model,
                   bert_model,
                   dataloader,
                   engineered_features=False,
                   multiple_dataset=False):
    """Evaluate regression metrics of a model on a dataset

       Written by Leo Nguyen. Contact Xenovortex, if problems arises.

    Args:
        model (torch.nn.Module): PyTorch model of a Regression Neural Network
        bert_model (torch.nn.Module): BERT PyTorch model for feature extraction
        dataloader (PyTorch dataloader): PyTorch dataloader of dataset
        engineered_features (bool, optional): contenate engineered features to vectorized sentence
        multiple_dataset (bool, optional): use multiple datasets

    Return:
        MSE_mean (double): Mean Square Error
        RMSE_mean (double): Root Mean Square Error
        MAE_mean (double): Mean Absolute Error
        r_square_mean (double): R Square
    """

    # check if GPU available
    device = gpu.check_gpu()

    # move model to device
    model = model.to(device)

    # record metrics per batch
    MSE_lst = []
    RMSE_lst = []
    MAE_lst = []
    r_square_lst = []

    # iterate through dataset
    with torch.no_grad():
        for i, data in enumerate(tqdm(dataloader)):
            # move batch and model to device
            model.to(device)
            input_id = data[0].to(device)
            segment = data[1].to(device)
            label = data[2].to(device)
            if engineered_features and multiple_dataset:
                extra_feat = data[3].to(device)
                dataset_label = data[4].to(device)
            elif engineered_features:
                extra_feat = data[3].to(device)
            elif multiple_dataset:
                dataset_label = data[3].to(device)

            # BERT feature extraction
            features = bert_model.get_features(input_id, segment)

            # add engineered features
            if engineered_features:
                features = torch.cat((features, extra_feat), 1)

            # add dataset conditional label (always 0)
            if multiple_dataset:
                features = torch.cat(
                    (features,
                     torch.tensor(np.zeros(dataset_label.shape),
                                  dtype=torch.float).to(device)), 1)

            # prediction
            output = model(features)

            # evaluate
            MSE, RMSE, MAE, r_square = evaluate(label.cpu(), output.cpu())
            MSE_lst.append(MSE)
            RMSE_lst.append(RMSE)
            MAE_lst.append(MAE)
            r_square_lst.append(r_square)

    # compute mean over all batches
    MSE_mean = sum(MSE_lst) / len(MSE_lst)
    RMSE_mean = sum(RMSE_lst) / len(RMSE_lst)
    MAE_mean = sum(MAE_lst) / len(MAE_lst)
    r_square_mean = sum(r_square_lst) / len(r_square_lst)

    return MSE_mean, RMSE_mean, MAE_mean, r_square_mean
Esempio n. 4
0
def train_pretask(pretask_epoch, model, bert_model, dataloader, criterion, optimizer, engineered_features, log_path, fig_path, model_path, save_name):
    """Train a model on a pretask

    Written by Leo Nguyen. Contact Xenovortex, if problems arises.

    Args:
        pretask_epoch (int): integer provided will be the number of epochs spent on the pretask
        model (torch.nn.Module): PyTorch model of a Regression Neural Network
        bert_model (torch.nn.Module): BERT PyTorch model for feature extraction
        dataloader (PyTorch dataloader): PyTorch dataloader of dataset
        criterion (function): loss function
        optimizer (PyTorch optimizer): optimizer of model parameters
        engineered_features (bool): contenate engineered features to vectorized sentence
        log_path (str): path to save log files
        fig_path (str): path to save figures
        model_path (str): path to save model
        save_name (str): name under which to save trained model and results
    
    Return:
        model (torch.nn.Module): trained PyTorch model 
    """

    # log
    pretask_loss_log = []
    pretask_acc_log = []
   
    # set device
    device = gpu.check_gpu()
    print("Pretask Training on:", device)

    # sigmoid 
    sigmoid = torch.nn.Sigmoid()

    # create directories
    for path in [model_path, log_path, fig_path]:
        if not exists(join(path, "pretask")):
            os.makedirs(join(path, "pretask"))

    for epoch in range(pretask_epoch):
        start = time.time()
        model.train()

        print("Training:")

        # training
        for i, data in enumerate(tqdm(dataloader)):
            # move batch and model to device
            model.to(device)
            input_id = data[0].to(device)
            segment = data[1].to(device)
            label = data[2].to(device)
            if engineered_features:
                extra_feat = data[3].to(device)

            # clear gradients
            optimizer.zero_grad()

            # BERT feature extraction
            features = bert_model.get_features(input_id, segment)

            # add engineered features
            if engineered_features:
                features = torch.cat((features, extra_feat), 1)
            
            # prediction
            output = sigmoid(model(features))
 
            # loss evaluation
            loss = criterion(output, label)
            loss.backward()

            # backpropagation
            optimizer.step()

            # record loss 
            curr_loss = torch.mean(loss).item()
            running_loss = (
                curr_loss if ((i == 0) and (epoch == 0)) else running_loss + curr_loss
            )

        # evaluation
        print("Evaluation:")
        model.eval()
        running_loss /= len(dataloader)
        accuracy = evaluater.evaluate_acc(model, bert_model, dataloader, engineered_features)
       

        # log evaluation result
        pretask_loss_log.append(running_loss)
        pretask_acc_log.append(accuracy)

        # save logs
        file = open(join(log_path, "pretask", save_name + ".txt"), "w")
        print("Last Epoch:", epoch + 1, file=file)
        print("Final Loss:", pretask_loss_log[-1], file=file)
        print("Final Train Accuracy:", pretask_acc_log[-1], file=file)

        # save variables 
        with open(join(log_path, "pretask", save_name + ".pkl"), "wb") as f:
            pickle.dump(
                [
                    pretask_loss_log,
                    pretask_acc_log,
                ],
                f,
            )

        # save model weights
        torch.save(
            model.to("cpu").state_dict(), join(model_path, "pretask", save_name + ".pt")
        )
        
        # print stats
        print(
            "epoch {} \t loss {:.5f} \t train_acc {:.3f} \t time {:.1f} sec".format(
                epoch + 1, running_loss, accuracy, time.time() - start
            )
        )

    # plots 
    plot_names = [
        "loss",
        "accuracy",
    ]
    for i, log in enumerate(
        [
            pretask_loss_log,
            pretask_acc_log
        ]
    ):
        plt.figure(num=None, figsize=(15, 10))
        plt.plot(log)
        plt.grid(True, which="both")
        plt.xlabel("epoch", fontsize=14)
        plt.ylabel(plot_names[i], fontsize=14)
        plt.savefig(join(fig_path, "pretask", save_name + "_" + plot_names[i] + ".png"))
        plt.close("all")
    
    return model 
Esempio n. 5
0
def train_model(
    filename,
    num_epoch,
    step_epochs,
    batch_size,
    lr,
    save_name,
    engineered_features=False,
    multiple_dataset=False,
    pretask_epoch=None,
    pretask_file=None,
    dropout=False,
    batchnorm=False,
    no_freeze=False
):
    """Train a model on the given dataset

       Written by Leo Nguyen. Contact Xenovortex, if problems arises.

    Args:
        filename (str): name of h5 file containing dataset
        num_epoch (int): number of epochs
        step_epochs (list): list of epoch number, where learning rate will be reduce by a factor of 10
        batch_size (int): batch size
        lr (float): learning rate
        save_name (str): name under which to save trained model and results
        engineered_features (bool, optional): contenate engineered features to vectorized sentence
        multiple_dataset (bool, optional): use multiple datasets for conditional training
        pretask_epoch (int, optional): integer provided will be the number of epochs spent on the pretask
        pretask_file (str, optional): filename of dataset for pretask
        dropout (bool, optional): use network architecture with dropout
        batchnorm (bool, optional): use network architecture with batch normalization
        no_freeze (bool, optional): in pretask training, don't freeze first layer
    """

    # save paths
    model_path = join(dirname(dirname(dirname(abspath(__file__)))), "model", "BERT")
    log_path = join(dirname(dirname(dirname(abspath(__file__)))), "result", "BERT")
    fig_path = join(dirname(dirname(dirname(abspath(__file__)))), "figures", "BERT")

    # create directories
    for path in [model_path, log_path, fig_path]:
        if not exists(dirname(path)):
            os.makedirs(dirname(path))
        if not exists(path):
            os.makedirs(path)

    # set device
    device = gpu.check_gpu()
    num_workers = multiprocessing.cpu_count()
    print("Training on:", device)
    print("Number of CPU cores detected:", num_workers)

    # read data
    df_train, df_test = to_dataframe.read_augmented_h5(filename)

    # setup BERT model
    bert_model = BERT.BERT()

    # prepare BERT input
    train_sentences = df_train.raw_text.values
    test_sentences = df_test.raw_text.values
    train_input_tensor, train_segment_tensor = bert_model.preprocessing(train_sentences)
    test_input_tensor, test_segment_tensor = bert_model.preprocessing(test_sentences)

    # extract labels and cast to PyTorch tensor
    train_labels = torch.tensor(
        list(df_train.rating.values), dtype=torch.float
    ).unsqueeze_(1)
    test_labels = torch.tensor(
        list(df_test.rating.values), dtype=torch.float
    ).unsqueeze_(1)

    # prepare dataset
    if engineered_features and multiple_dataset:
        extra_train_feat = torch.from_numpy(
            np.nan_to_num(sentencestats.construct_features(df_train.raw_text).values)
        ).float()
        extra_test_feat = torch.from_numpy(
            np.nan_to_num(sentencestats.construct_features(df_test.raw_text).values)
        ).float()
        train_dataset_label = torch.tensor(
            list(df_train.source.values), dtype=torch.float
        ).unsqueeze_(1)
        test_dataset_label = torch.tensor(
            list(df_test.source.values), dtype=torch.float
        ).unsqueeze_(1)
        trainset = TensorDataset(
            train_input_tensor,
            train_segment_tensor,
            train_labels,
            extra_train_feat,
            train_dataset_label,
        )
        testset = TensorDataset(
            test_input_tensor,
            test_segment_tensor,
            test_labels,
            extra_test_feat,
            test_dataset_label,
        )
    elif engineered_features:
        extra_train_feat = torch.from_numpy(
            np.nan_to_num(sentencestats.construct_features(df_train.raw_text).values)
        ).float()
        extra_test_feat = torch.from_numpy(
            np.nan_to_num(sentencestats.construct_features(df_test.raw_text).values)
        ).float()
        trainset = TensorDataset(
            train_input_tensor, train_segment_tensor, train_labels, extra_train_feat
        )
        testset = TensorDataset(
            test_input_tensor, test_segment_tensor, test_labels, extra_test_feat
        )
    elif multiple_dataset:
        train_dataset_label = torch.tensor(
            list(df_train.source.values), dtype=torch.float
        ).unsqueeze_(1)
        test_dataset_label = torch.tensor(
            list(df_test.source.values), dtype=torch.float
        ).unsqueeze_(1)
        trainset = TensorDataset(
            train_input_tensor, train_segment_tensor, train_labels, train_dataset_label
        )
        testset = TensorDataset(
            test_input_tensor, test_segment_tensor, test_labels, test_dataset_label
        )
    else:
        trainset = TensorDataset(train_input_tensor, train_segment_tensor, train_labels)
        testset = TensorDataset(test_input_tensor, test_segment_tensor, test_labels)

    # dataloader
    trainloader = DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
    )
    testloader = DataLoader(
        testset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
    )

    # setup pretask
    if pretask_epoch is not None and pretask_file is not None:
        # read data
        df_pretask, _ = to_dataframe.read_augmented_h5(pretask_file)
        
        # prepare BERT input
        pretask_sentences = df_pretask.raw_text.values
        pretask_input_tensor, pretask_segment_tensor = bert_model.preprocessing(pretask_sentences)
        
        # extract labels + cast to PyTorch tensors
        pretask_labels = torch.tensor(
            list(df_pretask.rating.values), dtype=torch.float
        ).unsqueeze_(1)
        
        # prepare dataset
        if engineered_features:
            extra_pretask_feat = torch.from_numpy(
                sentencestats.construct_features(pretask_sentences)
            )
            pretask_set = TensorDataset(
                pretask_input_tensor, pretask_segment_tensor, pretask_labels, extra_pretask_feat
            )
        else:
            pretask_set = TensorDataset(pretask_input_tensor, pretask_segment_tensor, pretask_labels)
        
        # dataloader 
        pretask_loader = DataLoader(
            pretask_set,
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
            pin_memory=True,
        )


    # prepare regression model
    feat_size = 768
    if engineered_features:
        feat_size += 21
    if multiple_dataset:
        feat_size += 1

    if dropout and batchnorm:
        reg_model = architectures.BN_Dropout_Net(feat_size, 512, 1)
    elif batchnorm:
        reg_model = architectures.BN_Net(feat_size, 512, 1)
    elif dropout:
        reg_model = architectures.Dropout_Net(feat_size, 512, 1)
    else:
        reg_model = architectures.Net(feat_size, 512, 1)
    reg_model = reg_model.to(device)

    # optimizer
    optimizer = torch.optim.Adam(reg_model.parameters(), lr=lr)

    # criterion
    criterion = torch.nn.MSELoss()
    pretask_criterion = torch.nn.BCELoss()

    # scheduler
    scheduler = opt.lr_scheduler.MultiStepLR(optimizer, step_epochs, 0.1)


    if pretask_epoch is not None and pretask_file is not None:
        # training on pretask
        reg_model = train_pretask(pretask_epoch, reg_model, bert_model, pretask_loader, pretask_criterion, optimizer, engineered_features, log_path, fig_path, model_path, save_name)

        if not no_freeze:
            # freeze first layer of the model 
            for params in reg_model.model[0].parameters():
                params.requires_grad = False

    # log
    loss_log = []
    train_MSE_log = []
    train_RMSE_log = []
    train_MAE_log = []
    train_r2_log = []
    test_MSE_log = []
    test_RMSE_log = []
    test_MAE_log = []
    test_r2_log = []

    for epoch in range(num_epoch):
        start = time.time()
        reg_model.train()

        print("Training:")

        # training
        for i, data in enumerate(tqdm(trainloader)):
            # move batch and model to device
            reg_model.to(device)
            input_id = data[0].to(device)
            segment = data[1].to(device)
            label = data[2].to(device)
            if engineered_features and multiple_dataset:
                extra_feat = data[3].to(device)
                dataset_label = data[4].to(device)
            elif engineered_features:
                extra_feat = data[3].to(device)
            elif multiple_dataset:
                dataset_label = data[3].to(device)

            # clear gradients
            optimizer.zero_grad()

            # BERT feature extraction
            features = bert_model.get_features(input_id, segment)

            # add engineered features
            if engineered_features:
                features = torch.cat((features, extra_feat), 1)

            # add dataset conditional label
            if multiple_dataset:
                features = torch.cat((features, dataset_label), 1)

            # prediction
            output = reg_model(features)

            # loss evaluation
            loss = criterion(output, label)
            loss.backward()

            # backpropagation
            optimizer.step()

            # record loss
            curr_loss = torch.mean(loss).item()
            running_loss = (
                curr_loss if ((i == 0) and (epoch == 0)) else running_loss + curr_loss
            )

        # update training schedule
        scheduler.step()

        # evaluation
        print("Evaluation:")
        reg_model.eval()
        running_loss /= len(trainloader)
        MSE_train, RMSE_train, MAE_train, r2_train = evaluater.evaluate_model(
            reg_model, bert_model, trainloader, engineered_features, multiple_dataset
        )
        MSE_test, RMSE_test, MAE_test, r2_test = evaluater.evaluate_model(
            reg_model, bert_model, testloader, engineered_features, multiple_dataset
        )

        # log evaluation result
        loss_log.append(running_loss)
        train_MSE_log.append(MSE_train)
        train_RMSE_log.append(RMSE_train)
        train_MAE_log.append(MAE_train)
        train_r2_log.append(r2_train)
        test_MSE_log.append(MSE_test)
        test_RMSE_log.append(RMSE_test)
        test_MAE_log.append(MAE_test)
        test_r2_log.append(r2_test)

        # save logs
        file = open(join(log_path, save_name + ".txt"), "w")
        print("Last Epoch:", epoch + 1, file=file)
        print("Final Loss:", loss_log[-1], file=file)
        print("Final Train MSE:", train_MSE_log[-1], file=file)
        print("Final Train RMSE:", train_RMSE_log[-1], file=file)
        print("Final Train MAE:", train_MAE_log[-1], file=file)
        print("Final Train R2:", train_r2_log[-1], file=file)
        print("Final Test MSE:", test_MSE_log[-1], file=file)
        print("Final Test RMSE:", test_RMSE_log[-1], file=file)
        print("Final Test MAE:", test_MAE_log[-1], file=file)
        print("Final Test R2:", test_r2_log[-1], file=file)

        # save variables
        with open(join(log_path, save_name + ".pkl"), "wb") as f:
            pickle.dump(
                [
                    loss_log,
                    train_MSE_log,
                    train_RMSE_log,
                    train_MAE_log,
                    train_r2_log,
                    test_MSE_log,
                    test_RMSE_log,
                    test_MAE_log,
                    test_r2_log,
                ],
                f,
            )

        # save model weights
        torch.save(
            reg_model.to("cpu").state_dict(), join(model_path, save_name + ".pt")
        )

        # print stats
        print(
            "epoch {} \t loss {:.5f} \t train_r2 {:.3f} \t test_r2 {:.3f} \t time {:.1f} sec".format(
                epoch + 1, running_loss, r2_train, r2_test, time.time() - start
            )
        )

    # plots
    plot_names = [
        "loss",
        "train_MSE",
        "train_RMSE",
        "train_MAE",
        "train_r2",
        "test_MSE",
        "test_RMSE",
        "test_MAE",
        "test_r2",
    ]
    for i, log in enumerate(
        [
            loss_log,
            train_MSE_log,
            train_RMSE_log,
            train_MAE_log,
            train_r2_log,
            test_MSE_log,
            test_RMSE_log,
            test_MAE_log,
            test_r2_log,
        ]
    ):
        plt.figure(num=None, figsize=(15, 10))
        plt.plot(log)
        plt.grid(True, which="both")
        plt.xlabel("epoch", fontsize=14)
        plt.ylabel(plot_names[i], fontsize=14)
        plt.savefig(join(fig_path, save_name + "_" + plot_names[i] + ".png"))
        plt.close("all")
Esempio n. 6
0
    print("Inference complete!")

def run(model_file, count = None):
    image_shape = IMAGE_SHAPE_KITI  # KITTI dataset uses 160x576 images

    assert model_file is not None, "Model file must be provided for inference to be run"

    with tf.Session() as sess:
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        input_layer, keep_prob, correct_label, learning_rate, output_layer, logits, _, _ = build_model(model_file=model_file, reload_model=True, sess=sess)
        run_inference(sess, image_shape, logits, keep_prob, input_layer, count=count)

if __name__ == '__main__':
    print("###############################################")
    print("#         IMAGE SEGMENTATION                  #")
    print("###############################################")

    current_time_millis = lambda: int(round(time.time() * 1000))

    parser = argparse.ArgumentParser(description='Image Segmentation Inference')
    parser.add_argument('-o', dest='model_folder', default=current_time_millis(), type=str, help='Location of model on disk')
    parser.add_argument('-n', dest='count', default=None, type=int, help='Number of images to segment (default: All)')
    args = parser.parse_args()

    check_gpu()

    model_file = os.path.join(MODELS_DIR, args.model_folder, MODEL_FILE_PATTERN);
    run(model_file=model_file, count=args.count)