def binary_naive_bayes():
    model = nb.NaiveBayesModel()
    clean = cn.DataCLean()
    doc_vector = dv.DocumentVector()
    df_clean, uniqueWords = clean.Clean()
    df_clean_test, df_clean_train = split(
        df_clean, 0, int(.3 * (df_clean['class'].count())))
    docVector = doc_vector.binary_docvector(df_clean_train, uniqueWords)
    # print(docVector)
    df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(
        docVector, uniqueWords)
    # print("Model Trained")
    predict_df, test_data = model.predict(Prob_PI, Prob_NoPI, uniqueWords,
                                          df_WordGivenPI, df_WordGivenNoPi,
                                          numWordsInPI, numWordsInNoPI,
                                          df_clean_test, clean)

    print(
        "--------------Binary Naive Bayes Accuracy Stats---------------------------"
    )
    stats = em.Evaluate()
    TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
    print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN))
    print("Precision = ", stats.Precision(TP, FP))
    print("Recall = ", stats.Recall(TP, FN))
    print("fScore = ", stats.fScore(TP, FN, FP))
    print("True Negative = ", stats.TrueNegative(TN, FP))
    print(
        "---------------------------------------------------------------------"
    )
Beispiel #2
0
def test():
    model = nb.NaiveBayesModel()
    path = 'E:/DATA/Sem8/fyp/Training.csv'
    final_df, df = model.extract('E:/DATA/Sem8/fyp/merge.csv')
    count = 0
    start = -200
    end = 0
    accuracy = []
    precision = []
    recall = []
    fscore = []
    stats = em.Evaluate()
    for count in range(5):
        df_test, df_train = split(final_df, start+200, end+200)
        print(df_train)
        li_clean_text = model.clean_data(df_train)
        uniqueWords = model.make_unique_li(li_clean_text)
    # # print(uniqueWords)
        docVector = model.binary_docvector(final_df, uniqueWords)
        df_WordGivenPI,df_WordGivenNoPi,Prob_PI,Prob_NoPI,numWordsInPI,numWordsInNoPI = model.TrainModel(docVector, uniqueWords)
        predict_df, test_data = model.Predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI)
        # print("--------------Naive Bayes Accuracy Stats---------------------------")
        TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
        accuracy.append(stats.Accuracy(TP, TN, FP, FN))
        precision.append(stats.Precision(TP, FP))
        recall.append(stats.Recall(TP, FN))
        fscore.append(stats.fScore(TP, FN, FP))
        # print("---------------------------------------------------------------------")
    print("accuracy = ",Average(accuracy))
    print("precison = ", Average(precision))
    print("recall = ", Average(recall))
    print("f-score = ", Average(fscore))
def binary_naive_bayes_kfold():
    model = nb.NaiveBayesModel()
    clean = cn.DataCLean()
    doc_vector = dv.DocumentVector()
    final_df, df = clean.extract(pathData)
    count = 0
    start = -200
    end = 0
    accuracy = []
    precision = []
    recall = []
    fscore = []
    true_neg = []
    stats = em.Evaluate()
    for count in range(5):
        start = start + 200
        end = end + 200
        df_test, df_train = split(final_df, start, end)
        # print(df_train)
        li_clean_text, df_clean = clean.clean_data(df_train)
        uniqueWords = clean.make_unique_li(li_clean_text)
        # # print(uniqueWords)
        docVector = doc_vector.binary_docvector(df_clean, uniqueWords)
        df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(
            docVector, uniqueWords)
        predict_df, punc_df = model.predict(Prob_PI, Prob_NoPI, uniqueWords,
                                            df_WordGivenPI, df_WordGivenNoPi,
                                            numWordsInPI, numWordsInNoPI,
                                            df_test, clean)
        # print("--------------Naive Bayes Accuracy Stats---------------------------")
        TP, FN, TN, FP = stats.confusion_matrix(punc_df, predict_df)
        accuracy.append(stats.Accuracy(TP, TN, FP, FN))
        precision.append(stats.Precision(TP, FP))
        recall.append(stats.Recall(TP, FN))
        fscore.append(stats.fScore(TP, FN, FP))
        true_neg.append(stats.TrueNegative(TN, FP))
        # print("---------------------------------------------------------------------")
    print(
        "---------------------------------------------------------------------"
    )
    print("Binary Naive Bayes wit k-fold Accuracy Stats")
    print("accuracy = ", accuracy)
    print("precison = ", precision)
    print("recall = ", recall)
    print("f-score = ", fscore)
    print("True Negative = ", true_neg)
    print("accuracy = ", Average(accuracy))
    print("precison = ", Average(precision))
    print("recall = ", Average(recall))
    print("f-score = ", Average(fscore))
    print("true negative = ", Average(true_neg))
Beispiel #4
0
def binary_naive_bayes():
    model = nb.NaiveBayesModel()
    path = 'E:/DATA/Sem8/fyp/Training.csv'
    final_df, df = model.extract('E:/DATA/Sem8/fyp/Training.csv')
    li_clean_text = model.clean_data(final_df)
    uniqueWords = model.make_unique_li(li_clean_text)
    # print(uniqueWords)
    docVector = model.binary_docvector(final_df, uniqueWords)
    df_WordGivenPI,df_WordGivenNoPi,Prob_PI,Prob_NoPI,numWordsInPI,numWordsInNoPI = model.TrainModel(docVector, uniqueWords)
    predict_df, test_data = model.Predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI)

    print("--------------Naive Bayes Accuracy Stats---------------------------")
    stats = em.Evaluate()
    TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
    print("Accuracy = ",stats.Accuracy(TP, TN, FP, FN))
    print("Precision = ",stats.Precision(TP, FP))
    print("Recall = ",stats.Recall(TP, FN))
    print("fScore = ",stats.fScore(TP, FN, FP))
    print("---------------------------------------------------------------------")
Beispiel #5
0
def text_blob():
    model = nb.NaiveBayesModel()
    path = 'E:/DATA/Sem8/fyp/Training.csv'
    final_df, df = model.extract('E:/DATA/Sem8/fyp/Training.csv')
    corpus = model.text_concat(final_df)
    li_clean_text = model.clean_data(corpus)
    uniqueWords = model.make_unique_li(li_clean_text)
    docVector = model.DocVector(final_df, uniqueWords)
    polarity_docVector = tb.text_blob(docVector, uniqueWords)
    print(polarity_docVector['bad'])
    df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(polarity_docVector, uniqueWords)
    predict_df, test_data = model.Predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI)

    print("--------------Naive Bayes with Text Blob Accuracy Stats---------------------------")
    stats = em.Evaluate()
    TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
    print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN))
    print("Precision = ", stats.Precision(TP, FP))
    print("Recall = ", stats.Recall(TP, FN))
    print("fScore = ", stats.fScore(TP, FN, FP))
    print("---------------------------------------------------------------------")
Beispiel #6
0
def train_cnn(PATH_TO_IMAGES,
              PATH_TO_LABELS,
              learning_rate,
              WEIGHT_DECAY,
              use_gpu=False):
    """
    Train torchvision model to NIH data given high level hyperparameters.

    Args:
        PATH_TO_IMAGES: path to NIH images
        PATH_TO_LABELS: path to csv which contains labels
        learning_rate: learning rate
        WEIGHT_DECAY: weight decay parameter for SGD

    Returns:
        preds: torchvision model predictions on test fold with ground truth for comparison
        aucs: AUCs for each train,test tuple

    """
    NUM_EPOCHS = 100
    BATCH_SIZE = 16

    try:
        rmtree('results/')
    except BaseException:
        pass  # directory doesn't yet exist, no need to clear it
    os.makedirs("results/")

    # use imagenet mean,std for normalization
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    N_LABELS = 14  # we are predicting 14 labels

    # load labels
    # df = pd.read_csv("nih_labels.csv", index_col=0)

    # define torchvision transforms
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.Scale(224),
            # because scale doesn't always give 224 x 224, this ensures 224 x
            # 224
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
        'val':
        transforms.Compose([
            transforms.Scale(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
    }

    # create train/val data_loaders
    transformed_datasets = {
        'train':
        CXR.ChexDataset(path_to_images=PATH_TO_IMAGES,
                        path_to_labels_csv=PATH_TO_LABELS,
                        fold='train',
                        transform=data_transforms['train']),
        'val':
        CXR.ChexDataset(path_to_images=PATH_TO_IMAGES,
                        path_to_labels_csv=PATH_TO_LABELS,
                        fold='val',
                        transform=data_transforms['val'])
    }

    data_loaders = {
        'train':
        torch.utils.data.DataLoader(transformed_datasets['train'],
                                    batch_size=BATCH_SIZE,
                                    shuffle=True,
                                    num_workers=8),
        'val':
        torch.utils.data.DataLoader(transformed_datasets['val'],
                                    batch_size=BATCH_SIZE,
                                    shuffle=True,
                                    num_workers=8)
    }

    # please do not attempt to train without GPU as will take excessively long
    # if not use_gpu:
    #     raise ValueError("Error, requires GPU")
    model = models.densenet121(pretrained=True)
    num_features = model.classifier.in_features
    # add final layer with # outputs in same dimension of labels with sigmoid
    # activation
    model.classifier = nn.Sequential(nn.Linear(num_features, N_LABELS),
                                     nn.Sigmoid())

    # put model on GPU
    if (use_gpu):
        is_gpu_available = torch.cuda.is_available()
        if not is_gpu_available:
            raise ValueError(
                "Error, Can't use GPU since hardware doesn't Support it, you idiot!"
            )
        gpu_count = torch.cuda.device_count()
        print("Using GPU: Available GPU count:" + str(gpu_count))

        model = model.cuda()

    # define criterion, optimizer for training
    criterion = nn.BCELoss()
    optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=learning_rate,
                          momentum=0.9,
                          weight_decay=WEIGHT_DECAY)
    dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']}

    # train model
    model, best_epoch = train_model(model,
                                    criterion,
                                    optimizer,
                                    learning_rate,
                                    num_epochs=NUM_EPOCHS,
                                    data_loaders=data_loaders,
                                    dataset_sizes=dataset_sizes,
                                    weight_decay=WEIGHT_DECAY)

    # get preds and AUCs on test fold
    preds, aucs = EM.make_pred_multilabel(data_transforms, model,
                                          PATH_TO_IMAGES)

    return preds, aucs
# Discretizing the attribute values of the class labels.
# Resultant - 5 class problem for identifying the ratings of business
# for idx in range(numRecords):
#     classLabels[idx] = round(classLabels[idx], 0)
#     if classLabels[idx] >= 7:
#         classLabels[idx] = 1
#     else:
#         classLabels[idx] = 0
#
# print(np.unique(classLabels))

# SVM classifier
print('SVM classifier:')
svm_classifier = SVC(kernel='rbf', gamma=0.8, C=1.0)
[attribute_values_svm,accuracy_values_svm] = EvaluateModel.evaluate_model(svm_classifier, dataRecords, classLabels, labels)

# Logistic regression
print('Logistic regression:')
lr_classifier = linear_model.LogisticRegression(C=1.0)
[attribute_values_lr,accuracy_values_lr] = EvaluateModel.evaluate_model(lr_classifier, dataRecords, classLabels, labels)

# Naive Bayes classifier
print('Naive Bayes classifier:')
nb_classifier = GaussianNB()
[attribute_values_gnb,accuracy_values_gnb] = EvaluateModel.evaluate_model(nb_classifier, dataRecords, classLabels, labels)


# # Graphs
plt.plot(attribute_values_svm, accuracy_values_svm)
plt.plot(attribute_values_lr, accuracy_values_lr)