Ejemplo n.º 1
0
def labelClassificationRNN(learn=True):
    if learn:
        c_vals = [[0.001, 0.001, 0.001, 0.001]]
        c_vals = [[0.005, 0.005, 0.005, 0.005]]
        c_vals.append([0.01, 0.01, 0.01, 0.01])
        c_vals.append([0.05, 0.05, 0.05, 0.05])
        c_vals.append([0.1, 0.1, 0.1, 0.1])
        c_vals.append([0.5, 0.5, 0.5, 0.5])
        c_vals.append([1, 1, 1, 1])
        optimal_c = [[0, 0, 0, 0]]
    else:
        file = open('./model_files/svm_c_values.pkl', 'r')
        c_vals = pickle.load(file)
        optimal_c = c_vals
        file.close()
    reports = preprocess.getReports()
    reportVectors = rnn.loadReportVecs()

    numFolds = 5  # number of folds for cross validation
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])
            # fetch corpus and labels
            labelledReports = []
            labelledCorpus = list()
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledReports.append(reports[i])
                labelledCorpus.append(reportVectors[i][:])
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            corpusList = [list(x) for x in labelledCorpus]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            # count = 0
            # deletes = []
            # for x in range(len(labels)):
            # 	if (labels[x] == "negative"):
            # 		count = count + 1
            # 		deletes.append(x)
            # 	if (count == (len(labels)-(list(labels).count("positive"))*2)):
            # 		break
            # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
            # labels = np.delete(labels,deletes)
            ##################
            best_area_cv = -1
            for c_value in c_vals:
                for n in range(numFolds):
                    # split training and test data
                    train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                        labelledCorpus, labels, test_size=0.15)
                    # Split of the last 20% of training set for cross validation
                    cv_labelledCorpus = train_labelledCorpus[
                        int(0.8 * len(train_labelledCorpus)):]
                    train_labelledCorpus = train_labelledCorpus[:int(
                        0.8 * len(train_labelledCorpus))]
                    cv_labels = train_labels[int(0.8 * len(train_labels)):]
                    train_labels = train_labels[:int(0.8 * len(train_labels))]
                    # build classifier
                    classifier = svm.SVC(C=c_value[j], kernel='linear').fit(
                        train_labelledCorpus, train_labels)
                    # compute output label and corresponding score
                    output_test = classifier.predict(test_labelledCorpus)
                    output_cv = classifier.predict(cv_labelledCorpus)
                    output_train = classifier.predict(train_labelledCorpus)
                    output_scores_test = classifier.decision_function(
                        test_labelledCorpus)
                    output_scores_train = classifier.decision_function(
                        train_labelledCorpus)
                    output_scores_cv = classifier.decision_function(
                        cv_labelledCorpus)

                    if n == 0:
                        all_test_labels = tuple(test_labels)
                        all_output_scores_test = tuple(output_scores_test)
                        all_cv_labels = tuple(cv_labels)
                        all_output_scores_cv = tuple(output_scores_cv)
                        all_train_labels = tuple(train_labels)
                        all_output_scores_train = tuple(output_scores_train)
                    else:
                        all_test_labels = all_test_labels + tuple(test_labels)
                        all_output_scores_test = all_output_scores_test + tuple(
                            output_scores_test)
                        all_cv_labels = all_cv_labels + tuple(cv_labels)
                        all_output_scores_cv = all_output_scores_cv + tuple(
                            output_scores_cv)
                        all_train_labels = all_train_labels + tuple(
                            train_labels)
                        all_output_scores_train = all_output_scores_train + tuple(
                            output_scores_train)
                    # save result for fold to file
                    for r in range(len(test_labels)):
                        reportIdx = corpusList.index(
                            list(test_labelledCorpus[r]))
                        writer.writerow("With c value: " + str(c_value[j]))
                        writer.writerow([
                            output_scores_test[r], output_test[r],
                            test_labels[r]
                        ])
                        writer.writerow([labelledReports[reportIdx]])
                # generate the roc curve
                fp_test, tp_test, _ = roc_curve(all_test_labels,
                                                all_output_scores_test,
                                                pos_label="positive")
                fp_cv, tp_cv, _ = roc_curve(all_cv_labels,
                                            all_output_scores_cv,
                                            pos_label="positive")
                fp_train, tp_train, _ = roc_curve(all_train_labels,
                                                  all_output_scores_train,
                                                  pos_label="positive")

                # Calculate the area under the curves
                area_test = auc(fp_test, tp_test)
                area_cv = auc(fp_cv, tp_cv)
                area_train = auc(fp_train, tp_train)
                # Store c value,tps, fps and aucs if cv auc is new best
                if area_cv > best_area_cv:
                    optimal_c[0][j] = c_value[j]
                    best_fp_test = fp_test
                    best_tp_test = tp_test
                    best_fp_cv = fp_cv
                    best_tp_cv = tp_cv
                    best_fp_train = fp_train
                    best_tp_train = tp_train
                    best_area_test = area_test
                    best_area_cv = area_cv
                    best_area_train = area_train
            # initialise and plot the average ROC curves for optimal c value
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC: c value of " +
                      str(optimal_c[0][j]))
            plt.plot(best_fp_test,
                     best_tp_test,
                     'b',
                     label='test(area = %0.2f)' % best_area_test)
            plt.plot(best_fp_cv,
                     best_tp_cv,
                     'g',
                     label='cv(area = %0.2f)' % best_area_cv)
            plt.plot(best_fp_train,
                     best_tp_train,
                     'r',
                     label='train(area = %0.2f)' % best_area_train)
            plt.legend(loc='lower right')
            plt.savefig(directory + name)
    writeFile.close()
    if learn:
        file = open('./model_files/svm_c_values.pkl', 'w')
        pickle.dump(optimal_c, file)
        file.close()
Ejemplo n.º 2
0
def precisionRecall(testFile):
    models = ["bow", "tfidf", "lsi", "lda", "doc2vec", "rnn"]
    # Create the output directory
    directory = "precision_recall/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    tests = []
    with open(testFile, 'rb') as file:
        reader = csv.reader(file)
        for row in reader:
            tests.append(row)
    file.close()

    thres = [
        0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
        0.9
    ]

    numReports = [
        preprocess.getNumReports(REPORT_FILES[:1]),
        preprocess.getNumReports(REPORT_FILES[:2]),
        preprocess.getNumReports(REPORT_FILES[:3]),
        preprocess.getNumReports()
    ]
    numResults = preprocess.getNumReports()
    for searchTerm in tests:
        print(searchTerm)
        plt.figure(searchTerm[0])
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(searchTerm[0])
        with open(directory + searchTerm[0] + ".csv", 'w') as writeFile:
            writer = csv.writer(writeFile)

            for model in models:
                writer.writerow([model])
                precision = []
                recall = []

                allReports = search.search(model, numResults, searchTerm[0])
                for i in range(len(thres)):
                    truePositive = 0
                    retrieved = 0  # retreieved = truePositive + falsePositive
                    relevant = 0  # relevant = truePositive + falseNegative

                    similarReports = [
                        report for report in allReports if report[1] > thres[i]
                    ]

                    for reportIdx in similarReports:
                        if reportIdx[0] < numReports[0]:  # prediction: brains
                            if (searchTerm[1] == "Brains"):  # actual: brains
                                truePositive = truePositive + 1
                            # print "brains"
                        elif reportIdx[0] < numReports[1]:
                            if (searchTerm[1] == "CTPA"):
                                truePositive = truePositive + 1
                            # print "ctpa"
                        elif reportIdx[0] < numReports[2]:
                            if (searchTerm[1] == "Plainab"):
                                truePositive = truePositive + 1
                            # print "plainab"
                        elif reportIdx[0] < numReports[3]:
                            if (searchTerm[1] == "Pvab"):
                                truePositive = truePositive + 1
                            # print "pvab"
                        else:
                            print "error"
                    retrieved = retrieved + len(similarReports)
                    relevant = relevant + preprocess.getNumReports(
                        ["nlp_data/Cleaned" + searchTerm[1] + "Full.csv"])

                    precision.append((truePositive /
                                      retrieved) if retrieved else 0)
                    recall.append((truePositive / relevant) if relevant else 0)
                    writer.writerow([precision[i - 1], recall[i - 1]])

                writer.writerow("")

                # plot the data point
                plt.plot(recall, precision, label=model)

        writeFile.close()
        plt.legend(loc='lower right')
        fileName = directory + searchTerm[0]
        plt.savefig(fileName)
Ejemplo n.º 3
0
def labelClassification():
    corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
    #convert the corpus to a numpy matrix, take the transpose and convert it to a list
    corpusList = [
        list(x) for x in zip(*gensim.matutils.corpus2dense(
            corpus, corpus.num_terms, dtype=np.float64))
    ]
    # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
    reports = preprocess.getReports()

    numFolds = 5  # number of folds for cross validation
    # Create the output directory
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # initialise figure and plot
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC")

            # fetch corpus and labels
            labelledCorpus = []
            # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]])))
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledCorpus.append((corpusList[i]))
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set
            dataPerFold = int(math.ceil(numData / numFolds))

            for n in range(0, numFolds):
                # split training and test data
                train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                    labelledCorpus, labels, test_size=0.13)

                # build classifier
                classifier = svm.SVC(kernel='linear').fit(
                    train_labelledCorpus, train_labels)
                # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels)
                # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels)

                # compute output label and corresponding score
                output_test = classifier.predict(test_labelledCorpus)
                output_train = classifier.predict(train_labelledCorpus)
                output_scores_test = classifier.decision_function(
                    test_labelledCorpus)
                output_scores_train = classifier.decision_function(
                    train_labelledCorpus)

                # sort scores and labels in order
                sortList = list(
                    zip(output_scores_test, output_test, test_labels,
                        test_labelledCorpus))
                sortList.sort()
                output_scores_test, output_test, test_labels, test_labelledCorpus = zip(
                    *sortList)

                if n == 0:
                    all_test_labels = test_labels
                    all_output_scores_test = output_scores_test
                    all_train_labels = tuple(train_labels)
                    all_output_scores_train = tuple(output_scores_train)
                else:
                    all_test_labels = all_test_labels + test_labels
                    all_output_scores_test = all_output_scores_test + output_scores_test
                    all_train_labels = all_train_labels + tuple(train_labels)
                    all_output_scores_train = all_output_scores_train + tuple(
                        output_scores_train)
                # save result to file
                for r in range(len(test_labels)):
                    reportIdx = corpusList.index(list(test_labelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow([
                        output_scores_test[r], output_test[r], test_labels[r]
                    ])
                    writer.writerow([reports[reportIdx]])
            # generate the roc curve
            fp_test, tp_test, _ = roc_curve(all_test_labels,
                                            all_output_scores_test,
                                            pos_label="positive")
            fp_train, tp_train, _ = roc_curve(all_train_labels,
                                              all_output_scores_train,
                                              pos_label="positive")

            # Calculate the area under the curves
            area_test = auc(fp_test, tp_test)
            area_train = auc(fp_train, tp_train)
            # Plot the average ROC curves
            plt.plot(fp_test,
                     tp_test,
                     'b',
                     label='test(area = %0.2f)' % area_test)
            plt.plot(fp_train,
                     tp_train,
                     'r',
                     label='train(area = %0.2f)' % area_train)
            plt.legend(loc='lower right')
            plt.savefig(directory + name)
    writeFile.close()
Ejemplo n.º 4
0
def labelClassificationD2V():

    model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

    reports = preprocess.getReports()
    processedReports = preprocess.getProcessedReports()

    numFolds = 5  # number of folds for cross validation
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # initialise figure and plot
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC")

            # fetch corpus and labels
            labelledReports = []
            labelledCorpus = list()
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledReports.append(reports[i])
                labelledCorpus.append(model.infer_vector(processedReports[i]))
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            corpusList = [list(x) for x in labelledCorpus]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set
            dataPerFold = int(math.ceil(numData / numFolds))

            for n in range(0, numFolds):
                # split training and test data
                train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                    labelledCorpus, labels, test_size=0.13)

                # build classifier
                classifier = svm.SVC(kernel='linear').fit(
                    train_labelledCorpus, train_labels)

                # compute output label and corresponding score
                output_test = classifier.predict(test_labelledCorpus)
                output_train = classifier.predict(train_labelledCorpus)
                output_scores_test = classifier.decision_function(
                    test_labelledCorpus)
                output_scores_train = classifier.decision_function(
                    train_labelledCorpus)

                # sort scores and labels in order
                sortList = list(
                    zip(output_scores_test, output_test, test_labels,
                        test_labelledCorpus))
                sortList.sort()
                output_scores_test, output_test, test_labels, test_labelledCorpus = zip(
                    *sortList)

                # build roc curve and plot
                fp_test, tp_test, _ = roc_curve(test_labels,
                                                output_scores_test,
                                                pos_label="positive")
                fp_train, tp_train, _ = roc_curve(train_labels,
                                                  output_scores_train,
                                                  pos_label="positive")

                plt.plot(fp_test,
                         tp_test,
                         'r',
                         label="train" if n == 0 else "")
                plt.plot(fp_train,
                         tp_train,
                         'b',
                         label="test" if n == 0 else "")
                plt.legend(loc='lower right')
                plt.savefig(directory + name)

                # save result to file
                for r in range(len(test_labels)):
                    reportIdx = corpusList.index(list(test_labelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow([
                        output_scores_test[r], output_test[r], test_labels[r]
                    ])
                    writer.writerow([labelledReports[reportIdx]])
        # plt.show()
    writeFile.close()
Ejemplo n.º 5
0
def testClassification():
    threashold = 0.001
    corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
    #convert the corpus to a numpy matrix, take the transpose and convert it to a list
    corpusList = [
        list(x) for x in zip(*gensim.matutils.corpus2dense(
            corpus, corpus.num_terms, dtype=np.float64))
    ]
    # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
    reports = preprocess.getReports()

    numFolds = 5  # number of folds for cross validation
    # Create the output directory
    directory = "label_tests/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # fetch corpus and labels
            labelledCorpus = []
            unlabelledCorpus = []
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledCorpus.append(corpusList[i])
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES[j]])):
                unlabelledCorpus.append(corpusList[i])
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set

            # build classifier
            classifier = svm.SVC(kernel='linear').fit(labelledCorpus, labels)

            # compute output label and corresponding score
            output_test = classifier.predict(unlabelledCorpus)
            output_scores_test = classifier.decision_function(unlabelledCorpus)

            # sort scores and labels in order
            sortList = list(
                zip(output_scores_test, output_test, unlabelledCorpus))
            sortList.sort()
            output_scores_test, output_test, unlabelledCorpus = zip(*sortList)

            # save result to file
            for r in range(len(unlabelledCorpus)):
                if (abs(output_scores_test[r]) < threashold):
                    reportIdx = corpusList.index(list(unlabelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow(
                        [reportIdx, output_scores_test[r], output_test[r]])
                    writer.writerow([reports[reportIdx]])
    writeFile.close()
Ejemplo n.º 6
0
def testClassification(threshold,fileType):

	REPORT_FILES = [('Cleaned' + fileType + 'Full.csv')]
	REPORT_FILES_LABELLED = [('Cleaned' + fileType + 'Labelled.csv')]
	DIAGNOSES = [fileType]

	corpus = gensim.corpora.MmCorpus('../model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports(fileType)

	numFolds = 5 # number of folds for cross validation

	with open("labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			#writer.writerow("")
			#writer.writerow("")
			writer.writerow([DIAGNOSES[j],"",""]) # Added "" for csv parsing

			# fetch corpus and labels
			labelledCorpus = []
			unlabelledCorpus = []
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append(corpusList[i])
			for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])):
				unlabelledCorpus.append(corpusList[i])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set

			# build classifier
			classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels)
			
			print ""
                        print "Model parameters:"
			print classifier.coef_
			print ""
                        print "L2 norm of current model: " + str(np.linalg.norm(classifier.coef_))
			print ""

			for i in range(len(classifier.coef_)):
				parameters.append(classifier.coef_[i])
                        

			# compute output label and corresponding score
			output_test = classifier.predict(unlabelledCorpus)
			output_scores_test = classifier.decision_function(unlabelledCorpus)

			# sort scores and labels in order
			sortList = list(zip(output_scores_test,output_test,unlabelledCorpus))
			sortList.sort()
			output_scores_test,output_test,unlabelledCorpus = zip(*sortList)

			# save result to file
			for r in range(len(unlabelledCorpus)):
				if (abs(output_scores_test[r]) < threshold):
					reportIdx = corpusList.index(list(unlabelledCorpus[r]))
					# writer.writerow("") # Removing newline to help with future parsing
					writer.writerow([reportIdx,output_scores_test[r],output_test[r]])
					writer.writerow([reports[reportIdx],"",""]) # Added extra "" to make csv parsing work
	writeFile.close()

	# Write model parameters to file
	with open("coef.csv",'w') as fout:
		writer = csv.writer(fout)
		for i in range(len(parameters)):
			writer.writerow(parameters[i])

	print "Model parameters saved to file."
Ejemplo n.º 7
0
def precisionRecall(testFile):
	models = ["bow","tfidf","lsi","lda","doc2vec","rnn"]
	# Create the output directory
	directory = "precision_recall/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	tests = []
	with open(testFile,'rb') as file:
		reader = csv.reader(file)
		for row in reader:
			tests.append(row)
	file.close()

	thres = [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

	numReports = [preprocess.getNumReports(REPORT_FILES[:1]), preprocess.getNumReports(REPORT_FILES[:2]), preprocess.getNumReports(REPORT_FILES[:3]),preprocess.getNumReports()]
	numResults = preprocess.getNumReports()
	for searchTerm in tests:
		print(searchTerm)
		plt.figure(searchTerm[0])
		plt.xlabel("Recall")
		plt.ylabel("Precision")
		plt.title(searchTerm[0])
		with open(directory + searchTerm[0] + ".csv",'w') as writeFile:
			writer = csv.writer(writeFile)

			for model in models:
				writer.writerow([model])
				precision = []
				recall = []

				allReports = search.search(model,numResults,searchTerm[0])
				for i in range(len(thres)):
					truePositive = 0
					retrieved = 0 # retreieved = truePositive + falsePositive
					relevant = 0 # relevant = truePositive + falseNegative

					similarReports = [report for report in allReports if report[1] > thres[i]]

					for reportIdx in similarReports:
						if reportIdx[0] < numReports[0]: # prediction: brains
							if (searchTerm[1] == "Brains"): # actual: brains
								truePositive = truePositive + 1
							# print "brains"
						elif reportIdx[0] < numReports[1]:
							if (searchTerm[1] == "CTPA"):
								truePositive = truePositive + 1
							# print "ctpa"
						elif reportIdx[0] < numReports[2]:
							if (searchTerm[1] == "Plainab"):
								truePositive = truePositive + 1
							# print "plainab"
						elif reportIdx[0] < numReports[3]:
							if (searchTerm[1] == "Pvab"):
								truePositive = truePositive + 1
							# print "pvab"
						else:
							print "error"
					retrieved = retrieved + len(similarReports)
					relevant = relevant + preprocess.getNumReports(["nlp_data/Cleaned" + searchTerm[1] + "Full.csv"])

					precision.append((truePositive/retrieved) if retrieved else 0)
					recall.append((truePositive/relevant) if relevant else 0)
					writer.writerow([precision[i-1],recall[i-1]])

				writer.writerow("")

				# plot the data point
				plt.plot(recall,precision,label=model)

		writeFile.close()
		plt.legend(loc='lower right')
		fileName = directory + searchTerm[0]
		plt.savefig(fileName)
Ejemplo n.º 8
0
def labelClassificationRNN(learn=True):
	if learn:
		c_vals = [[0.001,0.001,0.001,0.001]]
		c_vals = [[0.005,0.005,0.005,0.005]]
		c_vals.append([0.01,0.01,0.01,0.01])
		c_vals.append([0.05,0.05,0.05,0.05])
		c_vals.append([0.1,0.1,0.1,0.1])
		c_vals.append([0.5,0.5,0.5,0.5])
		c_vals.append([1,1,1,1])
		optimal_c = [[0,0,0,0]]
	else:
		file = open('./model_files/svm_c_values.pkl', 'r')
		c_vals = pickle.load(file)
		optimal_c = c_vals
		file.close()
	reports = preprocess.getReports()
	reportVectors = rnn.loadReportVecs()

	numFolds = 5 # number of folds for cross validation
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])
			# fetch corpus and labels
			labelledReports = []
			labelledCorpus = list()
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledReports.append(reports[i])
				labelledCorpus.append(reportVectors[i][:])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			corpusList = [list(x) for x in labelledCorpus]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			# count = 0
			# deletes = []
			# for x in range(len(labels)):
			# 	if (labels[x] == "negative"):
			# 		count = count + 1
			# 		deletes.append(x)
			# 	if (count == (len(labels)-(list(labels).count("positive"))*2)):
			# 		break
			# labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			# labels = np.delete(labels,deletes)
			##################
			best_area_cv = -1
			for c_value in c_vals:
				for n in range(numFolds):
					# split training and test data
					train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.15)
					# Split of the last 20% of training set for cross validation
					cv_labelledCorpus = train_labelledCorpus[int(0.8*len(train_labelledCorpus)):]
					train_labelledCorpus = train_labelledCorpus[:int(0.8*len(train_labelledCorpus))]
					cv_labels = train_labels[int(0.8*len(train_labels)):]
					train_labels = train_labels[:int(0.8*len(train_labels))]
					# build classifier
					classifier = svm.SVC(C=c_value[j],kernel='linear').fit(train_labelledCorpus,train_labels)
					# compute output label and corresponding score
					output_test = classifier.predict(test_labelledCorpus)
					output_cv = classifier.predict(cv_labelledCorpus)
					output_train = classifier.predict(train_labelledCorpus)
					output_scores_test = classifier.decision_function(test_labelledCorpus)
					output_scores_train = classifier.decision_function(train_labelledCorpus)
					output_scores_cv = classifier.decision_function(cv_labelledCorpus)

					if n ==0:
						all_test_labels = tuple(test_labels)
						all_output_scores_test = tuple(output_scores_test)
						all_cv_labels = tuple(cv_labels)
						all_output_scores_cv = tuple(output_scores_cv)
						all_train_labels = tuple(train_labels)
						all_output_scores_train = tuple(output_scores_train)
					else:
						all_test_labels = all_test_labels + tuple(test_labels)
						all_output_scores_test = all_output_scores_test + tuple(output_scores_test)
						all_cv_labels = all_cv_labels + tuple(cv_labels)
						all_output_scores_cv = all_output_scores_cv + tuple(output_scores_cv)
						all_train_labels = all_train_labels + tuple(train_labels)
						all_output_scores_train = all_output_scores_train+ tuple(output_scores_train)
					# save result for fold to file
					for r in range(len(test_labels)):
						reportIdx = corpusList.index(list(test_labelledCorpus[r]))
						writer.writerow("With c value: "+str(c_value[j]))
						writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
						writer.writerow([labelledReports[reportIdx]])
				# generate the roc curve
				fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive")
				fp_cv,tp_cv,_ = roc_curve(all_cv_labels,all_output_scores_cv,pos_label="positive")
				fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive")

				# Calculate the area under the curves
				area_test = auc(fp_test, tp_test)
				area_cv = auc(fp_cv, tp_cv)
				area_train = auc(fp_train, tp_train)
				# Store c value,tps, fps and aucs if cv auc is new best
				if area_cv > best_area_cv:
					optimal_c[0][j] = c_value[j]
					best_fp_test=fp_test
					best_tp_test=tp_test
					best_fp_cv=fp_cv
					best_tp_cv=tp_cv
					best_fp_train=fp_train
					best_tp_train=tp_train
					best_area_test=area_test
					best_area_cv=area_cv
					best_area_train=area_train
			# initialise and plot the average ROC curves for optimal c value
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC: c value of "+str(optimal_c[0][j]))
			plt.plot(best_fp_test,best_tp_test,'b',label='test(area = %0.2f)' % best_area_test)
			plt.plot(best_fp_cv,best_tp_cv,'g',label='cv(area = %0.2f)' % best_area_cv)
			plt.plot(best_fp_train,best_tp_train,'r',label='train(area = %0.2f)' % best_area_train)
			plt.legend(loc='lower right')
			plt.savefig(directory+name)
	writeFile.close()
	if learn:
		file = open('./model_files/svm_c_values.pkl', 'w')
		pickle.dump(optimal_c,file)
		file.close()
Ejemplo n.º 9
0
def labelClassificationD2V():

	model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

	reports = preprocess.getReports()
	processedReports = preprocess.getProcessedReports()

	numFolds = 5 # number of folds for cross validation
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# initialise figure and plot
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC")

			# fetch corpus and labels
			labelledReports = []
			labelledCorpus = list()
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledReports.append(reports[i])
				labelledCorpus.append(model.infer_vector(processedReports[i]))
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			corpusList = [list(x) for x in labelledCorpus]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set
			dataPerFold = int(math.ceil(numData/numFolds))


			for n in range(0,numFolds):
				# split training and test data
				train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13)

				# build classifier
				classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels)

				# compute output label and corresponding score
				output_test = classifier.predict(test_labelledCorpus)
				output_train = classifier.predict(train_labelledCorpus)
				output_scores_test = classifier.decision_function(test_labelledCorpus)
				output_scores_train = classifier.decision_function(train_labelledCorpus)

				# sort scores and labels in order
				sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus))
				sortList.sort()
				output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList)

				# build roc curve and plot
				fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive")
				fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive")

				plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "")
				plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "")
				plt.legend(loc='lower right')
				plt.savefig(directory+name)

				# save result to file
				for r in range(len(test_labels)):
					reportIdx = corpusList.index(list(test_labelledCorpus[r]))
					writer.writerow("")
					writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
					writer.writerow([labelledReports[reportIdx]])
		# plt.show()
	writeFile.close()
Ejemplo n.º 10
0
def labelClassification():
	corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports()

	numFolds = 5 # number of folds for cross validation
	# Create the output directory
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# initialise figure and plot
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC")

			# fetch corpus and labels
			labelledCorpus = []
			# print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]])))
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append((corpusList[i]))
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set
			dataPerFold = int(math.ceil(numData/numFolds))


			for n in range(0,numFolds):
				# split training and test data
				train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13)

				# build classifier
				classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels)
				# classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels)
				# classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels)

				# compute output label and corresponding score
				output_test = classifier.predict(test_labelledCorpus)
				output_train = classifier.predict(train_labelledCorpus)
				output_scores_test = classifier.decision_function(test_labelledCorpus)
				output_scores_train = classifier.decision_function(train_labelledCorpus)

				# sort scores and labels in order
				sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus))
				sortList.sort()
				output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList)

				if n ==0:
					all_test_labels = test_labels
					all_output_scores_test = output_scores_test
					all_train_labels = tuple(train_labels)
					all_output_scores_train = tuple(output_scores_train)
				else:
					all_test_labels = all_test_labels + test_labels
					all_output_scores_test = all_output_scores_test + output_scores_test
					all_train_labels = all_train_labels + tuple(train_labels)
					all_output_scores_train = all_output_scores_train+ tuple(output_scores_train)
				# save result to file
				for r in range(len(test_labels)):
					reportIdx = corpusList.index(list(test_labelledCorpus[r]))
					writer.writerow("")
					writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
					writer.writerow([reports[reportIdx]])
			# generate the roc curve
			fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive")
			fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive")

			# Calculate the area under the curves
			area_test = auc(fp_test, tp_test)
			area_train = auc(fp_train, tp_train)
			# Plot the average ROC curves
			plt.plot(fp_test,tp_test,'b',label='test(area = %0.2f)' % area_test)
			plt.plot(fp_train,tp_train,'r',label='train(area = %0.2f)' % area_train)
			plt.legend(loc='lower right')
			plt.savefig(directory+name)
	writeFile.close()
Ejemplo n.º 11
0
def testClassification():
	threashold = 0.001
	corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports()

	numFolds = 5 # number of folds for cross validation
	# Create the output directory
	directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# fetch corpus and labels
			labelledCorpus = []
			unlabelledCorpus = []
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append(corpusList[i])
			for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])):
				unlabelledCorpus.append(corpusList[i])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set

			# build classifier
			classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels)

			# compute output label and corresponding score
			output_test = classifier.predict(unlabelledCorpus)
			output_scores_test = classifier.decision_function(unlabelledCorpus)

			# sort scores and labels in order
			sortList = list(zip(output_scores_test,output_test,unlabelledCorpus))
			sortList.sort()
			output_scores_test,output_test,unlabelledCorpus = zip(*sortList)

			# save result to file
			for r in range(len(unlabelledCorpus)):
				if (abs(output_scores_test[r]) < threashold):
					reportIdx = corpusList.index(list(unlabelledCorpus[r]))
					writer.writerow("")
					writer.writerow([reportIdx,output_scores_test[r],output_test[r]])
					writer.writerow([reports[reportIdx]])
	writeFile.close()