Ejemplo n.º 1
0
def buildDoc2VecModel():
	reports = preprocess.getProcessedReports()

	# construct sentences from reports
	taggedDocuments = []
	for i in range(len(reports)):
		taggedDocument = gensim.models.doc2vec.TaggedDocument(words= reports[i], tags= [i])
		taggedDocuments.append(taggedDocument)


	# model = gensim.models.Doc2Vec(taggedDocuments)
	model = gensim.models.Doc2Vec(size=300, min_count=5, workers=16,dm=1, dbow_words=1,negative=20)

	model.build_vocab(taggedDocuments)

	model.alpha = 0.025 # learning rate

	for epoch in range(10):
		print(epoch)
		model.train(taggedDocuments)
		model.alpha -= 0.001
		model.min_alpha = model.alpha


	model.save("./model_files/reports.doc2vec_model")
Ejemplo n.º 2
0
def buildDoc2VecModel():
    reports = preprocess.getProcessedReports()

    # construct sentences from reports
    taggedDocuments = []
    for i in range(len(reports)):
        taggedDocument = gensim.models.doc2vec.TaggedDocument(words=reports[i],
                                                              tags=[i])
        taggedDocuments.append(taggedDocument)

    # model = gensim.models.Doc2Vec(taggedDocuments)
    model = gensim.models.Doc2Vec(size=300,
                                  min_count=5,
                                  workers=16,
                                  dm=1,
                                  dbow_words=1,
                                  negative=20)

    model.build_vocab(taggedDocuments)

    model.alpha = 0.025  # learning rate

    for epoch in range(10):
        print(epoch)
        model.train(taggedDocuments)
        model.alpha -= 0.001
        model.min_alpha = model.alpha

    model.save("./model_files/reports.doc2vec_model")
Ejemplo n.º 3
0
def buildDictionary(fileType):
	reports = preprocess.getProcessedReports(fileType)

	print("files loaded")

	# build dictionary
	dictionary = gensim.corpora.Dictionary(reports)
	# dictionary.filter_extremes(no_below=3)
	dictionary.save('../model_files/reports.dict')
	print(dictionary)

	print("dictionary finished")

	# build corpus
	corpus = [dictionary.doc2bow(report) for report in reports]
	gensim.corpora.MmCorpus.serialize('../model_files/reports.mm', corpus)
	# print(corpus)

	print("corpus finished")
Ejemplo n.º 4
0
def buildWord2VecModel():
	reports = preprocess.getProcessedReports()

	model = gensim.models.Word2Vec(reports, min_count=3)
	model.init_sims(replace=True)
	model.save("./model_files/reports.word2vec_model")
	print(model)

	# model = gensim.models.Word2Vec.load("zzmodel")
	print("----------------------------------similarity test")
	print(model.similarity("head","brain"))
	print("----------------------------------raw numpy vector of word")
	print(model["age"])
	print("----------------------------------remove outlier")
	print(model.doesnt_match("hours four age".split()))
	print("----------------------------------similar words")
	print(model.most_similar("haem"))

	print("script finished")
Ejemplo n.º 5
0
def buildDictionary():
    reports = preprocess.getProcessedReports()

    print("files loaded")

    # build dictionary
    dictionary = gensim.corpora.Dictionary(reports)
    # dictionary.filter_extremes(no_below=3)
    dictionary.save('./model_files/reports.dict')
    print(dictionary)

    print("dictionary finished")

    # build corpus
    corpus = [dictionary.doc2bow(report) for report in reports]
    gensim.corpora.MmCorpus.serialize('./model_files/reports.mm', corpus)
    # print(corpus)

    print("corpus finished")
Ejemplo n.º 6
0
def buildWord2VecModel():
    reports = preprocess.getProcessedReports()

    model = gensim.models.Word2Vec(reports, min_count=3)
    model.init_sims(replace=True)
    model.save("./model_files/reports.word2vec_model")
    print(model)

    # model = gensim.models.Word2Vec.load("zzmodel")
    print("----------------------------------similarity test")
    print(model.similarity("head", "brain"))
    print("----------------------------------raw numpy vector of word")
    print(model["age"])
    print("----------------------------------remove outlier")
    print(model.doesnt_match("hours four age".split()))
    print("----------------------------------similar words")
    print(model.most_similar("haem"))

    print("script finished")
Ejemplo n.º 7
0
def labelClassificationD2V():

    model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

    reports = preprocess.getReports()
    processedReports = preprocess.getProcessedReports()

    numFolds = 5  # number of folds for cross validation
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # initialise figure and plot
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC")

            # fetch corpus and labels
            labelledReports = []
            labelledCorpus = list()
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledReports.append(reports[i])
                labelledCorpus.append(model.infer_vector(processedReports[i]))
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            corpusList = [list(x) for x in labelledCorpus]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set
            dataPerFold = int(math.ceil(numData / numFolds))

            for n in range(0, numFolds):
                # split training and test data
                train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                    labelledCorpus, labels, test_size=0.13)

                # build classifier
                classifier = svm.SVC(kernel='linear').fit(
                    train_labelledCorpus, train_labels)

                # compute output label and corresponding score
                output_test = classifier.predict(test_labelledCorpus)
                output_train = classifier.predict(train_labelledCorpus)
                output_scores_test = classifier.decision_function(
                    test_labelledCorpus)
                output_scores_train = classifier.decision_function(
                    train_labelledCorpus)

                # sort scores and labels in order
                sortList = list(
                    zip(output_scores_test, output_test, test_labels,
                        test_labelledCorpus))
                sortList.sort()
                output_scores_test, output_test, test_labels, test_labelledCorpus = zip(
                    *sortList)

                # build roc curve and plot
                fp_test, tp_test, _ = roc_curve(test_labels,
                                                output_scores_test,
                                                pos_label="positive")
                fp_train, tp_train, _ = roc_curve(train_labels,
                                                  output_scores_train,
                                                  pos_label="positive")

                plt.plot(fp_test,
                         tp_test,
                         'r',
                         label="train" if n == 0 else "")
                plt.plot(fp_train,
                         tp_train,
                         'b',
                         label="test" if n == 0 else "")
                plt.legend(loc='lower right')
                plt.savefig(directory + name)

                # save result to file
                for r in range(len(test_labels)):
                    reportIdx = corpusList.index(list(test_labelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow([
                        output_scores_test[r], output_test[r], test_labels[r]
                    ])
                    writer.writerow([labelledReports[reportIdx]])
        # plt.show()
    writeFile.close()
Ejemplo n.º 8
0
def labelClassificationD2V():

	model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

	reports = preprocess.getReports()
	processedReports = preprocess.getProcessedReports()

	numFolds = 5 # number of folds for cross validation
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# initialise figure and plot
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC")

			# fetch corpus and labels
			labelledReports = []
			labelledCorpus = list()
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledReports.append(reports[i])
				labelledCorpus.append(model.infer_vector(processedReports[i]))
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			corpusList = [list(x) for x in labelledCorpus]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set
			dataPerFold = int(math.ceil(numData/numFolds))


			for n in range(0,numFolds):
				# split training and test data
				train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13)

				# build classifier
				classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels)

				# compute output label and corresponding score
				output_test = classifier.predict(test_labelledCorpus)
				output_train = classifier.predict(train_labelledCorpus)
				output_scores_test = classifier.decision_function(test_labelledCorpus)
				output_scores_train = classifier.decision_function(train_labelledCorpus)

				# sort scores and labels in order
				sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus))
				sortList.sort()
				output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList)

				# build roc curve and plot
				fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive")
				fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive")

				plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "")
				plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "")
				plt.legend(loc='lower right')
				plt.savefig(directory+name)

				# save result to file
				for r in range(len(test_labels)):
					reportIdx = corpusList.index(list(test_labelledCorpus[r]))
					writer.writerow("")
					writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
					writer.writerow([labelledReports[reportIdx]])
		# plt.show()
	writeFile.close()