forked from chop-dbhi/arrc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
keyword_analysis.py
92 lines (74 loc) · 3.06 KB
/
keyword_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
__author__ = 'Aaron J. Masino'
import numpy as np
from numpy.random import RandomState
import pandas as pd
from learn import printers, wrangle
from learn.metrics import PerformanceMetrics
import sklearn
from functools import reduce
def load_report(path):
f = open(path,'r')
text = reduce(lambda x,y: x+y, f.readlines(), "")
f.close()
return text
if __name__ == '__main__':
keyword_file = './data/input/SDS_PV2_combined/keywords/keywords.txt'
standard_out_file = './data/output/SDS_PV2_keyword_results.txt'
region_keys = ['inner', 'middle', 'outer', 'mastoid']
report_path = './data/input/SDS_PV2_combined/reports_single_find_impr'
#load test set data - same set used for ML tests
seed = 987654321
# set the numpy random seed so results are reproducible
rs = RandomState(987654321)
# set common path variables
label_file = './data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'
# read data
label_data = pd.read_csv(label_file)
# partition the data
pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
rs.shuffle(train_mask)
rs.shuffle(test_mask)
train_labels = label_data.iloc[train_mask]
test_labels = label_data.iloc[test_mask]
# read in the text reports
train_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in train_labels['pid']]
test_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in test_labels['pid']]
#import keywords
keywords = {}
with open(keyword_file, 'r') as f:
key = ""
for line in f.readlines():
if line.startswith("#"):
key = line[1:].strip('\n')
else:
l = keywords.get(key,[])
v = line.split(",")[0]
l.append(v)
keywords[key] = l
#create empty patient array to hold predicted values
num_patients = len(test_labels)
patients = np.empty((num_patients,), dtype=[('pid','S7'),('inner','i4'),('middle','i4'),('outer','i4'),('mastoid','i4')])
#initialize patients array
for k in region_keys:
patients[k] = 0
#get patient values based on icd9 codes
cnt = 0
for _ , row in test_labels.iterrows():
pid = row['pid']
patients['pid'][cnt] = pid
report = test_reports[cnt]
for region in region_keys:
for keyword in keywords[region]:
if keyword in report: patients[region][cnt] = 1
cnt += 1
#compare predicted and actual
for k in region_keys:
printers.printsf('{0}Analysis for {1} ear region{0}'.format(40*'-', k), standard_out_file)
y_pred = patients[k]
y_act = test_labels[k]
pm = PerformanceMetrics(y_act, y_pred)
printers.printsfPerformanceMetrics(pm, standard_out_file)
cm = sklearn.metrics.confusion_matrix(y_act, y_pred)
printers.printTwoClassConfusion(cm, standard_out_file)