/
analyze.py
229 lines (212 loc) · 11.1 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import intelligenticons as icon
import csv
from scipy.interpolate import interp1d
import jpype
import json
import numpy
import random
from pymatbridge import Matlab
import os
import base64
import math
from scipy.stats import norm
def get_mins_from_hhmm(hhmm):
parts = hhmm.split(':')
return int(parts[0]) * 60 + int(parts[1])
def load_from_data_file(input_filename):
f = open(input_filename, 'r')
return json.loads(f.read())
def write_sax_sentence_windowed(series, window_size, paa_length, alphabet_size):
sentence = []
series_len = len(series)
for window_start_index in range(series_len - window_size):
print("hi")
window = TSUtils.subseries(jpype.JArray(jpype.JDouble, 1)(series), window_start_index, window_size)
normalized = TSUtils.zNormalize(window)
paa = TSUtils.paa(normalized, paa_length)
blah = normal_a.getCuts(jpype.JPackage("java.lang").Integer(alphabet_size))
sax = str(TSUtils.ts2String(paa, blah))
sentence.append(jpype.JString(sax));
return sentence
def sax_unwindowed(series, alphabet_size):
global normal_a, TSUtils
#the following is copied from matlab/timeseries2symbol (written by Keogh et al)
n = 16;
length = len(series);
# normalize the time series first
j_arr = jpype.JArray(jpype.JDouble, 1)(series)
normalized = TSUtils.zNormalize(series)
# if we represent each symbol as a binary string, calculate how much space
# (i.e. how many bits) is needed to store one symbol
num_bits_per_symbol = math.floor(math.log(alphabet_size, 2)+1)
#64 bits vs. 3 bits (for alphabet size between 4 and 7) per PAA coef
PAA_symbolic_ratio = math.floor(64 / num_bits_per_symbol)
# this is the maximum symbolic segments possible, using the same amount of space as PAA
# (assuming each PAA segment uses 8 bytes)
temp1 = PAA_symbolic_ratio * n
if temp1 >= length:
sym_seg = length
# can't just use the maximum # of segments possible -- it has to be divisible by the
# original length
else:
temp2 = math.floor(math.log(temp1, 2));
sym_seg = 2 ** temp2;
if sym_seg > 4 * n:
sym_seg = sym_seg / 4;
paa = TSUtils.paa(normalized, int(sym_seg))
cuts = normal_a.getCuts(jpype.JPackage("java.lang").Integer(alphabet_size))
sax = str(TSUtils.ts2String(paa, cuts))
return sax
base_dir = os.path.dirname(os.path.realpath(__file__))
matlab_path = '/usr/local/MATLAB/R2012a/bin/matlab'
#initialize java bridge
jvmargs = ["-Djava.class.path=./lib/jmotif.lib-0.97.jar:./lib/hackystatlogger.lib.jar:./lib/hackystatuserhome.lib.jar:./lib/weka.jar", "-Djava.library.path=./lib/rjava/jri"]
jpype.startJVM(jpype.getDefaultJVMPath(), *jvmargs)
NormalAlphabet = jpype.JPackage("edu.hawaii.jmotif.sax.alphabet").NormalAlphabet
SAXFactory = jpype.JPackage("edu.hawaii.jmotif.sax").SAXFactory
TSUtils = jpype.JPackage("edu.hawaii.jmotif.timeseries").TSUtils
Timeseries = jpype.JPackage("edu.hawaii.jmotif.timeseries").Timeseries
WordBag = jpype.JPackage("edu.hawaii.jmotif.text").WordBag
normal_a = NormalAlphabet()
alphabet_size = 4
intelligent_icon_pixels = 16
current_subject_id = 0
last_minutes = 0
current_entry = {}
all_entries = []
data_file = open("/mnt/labserver/data_correct_newlines.csv", "r")
data_reader = csv.reader(data_file)
print("interpolating time series, adding sax")
for row in list(data_reader)[1:]:
pain_score = row[7]
minutes_after_surgery = get_mins_from_hhmm(row[8])
subject_id = row[15]
if (subject_id != current_subject_id):
if current_subject_id > 0:
if (len(current_entry["minutes_after_surgery_entries"]) > 1):
#linear interpolation of the pain scores to constant minute increments
increment_minutes = 10
interpolation_func = interp1d(current_entry["minutes_after_surgery_entries"], current_entry["recorded_pain_score_entries"])
minutes_after_surgery_expanded = range(current_entry["minutes_after_surgery_entries"][0], current_entry["minutes_after_surgery_entries"][-1], increment_minutes)
interpolated_pain_timeseries_numpy_array = interpolation_func(minutes_after_surgery_expanded)
interpolated_pain_timeseries_native_float_array = [float(x) for x in interpolated_pain_timeseries_numpy_array]
current_entry["interpolated_pain_timeseries"] = interpolated_pain_timeseries_native_float_array
current_entry["sax"] = sax_unwindowed(
current_entry["interpolated_pain_timeseries"],
alphabet_size
)
current_entry["raw_intelligent_icon_bitmap"] = icon.get_raw_sequence_bitmap(current_entry["sax"], 2, alphabet_size, intelligent_icon_pixels)
all_entries.append(current_entry)
current_subject_id = subject_id
current_entry = {
"subject_id": subject_id,
"age": row[0],
"gender": row[1],
"surgery_type": row[9],
"age_group": row[11],
"minutes_after_surgery_entries": [],
"recorded_pain_score_entries": [],
"interpolated_pain_timeseries": None,
"sax": "",
"raw_intelligent_icon_bitmap": [],
"raw_intelligent_icon_proportions_bitmap": [],
"normalized_intelligent_icon_bitmap": []
}
last_minutes = 0
if len(pain_score):
#todo: introduce a unique identifier for sleep
current_entry["minutes_after_surgery_entries"].append(minutes_after_surgery)
current_entry["recorded_pain_score_entries"].append(pain_score)
print("z normalizing intelligent icon pixels for individual patients")
pixel_totals = icon.make_empty_bitmap(intelligent_icon_pixels)
pixel_means = icon.make_empty_bitmap(intelligent_icon_pixels)
pixel_std_devs = icon.make_empty_bitmap(intelligent_icon_pixels)
row_pixel_count = int(math.sqrt(intelligent_icon_pixels))
for row in range(row_pixel_count):
for column in range(row_pixel_count):
pixel_total = numpy.sum([entry["raw_intelligent_icon_bitmap"][row][column] for entry in all_entries])
pixel_totals[row][column] = pixel_total
pixel_proportions = []
for entry in all_entries:
raw_value = entry["raw_intelligent_icon_bitmap"][row][column]
#dont include 0s in our proportion calculations, they skew the predicted population mean/stddev heavily
if raw_value > 0:
#TODO: ensure these are all normal!!!
#a couple don't seem to be, eg at least one appears to have two bell curves for some reason (but most seem to be normal)
#extra TODO: also, this is probably better represented as a beta distribution
pixel_proportions.append(float(raw_value) / float(pixel_total))
pixel_means[row][column] = numpy.mean(pixel_proportions)
pixel_std_devs[row][column] = numpy.std(pixel_proportions)
all_entries_grouped = {}
cluster_by_key = "surgery_type"
word_frequencies = {}
print("combining patients into groups by " + cluster_by_key)
for entry in all_entries:
#calculated normalized bitmap for this entry
entry["normalized_intelligent_icon_bitmap"] = icon.make_empty_bitmap(intelligent_icon_pixels)
tmp_flattened_bitmap = []
for row in range(row_pixel_count):
for column in range(row_pixel_count):
if pixel_std_devs[row][column]:
pixel_proportion = float(entry["raw_intelligent_icon_bitmap"][row][column]) / float(pixel_totals[row][column])
entry["normalized_intelligent_icon_bitmap"][row][column] = float(pixel_proportion - pixel_means[row][column]) / pixel_std_devs[row][column]
else:
pixel_proportion = 0
entry["normalized_intelligent_icon_bitmap"][row][column] = 0
tmp_flattened_bitmap.append(float(pixel_proportion - pixel_means[row][column]) / pixel_std_devs[row][column])
print(",".join([str(x) for x in tmp_flattened_bitmap]))
#add to group
key_value = entry[cluster_by_key]
if key_value not in all_entries_grouped:
all_entries_grouped[key_value] = {
"intelligent_icon": icon.make_empty_bitmap(intelligent_icon_pixels),
"entries_in_group": [],
"group_name": key_value
}
all_entries_grouped[key_value]["entries_in_group"].append(entry)
print("calculating normalized group icons")
matlab_intelligent_icon_visualization_data_expr = "["
add_struct_comma_to_expr = False
for group_name in all_entries_grouped:
group = all_entries_grouped[group_name]
print(group_name + " - " + str(len(group["entries_in_group"])) + " entries")
for row in range(row_pixel_count):
for column in range(row_pixel_count):
pixel_z_val = numpy.mean([entry["normalized_intelligent_icon_bitmap"][row][column] for entry in group["entries_in_group"]])
#pixel_cdf = norm.cdf(pixel_z_val)
#we could use cdf here for more "significant" results, but lowering our range makes things more colorful!
pixel_normalized_val = pixel_z_val
if pixel_normalized_val > 1:
pixel_normalized_val = 1
elif pixel_normalized_val < -1:
pixel_normalized_val = -1
#color map requires integer from 1-64, so scale our 0-1 value to that range
scaled_for_color = math.ceil(32 * pixel_normalized_val + 32)
if scaled_for_color == 0:
scaled_for_color = 1
group["intelligent_icon"][row][column] = scaled_for_color
if add_struct_comma_to_expr:
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + ","
else:
add_struct_comma_to_expr = True
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + "struct('title', '" + group_name + "', 'patches', ["
add_pixel_comma_to_expr = False
#flatten the bitmap into a vector of pixels (see matlab/show_icons.m)
for row in range(row_pixel_count):
for column in range(row_pixel_count):
if add_pixel_comma_to_expr:
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + ","
else:
add_pixel_comma_to_expr = True
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + str(group["intelligent_icon"][row][column])
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + "])"
matlab_intelligent_icon_visualization_data_expr = matlab_intelligent_icon_visualization_data_expr + "]"
print("rendering intelligent icon grid")
#initialise matlab bridge
mlab = Matlab(port=4000, matlab=matlab_path)
mlab.start()
#i cant imagine more ghetto way to do this, but it seems to be the only way to pass this data through some bug in the data parsing part of the bridge (Which i didnt write, for the record!!!)
icon_data_expr_encoded = base64.b64encode(matlab_intelligent_icon_visualization_data_expr)
print(icon_data_expr_encoded)
mlab.run(base_dir + "/matlab/show_icons.m", {"encoded": icon_data_expr_encoded})
print("done without errors")