def create_data(nb_of_samples, sequence_len): f = codecs.open("processed_data/word_index.txt","r","utf-8") index = f.read().split(",") f.close() p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") sample_texts = p_json["2"]["0"]["A/P"] samples = [] for i in xrange(nb_of_samples): tmp = np.zeros(len(index)) for j in xrange(sequence_len): tmp[in_out.return_vector(index, sample_texts[i+j])] = 1 samples.append(tmp) return samples
def get_common_words(arr, N, p_json): p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json") time = len(p_json["%s"%str(0)]) import MeCab tagger = MeCab.Tagger('-Ochasen') counter = Counter() for n in xrange(N): texts = p_json["%s"%arr[n]]["%s"%(time-1)]["A/P"].strip("\n").encode("utf-8") node = tagger.parseToNode(texts) while node: word = node.surface.decode('utf-8') node = node.next if re.match(ur"^[ぁ-んーァ-ンー\u4e00-\u9FFF]+$", word) != None and len(word) != 1: counter[word] += 1
def find_similar_patient_bycontent(arr, N): #N is the number of similar patient you want to take #Database with open("processed_data/Neighbor_mat.npy", "rb") as npy: pmat = np.load(npy) #Order Index tmp_min_arr = [] for i in xrange(len(pmat)): tmp_min = in_out.calc_dist(pmat[i], arr) tmp_min_arr.append(tmp_min) it = np.argsort(tmp_min_arr) #sorted index p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json") #Output Search Result #get_time_value(it, N, p_json) get_common_words(it, N, p_json)
def find_similar_patient_bycontent(arr, N): #N is the number of similar patient you want to take #Database with open("processed_data/Neighbor_mat.npy", "rb") as npy: pmat = np.load(npy) #Order Index tmp_min_arr = [] for i in xrange(len(pmat)): tmp_min = in_out.calc_dist(pmat[i], arr) tmp_min_arr.append(tmp_min) it = np.argsort(tmp_min_arr) #sorted index p_text, p_json = in_out.read_json( "output/json_multi_lab_time_series_patient.json") #Output Search Result #get_time_value(it, N, p_json) get_common_words(it, N, p_json)
def find_similar_patient_byID(p_id): #This is for patient ID p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") p_index = return_patient_index(p_id) with open("processed_data/Neighbor_mat.npy", "rb") as npy: tmp_m = np.load(npy) min_value = 10000 min_id = 0 for i in xrange(len(tmp_m)): if i == p_index: continue tmp_min = calc_dist(tmp_m[i], tmp_m[p_index]) if int(min_value) > int(tmp_min): min_value = tmp_min min_id = i print min_id return return_patient_ID(min_id)
def get_common_words(arr, N, p_json): p_text, p_json = in_out.read_json( "output/json_multi_lab_time_series_patient.json") time = len(p_json["%s" % str(0)]) import MeCab tagger = MeCab.Tagger('-Ochasen') counter = Counter() for n in xrange(N): texts = p_json["%s" % arr[n]]["%s" % (time - 1)]["A/P"].strip("\n").encode("utf-8") node = tagger.parseToNode(texts) while node: word = node.surface.decode('utf-8') node = node.next if re.match(ur"^[ぁ-んーァ-ンー\u4e00-\u9FFF]+$", word) != None and len(word) != 1: counter[word] += 1
elif int(m - t) > sec_base: sec_base = m - t t_sec_base = i print t_base print t_sec_base i += 1 maker_json(base, t_base, sec_base, t_sec_base) print p_dict.values() f = open("output/json_wc_processed_data.json", "w") json.dump(p_dict.values(), f, ensure_ascii=False) return p_dict if __name__ == "__main__": p_text, p_json = in_out.read_json() tokens, text = detection.text_to_tokens(p_text) tmp = detection.word_count(tokens) MD_wc_list = [] Triage_wc_list = [] Chief_wc_list = [] ## Only one sample patient for t in xrange(len(p_json["0"])): MD_wc_list.append( detection.word_count(p_json["0"]["%s" % t]["MDcomments"])) Triage_wc_list.append( detection.word_count(p_json["0"]["%s" % t]["TriageAssessment"])) Chief_wc_list.append( detection.word_count(p_json["0"]["%s" % t]["ChiefComplaint"]))
p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") sample_texts = p_json["2"]["0"]["A/P"] samples = [] for i in xrange(nb_of_samples): tmp = np.zeros(len(index)) for j in xrange(sequence_len): tmp[in_out.return_vector(index, sample_texts[i+j])] = 1 samples.append(tmp) return samples if __name__ == "__main__": #count, index = load_sample() p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") #Unidentified two spaces num_patients = len(p_json) print "Number of Patient is %s"%num_patients m = MeCab.Tagger ("-Owakati") #m = MeCab.Tagger ("-Ochasen") tmp = [] fp = open("processed_data/patient_id.txt", "w") for i in xrange(num_patients): text = p_json["%s"%i]["0"]["A/P"] fp.write(p_json["%s"%i]["0"]["patient_id"]) fp.write("\n")
import kanji, vectorlize, in_out def process(text): chrs = kanji.hasKanji(text) return chrs if __name__ == "__main__": #count, index = load_sample() p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json") #Unidentified two spaces num_patients = len(p_json) p = 1 Time = len(p_json["%s"%p]) tmp_chr = [] for p in xrange(num_patients): tmp_array = [] for t in xrange(Time): lab = "WBC" tmp_array.append(p_json["%s"%p]["%s"%t][lab]) tmp_chr.append(process(p_json["%s"%p]["%s"%t]["A/P"])) break break print tmp_chr count, index = vectorlize.dictionarize_text(tmp_chr)
t_base = i elif int(m-t) > sec_base: sec_base = m-t t_sec_base = i print t_base print t_sec_base i += 1 maker_json(base, t_base, sec_base, t_sec_base) print p_dict.values() f = open("output/json_wc_processed_data.json", "w") json.dump(p_dict.values(), f, ensure_ascii=False) return p_dict if __name__ == "__main__": p_text, p_json = in_out.read_json() tokens, text = detection.text_to_tokens(p_text) tmp = detection.word_count(tokens) MD_wc_list = [] Triage_wc_list = [] Chief_wc_list = [] ## Only one sample patient for t in xrange(len(p_json["0"])): MD_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["MDcomments"])) Triage_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["TriageAssessment"])) Chief_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["ChiefComplaint"])) convert_to_json(MD_wc_list, Triage_wc_list, Chief_wc_list)
import kanji, vectorlize, in_out def process(text): chrs = kanji.hasKanji(text) return chrs if __name__ == "__main__": #count, index = load_sample() p_text, p_json = in_out.read_json( "output/json_multi_lab_time_series_patient.json") #Unidentified two spaces num_patients = len(p_json) p = 1 Time = len(p_json["%s" % p]) tmp_chr = [] for p in xrange(num_patients): tmp_array = [] for t in xrange(Time): lab = "WBC" tmp_array.append(p_json["%s" % p]["%s" % t][lab]) tmp_chr.append(process(p_json["%s" % p]["%s" % t]["A/P"])) break break print tmp_chr count, index = vectorlize.dictionarize_text(tmp_chr)
def return_patient_index(p_id): p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") for i in xrange(len(p_json)): if p_id == p_json["%s"%i]["0"]["patient_id"]: return i
def return_patient_ID(p_index): p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json") return p_json["%s"%p_index]["0"]["patient_id"]