Ejemplo n.º 1
0
def create_data(nb_of_samples, sequence_len):
	f = codecs.open("processed_data/word_index.txt","r","utf-8")
	index = f.read().split(",")
	f.close()
	
	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")
	sample_texts = p_json["2"]["0"]["A/P"]
	samples = []
	for i in xrange(nb_of_samples):
		tmp = np.zeros(len(index))
		for j in xrange(sequence_len):
			tmp[in_out.return_vector(index, sample_texts[i+j])] = 1
		samples.append(tmp)
	return samples
Ejemplo n.º 2
0
def get_common_words(arr, N, p_json):
        p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json")
        time = len(p_json["%s"%str(0)])

	import MeCab
	tagger = MeCab.Tagger('-Ochasen')

	counter = Counter()
	for n in xrange(N):
		texts = p_json["%s"%arr[n]]["%s"%(time-1)]["A/P"].strip("\n").encode("utf-8")
  		node = tagger.parseToNode(texts)
		while node:
      			word = node.surface.decode('utf-8')
			node = node.next
			if re.match(ur"^[ぁ-んーァ-ンー\u4e00-\u9FFF]+$", word) != None and len(word) != 1:
        			counter[word] += 1
Ejemplo n.º 3
0
def find_similar_patient_bycontent(arr, N):
	#N is the number of similar patient you want to take
	#Database
	with open("processed_data/Neighbor_mat.npy", "rb") as npy:
        	pmat = np.load(npy)

	#Order Index
        tmp_min_arr = []
	for i in xrange(len(pmat)):
		tmp_min = in_out.calc_dist(pmat[i], arr)
                tmp_min_arr.append(tmp_min)

        it = np.argsort(tmp_min_arr)  #sorted index
        
	p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json")
        #Output Search Result
        #get_time_value(it, N, p_json)
        get_common_words(it, N, p_json)
Ejemplo n.º 4
0
def find_similar_patient_bycontent(arr, N):
    #N is the number of similar patient you want to take
    #Database
    with open("processed_data/Neighbor_mat.npy", "rb") as npy:
        pmat = np.load(npy)

    #Order Index
    tmp_min_arr = []
    for i in xrange(len(pmat)):
        tmp_min = in_out.calc_dist(pmat[i], arr)
        tmp_min_arr.append(tmp_min)

    it = np.argsort(tmp_min_arr)  #sorted index

    p_text, p_json = in_out.read_json(
        "output/json_multi_lab_time_series_patient.json")
    #Output Search Result
    #get_time_value(it, N, p_json)
    get_common_words(it, N, p_json)
Ejemplo n.º 5
0
def find_similar_patient_byID(p_id):
	#This is for patient ID
	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")
	p_index = return_patient_index(p_id)

	with open("processed_data/Neighbor_mat.npy", "rb") as npy:
        	tmp_m = np.load(npy)

	min_value = 10000
	min_id = 0
	for i in xrange(len(tmp_m)):
		if i == p_index:
			continue
		tmp_min = calc_dist(tmp_m[i], tmp_m[p_index])
		if int(min_value) > int(tmp_min):
			min_value = tmp_min
			min_id = i
	print min_id
	return return_patient_ID(min_id)
Ejemplo n.º 6
0
def get_common_words(arr, N, p_json):
    p_text, p_json = in_out.read_json(
        "output/json_multi_lab_time_series_patient.json")
    time = len(p_json["%s" % str(0)])

    import MeCab
    tagger = MeCab.Tagger('-Ochasen')

    counter = Counter()
    for n in xrange(N):
        texts = p_json["%s" %
                       arr[n]]["%s" %
                               (time - 1)]["A/P"].strip("\n").encode("utf-8")
        node = tagger.parseToNode(texts)
        while node:
            word = node.surface.decode('utf-8')
            node = node.next
            if re.match(ur"^[ぁ-んーァ-ンー\u4e00-\u9FFF]+$",
                        word) != None and len(word) != 1:
                counter[word] += 1
Ejemplo n.º 7
0
        elif int(m - t) > sec_base:
            sec_base = m - t
            t_sec_base = i
        print t_base
        print t_sec_base
        i += 1
        maker_json(base, t_base, sec_base, t_sec_base)

    print p_dict.values()
    f = open("output/json_wc_processed_data.json", "w")
    json.dump(p_dict.values(), f, ensure_ascii=False)
    return p_dict


if __name__ == "__main__":
    p_text, p_json = in_out.read_json()
    tokens, text = detection.text_to_tokens(p_text)
    tmp = detection.word_count(tokens)

    MD_wc_list = []
    Triage_wc_list = []
    Chief_wc_list = []

    ## Only one sample patient
    for t in xrange(len(p_json["0"])):
        MD_wc_list.append(
            detection.word_count(p_json["0"]["%s" % t]["MDcomments"]))
        Triage_wc_list.append(
            detection.word_count(p_json["0"]["%s" % t]["TriageAssessment"]))
        Chief_wc_list.append(
            detection.word_count(p_json["0"]["%s" % t]["ChiefComplaint"]))
Ejemplo n.º 8
0
	
	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")
	sample_texts = p_json["2"]["0"]["A/P"]
	samples = []
	for i in xrange(nb_of_samples):
		tmp = np.zeros(len(index))
		for j in xrange(sequence_len):
			tmp[in_out.return_vector(index, sample_texts[i+j])] = 1
		samples.append(tmp)
	return samples


if __name__ == "__main__":
        #count, index = load_sample()

	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")

        #Unidentified two spaces
        num_patients = len(p_json)
	print "Number of Patient is %s"%num_patients

	m = MeCab.Tagger ("-Owakati")
	#m = MeCab.Tagger ("-Ochasen")

	
	tmp = []
	fp = open("processed_data/patient_id.txt", "w")
	for i in xrange(num_patients):
		text = p_json["%s"%i]["0"]["A/P"]
		fp.write(p_json["%s"%i]["0"]["patient_id"])
		fp.write("\n")
Ejemplo n.º 9
0
import kanji, vectorlize, in_out

def process(text):
	chrs = kanji.hasKanji(text)
	return chrs

if __name__ == "__main__":
        #count, index = load_sample()

        p_text, p_json = in_out.read_json("output/json_multi_lab_time_series_patient.json")

        #Unidentified two spaces
        num_patients = len(p_json)
	p = 1
	Time = len(p_json["%s"%p])

	
	tmp_chr = []
        for p in xrange(num_patients):
		tmp_array = []
		for t in xrange(Time):
			lab = "WBC"
                	tmp_array.append(p_json["%s"%p]["%s"%t][lab])
                	tmp_chr.append(process(p_json["%s"%p]["%s"%t]["A/P"]))
			break
		break
	
	print tmp_chr
	count, index = vectorlize.dictionarize_text(tmp_chr)
Ejemplo n.º 10
0
			t_base = i
		elif int(m-t) > sec_base:
			sec_base = m-t
			t_sec_base = i		
		print t_base
		print t_sec_base
		i += 1
		maker_json(base, t_base, sec_base, t_sec_base)

	print p_dict.values()
	f = open("output/json_wc_processed_data.json", "w")
    	json.dump(p_dict.values(), f, ensure_ascii=False)
	return p_dict

if __name__ == "__main__":
	p_text, p_json  = in_out.read_json()
	tokens, text = detection.text_to_tokens(p_text)
	tmp = detection.word_count(tokens)

	MD_wc_list = []
	Triage_wc_list = []
	Chief_wc_list = []

	## Only one sample patient
	for t in xrange(len(p_json["0"])):
		MD_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["MDcomments"]))
		Triage_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["TriageAssessment"]))
		Chief_wc_list.append(detection.word_count(p_json["0"]["%s"%t]["ChiefComplaint"]))

	convert_to_json(MD_wc_list, Triage_wc_list, Chief_wc_list)
Ejemplo n.º 11
0
import kanji, vectorlize, in_out


def process(text):
    chrs = kanji.hasKanji(text)
    return chrs


if __name__ == "__main__":
    #count, index = load_sample()

    p_text, p_json = in_out.read_json(
        "output/json_multi_lab_time_series_patient.json")

    #Unidentified two spaces
    num_patients = len(p_json)
    p = 1
    Time = len(p_json["%s" % p])

    tmp_chr = []
    for p in xrange(num_patients):
        tmp_array = []
        for t in xrange(Time):
            lab = "WBC"
            tmp_array.append(p_json["%s" % p]["%s" % t][lab])
            tmp_chr.append(process(p_json["%s" % p]["%s" % t]["A/P"]))
            break
        break

    print tmp_chr
    count, index = vectorlize.dictionarize_text(tmp_chr)
Ejemplo n.º 12
0
def return_patient_index(p_id):
	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")
	for i in xrange(len(p_json)):
		if p_id == p_json["%s"%i]["0"]["patient_id"]:
			return i
Ejemplo n.º 13
0
def return_patient_ID(p_index):
	p_text, p_json = in_out.read_json("output/one_json_time_series_patient.json")
	return p_json["%s"%p_index]["0"]["patient_id"]