def patient_admission_duration(): patient_ids = [] admission_id = [] patient_mental_not_suicide_all = read.read_from_tsv(os.path.join(output_folder,"mental_patient_all_notes.tsv")) row_patient_admission_time ={} for row in patient_mental_not_suicide_all: row_patient_admission= row[0]+"_"+row[1] + "_" + row[2] add_key_dict(row_patient_admission_time, row_patient_admission, row[3]) print("patients (mental_not_suicide) time information are collected......") patient_mental_not_suicide_admission = read.read_from_tsv(os.path.join(output_folder,"mental_patient_admission_notes.tsv")) mental_not_suicide_patient_admission_time = {} for row in patient_mental_not_suicide_admission: if row[1] not in patient_ids: patient_ids.append(row[1]) if row[2] not in admission_id: admission_id.append(row[2]) add_key_dict(mental_not_suicide_patient_admission_time, row[1], row[3]) print("patients_admission (mental_not_suicide) timelines are collected......") mental_not_suicide_patient_all_time_formatted ={} for key, value in row_patient_admission_time.items(): time_items = value[0].split("-") datetime_new = date(int(time_items[0]), int(time_items[1]), int(time_items[2])) mental_not_suicide_patient_all_time_formatted[key] = datetime_new mental_not_suicide_patient_admission_period = get_duration(mental_not_suicide_patient_admission_time) print("patients_admission (mental_not_suicide) timelines are calculated......") patient_admission_before = [] patient_admission_meanwhile = [] for patient_id in patient_ids: mental_not_suicide_admission_time = mental_not_suicide_patient_admission_period[patient_id] mental_not_suicide_patient_all_time = get_patient_for_admission(mental_not_suicide_patient_all_time_formatted, patient_id) for key, admission_time in mental_not_suicide_patient_all_time.items(): row_id, _, _ = key.split("_") if admission_time < mental_not_suicide_admission_time[0]: patient_admission_before.append(row_id) elif admission_time > mental_not_suicide_admission_time[1]: None else: patient_admission_meanwhile.append(row_id) read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_before"),patient_admission_before) read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_meanwhile"),patient_admission_meanwhile) return patient_admission_before, patient_admission_meanwhile
def get_documents(): suicide_meanwhile_patient_admission = read.read_from_tsv( os.path.join( output_folder, "suicide_patient_id/suicide_meanwhile_patient_admission.tsv")) file_notes_title_meanwhile = suicide_meanwhile_patient_admission[:1] file_notes_meanwhile = suicide_meanwhile_patient_admission[1:] row_meanwhile = [row[0] for row in file_notes_meanwhile] suicide_before_patient_admission = read.read_from_tsv( os.path.join( output_folder, "suicide_patient_id/suicide_before_patient_admission.tsv")) file_notes_title_before = suicide_before_patient_admission[:1] file_notes_before = suicide_before_patient_admission[1:] row_before = [row[0] for row in file_notes_before] with open("data/NOTEEVENTS.csv", 'r') as mycsvfile: files = csv.reader(mycsvfile, delimiter=',') for row in files: if row[0] in row_before: if row[6] == "Discharge summary": file_notes_title_before.append(row[:-1]) read.save_in_txt_string( os.path.join( output_folder, "discharge_summaries/suicide_patient_before/" + row[0] + "_" + row[1] + "_" + row[2] + ".txt"), row[-1]) elif row[0] in row_meanwhile: if row[6] == "Discharge summary": file_notes_title_meanwhile.append(row[:-1]) read.save_in_txt_string( os.path.join( output_folder, "discharge_summaries/suicide_patient_during/" + row[0] + "_" + row[1] + "_" + row[2] + ".txt"), row[-1]) read.save_in_tsv( os.path.join(output_folder, "discharge_summaries/suicide_patient_before.tsv"), file_notes_title_before) read.save_in_tsv( os.path.join(output_folder, "discharge_summaries/suicide_patient_during.tsv"), file_notes_title_meanwhile)
def analyze(): rxnorm_term = read.read_from_tsv("data/umls/all_rxnorm_suppress.tsv") snomed_term = read.read_from_tsv("data/umls/all_snowmed_suppress.tsv") rxnorm_term = list(set([item[0] for item in rxnorm_term])) snomed_term = list(set([item[0] for item in snomed_term])) print(len(rxnorm_term)) print(len(snomed_term)) rxnorm_term1 = read.read_from_json("data/umls/rxnorm_dict") snomed_term1 = read.read_from_json("data/umls/snomed_dict") print(len(rxnorm_term1)) print(len(snomed_term1))
def suicide_meanwhile_notes(file_name): target_description = [ "Nursing/other", "Nursing", "Physician", "Discharge summary", "Social Work", "General", "Nutrition", "Rehab Services", "Case Management", "Consult" ] suicide_meanwhile = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/" + file_name)) title = read.read_from_tsv( os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[:1] patient_notes_all = read.read_from_tsv( os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[1:] suicide_meanwhile_notes = title none_admission_id = [] admission_id = [] notes_all = [] notes_all_subset = [] # admission_id_new = read.read_from_json(os.path.join(cache_folder,"suicide_patient_id/admission_id")) for row in patient_notes_all: if row[1] in suicide_meanwhile: if row[0] in suicide_meanwhile[row[1]]: if row[2] == "": # print(row) none_admission_id.append(row[0]) elif row[2] not in admission_id: admission_id.append(row[2]) else: None if row[6] in target_description: notes_all_subset.append(row[0]) suicide_meanwhile_notes.append(row) notes_all.append(row[0]) print("patients: ", len(suicide_meanwhile)) print("admission with id: ", len(admission_id)) print("admission without id: ", len(none_admission_id)) print("all notes: ", len(notes_all)) # for admission_id_1 in admission_id: # if admission_id_1 not in admission_id_new: # print(admission_id_1) read.save_in_tsv( os.path.join(output_folder, "suicide_patient_id/" + file_name + ".tsv"), suicide_meanwhile_notes)
def process_ontology(): ontology = read.read_from_tsv("data/ontology.tsv") concept_mentions = {} for idx, [synonym, concept] in enumerate(ontology): read.add_dict(concept_mentions, concept, synonym) concepts = list(concept_mentions.keys()) synonyms = [] concept_mention_idx = {} idx = 0 for concept in concepts: concept_synonyms = list(set(concept_mentions[concept])) synonyms += concept_synonyms end = idx + len(concept_synonyms) for index in range(idx, end): concept_mention_idx[concept] = (idx, end) idx = end synonyms = [[item] for item in synonyms] read.save_in_tsv("data/ontology/ontology_synonyms.tsv", synonyms) read.save_in_json("data/ontology/ontology_concept", concepts) read.save_in_json("data/ontology/ontology_concept_synonyms_idx", concept_mention_idx)
def add_oov_processed(): code_cuis = read.read_from_tsv( "data/AskAPatient/codes_single_synonyms_tsv.tsv") code_cuis_dict = { line[0]: line[2] for line in code_cuis if len(line[3]) > 2 } cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict") cui_st = read.read_from_json("data/AskAPatient/cui_st_dict") code_labels = read.read_from_json( "data/AskAPatient/label_texts_dict_AskAPatient") codes_synonyms_tsv = {} codes_st_tsv = [] for code in ask: code_st_tsv = [code, code_labels[code]] if code in ask: if code in code_cuis_dict: cui = code_cuis_dict[code] synonym = list(set(cui_synonyms[cui])) code_st_tsv += [ cui, " [SEP] ".join(synonym)[:100], cui_st[cui] ] else: synonym = code_labels[code] codes_synonyms_tsv[code] = synonym codes_st_tsv.append(code_st_tsv) read.save_in_json("data/AskAPatient/code_dict_complete", codes_synonyms_tsv) read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
def add_oov(): code_cuis = read.read_from_tsv("data/AskAPatient/code_cuis.tsv") code_cuis_dict = { line[0]: line[1:] for line in code_cuis if len(line[:-1]) > 0 } cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict") cui_st = read.read_from_json("data/AskAPatient/cui_st_dict") code_labels = read.read_from_json( "data/AskAPatient/label_texts_dict_AskAPatient") codes_synonyms_tsv = [] codes_st_tsv = [] for code in ask: code_synonyms_tsv = [code, code_labels[code]] code_st_tsv = [code, code_labels[code]] if code in ask: if code in code_cuis_dict: cuis = code_cuis_dict[code] for cui in cuis: code_synonyms_tsv += [ cui, " [SEP] ".join(cui_synonyms[cui])[:100] ] code_st_tsv += [ cui, " [SEP] ".join(cui_synonyms[cui])[:100], cui_st[cui] ] codes_synonyms_tsv.append(code_synonyms_tsv) codes_st_tsv.append(code_st_tsv) read.save_in_tsv("data/AskAPatient/codes_synonyms_tsv.tsv", codes_synonyms_tsv) read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
def main(model_path, model_type, sentence_corpus, output_path): #### Read sentence courpus. output: list of sentences #### sentences = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv")) sentences = [item for row in sentences for item in row] print(sentences[:10]) if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) sentences_embedding = embedder.encode(sentences) read.save_in_pickle(os.path.join(output_path, "embeddings.pkl"), sentences_embedding)
def dev_evaluator(): ontology = read.read_from_tsv("data/ontology/ontology_synonyms.tsv") cui_mention_idx = read.read_from_json( "data/ontology/ontology_concept_synonyms_idx") corpus = {"doc_" + str(id): item[0] for id, item in enumerate(ontology)} read.save_in_json("data/evaluator_path/corpus", corpus) doc_id2mesh_all = {} mesh2doc_id_all = {} for key, item in cui_mention_idx.items(): doc_id2mesh = {"doc_" + str(id): key for id in range(item[0], item[1])} doc_id2mesh_all.update(doc_id2mesh) mesh2doc_id = { key: ["doc_" + str(id) for id in range(item[0], item[1])] } mesh2doc_id_all.update(mesh2doc_id) dev_input = read.read_from_tsv("data/input_raw/dev.tsv") mentions = [item[0] for item in dev_input] query = { "q_" + str(id): item[0] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } relevant_docs = { "q_" + str(id): mesh2doc_id_all[item[1]] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } read.save_in_json("data/evaluator_path/dev_queries", query) read.save_in_json("data/evaluator_path/dev_relevant_docs", relevant_docs) for qid, item in query.items(): text = [ ontology[int(doc_id.split("_")[1])][0] for doc_id in relevant_docs[qid] ] print(item, text)
def read_all_notes_for_patients_admission_time(): patient_id = [] admission_id = [] patient_notes_all = read.read_from_tsv( os.path.join(output_folder, "note_events_all.tsv"))[1:] row_patient_admission_time = {} for row in patient_notes_all: row_patient_admission = row[0] + "_" + row[1] + "_" + row[2] add_key_dict(row_patient_admission_time, row_patient_admission, row[3]) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/allnotes_row_patient_admission_time"), row_patient_admission_time) patient_notes_suicide = read.read_from_tsv( os.path.join(output_folder, "note_events_suicidal.tsv"))[1:] suicide_patient_admission_time = {} for row in patient_notes_suicide: if row[1] not in patient_id: patient_id.append(row[1]) if row[2] not in admission_id: admission_id.append(row[2]) add_key_dict(suicide_patient_admission_time, row[1], row[3]) read.save_in_json( os.path.join( cache_folder, "suicide_patient_id/suicidalnotes_patient_admission_time"), suicide_patient_admission_time) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/patient_id"), patient_id) print(len(admission_id)) print(len(patient_id))
def main(model_path, model_type, sentence_corpus, query): if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) corpus_embeddings = read.read_from_pickle( os.path.join(sentence_corpus, "embeddings.pkl")) corpus = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv")) sentences = [item for row in corpus for item in row] query_embedding = embedder.encode([query]) # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity closest_n = 5 distances = scipy.spatial.distance.cdist(query_embedding, corpus_embeddings, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") for idx, distance in results[0:closest_n]: print(sentences[idx].strip(), "(Score: %.4f)" % (1 - distance))
import read_files as read file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" ask = read.read_from_json("data/AskAPatient/label") twa = read.read_from_json("data/TwADR-L/label") twa_cuis_all = read.read_from_tsv("data/TwADR-L/cui_cuis - cui_cuis.tsv") twa_cuis = [item[1] for item in twa_cuis_all] twa_cuis_dict = {item[0]: item[1] for item in twa_cuis_all} def textfile2list_twa(): data = read.readfrom_txt(file_path_synonym) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: if line[0] not in txt_list: txt_list[line[0]] = [line[14]] else: txt_list[line[0]] += [line[14]] read.save_in_json("data/TwADR-L/cui_dict", txt_list) # textfile2list_twa() def textfile2list_twa_st(): data = read.readfrom_txt(file_path_st) txt_list = {}
def add_ooc_st(): code_cuis = read.read_from_tsv( "data/AskAPatient/codes_st_tsv_processed.tsv") code_cuis_dict = {line[0]: line[4] for line in code_cuis} read.save_in_json("data/AskAPatient/code_st_dict_complete", code_cuis_dict)