def get_dataset(data_pickle_path, word_dict_path, predict_dict_path, save=False): all_events = CsvUtility.read_pickle(data_pickle_path, 'r') word_dict = CsvUtility.read_pickle(word_dict_path, 'r') predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r') print all_events[0] print len(word_dict), len(predict_dict), len(all_events) feature_dict = DictDoubleMap(list(word_dict)) pred_dict = DictDoubleMap(list(predict_dict)) feature_matrix = np.zeros((len(all_events), len(word_dict))) result_matrix = np.zeros((len(all_events), len(predict_dict))) for i_iter, event_line in enumerate(all_events): for event_item in event_line[0]: feature_matrix[i_iter][feature_dict.get_index_by_word(event_item)] += 1 for pred_item in event_line[1]: result_matrix[i_iter][pred_dict.get_index_by_word(pred_item)] = 1 if i_iter % 1000 == 0: print 'complete {0} of {1}'.format(i_iter, len(all_events)) if save: CsvUtility.write_dict2csv(feature_dict.get_word2index(), Path+'/data-repository/', 'feature2index.csv') CsvUtility.write_dict2csv(pred_dict.get_word2index(), Path+'/data-repository/', 'predict2index.csv') CsvUtility.write_array2csv(feature_matrix, Path+'/data-repository/', 'feature_matrix.csv') CsvUtility.write_array2csv(result_matrix, Path+'/data-repository/', 'result_matrix.csv') return feature_matrix, result_matrix
def get_revert_labtest(): labtest_df = pd.read_csv(os.path.join(Path, 'MIMICIII_data/D_LABITEMS.csv'), dtype=str) item_df = CsvUtility.read_pickle( Path + '/data-repository/lab_item_over.pkl', 'r') print item_df[:5] print type(list(item_df.index)[0]) print labtest_df.shape print labtest_df[:5] print labtest_df.dtypes print labtest_df.describe() labtest_dict = labtest_df[['ITEMID', 'LABEL']] print labtest_dict.shape labtest_dict = labtest_dict.dropna() print labtest_dict.shape labtest_dict = labtest_dict.drop_duplicates() print labtest_dict.shape print labtest_dict[:5] # labtest_dict.to_csv("../data-repository/labtest_dict.csv", index=None) labtest_list = labtest_dict.values print labtest_list[:5] # print np.array(list(item_df.index), dtype=str) revert_labtest_dict = {} for i in range(len(labtest_list)): if labtest_list[i][0] in np.array(list(item_df.index), dtype=str): temp_str = remove_bracket_from_str(labtest_list[i][1]) temp_str = remove_quotation_from_str(temp_str) temp_str = temp_str.replace(",", " ").strip().lower() revert_labtest_dict[temp_str] = labtest_list[i][0] print revert_labtest_dict print len(revert_labtest_dict) CsvUtility.write_dict2csv(dict(revert_labtest_dict), Path + "/data-repository", "revert_labtest_dict.csv")
new_literature = "" for (key, value) in doc2word_list.entity_count.items(): for i in range(value): new_literature += key + "," new_docs.append(new_literature) i_count += 1 if i_count % 5 == 0: end_time = clock() print('\rFile Completed {0} of {1}... Spend {2} s'.format( i_count, len(file_path_list), (end_time - start_time))) start_time = end_time # print "vocabulary size : ", len(vocabulary_count) print "using entity size : ", len(used_entity_count) print "num of docs having entity : ", len(doc2entity) # CsvUtility.write_dict2csv(raw_dict=vocabulary_count, csv_path=args.output_path, # file_name='literature_vocabulary.csv') CsvUtility.write_dict2csv(raw_dict=used_entity_count, csv_path=args.output_path, file_name='used_entity.csv') CsvUtility.write_dict2csv(raw_dict=doc2entity, csv_path=args.output_path, file_name='doc2entity.csv') CsvUtility.write_list2csv(new_docs, csv_path=args.out_filter_file_path, file_name='new_docs.csv') print '******************************************************************************' #test code #python select_relate_literature.py ../data-repository/literature_doc ../data-repository ../data-repository/new_literature entity_dict.csv
def get_revert_prescription(): prescription_df = pd.read_csv(os.path.join( Path, 'MIMICIII_data/PRESCRIPTIONS.csv'), dtype=str) drug_df = CsvUtility.read_pickle( Path + '/data-repository/prescription_drug_over.pkl', 'r') # print type(list(drug_df.index)[0]) # print np.array(list(drug_df.index), dtype=str) print prescription_df.shape print prescription_df[:5] print prescription_df.dtypes print prescription_df.describe() prescription_dict = prescription_df[[ 'FORMULARY_DRUG_CD', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC' ]] print prescription_dict.shape prescription_dict = prescription_dict.dropna() print prescription_dict.shape prescription_dict = prescription_dict.drop_duplicates() print prescription_dict.shape # print prescription_dict[:5] # prescription_dict.to_csv("../data-repository/prescription_dict.csv", index=None) stop_char = ['(', ')', '/', '/"', '-'] stop_str = { "*nf*", "a", "b", "of", "and", "by", "to", "or", "the", "in", "with", "not", "classified", "for", "on", "from", "without", "as", "other", "than", "more", "at", "one", "all", "its", "may", "after", "any", "d", "be", "into", "their", "which", "an", "ec", "c", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "i", "s", "t", "u", "v", "w", "x", "y", "z", "vs.", "mg", "extended-release", "" } revert_prescrip_dict = {} prescrip_list = prescription_dict.values print prescrip_list[:5] for i in range(len(prescrip_list)): if prescrip_list[i][0] in list(drug_df.index): word_list_tmp = [] prescrip_str = remove_bracket_from_str(prescrip_list[i][1]) for stop_c in stop_char: prescrip_str = prescrip_str.replace(stop_c, ' ').strip() for word_tmp in prescrip_str.split(" "): tmp = word_tmp.lower() if len(tmp) > 0 and any(char.isalpha() for char in tmp): if tmp.endswith("mg") and len(tmp) > 2 and is_number( tmp[:-2]): pass elif tmp not in stop_str: word_list_tmp.append(tmp.strip()) words = " ".join(word_list_tmp).strip() if len(words) > 0 and words not in revert_prescrip_dict: revert_prescrip_dict[words] = prescrip_list[i][0] word_list_tmp = [] prescrip_str = remove_bracket_from_str(prescrip_list[i][2]) for stop_c in stop_char: prescrip_str = prescrip_str.replace(stop_c, ' ').strip() for word_tmp in prescrip_str.split(" "): tmp = word_tmp.lower() if len(tmp) > 0 and any(char.isalpha() for char in tmp): if tmp.endswith("mg") and len(tmp) > 2 and is_number( tmp[:-2]): pass elif tmp not in stop_str: word_list_tmp.append(tmp.strip()) words = " ".join(word_list_tmp).strip() if len(words) > 0 and words not in revert_prescrip_dict: revert_prescrip_dict[words] = prescrip_list[i][0] print revert_prescrip_dict print len(revert_prescrip_dict) CsvUtility.write_dict2csv(dict(revert_prescrip_dict), Path + "/data-repository", 'revert_prescription_dict.csv')
def get_revert_diagnoses_procedures(): word_count = {} stop_list = { "of", "and", "by", "to", "or", "the", "in", "with", "not", "classified", "for", "on", "from", "without", "as", "other", "than", "more", "at", "one", "all", "a", "its", "may", "after", "any", "d", "be", "into", "their", "which", "an", "*nf", "nf*", "but", "but", "", "-", "c", "c-c", "w", "e", "o", "b", "m", "g", "s", "h", "t-t", "un", "ve", "k", "u", "j", "t", "n" } diagnoses_df = CsvUtility.read_pickle( Path + '/data-repository/icd_diagnoses_over.pkl', 'r') procedures_df = CsvUtility.read_pickle( Path + '/data-repository/icd_procedures_over.pkl', 'r') data_diagnoses = pd.read_csv(os.path.join( Path, 'MIMICIII_data/D_ICD_DIAGNOSES.csv'), dtype=str)[["ICD9_CODE", "LONG_TITLE"]] data_procedures = pd.read_csv(os.path.join( Path, 'MIMICIII_data/D_ICD_PROCEDURES.csv'), dtype=str)[["ICD9_CODE", "LONG_TITLE"]] data_diagnoses.set_index(["ICD9_CODE"], inplace=True) data_procedures.set_index(["ICD9_CODE"], inplace=True) print diagnoses_df[:5] print diagnoses_df.shape print procedures_df[:5] print procedures_df.shape print data_diagnoses[:5] print data_diagnoses.shape print data_procedures[:5] print data_procedures.shape merge_diagnoses = pd.merge(diagnoses_df, data_diagnoses, how='inner', left_index=True, right_index=True) print merge_diagnoses[:10] print merge_diagnoses.shape merge_procedures = pd.merge(procedures_df, data_procedures, how='inner', left_index=True, right_index=True) print merge_procedures[:10] print merge_procedures.shape #combine the dianoses and procedures dataframe ICD_merge = pd.concat([merge_diagnoses, merge_procedures], axis=0) print ICD_merge[:5] icd_merge_list = np.array(ICD_merge.reset_index(), dtype=str) print icd_merge_list[:5] revert_diagnoses_procedures = {} for i in range(len(icd_merge_list)): wordlist = [ re.sub("[^a-zA-Z-]", "", x.lower()) for x in icd_merge_list[i][2].split(' ') if re.sub("[^a-zA-Z-]", "", x.lower()) not in stop_list ] revert_diagnoses_procedures[" ".join(wordlist)] = icd_merge_list[i][0] for word in wordlist: word_count[ word] = word_count[word] + 1 if word in word_count else 1 CsvUtility.write_dict2csv(revert_diagnoses_procedures, Path + '/data-repository/', 'revert_diagnoses_procedures.csv') # CsvUtility.write_text2csv(word_count, '../data-repository/', 'revert_ICD_word_dict.csv') with open(Path + "/data-repository/revert_ICD_word_dict.csv", 'w') as w: for (key, value) in sorted(word_count.items(), key=lambda s: s[1], reverse=True): w.write(key + "," + str(value) + "\n")
doc_maps = [] for doc in file_contend: doc_maps.extend([[doc[2], doc[1]]]) # print len(doc_maps), len(doc_maps[0]) return doc_maps if __name__ == '__main__': # get_good_docs('../data-repository/result/jack_1.csv', 10, 2) file_list = Directory.folder_process(Path + '/data-repository/result_0.8') merge_dict = dict({}) doc_map = [] for file_path in file_list: dict_tmp = get_good_docs(file_path, 80, 10) print 'this dict len : ', len(dict_tmp) merge_dict.update(dict_tmp) print 'after the merge : ', len(merge_dict) doc_map.extend(get_docs_frequence_kind_map(file_path=file_path)) # draw_pl(x_y=doc_map, type='o') # print merge_dict texts = [[word for word in doc.split(' ')] for doc in merge_dict.values()] # pprint(texts[:5]) dictionary = corpora.Dictionary(texts) dictionary.save(Path + '/data-repository/available_word_in_literature.dict') print dictionary CsvUtility.write_dict2csv(merge_dict, Path + '/data-repository', 'selected_docs4LDA.csv')