def get_events_together(): columns_name = [ 'hadm_id', 'subject_id', 'charttime', 'event_type', 'event' ] diag_columns = [ 'HADM_ID', 'SUBJECT_ID', 'DISCHTIME', 'DIAGNOSIS', 'ICD9_CODE' ] diag_events = get_all_diagnoses_event() diag_events = diag_events.ix[:, diag_columns] diag_events.columns = columns_name print diag_events[:5] lab_events = get_lab_event() lab_columns = ['HADM_ID', 'SUBJECT_ID', 'CHARTTIME', 'FLAG', 'ITEMID'] lab_events = lab_events.ix[:, lab_columns] lab_events.columns = columns_name print lab_events[:5] medic_events = get_medication_event() medic_columns = [ 'HADM_ID', 'SUBJECT_ID', 'STARTDATE', 'DRUG_TYPE', 'FORMULARY_DRUG_CD' ] medic_events = medic_events.ix[:, medic_columns] medic_events.columns = columns_name print medic_events[:5] all_events = pd.concat([diag_events, lab_events, medic_events], ignore_index=True) print all_events[:5] print all_events[-5:] print all_events.shape CsvUtility.write2pickle(path.join(Path, 'data-repository/allevents.pkl'), all_events, 'w')
def filter_all_event(): all_events_df = CsvUtility.read_pickle( path.join(Path, '/data-repository/allevents.pkl'), 'r') all_events_df['icd9_3'] = '' print all_events_df[:5] print all_events_df.shape # diagnoses_events = all_events_df[all_events_df['event_type'] == 'diagnosis'] # print diagnoses_events[:5] # print diagnoses_events.shape # diagnoses_set = set(list(pd.read_csv('../data-repository/merge_diagnoses_dict.csv', header=None).index)) # print len(diagnoses_set) # i=0 # for index_iter in diagnoses_events.index: # icd_code = diagnoses_events.ix[index_iter, 'event'] # assert len(icd_code) >= 3 # if len(icd_code) >= 3: # if icd_code[:3] in diagnoses_set: # all_events_df.ix[index_iter, 'icd9_3'] = all_events_df.ix[index_iter, 'event'][:3] # else: # all_events_df.drop(index_iter, axis=0, inplace=True) # sys.stdout.write('\rROW {0} of {1}...'.format(i, diagnoses_events.shape[0])) # i += 1 # all_events_df.index = np.array(range(all_events_df.shape[0])) print all_events_df[:5] print all_events_df.shape CsvUtility.write2pickle( path.join(Path, '/data-repository/all_events_icd9.pkl'), all_events_df, 'w')
def load_corpus(all_path=Path+'/data-repository/', train_perc=0.7): x, y = get_dataset(all_path + 'after_instance.pkl', all_path + 'event_instance_dict.pkl', all_path + 'predict_diags_dict.pkl') train_size = int(x.shape[0] * train_perc) # shuffle the train set idx = np.random.permutation(x.shape[0]) x_train = x[idx] y_train = y[idx] CsvUtility.write_array2csv(idx, Path+'/data-repository/', 'random_idx.csv') return x_train[:train_size], y_train[:train_size], x_train[train_size:], y_train[train_size:], idx
def reload_corpus(all_path=Path+'/data-repository/', train_perc=0.7, shuffle=False): x = CsvUtility.read_array_from_csv(all_path, 'feature_matrix.csv') y = CsvUtility.read_array_from_csv(all_path, 'result_matrix.csv') train_size = int(x.shape[0] * train_perc) # shuffle the train set if shuffle: idx = np.random.permutation(x.shape[0]) CsvUtility.write_array2csv(idx, Path+'/data-repository/', 'random_idx.csv') else: idx = CsvUtility.read_array_from_csv(all_path, 'random_idx.csv') x_train = x[idx] y_train = y[idx] return x_train[:train_size], y_train[:train_size], x_train[train_size:], y_train[train_size:], idx
def get_drug_over(file_name, over_num): drug_df = pd.read_csv(os.path.join(Path, file_name), dtype=str)[['HADM_ID', 'FORMULARY_DRUG_CD']] print drug_df[:5] print drug_df.shape drug_df.drop_duplicates(inplace=True) print drug_df.shape drug_count = drug_df['FORMULARY_DRUG_CD'].value_counts() drug_df = pd.DataFrame(drug_count[drug_count > over_num]) drug_df.columns = ['COUNT'] drug_df.index.name = 'FORMULARY_DRUG_CD' print drug_df[:5] print 'size:', drug_df.shape CsvUtility.write2pickle( path.join(Path, 'data-repository/prescription_drug_over.pkl'), drug_df, 'w')
def get_medication_event(): medication_df = pd.read_csv( os.path.join(Path, 'MIMICIII_data/PRESCRIPTIONS.csv'))[[ 'SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'DRUG_TYPE', 'FORMULARY_DRUG_CD' ]] # print medication_df[:5] medication_df['DRUG_TYPE'] = ['prescription'] * medication_df.shape[0] # print medication_df[:5] # print medication_df.shape sub_df = CsvUtility.read_pickle( path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r') # drug_df = CsvUtility.read_pickle('../data-repository/prescription_drug_over.pkl', 'r') prescription_list = np.array( pd.read_csv(path.join(Path, 'data-repository/revert_prescription_dict.csv'), index_col=[0], header=None, dtype=str)).flatten() medication_df = medication_df[ medication_df['SUBJECT_ID'].isin( np.array(list(sub_df.index), dtype=str)) & medication_df['FORMULARY_DRUG_CD'].isin(prescription_list)] # medication_df ['icd9_3'] = [''] * medication_df.shape[0] print medication_df.shape print len(set(list(medication_df['FORMULARY_DRUG_CD']))) return medication_df
def get_lab_event(): labevent_df = pd.read_csv( os.path.join(Path, 'MIMICIII_data/LABEVENTS.csv'), dtype=str)[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'FLAG']] labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal'] labevent_df['FLAG'] = ['labevent'] * labevent_df.shape[0] # labevent_df['SUBJECT_ID'] = labevent_df['SUBJECT_ID'].astype('str') # labevent_df['HADM_ID'] = labevent_df['HADM_ID'].astype('str') print labevent_df[-5:] print labevent_df.shape print labevent_df.dtypes sub_df = CsvUtility.read_pickle( path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r') # item_df = CsvUtility.read_pickle('../data-repository/lab_item_over.pkl', 'r') labtest_list = np.array( pd.read_csv(path.join(Path, 'data-repository/revert_labtest_dict.csv'), index_col=[0], header=None, dtype=str)).flatten() print labtest_list print len(labtest_list) labevent_df = labevent_df[ labevent_df['SUBJECT_ID'].isin(np.array(list(sub_df.index), dtype=str)) & labevent_df['ITEMID'].isin(labtest_list)] # labevent_df['icd9_3'] = [''] * labevent_df.shape[0] print labevent_df.shape print len(set(list(labevent_df['ITEMID']))) return labevent_df
def subject_admission_over(filename, over_num): admission_df = pd.read_csv(os.path.join(Path, filename), dtype=str) # print admission_df[:5] # admission_df.filter(items=['SUBJECT_ID'], like=) print admission_df[:5] print admission_df.shape admission_df.drop_duplicates(inplace=True) print admission_df.shape sub_vc = admission_df['SUBJECT_ID'].value_counts() sub_df = pd.DataFrame(sub_vc[sub_vc > over_num]) sub_df.columns = ['COUNT'] sub_df.index.name = 'SUBJECT_ID' print sub_df[:5] print 'size: ', sub_df.shape CsvUtility.write2pickle( path.join(Path, 'data-repository/subject_admission_over.pkl'), sub_df, 'w')
def icd_procedures_over(filename, over_num): procedures_df = pd.read_csv(os.path.join(Path, filename), dtype=str)[['HADM_ID', 'ICD9_CODE']] print procedures_df[:5] print procedures_df.shape procedures_df.drop_duplicates(inplace=True) print procedures_df.shape procedure_count = procedures_df['ICD9_CODE'].value_counts() print procedure_count procedure_df = pd.DataFrame(procedure_count[procedure_count > over_num]) procedure_df.columns = ['COUNT'] procedure_df.index.name = 'ICD9_CODE' print procedure_df[:5] print 'size:', procedure_df.shape CsvUtility.write2pickle( path.join(Path, 'data-repository/icd_procedures_over.pkl'), procedure_df, 'w')
def get_lab_item_over(file_name, over_num): labevent_df = pd.read_csv(os.path.join(Path, file_name), dtype=str)[['HADM_ID', 'ITEMID', 'FLAG']] print labevent_df[:5] print labevent_df.shape labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal'] print labevent_df.shape labevent_df.drop_duplicates(inplace=True) print labevent_df.shape item_count = labevent_df['ITEMID'].value_counts() item_df = pd.DataFrame(item_count[item_count > over_num]) item_df.columns = ['COUNT'] item_df.index.name = 'ITEMID' print item_df[:5] print 'size:', item_df.shape CsvUtility.write2pickle( path.join(Path, 'data-repository/lab_item_over.pkl'), item_df, 'w')
def doc2sentences(self, rows_text, token): sentences_list = CsvUtility.text2sentence(raw_text=rows_text, token=token, stop_words=self.stop_word, stem_word=False) # for sentence in sentences_list: # for word in sentence: # self.word_count[word] = self.word_count[word] + 1 if word in self.word_count else 1 return sentences_list
def get_revert_labtest(): labtest_df = pd.read_csv(os.path.join(Path, 'MIMICIII_data/D_LABITEMS.csv'), dtype=str) item_df = CsvUtility.read_pickle( Path + '/data-repository/lab_item_over.pkl', 'r') print item_df[:5] print type(list(item_df.index)[0]) print labtest_df.shape print labtest_df[:5] print labtest_df.dtypes print labtest_df.describe() labtest_dict = labtest_df[['ITEMID', 'LABEL']] print labtest_dict.shape labtest_dict = labtest_dict.dropna() print labtest_dict.shape labtest_dict = labtest_dict.drop_duplicates() print labtest_dict.shape print labtest_dict[:5] # labtest_dict.to_csv("../data-repository/labtest_dict.csv", index=None) labtest_list = labtest_dict.values print labtest_list[:5] # print np.array(list(item_df.index), dtype=str) revert_labtest_dict = {} for i in range(len(labtest_list)): if labtest_list[i][0] in np.array(list(item_df.index), dtype=str): temp_str = remove_bracket_from_str(labtest_list[i][1]) temp_str = remove_quotation_from_str(temp_str) temp_str = temp_str.replace(",", " ").strip().lower() revert_labtest_dict[temp_str] = labtest_list[i][0] print revert_labtest_dict print len(revert_labtest_dict) CsvUtility.write_dict2csv(dict(revert_labtest_dict), Path + "/data-repository", "revert_labtest_dict.csv")
def get_simple_inference_penalty(net): # get loss from gamma with lda model # gamma = get_topicdist_lda(Path+'/data-repository/selected_docs4LDA.csv', 20) gamma = CsvUtility.read_array_from_csv(Path+'/data-repository', 'topicdist_result.csv') penalty = Variable(torch.FloatTensor([0.0])) gammas = Variable(torch.from_numpy(gamma)).float() latent_neuron_topics = np.array([]) for para_iter, para in enumerate(net.parameters()): if para_iter == 0: latent_neuron_topics = para.abs().mm(gammas) # print 'latent_neuron_topics : ', latent_neuron_topics latent_neuron_topics = latent_neuron_topics / (latent_neuron_topics.sum(dim=1).view(-1, 1)) # print 'Norm latent_neuron_topics : ', latent_neuron_topics penalty = Variable(torch.FloatTensor([1.0])) / (latent_neuron_topics.max(dim=1)[0].sum()) return penalty, latent_neuron_topics.data.numpy()
def get_revert_diagnoses_procedures(): word_count = {} stop_list = { "of", "and", "by", "to", "or", "the", "in", "with", "not", "classified", "for", "on", "from", "without", "as", "other", "than", "more", "at", "one", "all", "a", "its", "may", "after", "any", "d", "be", "into", "their", "which", "an", "*nf", "nf*", "but", "but", "", "-", "c", "c-c", "w", "e", "o", "b", "m", "g", "s", "h", "t-t", "un", "ve", "k", "u", "j", "t", "n" } diagnoses_df = CsvUtility.read_pickle( Path + '/data-repository/icd_diagnoses_over.pkl', 'r') procedures_df = CsvUtility.read_pickle( Path + '/data-repository/icd_procedures_over.pkl', 'r') data_diagnoses = pd.read_csv(os.path.join( Path, 'MIMICIII_data/D_ICD_DIAGNOSES.csv'), dtype=str)[["ICD9_CODE", "LONG_TITLE"]] data_procedures = pd.read_csv(os.path.join( Path, 'MIMICIII_data/D_ICD_PROCEDURES.csv'), dtype=str)[["ICD9_CODE", "LONG_TITLE"]] data_diagnoses.set_index(["ICD9_CODE"], inplace=True) data_procedures.set_index(["ICD9_CODE"], inplace=True) print diagnoses_df[:5] print diagnoses_df.shape print procedures_df[:5] print procedures_df.shape print data_diagnoses[:5] print data_diagnoses.shape print data_procedures[:5] print data_procedures.shape merge_diagnoses = pd.merge(diagnoses_df, data_diagnoses, how='inner', left_index=True, right_index=True) print merge_diagnoses[:10] print merge_diagnoses.shape merge_procedures = pd.merge(procedures_df, data_procedures, how='inner', left_index=True, right_index=True) print merge_procedures[:10] print merge_procedures.shape #combine the dianoses and procedures dataframe ICD_merge = pd.concat([merge_diagnoses, merge_procedures], axis=0) print ICD_merge[:5] icd_merge_list = np.array(ICD_merge.reset_index(), dtype=str) print icd_merge_list[:5] revert_diagnoses_procedures = {} for i in range(len(icd_merge_list)): wordlist = [ re.sub("[^a-zA-Z-]", "", x.lower()) for x in icd_merge_list[i][2].split(' ') if re.sub("[^a-zA-Z-]", "", x.lower()) not in stop_list ] revert_diagnoses_procedures[" ".join(wordlist)] = icd_merge_list[i][0] for word in wordlist: word_count[ word] = word_count[word] + 1 if word in word_count else 1 CsvUtility.write_dict2csv(revert_diagnoses_procedures, Path + '/data-repository/', 'revert_diagnoses_procedures.csv') # CsvUtility.write_text2csv(word_count, '../data-repository/', 'revert_ICD_word_dict.csv') with open(Path + "/data-repository/revert_ICD_word_dict.csv", 'w') as w: for (key, value) in sorted(word_count.items(), key=lambda s: s[1], reverse=True): w.write(key + "," + str(value) + "\n")
def mlp_lda(penalty_rate=100): # Mimic Dataset print 'loading data...' train_x, train_y, test_x, test_y, idx = load_corpus() print 'loading ready...' print 'shape of train x:', train_x.shape print 'shape of train y:', train_y.shape print 'shape of test x:', test_x.shape print 'shape of test y:', test_y.shape # Hyper Parameters input_size = len(train_x[0]) hidden_size = 128 num_classes = 80 num_epochs = 20 batchsize = 10 learning_rate = 0.001 net = Net(input_size, hidden_size, num_classes) # Loss and Optimizer criterion = nn.MSELoss(size_average=False) optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) print 'parameter size :' for para in net.parameters(): print para.size() # print para train_dataset = Data.TensorDataset(data_tensor=torch.from_numpy(train_x), target_tensor=torch.from_numpy(train_y)) train_loader = Data.DataLoader(dataset=train_dataset, batch_size=batchsize, shuffle=True, num_workers=2) test_dataset = Data.TensorDataset(data_tensor=torch.from_numpy(test_x), target_tensor=torch.from_numpy(test_y)) test_loader = Data.DataLoader(dataset=test_dataset, batch_size=batchsize, shuffle=False, num_workers=2) # Train the Model neuron_topics = np.array([]) for epoch in range(num_epochs): running_loss = 0.0 count_isntance = 0 for i, data_iter in enumerate(train_loader, 0): # Convert numpy array to torch Variable input_train_x, input_train_y = data_iter inputs = Variable(input_train_x).float() targets = Variable(input_train_y).float() # get the penalty from lda model penalty, neuron_topics = get_simple_inference_penalty(net) #penalty = 0 # Forward + Backward + Optimize optimizer.zero_grad() # zero the gradient buffer outputs = net(inputs) loss = criterion(outputs, targets) #print 'criterion loss : ', loss #print 'penalty loss: ', penalty loss = loss + penalty_rate * penalty # print 'penalty loss : ', (penalty_rate * penalty).data.numpy() loss.backward() optimizer.step() running_loss += loss.data[0] / num_classes count_isntance += 1 # print loss.data if (i + 1) % 100 == 1: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, i + 1, train_x.shape[0] / batchsize, running_loss / count_isntance)) running_loss = 0.0 count_isntance = 0 CsvUtility.write_array2csv(neuron_topics, Path + '/data-repository', 'neuron_topics_' + str(epoch) + '.csv') print 'finish training' # Test the Model res = [] test_loss = 0.0 test_count = 0 for data_iter in test_loader: input_test_x, input_test_y = data_iter outputs = net(Variable(input_test_x).float()) targets = Variable(input_test_y).float() # _, predicted = torch.max(outputs.data, 1) predicted = outputs.data st_loss = criterion(outputs, targets) test_loss += st_loss.data[0] / num_classes test_count += 1 res.extend(list(predicted.numpy())) # save the first parameter paras = net.parameters() for i, para4 in enumerate(paras, 0): if i == 0: para4save = para4.data.numpy() print 'the first parameter: ', para4save.shape CsvUtility.write_array2csv(para4save, Path + '/data-repository', 'temp_parameter.csv') # get the precision of test data print 'result shape:', len(res), len(res[0]) print 'test loss:', test_loss / test_count auc_list, _ = get_auc_list(test_y, res) print 'AUC List:' print auc_list
def get_final_word_dict(): MIMIC_word_dict = list( CsvUtility.read_pickle( Path + '/data-repository/event_instance_dict.pkl', 'r')) print MIMIC_word_dict[:10] print len(MIMIC_word_dict) diag_num = 0 lab_num = 0 drug_num = 0 other_num = 0 new_MIMIC_dict = {} for item in MIMIC_word_dict: if item.startswith("d_"): diag_num += 1 elif item.startswith("l_"): lab_num += 1 elif item.startswith("p_"): drug_num += 1 else: other_num += 1 print item new_MIMIC_dict[item[2:]] = item new_MIMIC_dict_df = pd.DataFrame.from_dict(dict(new_MIMIC_dict), orient='index') show_df(new_MIMIC_dict_df, 10) print 'diagnoses number :', diag_num, 'labtest number:', lab_num, 'drug number:', drug_num, 'other number:', other_num revert_diag_proce_df = pd.read_csv( Path + '/data-repository/revert_diagnoses_procedures.csv', header=None, dtype=str) revert_labtest_df = pd.read_csv(Path + '/data-repository/revert_labtest_dict.csv', header=None, dtype=str) revert_prescrip_df = pd.read_csv( Path + '/data-repository/revert_prescription_dict.csv', header=None, dtype=str) show_df(revert_diag_proce_df, 10) show_df(revert_labtest_df, 10) show_df(revert_prescrip_df, 10) concat_dict = pd.concat( [revert_diag_proce_df, revert_labtest_df, revert_prescrip_df], axis=0, ignore_index=True) show_df(concat_dict, 20) concat_dict.set_index(keys=[1], inplace=True) show_df(concat_dict, 10) print len(set(list(concat_dict.index))) merge_df = pd.merge(new_MIMIC_dict_df, concat_dict, how='left', left_index=True, right_index=True) show_df(merge_df, 10) print len(set(list(merge_df.index))) print len(merge_df['0_x'].unique()) print len(merge_df['0_y'].unique()) merge_df.drop_duplicates() show_df(merge_df) merge_df.to_csv(Path + '/data-repository/entity_dict.csv', header=None, index=None)
def get_sequence(): print 'reading.....' all_events = CsvUtility.read_pickle('../data-repository/allevents.pkl', 'r') print all_events.shape all_events.dropna(axis=0, how='any', subset=['subject_id', 'charttime', 'event', 'hadm_id'], inplace=True) print all_events.shape print 'changing the order......' all_events = all_events.ix[:, [ 'subject_id', 'charttime', 'event_type', 'event', 'icd9_3', 'hadm_id' ]] print all_events.dtypes all_events = all_events.astype({'hadm_id': 'int64'}) print all_events.dtypes print 'sorting ......' all_events.sort_values( by=['subject_id', 'hadm_id', 'charttime', 'event_type', 'event'], inplace=True) print all_events[:10] rows = np.array(all_events) prev_time = None prev_subject = None prev_hadm_id = None # temp diagnoses in each admission diags = set() # temp event sequence in each admission temp_event_seq = [] event_seq = [] # event sequence for each person all_seq = [] # map the time to the events in all_seq all_days = [] # whole set of events unique_events = set() # whole diagnoses count dict diag_count = defaultdict(lambda: 0) # get the static feature of a patient p_features = set_p_features() # count the length of sequence seq_len = 0 seq_max = 0 seq_min = 100000 for i in rows[0]: print type(i) for i, row in enumerate(rows): # print i, row if row[2] == "diagnosis": event = row[2][:1] + "_" + str(row[4]) if not row[2].startswith("E"): diag_count[event] += 1 else: event = row[2][:1] + "_" + str(row[3]) if row[0] is None or row[1] is None or row[5] is None: print 'delete None:', row continue elif type(row[1]) != str and math.isnan(row[1]): print 'delete nan:', row continue elif prev_time is None or prev_subject is None: print 'first event' pass elif (row[0] != prev_subject) or (NLP_Utility.strtime2datetime( row[1]) > prev_time + datetime.timedelta(365)): print 'change sequence', row, ' pre: ', prev_subject, prev_time if len(diags) > 0 and len(event_seq) > 4: # pre, suf = calculate_window(event_seq + temp_event_seq, all_days) # all_seq.append([p_features, event_seq, temp_event_seq, diags, pre, suf]) temp_event_seq = [x for x in temp_event_seq if x not in diags] for item in event_seq: unique_events.add(item) for item in temp_event_seq: unique_events.add(item) all_days.append(len(temp_event_seq)) all_seq.append([ p_features[prev_hadm_id], event_seq, temp_event_seq, all_days, diags ]) print '!!!__!!!', prev_subject print len(event_seq) + len(temp_event_seq), len(all_days), sum( all_days) seq_len += len(all_days) seq_max = seq_max if seq_max > len(all_days) else len(all_days) seq_min = seq_min if seq_min < len(all_days) else len(all_days) diags = set() event_seq = [] temp_event_seq = [] all_days = [] elif prev_hadm_id != row[5]: print 'change temp sequence:', row, ' prev: ', prev_hadm_id all_days.append(len(temp_event_seq)) event_seq += temp_event_seq temp_event_seq = [] diags = set() elif NLP_Utility.strtime2datetime(row[1]) != prev_time: # print 'just change time: ', prev_time, rows[1] all_days.append(len(temp_event_seq)) event_seq += temp_event_seq temp_event_seq = [] # print 'adding ....' temp_event_seq.append(event) prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S') prev_subject = row[0] prev_hadm_id = row[5] if row[2] == "diagnosis": diags.add(event) if i % 10000 == 0: print 'complete {0} of {1}'.format(i, len(rows)) # Write down the vocalulary used and diagnoses that we want to predict predicted_diags = [ y[0] for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True) [:num_pred_diag] ] # uniq = open('../data-repository/vocab', 'w') # uniq.write(' '.join(unique_events) + '\n') # uniq.write(' '.join(predicted_diags)) # uniq.close() print len(all_seq) print all_seq[0] after_del_sequence = [] for instance in all_seq: fil_diag = [diag for diag in instance[-1] if diag in predicted_diags] if len(fil_diag) > 0: after_del_sequence.append(instance) after_del_sequence[-1][-1] = fil_diag print 'num of seq: ', len(after_del_sequence) print 'max/min of seq: ', seq_max, seq_min print 'mean of seq: ', seq_len / len(after_del_sequence) CsvUtility.write2pickle('../data-repository/after_sequence.pickle', after_del_sequence, 'w') CsvUtility.write2pickle('../data-repository/event_dict.pickle', unique_events, 'w') print '************************************************************' ####################################################################################################### def get_diag_sequence(): pass
def write_filtered_file(self): return CsvUtility.write_key_value_times( self.entity_count, self.out_filter_file_path, 'F_' + os.path.basename(self.doc_path))
new_literature = "" for (key, value) in doc2word_list.entity_count.items(): for i in range(value): new_literature += key + "," new_docs.append(new_literature) i_count += 1 if i_count % 5 == 0: end_time = clock() print('\rFile Completed {0} of {1}... Spend {2} s'.format( i_count, len(file_path_list), (end_time - start_time))) start_time = end_time # print "vocabulary size : ", len(vocabulary_count) print "using entity size : ", len(used_entity_count) print "num of docs having entity : ", len(doc2entity) # CsvUtility.write_dict2csv(raw_dict=vocabulary_count, csv_path=args.output_path, # file_name='literature_vocabulary.csv') CsvUtility.write_dict2csv(raw_dict=used_entity_count, csv_path=args.output_path, file_name='used_entity.csv') CsvUtility.write_dict2csv(raw_dict=doc2entity, csv_path=args.output_path, file_name='doc2entity.csv') CsvUtility.write_list2csv(new_docs, csv_path=args.out_filter_file_path, file_name='new_docs.csv') print '******************************************************************************' #test code #python select_relate_literature.py ../data-repository/literature_doc ../data-repository ../data-repository/new_literature entity_dict.csv
def get_gamma_lda(docs_path, topic_num): selected_docs = pd.read_csv(docs_path, header=None, index_col=[0]).values print 'number of docs:', selected_docs.shape # print selected_docs[:5] texts = [[word for word in doc[0].split(' ')] for doc in selected_docs] # pprint(texts[:5]) dictionary = corpora.Dictionary(texts) dictionary.save_as_text(Path+'/data-repository/available_word_in_literature.csv') print dictionary # print dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # print corpus[:5] # print len(corpus) lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=1000, passes=1) # lda_model.print_topics(10, 10) # gamma = lda_model.get_topics().T gamma = lda_model.state.get_lambda() gamma = (gamma / gamma.sum(axis=0)).T print "shape of gamma :", gamma.shape CsvUtility.write_array2csv(gamma, Path+'/data-repository', 'gamma_from_LDA.csv') pprint(lda_model.show_topics(10, 10)) # change the gamma, because the number of word is less than the number of feature # then insert zeros to change the size of gamma into bigger gamma with the same size # of (feature_size, topic_number) gamma_id2word = {} with open(Path+'/data-repository/available_word_in_literature.csv') as file: line_num = file.readline() # print line_num lines_contend = file.readlines() for line_n in lines_contend: line = line_n.split("\t") # print line if len(line) > 1: gamma_id2word[int(line[0])] = line[1] print 'original gamma size: ', len(gamma_id2word) id_list = gamma_id2word.keys() # print np.array(id_list).max() feature_word2id = {} feature_index = pd.read_csv(Path+'/data-repository/feature2index.csv', header=None, index_col=None) # print feature_index.shape # print feature_index[:5] f_i = np.array(feature_index) # print f_i.shape, f_i[:, 1].max() # np.zeros((feature_index.shape[0], gamma_data.shape[1])) for i in range(f_i.shape[0]): feature_word2id[f_i[i][0]] = int(f_i[i][1]) print 'new feature size: ', len(feature_word2id) change_index_result = np.zeros((feature_index.shape[0], gamma.shape[1])) for i in range(gamma.shape[0]): new_index = feature_word2id[gamma_id2word[i]] for j in range(gamma.shape[1]): change_index_result[new_index][j] += gamma[i][j] if i % 1000 == 0: print i, 'line' print change_index_result[:5] print 'after changing the size of result: ', change_index_result.shape CsvUtility.write_array2csv(change_index_result, Path+'/data-repository', 'gamma_result.csv') return change_index_result
def get_all_diagnoses_event(): diagnoses_df = pd.read_csv(path.join(Path, 'MIMICIII_data/DIAGNOSES_ICD.csv'), dtype=str) procedures_df = pd.read_csv(path.join(Path, 'MIMICIII_data/PROCEDURES_ICD.csv'), dtype=str) print procedures_df[:5] print procedures_df.shape print diagnoses_df[:5] print diagnoses_df.shape diagnoses_df = pd.concat([diagnoses_df, procedures_df], axis=0) print diagnoses_df[:5] print diagnoses_df.shape admission_df = pd.read_csv(os.path.join(Path, 'MIMICIII_data/ADMISSIONS.csv'), dtype=str) # print admission_df[:5] diagnoses_event = pd.merge( diagnoses_df[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']], admission_df[['HADM_ID', 'DISCHTIME', 'DIAGNOSIS']], 'left', on='HADM_ID') diagnoses_event['DIAGNOSIS'] = ['diagnosis'] * diagnoses_event.shape[0] print diagnoses_event[:10] print diagnoses_event.shape # print diagnoses_event.dtypes # print type(diagnoses_event.ix[0, 0]) # new update: # here icd_diagnoses_over is useless, because the revert_diagnoses_dict already use the "over" to limit the dict # icd_df = CsvUtility.read_pickle('../data-repository/icd_diagnoses_over.pkl', 'r') diagnoses_list = np.array( pd.read_csv(path.join( Path, 'data-repository/revert_diagnoses_procedures.csv'), index_col=[0], header=None).values).flatten() # print diagnoses_list # print len(diagnoses_list) sub_df = CsvUtility.read_pickle( path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r') diagnoses_event = diagnoses_event[ diagnoses_event['SUBJECT_ID'].isin( np.array(list(sub_df.index), dtype=str)) & diagnoses_event['ICD9_CODE'].isin(diagnoses_list)] print diagnoses_event.shape print diagnoses_event[:10] ###################################### # print 'additional process' # np_diagnoses_event = np.array(diagnoses_event) # new_diagnoses_event = [] # # for i in range(len(np_diagnoses_event)): # if np_diagnoses_event[i][2] != np.NaN and len(np_diagnoses_event[i][2]) >= 3 and np_diagnoses_event[i][2][:3] in diagnoses_set: # new_line = [] # new_line.extend(np_diagnoses_event[i]) # new_line.append(np_diagnoses_event[i][2][:3]) # if re.match('^V.*', np_diagnoses_event[i][2]): # new_line[4] = 'condition' # if re.match('^7[89]\d.*', np_diagnoses_event[i][2]): # new_line[4] = 'symptom' # new_diagnoses_event.append(new_line) # if i % 10000 == 0: # print i # new_columns = list(diagnoses_event.columns) # new_columns.append('icd9_3') # print new_columns # print new_diagnoses_event[:5] # diagnoses_event = pd.DataFrame(new_diagnoses_event) # diagnoses_event.columns = new_columns ###################################### ###################################### # just add the 'condition' and 'symptom' and do not use the icd9_3 anymore.. print "new additional processing ..." np_diagnosis_events = np.array(diagnoses_event) new_diagnosis_events = [] for i in range(len(np_diagnosis_events)): new_diagnosis_events.append(np_diagnosis_events[i]) if re.match('^V.*', np_diagnosis_events[i][2]): new_diagnosis_events[-1][4] = 'condition' elif re.match('^7[89]\d.*]', np_diagnosis_events[i][2]): new_diagnosis_events[-1][4] = 'symptom' if i % 10000 == 0: print "processing the ", i, "line" new_columns = list(diagnoses_event.columns) print new_columns diagnoses_event = pd.DataFrame(new_diagnosis_events, dtype=str) diagnoses_event.columns = new_columns ###################################### print diagnoses_event[:10] print diagnoses_event.shape print len(set(list(diagnoses_event['ICD9_CODE']))) return diagnoses_event
def get_instance(time_before_diag=90): print 'reading.....' all_events = CsvUtility.read_pickle( path.join(Path, 'data-repository/allevents.pkl'), 'r') print all_events.shape all_events.dropna(axis=0, how='any', inplace=True) print all_events.shape print 'changing the order......' all_events = all_events.ix[:, [ 'subject_id', 'charttime', 'event_type', 'event', 'hadm_id' ]] print all_events.dtypes # all_events = all_events.astype({'hadm_id': 'int64'}) # print all_events.dtypes all_events['subject_id'] = all_events['subject_id'].astype('int64') for rr in all_events.ix[0, :]: print type(rr) print 'sorting ......' all_events.sort_values( by=['subject_id', 'charttime', 'event_type', 'event'], inplace=True) print all_events[:10] rows = np.array(all_events, dtype=str) prev_time = None prev_subject = None # temp diagnoses in each time tem_diags = set() # temp event sequence in each time temp_event_seq = [] # event sequence for each person event_seq = [] # map the time for each person event_days = [] # first time for each person base_time = None # all instance all_seq = [] # whole set of events unique_events = set() # whole diagnoses count dict diag_count = defaultdict(lambda: 0) # count the length of instance seq_max = 0 seq_min = 100000 for i in rows[0]: print type(i) for i, row in enumerate(rows): # print i, row # if row[2] == "diagnosis": # event = row[2][:1] + "_" + str(row[4]) # else: # event = row[2][:1] + "_" + str(row[3]) event = row[2][:1] + "_" + str(row[3]) # if type(row[1]) != str and math.isnan(row[1]): # print 'delete nan:', row # continue if prev_time is None or prev_subject is None: print 'first event' base_time = NLP_Utility.strtime2datetime(row[1]) elif row[0] != prev_subject or NLP_Utility.strtime2datetime( row[1]) != prev_time: if len(tem_diags) > 0: # why exclude the diagnoses? # temp_event_seq = [x for x in temp_event_seq if x not in tem_diags] this_days = (prev_time - base_time).days find_days = this_days - time_before_diag if this_days >= time_before_diag else 0 start_position = get_first_index(event_days, find_days) t_event_seq = [] # for i_pos in range(start_position, len(event_days)): # t_event_seq.append(event_seq[i_pos]) # unique_events.add(event_seq[i_pos]) t_event_seq += event_seq[start_position:] # print len(event_seq[start_position:]) # for test_event in event_seq[start_position:]: # if test_event.startswith("p_"): # print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" # for item in temp_event_seq: # # t_event_seq.append(item) # unique_events.add(item) all_seq.append([t_event_seq, list(tem_diags)]) for iter_diag in tem_diags: diag_count[iter_diag] = diag_count[iter_diag] + 1 seq_max = seq_max if seq_max > len(t_event_seq) else len( t_event_seq) seq_min = seq_min if seq_min < len(t_event_seq) else len( t_event_seq) if row[0] != prev_subject: # print 'change patient ', row, ' pre: ', prev_subject, row[0] event_seq = [] event_days = [] base_time = NLP_Utility.strtime2datetime(row[1]) else: # print 'change time ', row, ' pre: ', prev_time, row[1] event_seq += temp_event_seq # print prev_time # print base_time # print type((prev_time - base_time).days) event_days += [(prev_time - base_time).days ] * len(temp_event_seq) tem_diags = set() temp_event_seq = [] # print 'adding ....' temp_event_seq.append(event) prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S') prev_subject = row[0] if row[2] == "diagnosis": tem_diags.add(event) if i % 10000 == 0: print 'complete {0} of {1}'.format(i, len(rows)) # Write down the vocalulary used and diagnoses that we want to predict predicted_diags = [ y[0] for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True) [:num_pred_diag] ] print 'num of seq: ', len(all_seq) print all_seq[0] after_del_sequence = [] for instance in all_seq: fil_diag = [diag for diag in instance[-1] if diag in predicted_diags] # if len(fil_diag) > 0: for item in instance[0]: unique_events.add(item) after_del_sequence.append(instance) after_del_sequence[-1][-1] = fil_diag for diag in fil_diag: unique_events.add(diag) print 'after limit the predict diagnoses, num of seq: ', len( after_del_sequence) print 'max/min of seq: ', seq_max, seq_min print 'number of unique items:', len(unique_events) CsvUtility.write2pickle( path.join(Path, 'data-repository/after_instance.pkl'), after_del_sequence, 'w') CsvUtility.write2pickle( path.join(Path, 'data-repository/event_instance_dict.pkl'), unique_events, 'w') CsvUtility.write2pickle( path.join(Path, 'data-repository/predict_diags_dict.pkl'), predicted_diags, 'w') print '************************************************************'
def get_revert_prescription(): prescription_df = pd.read_csv(os.path.join( Path, 'MIMICIII_data/PRESCRIPTIONS.csv'), dtype=str) drug_df = CsvUtility.read_pickle( Path + '/data-repository/prescription_drug_over.pkl', 'r') # print type(list(drug_df.index)[0]) # print np.array(list(drug_df.index), dtype=str) print prescription_df.shape print prescription_df[:5] print prescription_df.dtypes print prescription_df.describe() prescription_dict = prescription_df[[ 'FORMULARY_DRUG_CD', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC' ]] print prescription_dict.shape prescription_dict = prescription_dict.dropna() print prescription_dict.shape prescription_dict = prescription_dict.drop_duplicates() print prescription_dict.shape # print prescription_dict[:5] # prescription_dict.to_csv("../data-repository/prescription_dict.csv", index=None) stop_char = ['(', ')', '/', '/"', '-'] stop_str = { "*nf*", "a", "b", "of", "and", "by", "to", "or", "the", "in", "with", "not", "classified", "for", "on", "from", "without", "as", "other", "than", "more", "at", "one", "all", "its", "may", "after", "any", "d", "be", "into", "their", "which", "an", "ec", "c", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "i", "s", "t", "u", "v", "w", "x", "y", "z", "vs.", "mg", "extended-release", "" } revert_prescrip_dict = {} prescrip_list = prescription_dict.values print prescrip_list[:5] for i in range(len(prescrip_list)): if prescrip_list[i][0] in list(drug_df.index): word_list_tmp = [] prescrip_str = remove_bracket_from_str(prescrip_list[i][1]) for stop_c in stop_char: prescrip_str = prescrip_str.replace(stop_c, ' ').strip() for word_tmp in prescrip_str.split(" "): tmp = word_tmp.lower() if len(tmp) > 0 and any(char.isalpha() for char in tmp): if tmp.endswith("mg") and len(tmp) > 2 and is_number( tmp[:-2]): pass elif tmp not in stop_str: word_list_tmp.append(tmp.strip()) words = " ".join(word_list_tmp).strip() if len(words) > 0 and words not in revert_prescrip_dict: revert_prescrip_dict[words] = prescrip_list[i][0] word_list_tmp = [] prescrip_str = remove_bracket_from_str(prescrip_list[i][2]) for stop_c in stop_char: prescrip_str = prescrip_str.replace(stop_c, ' ').strip() for word_tmp in prescrip_str.split(" "): tmp = word_tmp.lower() if len(tmp) > 0 and any(char.isalpha() for char in tmp): if tmp.endswith("mg") and len(tmp) > 2 and is_number( tmp[:-2]): pass elif tmp not in stop_str: word_list_tmp.append(tmp.strip()) words = " ".join(word_list_tmp).strip() if len(words) > 0 and words not in revert_prescrip_dict: revert_prescrip_dict[words] = prescrip_list[i][0] print revert_prescrip_dict print len(revert_prescrip_dict) CsvUtility.write_dict2csv(dict(revert_prescrip_dict), Path + "/data-repository", 'revert_prescription_dict.csv')
def get_dataset(data_pickle_path, word_dict_path, predict_dict_path, save=False): all_events = CsvUtility.read_pickle(data_pickle_path, 'r') word_dict = CsvUtility.read_pickle(word_dict_path, 'r') predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r') print all_events[0] print len(word_dict), len(predict_dict), len(all_events) feature_dict = DictDoubleMap(list(word_dict)) pred_dict = DictDoubleMap(list(predict_dict)) feature_matrix = np.zeros((len(all_events), len(word_dict))) result_matrix = np.zeros((len(all_events), len(predict_dict))) for i_iter, event_line in enumerate(all_events): for event_item in event_line[0]: feature_matrix[i_iter][feature_dict.get_index_by_word(event_item)] += 1 for pred_item in event_line[1]: result_matrix[i_iter][pred_dict.get_index_by_word(pred_item)] = 1 if i_iter % 1000 == 0: print 'complete {0} of {1}'.format(i_iter, len(all_events)) if save: CsvUtility.write_dict2csv(feature_dict.get_word2index(), Path+'/data-repository/', 'feature2index.csv') CsvUtility.write_dict2csv(pred_dict.get_word2index(), Path+'/data-repository/', 'predict2index.csv') CsvUtility.write_array2csv(feature_matrix, Path+'/data-repository/', 'feature_matrix.csv') CsvUtility.write_array2csv(result_matrix, Path+'/data-repository/', 'result_matrix.csv') return feature_matrix, result_matrix
from baseline_method.multi_logistic_model import MultiLogistic from load_data import load_corpus, reload_corpus from utility.csv_utility import CsvUtility if __name__ == '__main__': print 'loading data...' train_x, train_y, test_x, test_y, idx = reload_corpus() print 'loading ready...' multi_logs = MultiLogistic(len(train_y[0])) print 'training...' multi_logs.training(training_x=train_x, training_y=train_y) print 'testing...' re_auc, re_list = multi_logs.testing(testing_x=test_x, testing_y=test_y) print re_auc[:-1] print re_auc[-1] CsvUtility.write2pickle('../data-repository/model_multilogisticCV.pkl', [idx, re_auc, re_list], 'w')
# except Exception: # pass # first step print "prepare the dict of subject(patient), diagnosis, medication, labtest by limit minimal count number" subject_admission_over('MIMICIII_data/ADMISSIONS.csv', 1) print "=============================================================================" icd_diagnoses_over('MIMICIII_data/DIAGNOSES_ICD.csv', 5) print "=============================================================================" icd_procedures_over('MIMICIII_data/PROCEDURES_ICD.csv', 5) print "=============================================================================" get_lab_item_over('MIMICIII_data/LABEVENTS.csv', 10) print "=============================================================================" get_drug_over('MIMICIII_data/PRESCRIPTIONS.csv', 10) print "=============================================================================" # get_all_diagnoses_event() # get_lab_event() # get_medication_event() # third step get_events_together() all_events = CsvUtility.read_pickle( path.join(Path, 'data-repository/allevents.pkl'), 'r') for i in all_events.ix[0, :]: print i print type(i) # filter_all_event() print '******************************************************************************' # python select_relate_literature.py '../data-repository/BMC_Musuloskelet_Disord' '../data-repository' 'merge_diagnoses_word_dict.csv'
doc_maps = [] for doc in file_contend: doc_maps.extend([[doc[2], doc[1]]]) # print len(doc_maps), len(doc_maps[0]) return doc_maps if __name__ == '__main__': # get_good_docs('../data-repository/result/jack_1.csv', 10, 2) file_list = Directory.folder_process(Path + '/data-repository/result_0.8') merge_dict = dict({}) doc_map = [] for file_path in file_list: dict_tmp = get_good_docs(file_path, 80, 10) print 'this dict len : ', len(dict_tmp) merge_dict.update(dict_tmp) print 'after the merge : ', len(merge_dict) doc_map.extend(get_docs_frequence_kind_map(file_path=file_path)) # draw_pl(x_y=doc_map, type='o') # print merge_dict texts = [[word for word in doc.split(' ')] for doc in merge_dict.values()] # pprint(texts[:5]) dictionary = corpora.Dictionary(texts) dictionary.save(Path + '/data-repository/available_word_in_literature.dict') print dictionary CsvUtility.write_dict2csv(merge_dict, Path + '/data-repository', 'selected_docs4LDA.csv')