def save(file, idx, label): file.write("---" + label + "---\n") freq = df.loc[idx, MEDICINE_COL].sum() / len(idx) for i, med in enumerate(MEDICINE_COL): f.write("%s:%.2f(%d)\n" % (med, freq[i], df[med][idx].sum())) f.write("total#: %d\n" % len(idx)) df.loc[idx, NOTES_COL].to_pickle( os.path.join(check_sys_path(), "%s_idx.pkl" % label)) df.loc[idx, MEDICINE_COL].to_csv(os.path.join(check_sys_path(), "%s_label.csv" % label), index=False)
def transform(infile, outfile): """ :param infile: npy filename, shape=(num_notes, num_words) :param outfile: npy filename, shape=(num_notes, num_words, embedding_dim) """ print("transforming idx data %s to embedding data %s..." % (infile, outfile)) transform_dataset = IdxData(context_size=-1, file=infile) transform_loader = DataLoader( transform_dataset, batch_size= 1, # because the num of words in different note is different num_workers=1, shuffle=False) with torch.no_grad(): net.eval() embeddings = [] for note in transform_loader: note = note.long().cuda() embedding = net.embeddings(note) embedding = embedding.cpu().numpy().astype("float32") embeddings.append( embedding[0]) # batch size = 1, select the item notes_embedding = np.array(embeddings) np.save(os.path.join(check_sys_path(), outfile), notes_embedding)
def __init__(self, hidden_size, attention_size, num_classes, word2vec_path=os.path.join(check_sys_path(), "glove.6B.50d.txt")): super(HAN, self).__init__() # load pre-trained embedding layer # pretrained_weight = load_pretrained_embedding(word2vec_path) # self.embedding = nn.Embedding.from_pretrained(pretrained_weight) # embed_dim = self.embedding.weight.shape[1] # TODO: load pre trained weight embed_dim = 128 self.embedding = nn.Embedding(len(word2idx), embed_dim) self.word_att = AttGRU(input_size=embed_dim, hidden_size=hidden_size, att_size=attention_size) self.sent_att = AttGRU(input_size=hidden_size * 2, hidden_size=hidden_size, att_size=attention_size) self.fc = nn.Linear(hidden_size * 2, num_classes)
embedding[0]) # batch size = 1, select the item notes_embedding = np.array(embeddings) np.save(os.path.join(check_sys_path(), outfile), notes_embedding) transform("train_idx.npy", "train_%dembedding.npy" % args.embedding_dim) transform("val_idx.npy", "val_%dembedding.npy" % args.embedding_dim) # find most closed words to medicines on embedding space to evaluate print( "finding most relevant words of medicine (nearest neighbour on embedding space..." ) # TODO: test performance on T-SNE low dimension space idx_med = dict() with open(os.path.join(check_sys_path(), "med_idx.txt")) as f: for line in f: med, idx = line.split(":") idx_med[int(idx)] = med idx_word = dict() with open(os.path.join(check_sys_path(), "word_idx.txt")) as f: for line in f: word, idx = line.split(":") idx_word[int(idx)] = word print("calculating embeddings for all words and medicines") # get embedding of all words for med in idx_med.keys(): # remove medicines idx from words idx idx_word.pop(med, None) words = np.array(list(idx_word.keys()))
MEDICINE_COL = [ "metoprolol", "furosemide", "lisinopril", "amlodipine", "atenolol", "hydrochlorothiazide", "diltiazem", "carvedilol" ] parser = argparse.ArgumentParser(description='embedding model') parser.add_argument('--min_freq', default=10, type=int, help='learning rate') # wrong descriptions? parser.add_argument('--max_df', default=1, type=float, help='batch size') # wrong descriptions? args = parser.parse_args() if __name__ == '__main__': # read data df = pd.read_csv( os.path.join(check_sys_path(), "discharge_notes_with_medication.csv")) df = df[df["admission_notes"].notna()] print("cleaning word...") # discharge_notes = df["discharge_notes"][train_idx].fillna("").tolist() admission_notes = df["admission_notes"].dropna().tolist() for i in range(len(admission_notes)): admission_notes[i] = re.sub(r"\d\. ", " ordernum ", admission_notes[i]) admission_notes[i] = re.sub(r"\d\d:\d\d", " hourtime ", admission_notes[i]) admission_notes[i] = re.sub(r"\d+", " num ", admission_notes[i]) admission_notes[i] = re.sub("_", " ", admission_notes[i]) admission_notes[i] = re.sub(r"\. ", " eos ", admission_notes[i]) print("training nlp model...") vectorizer = CountVectorizer(min_df=args.min_freq,
"past medical history", "past procedure", "social history", "family history", "initial exam", "admission medications", "pertinent results" ] MEDICINE_COL = [ "metoprolol", "furosemide", "lisinopril", "amlodipine", "atenolol", "hydrochlorothiazide", "diltiazem", "carvedilol" ] parser = argparse.ArgumentParser(description='embedding model') args = parser.parse_args() if __name__ == '__main__': # read data df = pd.read_csv( os.path.join(check_sys_path(), "discharge_notes_with_medication.csv")) print("cleaning word...") df["admission_notes"] = "" for col in NOTES_COL: df[col] = df[col].str.lower().replace(r"\d\. ", " ordernum ", regex=True) df[col] = df[col].str.replace(r"\d\d:\d\d", " hourtime ", regex=True) df[col] = df[col].str.replace(r"\d+", " num ", regex=True) df[col] = df[col].str.replace("_", " ", regex=True) df[col] = df[col].str.replace(r"\. ", " <eos> ", regex=True) df["admission_notes"] = df["admission_notes"] + df[col] df = df.dropna(subset=["admission_notes"]) admission_notes = df["admission_notes"].dropna().tolist()