def save(file, idx, label):
        file.write("---" + label + "---\n")
        freq = df.loc[idx, MEDICINE_COL].sum() / len(idx)
        for i, med in enumerate(MEDICINE_COL):
            f.write("%s:%.2f(%d)\n" % (med, freq[i], df[med][idx].sum()))
        f.write("total#: %d\n" % len(idx))

        df.loc[idx, NOTES_COL].to_pickle(
            os.path.join(check_sys_path(), "%s_idx.pkl" % label))
        df.loc[idx, MEDICINE_COL].to_csv(os.path.join(check_sys_path(),
                                                      "%s_label.csv" % label),
                                         index=False)
    def transform(infile, outfile):
        """
        :param infile: npy filename, shape=(num_notes, num_words)
        :param outfile: npy filename, shape=(num_notes, num_words, embedding_dim)
        """
        print("transforming idx data %s to embedding data %s..." %
              (infile, outfile))
        transform_dataset = IdxData(context_size=-1, file=infile)
        transform_loader = DataLoader(
            transform_dataset,
            batch_size=
            1,  # because the num of words in different note is different
            num_workers=1,
            shuffle=False)
        with torch.no_grad():
            net.eval()
            embeddings = []
            for note in transform_loader:
                note = note.long().cuda()
                embedding = net.embeddings(note)

                embedding = embedding.cpu().numpy().astype("float32")
                embeddings.append(
                    embedding[0])  # batch size = 1, select the item

        notes_embedding = np.array(embeddings)
        np.save(os.path.join(check_sys_path(), outfile), notes_embedding)
Beispiel #3
0
    def __init__(self,
                 hidden_size,
                 attention_size,
                 num_classes,
                 word2vec_path=os.path.join(check_sys_path(),
                                            "glove.6B.50d.txt")):
        super(HAN, self).__init__()

        # load pre-trained embedding layer
        # pretrained_weight = load_pretrained_embedding(word2vec_path)
        # self.embedding = nn.Embedding.from_pretrained(pretrained_weight)

        # embed_dim = self.embedding.weight.shape[1]
        # TODO: load pre trained weight
        embed_dim = 128
        self.embedding = nn.Embedding(len(word2idx), embed_dim)

        self.word_att = AttGRU(input_size=embed_dim,
                               hidden_size=hidden_size,
                               att_size=attention_size)
        self.sent_att = AttGRU(input_size=hidden_size * 2,
                               hidden_size=hidden_size,
                               att_size=attention_size)

        self.fc = nn.Linear(hidden_size * 2, num_classes)
                    embedding[0])  # batch size = 1, select the item

        notes_embedding = np.array(embeddings)
        np.save(os.path.join(check_sys_path(), outfile), notes_embedding)

    transform("train_idx.npy", "train_%dembedding.npy" % args.embedding_dim)
    transform("val_idx.npy", "val_%dembedding.npy" % args.embedding_dim)

    # find most closed words to medicines on embedding space to evaluate
    print(
        "finding most relevant words of medicine (nearest neighbour on embedding space..."
    )
    # TODO: test performance on T-SNE low dimension space

    idx_med = dict()
    with open(os.path.join(check_sys_path(), "med_idx.txt")) as f:
        for line in f:
            med, idx = line.split(":")
            idx_med[int(idx)] = med

    idx_word = dict()
    with open(os.path.join(check_sys_path(), "word_idx.txt")) as f:
        for line in f:
            word, idx = line.split(":")
            idx_word[int(idx)] = word

    print("calculating embeddings for all words and medicines")
    # get embedding of all words
    for med in idx_med.keys():  # remove medicines idx from words idx
        idx_word.pop(med, None)
    words = np.array(list(idx_word.keys()))
MEDICINE_COL = [
    "metoprolol", "furosemide", "lisinopril", "amlodipine", "atenolol",
    "hydrochlorothiazide", "diltiazem", "carvedilol"
]

parser = argparse.ArgumentParser(description='embedding model')
parser.add_argument('--min_freq', default=10, type=int,
                    help='learning rate')  # wrong descriptions?
parser.add_argument('--max_df', default=1, type=float,
                    help='batch size')  # wrong descriptions?
args = parser.parse_args()

if __name__ == '__main__':
    # read data
    df = pd.read_csv(
        os.path.join(check_sys_path(), "discharge_notes_with_medication.csv"))
    df = df[df["admission_notes"].notna()]

    print("cleaning word...")
    # discharge_notes = df["discharge_notes"][train_idx].fillna("").tolist()
    admission_notes = df["admission_notes"].dropna().tolist()
    for i in range(len(admission_notes)):
        admission_notes[i] = re.sub(r"\d\. ", " ordernum ", admission_notes[i])
        admission_notes[i] = re.sub(r"\d\d:\d\d", " hourtime ",
                                    admission_notes[i])
        admission_notes[i] = re.sub(r"\d+", " num ", admission_notes[i])
        admission_notes[i] = re.sub("_", " ", admission_notes[i])
        admission_notes[i] = re.sub(r"\. ", " eos ", admission_notes[i])

    print("training nlp model...")
    vectorizer = CountVectorizer(min_df=args.min_freq,
    "past medical history", "past procedure", "social history",
    "family history", "initial exam", "admission medications",
    "pertinent results"
]
MEDICINE_COL = [
    "metoprolol", "furosemide", "lisinopril", "amlodipine", "atenolol",
    "hydrochlorothiazide", "diltiazem", "carvedilol"
]

parser = argparse.ArgumentParser(description='embedding model')
args = parser.parse_args()

if __name__ == '__main__':
    # read data
    df = pd.read_csv(
        os.path.join(check_sys_path(), "discharge_notes_with_medication.csv"))

    print("cleaning word...")
    df["admission_notes"] = ""
    for col in NOTES_COL:
        df[col] = df[col].str.lower().replace(r"\d\. ",
                                              " ordernum ",
                                              regex=True)
        df[col] = df[col].str.replace(r"\d\d:\d\d", " hourtime ", regex=True)
        df[col] = df[col].str.replace(r"\d+", " num ", regex=True)
        df[col] = df[col].str.replace("_", " ", regex=True)
        df[col] = df[col].str.replace(r"\. ", " <eos> ", regex=True)
        df["admission_notes"] = df["admission_notes"] + df[col]

    df = df.dropna(subset=["admission_notes"])
    admission_notes = df["admission_notes"].dropna().tolist()