def build_inputs(all_sentences):
    all_x = []
    all_l = []

    elmo_models = ELMO_MIMIC()

    for sentence in all_sentences:
        embeddings = elmo_models.get_embeddings(sentence)
        all_x.append(embeddings)
        all_l.append(len(embeddings))
    elmo_models.close_session()

    return all_x, all_l
def main():

    with open('data_orig.pickle', "rb") as input_file:
        train_c = pickle.load(input_file)

    elmo_model = ELMO_MIMIC()
    write_tf_records(train_c, elmo_model)
    def __init__(self, models_path):
        if not os.path.isdir(models_path):
            raise FileNotFoundError
        os.environ['CCE_ASSETS'] = models_path

        self.all_concept = [
            'O', 'I-problem', 'I-treatment', 'I-test', 'B-problem',
            'B-treatment', 'B-test'
        ]

        # build both emlo and clinical_concept extraction globally so no need to build them with each time prediction needed
        # save some time and speed up prediction process.
        self.elmo_model = ELMO_MIMIC()
        tf.compat.v1.reset_default_graph()
        self.y, self.x_placeHolder, self.l_placeHolder, self.clinical_session = build_clinical_graph(
            session=tf.compat.v1.Session(config=config))
Exemple #4
0
def main():
    all_concept = [
        '', 'problem', 'treatment', 'test', 'B-problem', 'B-treatment',
        'B-test'
    ]

    save_dir = '../data/preprocessed/tfrecords/'

    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    beth_t, beth_c = pickle.load(
        open('../data/preprocessed/pkl/beth.pkl', 'rb'))
    partners_t, partners_c = pickle.load(
        open('../data/preprocessed/pkl/partners.pkl', 'rb'))
    test_t, test_c = pickle.load(
        open('../data/preprocessed/pkl/text.pkl', 'rb'))

    train_t = beth_t + partners_t
    train_c = beth_c + partners_c

    elmo_model = ELMO_MIMIC()

    # not for cv, just to break to 10 shards
    cv = KFold(n_splits=10, random_state=0, shuffle=True)

    split_num = 0

    for _, valid_set in cv.split(train_c):
        valid_t = [train_t[i] for i in valid_set]
        valid_c = [train_c[i] for i in valid_set]

        output_filename = save_dir + 'train_cv' + str(split_num) + '.tfrecords'

        write_tf_records(valid_t, valid_c, output_filename, all_concept,
                         elmo_model)

        split_num += 1

    output_filename = save_dir + 'test.tfrecords'

    write_tf_records(test_t, test_c, output_filename, all_concept, elmo_model)
class ClinicalConceptExtraction:
    def __init__(self, models_path):
        if not os.path.isdir(models_path):
            raise FileNotFoundError
        os.environ['CCE_ASSETS'] = models_path

        self.all_concept = [
            'O', 'I-problem', 'I-treatment', 'I-test', 'B-problem',
            'B-treatment', 'B-test'
        ]

        # build both emlo and clinical_concept extraction globally so no need to build them with each time prediction needed
        # save some time and speed up prediction process.
        self.elmo_model = ELMO_MIMIC()
        tf.compat.v1.reset_default_graph()
        self.y, self.x_placeHolder, self.l_placeHolder, self.clinical_session = build_clinical_graph(
            session=tf.compat.v1.Session(config=config))

    def decode_prediction(self, all_y, l):
        '''
        map prediction output to all concepts
        ['O', 'I-problem', 'I-treatment', 'I-test', 'B-problem', 'B-treatment', 'B-test']
        '''
        all_y_ens = []
        for i in range(len(l)):
            best_v, _ = mode(all_y[i][:l[i]], axis=1)
            ann_ids = best_v.reshape(-1)
            ann = [self.all_concept[i] for i in ann_ids]
            all_y_ens.append(ann)

        return all_y_ens

    def predict_concepts_labels(self, tokenized_sentences):
        '''
        get embeddings for batch tokenized sentences and feed them to the clinical concept extraction model.
        '''
        embedds, embedds_lengths = self.elmo_model.get_embeddings(
            tokenized_sentences)
        all_y = [
            self.clinical_session.run([self.y],
                                      feed_dict={
                                          self.x_placeHolder: embedds,
                                          self.l_placeHolder: embedds_lengths
                                      })[0][0]
        ]
        prediction = self.decode_prediction(np.squeeze(all_y, axis=0),
                                            embedds_lengths)
        return prediction

    def extract_concepts(self, text, batch_size=1, as_one_batch=False):
        '''
        note: sample text
        as_one_batch : boolen to indicate if desired to predict the whole text as one batch
        '''
        start_time = time.time()
        concepts = []
        tokenized_sentences, all_spans, normalized_text = parse_text(text)

        if (batch_size > len(tokenized_sentences)) or as_one_batch:
            batch_size = len(tokenized_sentences)

        number_of_batches = int(len(tokenized_sentences) / batch_size)
        remaining_batchs = len(tokenized_sentences) % batch_size

        for batch_number in range(number_of_batches):
            batch_sentences_tokens = tokenized_sentences[batch_number *
                                                         batch_size:
                                                         (batch_number *
                                                          batch_size) +
                                                         batch_size]

            batch_spans = all_spans[batch_number *
                                    batch_size:(batch_number * batch_size) +
                                    batch_size]

            predictions = self.predict_concepts_labels(batch_sentences_tokens)

            for sent_tokens, sent_spans, sent_ann in zip(
                    batch_sentences_tokens, batch_spans, predictions):
                for token, span, annotation in zip(sent_tokens, sent_spans,
                                                   sent_ann):
                    concepts.append([token, span, annotation])

        # predict remaining last batch
        if remaining_batchs > 0:
            remaining_last_batch = tokenized_sentences[number_of_batches *
                                                       batch_size:]
            remaining_last_spans = all_spans[number_of_batches * batch_size:]
            predictions = self.predict_concepts_labels(remaining_last_batch)
            for sent_tokens, sent_spans, sent_ann in zip(
                    remaining_last_batch, remaining_last_spans, predictions):
                for token, span, annotation in zip(sent_tokens, sent_spans,
                                                   sent_ann):
                    concepts.append([token, span, annotation])

        print("\n\nTook ", time.time() - start_time, " Seconds to predict\n\n")

        # concept is an list of [[token_0, span_0, label_0], [token_1, span_1, label_1], ..., ...., [token_n, span_n, label_n]]
        return concepts