def build_inputs(all_sentences): all_x = [] all_l = [] elmo_models = ELMO_MIMIC() for sentence in all_sentences: embeddings = elmo_models.get_embeddings(sentence) all_x.append(embeddings) all_l.append(len(embeddings)) elmo_models.close_session() return all_x, all_l
def main(): with open('data_orig.pickle', "rb") as input_file: train_c = pickle.load(input_file) elmo_model = ELMO_MIMIC() write_tf_records(train_c, elmo_model)
def __init__(self, models_path): if not os.path.isdir(models_path): raise FileNotFoundError os.environ['CCE_ASSETS'] = models_path self.all_concept = [ 'O', 'I-problem', 'I-treatment', 'I-test', 'B-problem', 'B-treatment', 'B-test' ] # build both emlo and clinical_concept extraction globally so no need to build them with each time prediction needed # save some time and speed up prediction process. self.elmo_model = ELMO_MIMIC() tf.compat.v1.reset_default_graph() self.y, self.x_placeHolder, self.l_placeHolder, self.clinical_session = build_clinical_graph( session=tf.compat.v1.Session(config=config))
def main(): all_concept = [ '', 'problem', 'treatment', 'test', 'B-problem', 'B-treatment', 'B-test' ] save_dir = '../data/preprocessed/tfrecords/' if not os.path.isdir(save_dir): os.makedirs(save_dir) beth_t, beth_c = pickle.load( open('../data/preprocessed/pkl/beth.pkl', 'rb')) partners_t, partners_c = pickle.load( open('../data/preprocessed/pkl/partners.pkl', 'rb')) test_t, test_c = pickle.load( open('../data/preprocessed/pkl/text.pkl', 'rb')) train_t = beth_t + partners_t train_c = beth_c + partners_c elmo_model = ELMO_MIMIC() # not for cv, just to break to 10 shards cv = KFold(n_splits=10, random_state=0, shuffle=True) split_num = 0 for _, valid_set in cv.split(train_c): valid_t = [train_t[i] for i in valid_set] valid_c = [train_c[i] for i in valid_set] output_filename = save_dir + 'train_cv' + str(split_num) + '.tfrecords' write_tf_records(valid_t, valid_c, output_filename, all_concept, elmo_model) split_num += 1 output_filename = save_dir + 'test.tfrecords' write_tf_records(test_t, test_c, output_filename, all_concept, elmo_model)
class ClinicalConceptExtraction: def __init__(self, models_path): if not os.path.isdir(models_path): raise FileNotFoundError os.environ['CCE_ASSETS'] = models_path self.all_concept = [ 'O', 'I-problem', 'I-treatment', 'I-test', 'B-problem', 'B-treatment', 'B-test' ] # build both emlo and clinical_concept extraction globally so no need to build them with each time prediction needed # save some time and speed up prediction process. self.elmo_model = ELMO_MIMIC() tf.compat.v1.reset_default_graph() self.y, self.x_placeHolder, self.l_placeHolder, self.clinical_session = build_clinical_graph( session=tf.compat.v1.Session(config=config)) def decode_prediction(self, all_y, l): ''' map prediction output to all concepts ['O', 'I-problem', 'I-treatment', 'I-test', 'B-problem', 'B-treatment', 'B-test'] ''' all_y_ens = [] for i in range(len(l)): best_v, _ = mode(all_y[i][:l[i]], axis=1) ann_ids = best_v.reshape(-1) ann = [self.all_concept[i] for i in ann_ids] all_y_ens.append(ann) return all_y_ens def predict_concepts_labels(self, tokenized_sentences): ''' get embeddings for batch tokenized sentences and feed them to the clinical concept extraction model. ''' embedds, embedds_lengths = self.elmo_model.get_embeddings( tokenized_sentences) all_y = [ self.clinical_session.run([self.y], feed_dict={ self.x_placeHolder: embedds, self.l_placeHolder: embedds_lengths })[0][0] ] prediction = self.decode_prediction(np.squeeze(all_y, axis=0), embedds_lengths) return prediction def extract_concepts(self, text, batch_size=1, as_one_batch=False): ''' note: sample text as_one_batch : boolen to indicate if desired to predict the whole text as one batch ''' start_time = time.time() concepts = [] tokenized_sentences, all_spans, normalized_text = parse_text(text) if (batch_size > len(tokenized_sentences)) or as_one_batch: batch_size = len(tokenized_sentences) number_of_batches = int(len(tokenized_sentences) / batch_size) remaining_batchs = len(tokenized_sentences) % batch_size for batch_number in range(number_of_batches): batch_sentences_tokens = tokenized_sentences[batch_number * batch_size: (batch_number * batch_size) + batch_size] batch_spans = all_spans[batch_number * batch_size:(batch_number * batch_size) + batch_size] predictions = self.predict_concepts_labels(batch_sentences_tokens) for sent_tokens, sent_spans, sent_ann in zip( batch_sentences_tokens, batch_spans, predictions): for token, span, annotation in zip(sent_tokens, sent_spans, sent_ann): concepts.append([token, span, annotation]) # predict remaining last batch if remaining_batchs > 0: remaining_last_batch = tokenized_sentences[number_of_batches * batch_size:] remaining_last_spans = all_spans[number_of_batches * batch_size:] predictions = self.predict_concepts_labels(remaining_last_batch) for sent_tokens, sent_spans, sent_ann in zip( remaining_last_batch, remaining_last_spans, predictions): for token, span, annotation in zip(sent_tokens, sent_spans, sent_ann): concepts.append([token, span, annotation]) print("\n\nTook ", time.time() - start_time, " Seconds to predict\n\n") # concept is an list of [[token_0, span_0, label_0], [token_1, span_1, label_1], ..., ...., [token_n, span_n, label_n]] return concepts