def get_re_predictions(test_ehr: HealthRecord) -> HealthRecord: """ Get predictions for Relation Extraction. Parameters ----------- test_ehr : HealthRecord A HealthRecord object with entities set. Returns -------- HealthRecord The original object with relations set. """ test_dataset = RETestDataset(test_ehr, biobert_ner_tokenizer, BIOBERT_RE_SEQ_LEN, re_label_list) if len(test_dataset) == 0: test_ehr.relations = [] return test_ehr re_predictions = biobert_re_trainer.predict( test_dataset=test_dataset).predictions re_predictions = np.argmax(re_predictions, axis=1) idx = 1 rel_preds = [] for relation, pred in zip(test_dataset.relation_list, re_predictions): if pred == 1: relation.ann_id = "R%d" % idx idx += 1 rel_preds.append(relation) test_ehr.relations = rel_preds return test_ehr
def get_bilstm_ner_predictions( test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BiLSTM Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BILSTM_NER_SEQ_LEN) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append(words) predictions = bilstm_learn.predict(examples) pred_entities = [] for idx in range(len(split_points) - 1): chunk_pred = get_chunks(predictions[idx]) for ent in chunk_pred: pred_entities.append( (ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1])[0], test_ehr.get_char_idx(split_points[idx] + ent[2])[1])) return pred_entities
def get_ner_predictions(ehr_record: str, model_name: str = "biobert", record_id: str = "1") -> HealthRecord: """ Get predictions for NER using either BioBERT or BiLSTM Parameters -------------- ehr_record : str An EHR record in text format. model_name : str The model to use for prediction. Default is biobert. record_id : str The record id of the returned object. Default is 1. Returns ----------- A HealthRecord object with entities set. """ if model_name.lower() == "biobert": test_ehr = HealthRecord(record_id=record_id, text=ehr_record, tokenizer=biobert_ner_tokenizer.tokenize, is_bert_tokenizer=True, is_training=False) predictions = get_biobert_ner_predictions(test_ehr) elif model_name.lower() == "bilstm": test_ehr = HealthRecord(text=ehr_record, tokenizer=scispacy_plus_tokenizer, is_bert_tokenizer=False, is_training=False) predictions = get_bilstm_ner_predictions(test_ehr) else: raise AttributeError("Accepted model names include 'biobert' " "and 'bilstm'.") ent_preds = [] for i, pred in enumerate(predictions): ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]]) ent_text = test_ehr.text[ent[0]:ent[1]] if not any(letter.isalnum() for letter in ent_text): continue ent.set_text(ent_text) ent_preds.append(ent) test_ehr.entities = ent_preds return test_ehr
def get_biobert_ner_predictions(test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BioBERT Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BIOBERT_SEQ_LEN - 2) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append(NerExample(guid=str(split_points[idx]), words=words, labels=["O"] * len(words))) input_features = convert_examples_to_features( examples, biobert_ner_labels, max_seq_length=BIOBERT_SEQ_LEN, tokenizer=biobert_ner_tokenizer, cls_token_at_end=False, cls_token=biobert_ner_tokenizer.cls_token, cls_token_segment_id=0, sep_token=biobert_ner_tokenizer.sep_token, sep_token_extra=False, pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"), pad_token=biobert_ner_tokenizer.pad_token_id, pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id, pad_token_label_id=nn.CrossEntropyLoss().ignore_index) predictions, _, _ = biobert_ner_trainer.predict(input_features) predictions = align_predictions(predictions) pred_entities = [] for idx in range(len(split_points) - 1): chunk_pred = get_chunks(predictions[idx]) for ent in chunk_pred: pred_entities.append((ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1] - 1)[0], test_ehr.get_char_idx(split_points[idx] + ent[2] - 1)[1])) return pred_entities
def get_biobert_ner_predictions( test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BioBERT Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BIOBERT_NER_SEQ_LEN - 2) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append( NerExample(guid=str(split_points[idx]), words=words, labels=["O"] * len(words))) input_features = convert_examples_to_features( examples, biobert_ner_labels, max_seq_length=BIOBERT_NER_SEQ_LEN, tokenizer=biobert_ner_tokenizer, cls_token_at_end=False, cls_token=biobert_ner_tokenizer.cls_token, cls_token_segment_id=0, sep_token=biobert_ner_tokenizer.sep_token, sep_token_extra=False, pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"), pad_token=biobert_ner_tokenizer.pad_token_id, pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id, pad_token_label_id=nn.CrossEntropyLoss().ignore_index, verbose=0) test_dataset = NerTestDataset(input_features) predictions, label_ids, _ = biobert_ner_trainer.predict(test_dataset) predictions = align_predictions(predictions, label_ids) # Flatten the prediction list predictions = [p for ex in predictions for p in ex] input_tokens = test_ehr.get_tokens() prev_pred = "" final_predictions = [] idx = 0 for token in input_tokens: if token.startswith("##"): if prev_pred == "O": final_predictions.append(prev_pred) else: pred_typ = prev_pred.split("-")[-1] final_predictions.append("I-" + pred_typ) else: prev_pred = predictions[idx] final_predictions.append(prev_pred) idx += 1 pred_entities = [] chunk_pred = get_chunks(final_predictions) for ent in chunk_pred: pred_entities.append((ent[0], test_ehr.get_char_idx(ent[1])[0], test_ehr.get_char_idx(ent[2])[1])) return pred_entities
def read_data(data_dir: str = 'data/', tokenizer: Callable[[str], List[str]] = None, is_bert_tokenizer: bool = True, verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]: """ Reads train and test data Parameters ---------- data_dir : str, optional Directory where the data is located. It should have directories named 'train' and 'test' The default is 'data/'. tokenizer : Callable[[str], List[str]], optional The tokenizer function to use.. The default is None. is_bert_tokenizer : bool If the tokenizer is a BERT-based WordPiece tokenizer verbose : int, optional 1 to print reading progress, 0 otherwise. The default is 0. Returns ------- Tuple[List[HealthRecord], List[HealthRecord]] Train data, Test data. """ train_path = os.path.join(data_dir, "train") test_path = os.path.join(data_dir, "test") # Get all IDs for train and test data train_ids = list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(train_path) \ if not fname.startswith('.')])) test_ids = list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(test_path) \ if not fname.startswith('.')])) if verbose == 1: print("Train data:") train_data = [] for idx, fid in enumerate(train_ids): record = HealthRecord(fid, text_path=os.path.join(train_path, fid + '.txt'), ann_path=os.path.join(train_path, fid + '.ann'), tokenizer=tokenizer, is_bert_tokenizer=is_bert_tokenizer) train_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(train_ids)) if verbose == 1: print('\n\nTest Data:') test_data = [] for idx, fid in enumerate(test_ids): record = HealthRecord(fid, text_path=os.path.join(test_path, fid + '.txt'), ann_path=os.path.join(test_path, fid + '.ann'), tokenizer=tokenizer, is_bert_tokenizer=is_bert_tokenizer) test_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(test_ids)) return train_data, test_data
def generate_re_test_file( ehr_record: HealthRecord, max_len: int = 128) -> Tuple[List[str], List[Relation]]: """ Generates test file for Relation Extraction. Parameters ----------- ehr_record : HealthRecord The EHR record with entities set. max_len : int The maximum length of sequence. Returns -------- Tuple[List[str], List[Relation]] List of sequences with entity replaced by it's tag. And a list of relation objects representing relation in those sequences. """ random.seed(0) re_text_list = [] relation_list = [] text = ehr_record.text entities = ehr_record.get_entities() if isinstance(entities, dict): entities = list(entities.values()) # get character split points char_split_points = get_char_split_points(ehr_record, max_len) start = 0 end = char_split_points[0] for i in range(len(char_split_points)): # Obtain only entities within the split text range_entities = [ ent for ent in filter( lambda item: int(item[0]) >= start and int(item[1]) <= end, entities) ] # Get all possible relations within the split text possible_relations = utils.map_entities(range_entities) for rel, label in possible_relations: split_text = text[start:end] split_offset = start ent1 = rel.get_entities()[0] ent2 = rel.get_entities()[1] # Check if both entities are within split text if ent1[0] >= start and ent1[1] < end and \ ent2[0] >= start and ent2[1] < end: modified_text = replace_entity_text(split_text, ent1, ent2, split_offset) # Replace un-required characters with space final_text = modified_text.replace('\n', ' ').replace('\t', ' ') re_text_list.append(final_text) relation_list.append(rel) start = end if i != len(char_split_points) - 1: end = char_split_points[i + 1] else: end = len(text) + 1 assert len(re_text_list) == len(relation_list) return re_text_list, relation_list
def read_data(data_dir: str = 'data/', train_ratio: int = 0.8, tokenizer: Callable[[str], List[str]] = None, verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]: """ Reads train and test data Parameters ---------- data_dir : str, optional Directory where the data is located. The default is 'data/'. train_ratio : int, optional Percentage split of train data. The default is 0.8. tokenizer : Callable[[str], List[str]], optional The tokenizer function to use.. The default is None. verbose : int, optional 1 to print reading progress, 0 otherwise. The default is 0. Returns ------- Tuple[List[HealthRecord], List[HealthRecord]] Train data, Test data. """ # Get all the IDs of data file_ids = sorted(list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(data_dir) \ if not fname.startswith('.')]))) # Splitting IDs into random training and test data random.seed(0) random.shuffle(file_ids) split_idx = int(train_ratio * len(file_ids)) train_ids = file_ids[:split_idx] test_ids = file_ids[split_idx:] if verbose == 1: print("Train data:") train_data = [] for idx, fid in enumerate(train_ids): record = HealthRecord(fid, text_path=data_dir + fid + '.txt', ann_path=data_dir + fid + '.ann', tokenizer=tokenizer) train_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, split_idx) if verbose == 1: print('\n\nTest Data:') test_data = [] for idx, fid in enumerate(test_ids): record = HealthRecord(fid, text_path=data_dir + fid + '.txt', ann_path=data_dir + fid + '.ann', tokenizer=tokenizer) test_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(file_ids) - split_idx) return train_data, test_data