def get_ner_predictions(ehr_record: str, model_name: str = "biobert", record_id: str = "1") -> HealthRecord: """ Get predictions for NER using either BioBERT or BiLSTM Parameters -------------- ehr_record : str An EHR record in text format. model_name : str The model to use for prediction. Default is biobert. record_id : str The record id of the returned object. Default is 1. Returns ----------- A HealthRecord object with entities set. """ if model_name.lower() == "biobert": test_ehr = HealthRecord(record_id=record_id, text=ehr_record, tokenizer=biobert_ner_tokenizer.tokenize, is_bert_tokenizer=True, is_training=False) predictions = get_biobert_ner_predictions(test_ehr) elif model_name.lower() == "bilstm": test_ehr = HealthRecord(text=ehr_record, tokenizer=scispacy_plus_tokenizer, is_bert_tokenizer=False, is_training=False) predictions = get_bilstm_ner_predictions(test_ehr) else: raise AttributeError("Accepted model names include 'biobert' " "and 'bilstm'.") ent_preds = [] for i, pred in enumerate(predictions): ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]]) ent_text = test_ehr.text[ent[0]:ent[1]] if not any(letter.isalnum() for letter in ent_text): continue ent.set_text(ent_text) ent_preds.append(ent) test_ehr.entities = ent_preds return test_ehr
def read_data(data_dir: str = 'data/', tokenizer: Callable[[str], List[str]] = None, is_bert_tokenizer: bool = True, verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]: """ Reads train and test data Parameters ---------- data_dir : str, optional Directory where the data is located. It should have directories named 'train' and 'test' The default is 'data/'. tokenizer : Callable[[str], List[str]], optional The tokenizer function to use.. The default is None. is_bert_tokenizer : bool If the tokenizer is a BERT-based WordPiece tokenizer verbose : int, optional 1 to print reading progress, 0 otherwise. The default is 0. Returns ------- Tuple[List[HealthRecord], List[HealthRecord]] Train data, Test data. """ train_path = os.path.join(data_dir, "train") test_path = os.path.join(data_dir, "test") # Get all IDs for train and test data train_ids = list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(train_path) \ if not fname.startswith('.')])) test_ids = list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(test_path) \ if not fname.startswith('.')])) if verbose == 1: print("Train data:") train_data = [] for idx, fid in enumerate(train_ids): record = HealthRecord(fid, text_path=os.path.join(train_path, fid + '.txt'), ann_path=os.path.join(train_path, fid + '.ann'), tokenizer=tokenizer, is_bert_tokenizer=is_bert_tokenizer) train_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(train_ids)) if verbose == 1: print('\n\nTest Data:') test_data = [] for idx, fid in enumerate(test_ids): record = HealthRecord(fid, text_path=os.path.join(test_path, fid + '.txt'), ann_path=os.path.join(test_path, fid + '.ann'), tokenizer=tokenizer, is_bert_tokenizer=is_bert_tokenizer) test_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(test_ids)) return train_data, test_data
def read_data(data_dir: str = 'data/', train_ratio: int = 0.8, tokenizer: Callable[[str], List[str]] = None, verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]: """ Reads train and test data Parameters ---------- data_dir : str, optional Directory where the data is located. The default is 'data/'. train_ratio : int, optional Percentage split of train data. The default is 0.8. tokenizer : Callable[[str], List[str]], optional The tokenizer function to use.. The default is None. verbose : int, optional 1 to print reading progress, 0 otherwise. The default is 0. Returns ------- Tuple[List[HealthRecord], List[HealthRecord]] Train data, Test data. """ # Get all the IDs of data file_ids = sorted(list(set(['.'.join(fname.split('.')[:-1]) \ for fname in os.listdir(data_dir) \ if not fname.startswith('.')]))) # Splitting IDs into random training and test data random.seed(0) random.shuffle(file_ids) split_idx = int(train_ratio * len(file_ids)) train_ids = file_ids[:split_idx] test_ids = file_ids[split_idx:] if verbose == 1: print("Train data:") train_data = [] for idx, fid in enumerate(train_ids): record = HealthRecord(fid, text_path=data_dir + fid + '.txt', ann_path=data_dir + fid + '.ann', tokenizer=tokenizer) train_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, split_idx) if verbose == 1: print('\n\nTest Data:') test_data = [] for idx, fid in enumerate(test_ids): record = HealthRecord(fid, text_path=data_dir + fid + '.txt', ann_path=data_dir + fid + '.ann', tokenizer=tokenizer) test_data.append(record) if verbose == 1: draw_progress_bar(idx + 1, len(file_ids) - split_idx) return train_data, test_data