def get_ner_predictions(ehr_record: str,
                        model_name: str = "biobert",
                        record_id: str = "1") -> HealthRecord:
    """
    Get predictions for NER using either BioBERT or BiLSTM

    Parameters
    --------------
    ehr_record : str
        An EHR record in text format.

    model_name : str
        The model to use for prediction. Default is biobert.

    record_id : str
        The record id of the returned object. Default is 1.

    Returns
    -----------
    A HealthRecord object with entities set.
    """
    if model_name.lower() == "biobert":
        test_ehr = HealthRecord(record_id=record_id,
                                text=ehr_record,
                                tokenizer=biobert_ner_tokenizer.tokenize,
                                is_bert_tokenizer=True,
                                is_training=False)

        predictions = get_biobert_ner_predictions(test_ehr)

    elif model_name.lower() == "bilstm":
        test_ehr = HealthRecord(text=ehr_record,
                                tokenizer=scispacy_plus_tokenizer,
                                is_bert_tokenizer=False,
                                is_training=False)
        predictions = get_bilstm_ner_predictions(test_ehr)

    else:
        raise AttributeError("Accepted model names include 'biobert' "
                             "and 'bilstm'.")

    ent_preds = []
    for i, pred in enumerate(predictions):
        ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]])
        ent_text = test_ehr.text[ent[0]:ent[1]]

        if not any(letter.isalnum() for letter in ent_text):
            continue

        ent.set_text(ent_text)
        ent_preds.append(ent)

    test_ehr.entities = ent_preds
    return test_ehr
def read_data(data_dir: str = 'data/',
              tokenizer: Callable[[str], List[str]] = None,
              is_bert_tokenizer: bool = True,
              verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]:
    """
    Reads train and test data

    Parameters
    ----------
    data_dir : str, optional
        Directory where the data is located.
        It should have directories named 'train' and 'test'
        The default is 'data/'.

    tokenizer : Callable[[str], List[str]], optional
        The tokenizer function to use.. The default is None.

    is_bert_tokenizer : bool
        If the tokenizer is a BERT-based WordPiece tokenizer

    verbose : int, optional
        1 to print reading progress, 0 otherwise. The default is 0.

    Returns
    -------
    Tuple[List[HealthRecord], List[HealthRecord]]
        Train data, Test data.

    """
    train_path = os.path.join(data_dir, "train")
    test_path = os.path.join(data_dir, "test")

    # Get all IDs for train and test data
    train_ids = list(set(['.'.join(fname.split('.')[:-1]) \
                          for fname in os.listdir(train_path) \
                          if not fname.startswith('.')]))

    test_ids = list(set(['.'.join(fname.split('.')[:-1]) \
                         for fname in os.listdir(test_path) \
                         if not fname.startswith('.')]))

    if verbose == 1:
        print("Train data:")

    train_data = []
    for idx, fid in enumerate(train_ids):
        record = HealthRecord(fid, text_path=os.path.join(train_path, fid + '.txt'),
                              ann_path=os.path.join(train_path, fid + '.ann'),
                              tokenizer=tokenizer,
                              is_bert_tokenizer=is_bert_tokenizer)
        train_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(train_ids))

    if verbose == 1:
        print('\n\nTest Data:')

    test_data = []
    for idx, fid in enumerate(test_ids):
        record = HealthRecord(fid, text_path=os.path.join(test_path, fid + '.txt'),
                              ann_path=os.path.join(test_path, fid + '.ann'),
                              tokenizer=tokenizer,
                              is_bert_tokenizer=is_bert_tokenizer)
        test_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(test_ids))

    return train_data, test_data
def read_data(data_dir: str = 'data/',
              train_ratio: int = 0.8,
              tokenizer: Callable[[str], List[str]] = None,
              verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]:
    """
    Reads train and test data

    Parameters
    ----------
    data_dir : str, optional
        Directory where the data is located. The default is 'data/'.

    train_ratio : int, optional
        Percentage split of train data. The default is 0.8.

    tokenizer : Callable[[str], List[str]], optional
        The tokenizer function to use.. The default is None.

    verbose : int, optional
        1 to print reading progress, 0 otherwise. The default is 0.

    Returns
    -------
    Tuple[List[HealthRecord], List[HealthRecord]]
        Train data, Test data.

    """
    # Get all the IDs of data
    file_ids = sorted(list(set(['.'.join(fname.split('.')[:-1]) \
                                for fname in os.listdir(data_dir) \
                                if not fname.startswith('.')])))

    # Splitting IDs into random training and test data
    random.seed(0)
    random.shuffle(file_ids)

    split_idx = int(train_ratio * len(file_ids))
    train_ids = file_ids[:split_idx]
    test_ids = file_ids[split_idx:]

    if verbose == 1:
        print("Train data:")

    train_data = []
    for idx, fid in enumerate(train_ids):
        record = HealthRecord(fid, text_path=data_dir + fid + '.txt',
                              ann_path=data_dir + fid + '.ann',
                              tokenizer=tokenizer)
        train_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, split_idx)

    if verbose == 1:
        print('\n\nTest Data:')

    test_data = []
    for idx, fid in enumerate(test_ids):
        record = HealthRecord(fid, text_path=data_dir + fid + '.txt',
                              ann_path=data_dir + fid + '.ann',
                              tokenizer=tokenizer)
        test_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(file_ids) - split_idx)

    return train_data, test_data