Ejemplo n.º 1
0
def get_ner_predictions(ehr_record: str,
                        model_name: str = "biobert",
                        record_id: str = "1") -> HealthRecord:
    """
    Get predictions for NER using either BioBERT or BiLSTM

    Parameters
    --------------
    ehr_record : str
        An EHR record in text format.

    model_name : str
        The model to use for prediction. Default is biobert.

    record_id : str
        The record id of the returned object. Default is 1.

    Returns
    -----------
    A HealthRecord object with entities set.
    """
    if model_name.lower() == "biobert":
        test_ehr = HealthRecord(record_id=record_id,
                                text=ehr_record,
                                tokenizer=biobert_ner_tokenizer.tokenize,
                                is_bert_tokenizer=True,
                                is_training=False)

        predictions = get_biobert_ner_predictions(test_ehr)

    elif model_name.lower() == "bilstm":
        test_ehr = HealthRecord(text=ehr_record,
                                tokenizer=scispacy_plus_tokenizer,
                                is_bert_tokenizer=False,
                                is_training=False)
        predictions = get_bilstm_ner_predictions(test_ehr)

    else:
        raise AttributeError("Accepted model names include 'biobert' "
                             "and 'bilstm'.")

    ent_preds = []
    for i, pred in enumerate(predictions):
        ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]])
        ent_text = test_ehr.text[ent[0]:ent[1]]

        if not any(letter.isalnum() for letter in ent_text):
            continue

        ent.set_text(ent_text)
        ent_preds.append(ent)

    test_ehr.entities = ent_preds
    return test_ehr
Ejemplo n.º 2
0
    def _extract_annotations(path: str) \
            -> Tuple[Dict[str, Entity], Dict[str, Relation]]:
        """
        Internal function that extracts entities and relations
        as a dictionary from an annotation file.

        Parameters
        ----------
        path : str
            Path for the ann file.

        Returns
        -------
        Tuple[Dict[str, Entity], Dict[str, Relation]]
            Entities and relations.
        """
        f = open(path)
        raw_data = f.read().split('\n')
        f.close()

        entities = {}
        relations = {}

        # Relations with entities that haven't been processed yet
        relation_backlog = []

        for line in raw_data:
            if line.startswith('#'):
                continue

            line = line.split('\t')

            # Remove empty strings from list
            line = list(filter(None, line))

            if not line or not line[0]:
                continue

            if line[0][0] == 'T':
                assert len(line) == 3

                idx = 0
                # Find the end of first word, which is the entity type
                for idx in range(len(line[1])):
                    if line[1][idx] == ' ':
                        break

                char_ranges = line[1][idx + 1:]

                # Get all character ranges, separated by ;
                char_ranges = [r.split() for r in char_ranges.split(';')]

                # Create an Entity object
                ent = Entity(entity_id=line[0], entity_type=line[1][:idx])

                r = [char_ranges[0][0], char_ranges[-1][1]]
                r = list(map(int, r))
                ent.set_range(r)

                ent.set_text(line[2])
                entities[line[0]] = ent

            elif line[0][0] == 'R':
                assert len(line) == 2

                rel_details = line[1].split(' ')
                entity1 = rel_details[1].split(':')[-1]
                entity2 = rel_details[2].split(':')[-1]

                if entity1 in entities and entity2 in entities:
                    rel = Relation(relation_id=line[0],
                                   relation_type=rel_details[0],
                                   arg1=entities[entity1],
                                   arg2=entities[entity2])

                    relations[line[0]] = rel
                else:
                    # If the entities aren't processed yet,
                    # add them to backlog to process later
                    relation_backlog.append(
                        [line[0], rel_details[0], entity1, entity2])

            else:
                # If the annotation is not a relation or entity, warn user
                warnings.warn("Invalid annotation encountered: " + str(line))

        for r in relation_backlog:
            rel = Relation(relation_id=r[0],
                           relation_type=r[1],
                           arg1=entities[r[2]],
                           arg2=entities[r[3]])

            relations[r[0]] = rel

        return entities, relations
Ejemplo n.º 3
0
def process_ade_files(ade_data: List[dict]) -> List[dict]:
    """
    Extracts tokens and creates Entity and Relation objects
    from raw json data.

    Parameters
    ----------
    ade_data : List[dict]
        Raw json data.

    Returns
    -------
    List[dict]
        Tokens, entities and relations.

    """
    ade_records = []

    for ade in ade_data:
        entities = {}
        relations = {}
        relation_backlog = []

        # Tokens
        tokens = ade['tokens']

        # Entities
        e_num = 1
        for ent in ade['entities']:
            ent_id = 'T' + "%s" % e_num
            if ent['type'] == 'Adverse-Effect':
                ent['type'] = 'ADE'

            ent_obj = Entity(entity_id=ent_id,
                             entity_type=ent['type'])

            r = [ent['start'], ent['end'] - 1]
            r = list(map(int, r))
            ent_obj.set_range(r)

            text = ''
            for token_ent in ade['tokens'][ent['start']:ent['end']]:
                text += token_ent + ' '
            ent_obj.set_text(text)

            entities[ent_id] = ent_obj
            e_num += 1

            # Relations
        r_num = 1
        for relation in ade['relations']:
            rel_id = 'R' + "%s" % r_num
            rel_details = 'ADE-Drug'
            entity1 = "T" + str(relation['head'] + 1)
            entity2 = "T" + str(relation['tail'] + 1)

            if entity1 in entities and entity2 in entities:
                rel = Relation(relation_id=rel_id,
                               relation_type=rel_details,
                               arg1=entities[entity1],
                               arg2=entities[entity2])

                relations[rel_id] = rel

            else:
                relation_backlog.append([rel_id, rel_details,
                                         entity1, entity2])
            r_num += 1

        ade_records.append({"tokens": tokens, "entities": entities, "relations": relations})
    return ade_records