Ejemplo n.º 1
0
    def __init__(self, nlp: Language):
        """Initialise components"""

        extensions = [
            "_n_sentences",
            "_n_tokens",
            "_n_syllables",
            "token_length",
            "sentence_length",
            "syllables",
            "counts",
        ]
        ext_funs = [
            n_sentences,
            n_tokens,
            n_syllables,
            self.token_length,
            self.sentence_length,
            self.syllables,
            self.counts,
        ]
        for ext, fun in zip(extensions, ext_funs):
            if ext not in ["_n_sentences", "sentence_length", "syllables"]:
                if not Span.has_extension(ext):
                    Span.set_extension(ext, getter=fun)
            if not Doc.has_extension(ext):
                Doc.set_extension(ext, getter=fun)

        if not Doc.has_extension("_filtered_tokens"):
            Doc.set_extension("_filtered_tokens", default=[])
        if not Span.has_extension("_filtered_tokens"):
            Span.set_extension("_filtered_tokens", getter=filtered_tokens)
Ejemplo n.º 2
0
def configure_spacy_entity_extension_attributes():
    """Add custom extension attributes to the spaCy Span class."""
    from spacy.tokens import Span

    if not Span.has_extension("score"):
        Span.set_extension("score", default=-1.0)
    if not Span.has_extension("recognizer"):
        Span.set_extension("recognizer", default="")
Ejemplo n.º 3
0
 def __init__(self, entities, model, noun_phrases=False):
     self.entities = entities
     self.model = model
     self.noun_phrases = noun_phrases
     if not Span.has_extension('entity_relation_subj'):
         Span.set_extension('entity_relation_subj', default='')
     if not Span.has_extension('entity_relation_root'):
         Span.set_extension('entity_relation_root', default='')
Ejemplo n.º 4
0
    def _set_span_conll(self, span: Span, span_idx: int = 1):
        """Sets a span's properties according to the CoNLL-U format.
        :param span: a spaCy Span
        :param span_idx: optional index, corresponding to the n-th sentence
                         in the parent Doc
        """
        span_conll_str = ""
        if self.include_headers:
            # Get metadata from custom extension or create it ourselves
            if not (span.has_extension("conll_metadata")
                    and span._.conll_metadata):
                span._.conll_metadata = f"# sent_id = {span_idx}\n# text = {span.text}\n"

            span_conll_str += span._.conll_metadata

        for token_idx, token in enumerate(span, 1):
            self._set_token_conll(token, token_idx)

        span._.set(self.ext_names["conll"],
                   [t._.get(self.ext_names["conll"]) for t in span])
        span_conll_str += "".join(
            [t._.get(self.ext_names["conll_str"]) for t in span])
        span._.set(self.ext_names["conll_str"], span_conll_str)

        if PD_AVAILABLE and not self.disable_pandas:
            span._.set(
                self.ext_names["conll_pd"],
                pd.DataFrame([t._.get(self.ext_names["conll"]) for t in span]),
            )
Ejemplo n.º 5
0
    def __init__(self, nlp, ent_types=[]):
        if not Span.has_extension("negex"):
            Span.set_extension("negex", default=False, force=True)
        psuedo_negations = [
            "gram negative",
            "no further",
            "not able to be",
            "not certain if",
            "not certain whether",
            "not necessarily",
            "not rule out",
            "not ruled out",
            "not been ruled out",
            "without any further",
            "without difficulty",
            "without further",
        ]
        preceeding_negations = [
            "absence of",
            "declined",
            "denied",
            "denies",
            "denying",
            "did not exhibit",
            "no sign of",
            "no signs of",
            "not",
            "not demonstrate",
            "patient was not",
            "rules out",
            "doubt",
            "negative for",
            "no",
            "no cause of",
            "no complaints of",
            "no evidence of",
            "versus",
            "without",
            "without indication of",
            "without sign of",
            "without signs of",
            "ruled out",
        ]
        following_negations = ["declined", "unlikely"]
        termination = ["but", "however"]

        # efficiently build spaCy matcher patterns
        psuedo_patterns = list(nlp.tokenizer.pipe(psuedo_negations))
        preceeding_patterns = list(nlp.tokenizer.pipe(preceeding_negations))
        following_patterns = list(nlp.tokenizer.pipe(following_negations))
        termination_patterns = list(nlp.tokenizer.pipe(termination))

        self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.matcher.add("Psuedo", None, *psuedo_patterns)
        self.matcher.add("Preceeding", None, *preceeding_patterns)
        self.matcher.add("Following", None, *following_patterns)
        self.matcher.add("Termination", None, *termination_patterns)
        self.keys = [k for k in self.matcher._docs.keys()]
        self.ent_types = ent_types
Ejemplo n.º 6
0
    def __init__(
            self,
            nlp,
            language="en_clinical",
            ent_types=list(),
            extension_name="negex",
            pseudo_negations=list(),
            preceding_negations=list(),
            following_negations=list(),
            termination=list(),
            chunk_prefix=list(),
    ):
        if not language in LANGUAGES:
            raise KeyError(f"{language} not found in languages termset. "
                           "Ensure this is a supported language or specify "
                           "your own termsets when initializing Negex.")
        termsets = LANGUAGES[language]
        if not Span.has_extension(extension_name):
            Span.set_extension(extension_name, default=False, force=True)

        if not pseudo_negations:
            if not "pseudo_negations" in termsets:
                raise KeyError(
                    "pseudo_negations not specified for this language.")
            pseudo_negations = termsets["pseudo_negations"]

        if not preceding_negations:
            if not "preceding_negations" in termsets:
                raise KeyError(
                    "preceding_negations not specified for this language.")
            preceding_negations = termsets["preceding_negations"]

        if not following_negations:
            if not "following_negations" in termsets:
                raise KeyError(
                    "following_negations not specified for this language.")
            following_negations = termsets["following_negations"]

        if not termination:
            if not "termination" in termsets:
                raise KeyError("termination not specified for this language.")
            termination = termsets["termination"]

        # efficiently build spaCy matcher patterns
        self.pseudo_patterns = list(nlp.tokenizer.pipe(pseudo_negations))
        self.preceding_patterns = list(nlp.tokenizer.pipe(preceding_negations))
        self.following_patterns = list(nlp.tokenizer.pipe(following_negations))
        self.termination_patterns = list(nlp.tokenizer.pipe(termination))

        self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.matcher.add("pseudo", None, *self.pseudo_patterns)
        self.matcher.add("Preceding", None, *self.preceding_patterns)
        self.matcher.add("Following", None, *self.following_patterns)
        self.matcher.add("Termination", None, *self.termination_patterns)
        self.nlp = nlp
        self.ent_types = ent_types
        self.extension_name = extension_name

        self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
Ejemplo n.º 7
0
    def __init__(self, nlp: Language, use_pos: bool):
        """Initialise components"""
        self.use_pos = use_pos

        if not Doc.has_extension("pos_proportions"):
            Doc.set_extension("pos_proportions", getter=self.pos_proportions)

        if not Span.has_extension("pos_proportions"):
            Span.set_extension("pos_proportions", getter=self.pos_proportions)
    def __init__(
            self,
            nlp,
            language="en",
            ent_types=list(),
            pseudo_negations=list(),
            preceding_negations=list(),
            following_negations=list(),
            termination=list(),
            chunk_prefix=list(),
    ):
        if not language in LANGUAGES:
            raise KeyError("Language not found")

        termsets = LANGUAGES[language]
        if not Span.has_extension("negex"):
            Span.set_extension("negex", default=False, force=True)

        if not pseudo_negations:
            if not "pseudo_negations" in termsets:
                raise KeyError(
                    "pseudo_negations not specified for this language.")
            pseudo_negations = termsets["pseudo_negations"]

        if not preceding_negations:
            if not "preceding_negations" in termsets:
                raise KeyError(
                    "preceding_negations not specified for this language.")
            preceding_negations = termsets["preceding_negations"]

        if not following_negations:
            if not "following_negations" in termsets:
                raise KeyError(
                    "following_negations not specified for this language.")
            following_negations = termsets["following_negations"]

        if not termination:
            if not "termination" in termsets:
                raise KeyError("termination not specified for this language.")
            termination = termsets["termination"]

        #  build spaCy matcher patterns
        self.pseudo_patterns = list(nlp.tokenizer.pipe(pseudo_negations))
        self.preceding_patterns = list(nlp.tokenizer.pipe(preceding_negations))
        self.following_patterns = list(nlp.tokenizer.pipe(following_negations))
        self.termination_patterns = list(nlp.tokenizer.pipe(termination))

        self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.matcher.add("pseudo", None, *self.pseudo_patterns)
        self.matcher.add("Preceding", None, *self.preceding_patterns)
        self.matcher.add("Following", None, *self.following_patterns)
        self.matcher.add("Termination", None, *self.termination_patterns)
        self.nlp = nlp
        self.ent_types = ent_types

        self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
Ejemplo n.º 9
0
    def __init__(
            self,
            nlp,
            language="en_clinical",
            ent_types=list(),
            extension_name="negex",
            pseudo_negations=list(),
            preceding_negations=list(),
            following_negations=list(),
            termination=list(),
            chunk_prefix=list(),
    ):
        if not language in LANGUAGES:
            raise KeyError(f"{language} not found in languages termset. "
                           "Ensure this is a supported language or specify "
                           "your own termsets when initializing Negex.")
        termsets = LANGUAGES[language]
        if not Span.has_extension(extension_name):
            Span.set_extension(extension_name, default=False, force=True)

        if not pseudo_negations:
            if not "pseudo_negations" in termsets:
                raise KeyError(
                    "pseudo_negations not specified for this language.")
            self.pseudo_negations = termsets["pseudo_negations"]
        else:
            self.pseudo_negations = pseudo_negations

        if not preceding_negations:
            if not "preceding_negations" in termsets:
                raise KeyError(
                    "preceding_negations not specified for this language.")
            self.preceding_negations = termsets["preceding_negations"]
        else:
            self.preceding_negations = preceding_negations

        if not following_negations:
            if not "following_negations" in termsets:
                raise KeyError(
                    "following_negations not specified for this language.")
            self.following_negations = termsets["following_negations"]
        else:
            self.following_negations = following_negations

        if not termination:
            if not "termination" in termsets:
                raise KeyError("termination not specified for this language.")
            self.termination = termsets["termination"]
        else:
            self.termination = termination

        self.nlp = nlp
        self.ent_types = ent_types
        self.extension_name = extension_name
        self.build_patterns()
        self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
Ejemplo n.º 10
0
    def __init__(self, nlp, label, regexp: Regexp):
        string_store = nlp.vocab.strings
        if label not in string_store:
            string_store.add(label)
        self.label = string_store[label]

        self.regexp = regexp

        if not Span.has_extension('regexp_match'):
            Span.set_extension('regexp_match', default='')
Ejemplo n.º 11
0
 def __init__(self, nlp: Language):
     """Initialise components"""
     if not Token.has_extension("dependency_distance"):
         Token.set_extension("dependency_distance",
                             getter=self.token_dependency)
     if not Span.has_extension("dependency_distance"):
         Span.set_extension("dependency_distance",
                            getter=self.span_dependency)
     if not Doc.has_extension("dependency_distance"):
         Doc.set_extension("dependency_distance",
                           getter=self.doc_dependency)
Ejemplo n.º 12
0
    def __init__(self,
                 nlp,
                 patterns="default",
                 add_attrs=False,
                 max_scope=None):
        self.nlp = nlp
        self.add_attrs = add_attrs
        self.matcher = Matcher(nlp.vocab)
        self.max_scope = max_scope
        self.phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.assertion_attributes_mapping = None
        self._patterns = []
        self._section_titles = set()

        if patterns is not None:
            if patterns == "default":
                import os
                if not os.path.exists(DEFAULT_RULES_FILEPATH):
                    raise FileNotFoundError(
                        "The expected location of the default patterns file cannot be found. Please either "
                        "add patterns manually or add a jsonl file to the following location: ",
                        DEFAULT_RULES_FILEPATH)
                self.add(self.load_patterns_from_jsonl(DEFAULT_RULES_FILEPATH))
            # If a list, add each of the patterns in the list
            elif isinstance(patterns, list):
                self.add(patterns)
            elif isinstance(patterns, str):
                import os
                assert os.path.exists(patterns)
                self.add(self.load_patterns_from_jsonl(patterns))

        if add_attrs is False:
            self.add_attrs = False
        elif add_attrs is True:
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif isinstance(add_attrs, dict):
            # Check that each of the attributes being added has been set
            for modifier in add_attrs.keys():
                attr_dict = add_attrs[modifier]
                for attr_name, attr_value in attr_dict.items():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            "Custom extension {0} has not been set. Call Span.set_extension."
                        )

            self.add_attrs = True
            self.assertion_attributes_mapping = add_attrs

        else:
            raise ValueError(
                "add_attrs must be either True (default), False, or a dictionary, not {0}"
                .format(add_attrs))
Ejemplo n.º 13
0
    def __init__(self,
                 nlp,
                 quickumls_fp,
                 best_match=True,
                 ignore_syntax=False,
                 **kwargs):
        """Instantiate SpacyQuickUMLS object

            This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
            This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.
            Note that this implementation follows and enforces a known spacy convention that entity Spans cannot overlap on a single token.

        Args:
            nlp: Existing spaCy pipeline.  This is needed to update the vocabulary with UMLS CUI values
            quickumls_fp (str): Path to QuickUMLS data
            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
            ignore_syntax (bool, optional): Whether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
            **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
        """

        self.quickumls = QuickUMLS(
            quickumls_fp,
            # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
            # when we're using it as a component in a pipeline
            spacy_component=True,
            **kwargs)

        # save this off so that we can get vocab values of labels later
        self.nlp = nlp

        # keep these for matching
        self.best_match = best_match
        self.ignore_syntax = ignore_syntax

        # let's extend this with some proprties that we want
        if not Span.has_extension("similarity"):
            Span.set_extension('similarity', default=-1.0)
        if not Span.has_extension("semtypes"):
            Span.set_extension('semtypes', default=-1.0)
Ejemplo n.º 14
0
    def _set_extensions(self):
        """Sets the default extensions if they do not exist yet."""
        for obj in Doc, Span, Token:
            if not obj.has_extension(self.ext_names["conll_str"]):
                obj.set_extension(self.ext_names["conll_str"], default=None)
            if not obj.has_extension(self.ext_names["conll"]):
                obj.set_extension(self.ext_names["conll"], default=None)

            if PD_AVAILABLE and not self.disable_pandas:
                if not obj.has_extension(self.ext_names["conll_pd"]):
                    obj.set_extension(self.ext_names["conll_pd"], default=None)

        # Adds fields from the CoNLL-U format that are not available in spaCy
        # However, ConllParser might set these fields when it has read CoNLL_str->spaCy
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)
Ejemplo n.º 15
0
    def __init__(
        self,
        nlp: Language,
        name: str,
        neg_termset: dict,
        ent_types: list,
        extension_name: str,
        chunk_prefix: list,
    ):
        # if not termset_lang in LANGUAGES:
        #     raise KeyError(
        #         f"{termset_lang} not found in languages termset. "
        #         "Ensure this is a supported termset or specify "
        #         "your own termsets when initializing Negex."
        #     )
        # termsets = LANGUAGES[termset_lang]
        if not Span.has_extension(extension_name):
            Span.set_extension(extension_name, default=False, force=True)

        ts = neg_termset
        expected_keys = [
            "pseudo_negations",
            "preceding_negations",
            "following_negations",
            "termination",
        ]
        if not set(ts.keys()) == set(expected_keys):
            raise KeyError(
                f"Unexpected or missing keys in 'neg_termset', expected: {expected_keys}, instead got: {list(ts.keys())}"
            )

        self.pseudo_negations = ts["pseudo_negations"]
        self.preceding_negations = ts["preceding_negations"]
        self.following_negations = ts["following_negations"]
        self.termination = ts["termination"]

        self.nlp = nlp
        self.ent_types = ent_types
        self.extension_name = extension_name
        self.build_patterns()
        self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
Ejemplo n.º 16
0
    def __init__(self):

        # register Token attributes if they are not registered already
        from spacy.tokens import Token

        for attr_name in [
                "speaker", "start_time", "end_time", "confidence",
                "entity_linking", "addressee"
        ]:
            if not Token.has_extension(attr_name):
                Token.set_extension(attr_name, default=None)

        # register Span attributes if they are not registered already
        from spacy.tokens import Span

        if not Span.has_extension("speaker"):
            Span.set_extension("speaker", getter=self.span_speaker)

        if not Span.has_extension("start_time"):
            Span.set_extension("start_time", getter=self.span_start_time)

        if not Span.has_extension("end_time"):
            Span.set_extension("end_time", getter=self.span_end_time)

        if not Span.has_extension("confidence"):
            Span.set_extension("confidence",
                               getter=self.span_average_confidence)

        if not Span.has_extension("entity_linking"):
            Span.set_extension("entity_linking",
                               getter=self.span_entity_linking)

        if not Span.has_extension("addressee"):
            Span.set_extension("addressee", getter=self.span_addressee)

        # minimalist spaCy pipeline (used only for its tokenizer)
        self.tokenizer = spacy.load("en_core_web_sm",
                                    disable=["tagger", "parser", "ner"])

        # custom spaCy pipeline (that adds forced alignment attributes and ensures
        # that a new sentence starts at every speaker change)
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.add_pipe(self.placeholder,
                          name="forced_alignment",
                          first=True)
        self.nlp.add_pipe(self.start_sentence_at_speaker_change,
                          after="forced_alignment")
Ejemplo n.º 17
0
    def __init__(self,
                 nlp,
                 keywords,
                 label,
                 tokentag,
                 doctag=None,
                 spantag=None):
        nlp.vocab.strings.add(label)
        self.label = nlp.vocab.strings[label]
        self._label_str = label
        self._token_tag = tokentag
        self._doctag = doctag
        self._spantag = spantag
        self._keywordtag = "is_keyword"
        self._labeltag = "label_"
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(key) for key in keywords]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(self._token_tag, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension(self._token_tag, default=False)
        if not Token.has_extension(self._keywordtag):
            Token.set_extension(self._keywordtag, default=False)
            Token.set_extension(self._labeltag, default=None)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension(self._doctag,
                          getter=lambda tokens: any(
                              [t._.get(self._token_tag) for t in tokens]))
        Span.set_extension(self._spantag,
                           getter=lambda tokens: any(
                               [t._.get(self._token_tag) for t in tokens]))
        if not Span.has_extension("dep_"):
            Span.set_extension("dep_", default="")
            Span.set_extension("head_", default=None)
Ejemplo n.º 18
0
    def __init__(
        self,
        nlp,
        rules="default",
        add_attrs=False,
        max_scope=None,
        include_header=False,
        phrase_matcher_attr="LOWER",
        require_start_line=False,
        require_end_line=False,
        newline_pattern=r"[\n\r]+[\s]*$",
    ):
        """Create a new Sectionizer component. The sectionizer will search for spans in the text which
        match section header rules, such as 'Past Medical History:'. Sections will be represented
        in custom attributes as:
            category (str): A normalized title of the section. Example: 'past_medical_history'
            section_title (Span): The Span of the doc which was matched as a section header.
                Example: 'Past Medical History:'
            section_span (Span): The entire section of the note, starting with section_header and up until the end
                of the section, which will be either the start of the next section header of some pre-specified
                scope. Example: 'Past Medical History: Type II DM'

        Section attributes will be registered for each Doc, Span, and Token in the following attributes:
            Doc._.sections: A list of namedtuples of type Section with 4 elements:
                - section_title
                - section_header
                - section_parent
                - section_span.
            A Doc will also have attributes corresponding to lists of each
                (ie., Doc._.section_titles, Doc._.section_headers, Doc._.section_parents, Doc._.section_list)
            (Span|Token)._.section_title
            (Span|Token)._.section_header
            (Span|Token)._.section_parent
            (Span|Token)._.section_span

        Args:
            nlp: A SpaCy language model object
            rules (str, list, or None): Where to read rules from. Default is "default", which will
                load the default rules provided by medSpaCy, which are derived from MIMIC-II.
                If a list, should be a list of pattern dicts following these conventional spaCy formats:
                    [
                        {"section_title": "past_medical_history", "pattern": "Past Medical History:"},
                        {"section_title": "problem_list", "pattern": [{"TEXT": "PROBLEM"}, {"TEXT": "LIST"}, {"TEXT": ":"}]}
                    ]
                If a string other than "default", should be a path to a jsonl file containing rules.
            max_scope (None or int): Optional argument specifying the maximum number of tokens following a section header
                which can be included in a section. This can be useful if you think your section rules are incomplete
                and want to prevent sections from running too long in the note. Default is None, meaning that the scope
                of a section will be until either the next section header or the end of the document.
            include_title (bool): whether the section title is included in the section text
            phrase_matcher_attr (str): The name of the token attribute which will be used by the PhraseMatcher
                for any rules with a "pattern" value of a string.
            require_start_line (bool): Optionally require a section header to start on a new line. Default False.
            require_end_line (bool): Optionally require a section header to end with a new line. Default False.
            newline_pattern (str): Regular expression to match the new line either preceding or following a header
                if either require_start_line or require_end_line are True.
        """
        self.nlp = nlp
        self.add_attrs = add_attrs
        self.matcher = MedspacyMatcher(nlp,
                                       phrase_matcher_attr=phrase_matcher_attr)
        self.max_scope = max_scope
        self.require_start_line = require_start_line
        self.require_end_line = require_end_line
        self.newline_pattern = re.compile(newline_pattern)
        self.assertion_attributes_mapping = None
        self._parent_sections = {}
        self._parent_required = {}
        self._rule_item_mapping = self.matcher._rule_item_mapping
        self._rules = []
        self._section_categories = set()
        self.include_header = include_header

        if rules is not None:
            if rules == "default":
                import os

                if not os.path.exists(DEFAULT_RULES_FILEPATH):
                    raise FileNotFoundError(
                        "The expected location of the default rules file cannot be found. Please either "
                        "add rules manually or add a jsonl file to the following location: ",
                        DEFAULT_RULES_FILEPATH,
                    )
                self.add(SectionRule.from_json(DEFAULT_RULES_FILEPATH))
            # If a list, add each of the rules in the list
            elif isinstance(rules, list):
                self.add(rules)
            elif isinstance(rules, str):
                path.exists(rules)
                self.add(SectionRule.from_json(rules))

        if add_attrs is False:
            self.add_attrs = False
        elif add_attrs is True:
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif isinstance(add_attrs, dict):
            # Check that each of the attributes being added has been set
            for modifier in add_attrs.keys():
                attr_dict = add_attrs[modifier]
                for attr_name, attr_value in attr_dict.items():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            "Custom extension {0} has not been set. Call Span.set_extension."
                        )

            self.add_attrs = True
            self.assertion_attributes_mapping = add_attrs

        else:
            raise ValueError(
                "add_attrs must be either True (default), False, or a dictionary, not {0}"
                .format(add_attrs))
Ejemplo n.º 19
0
    def __init__(
        self,
        nlp,
        targets="ents",
        add_attrs=True,
        phrase_matcher_attr="LOWER",
        rules="default",
        rule_list=None,
        allowed_types=None,
        excluded_types=None,
        use_context_window=False,
        max_scope=None,
        max_targets=None,
        terminations=None,
        prune=True,
        remove_overlapping_modifiers=False,
    ):

        """Create a new ConTextComponent algorithm.

        This component matches modifiers in a Doc,
        defines their scope, and identifies edges between targets and modifiers.
        Sets two spaCy extensions:
            - Span._.modifiers: a list of TagObject objects which modify a target Span
            - Doc._.context_graph: a ConText graph object which contains the targets,
                modifiers, and edges between them.

        Args:
            nlp: a spaCy NLP model
            targets: the attribute of Doc which contains targets.
                Default is "ents", in which case it will use the standard Doc.ents attribute.
                Otherwise will look for a custom attribute in Doc._.{targets}
            add_attrs: Whether or not to add the additional spaCy Span attributes (ie., Span._.x)
                defining assertion on the targets. By default, these are:
                - is_negated: True if a target is modified by 'NEGATED_EXISTENCE', default False
                - is_uncertain: True if a target is modified by 'POSSIBLE_EXISTENCE', default False
                - is_historical: True if a target is modified by 'HISTORICAL', default False
                - is_hypothetical: True if a target is modified by 'HYPOTHETICAL', default False
                - is_family: True if a target is modified by 'FAMILY', default False
                In the future, these should be made customizable.
            phrase_matcher_attr: The token attribute to be used by the underlying PhraseMatcher.
                If "LOWER", then the matching of modifiers with a "literal" string will be
                case-insensitive. If "TEXT" or "ORTH", it will be case-sensitive.
                Default "LOWER'.
            prune: Whether or not to prune modifiers which are substrings of another modifier.
                For example, if "no history of" and "history of" are both ConTextItems, both will match
                the text "no history of afib", but only "no history of" should modify afib.
                If True, will drop shorter substrings completely.
                Default True.
            remove_overlapping_modifiers: Whether or not to remove any matched modifiers which overlap
                with target entities. If False, any overlapping modifiers will not modify the overlapping
                entity but will still modify any other targets in its scope.
                Default False.
            rules: Which rules to load on initialization. Default is 'default'.
                - 'default': Load the default set of rules provided with cyConText
                - 'other': Load a custom set of rules, please also set rule_list with a file path or list.
                - None: Load no rules.
            rule_list: The location of rules in json format or a list of ContextItems. Default
                is None.
            allowed_types (set or None): A set of target labels to allow a ConTextItem to modify.
                If None, will apply to any type not specifically excluded in excluded_types.
                Only one of allowed_types and excluded_types can be used. An error will be thrown
                if both or not None.
                If this attribute is also defined in the ConTextItem, it will keep that value.
                Otherwise it will inherit this value.
            excluded_types (set or None): A set of target labels which this modifier cannot modify.
                If None, will apply to all target types unless allowed_types is not None.
                If this attribute is also defined in the ConTextItem, it will keep that value.
                Otherwise it will inherit this value.
            max_targets (int or None): The maximum number of targets which a modifier can modify.
                If None, will modify all targets in its scope.
                If this attribute is also defined in the ConTextItem, it will keep that value.
                Otherwise it will inherit this value.
            use_context_window (bool): Whether to use a specified range around a target to check
                for modifiers rather than split sentence boundaries. This can be useful
                for quicker processing by skipping sentence splitting or errors caused by poorly
                defined sentence boundaries. If True, max_scope must be an integer greater than 0.
            max_scope (int or None): A number to explicitly limit the size of the modifier's scope
                If this attribute is also defined in the ConTextItem, it will keep that value.
                Otherwise it will inherit this value.
            terminations (dict or None): Optional mapping between different categories which will
                cause one modifier type to be 'terminated' by another type. For example, if given
                a mapping:
                    {"POSITIVE_EXISTENCE": {"NEGATED_EXISTENCE", "UNCERTAIN"},
                    "NEGATED_EXISTENCE": {"FUTURE"},
                    }
                all modifiers of type "POSITIVE_EXISTENCE" will be terminated by "NEGATED_EXISTENCE" or "UNCERTAIN"
                modifiers, and all "NEGATED_EXISTENCE" modifiers will be terminated by "FUTURE".
                This can also be defined for specific ConTextItems in the `terminated_by` attribute.


        Returns:
            context: a ConTextComponent

        Raises:
            ValueError: if one of the parameters is incorrectly formatted.
        """

        self.nlp = nlp
        if targets != "ents":
            raise NotImplementedError()
        self._target_attr = targets
        self.prune = prune
        self.remove_overlapping_modifiers = remove_overlapping_modifiers

        self._item_data = []
        self._i = 0
        self._categories = set()

        # _modifier_item_mapping: A mapping from spaCy Matcher match_ids to ConTextItem
        # This allows us to use spaCy Matchers while still linking back to the ConTextItem
        # To get the rule and category
        self._modifier_item_mapping = dict()
        self.phrase_matcher = PhraseMatcher(
            nlp.vocab, attr=phrase_matcher_attr, validate=True
        )  # TODO: match on custom attributes
        self.matcher = Matcher(nlp.vocab, validate=True)

        self.register_graph_attributes()
        if add_attrs is False:
            self.add_attrs = False
        elif add_attrs is True:
            self.add_attrs = True
            self.context_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif isinstance(add_attrs, dict):
            # Check that each of the attributes being added has been set
            for modifier in add_attrs.keys():
                attr_dict = add_attrs[modifier]
                for attr_name, attr_value in attr_dict.items():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            "Custom extension {0} has not been set. Call Span.set_extension.".format(
                                attr_name
                            )
                        )

            self.add_attrs = True
            self.context_attributes_mapping = add_attrs

        else:
            raise ValueError(
                "add_attrs must be either True (default), False, or a dictionary, not {0}".format(
                    add_attrs
                )
            )
        if use_context_window is True:
            if not isinstance(max_scope, int) or max_scope < 1:
                raise ValueError(
                    "If 'use_context_window' is True, 'max_scope' must be an integer greater 1, "
                    "not {0}".format(max_scope)
                )
        self.use_context_window = use_context_window
        if max_scope is not None and (
            not isinstance(max_scope, int) or max_scope < 1
        ):
            raise ValueError(
                "'max_scope' must be None or an integer greater 1, "
                "not {0}".format(max_scope)
            )
        self.max_scope = max_scope

        self.allowed_types = allowed_types
        self.excluded_types = excluded_types
        self.max_targets = max_targets

        if terminations is None:
            terminations = dict()
        self.terminations = {k.upper(): v for (k, v) in terminations.items()}

        if rules == "default":

            item_data = ConTextItem.from_json(DEFAULT_RULES_FILEPATH)
            self.add(item_data)

        elif rules == "other":
            # use custom rules
            if isinstance(rule_list, str):
                # if rules_list is a string, then it must be a path to a json
                if "yaml" in rule_list or "yml" in rule_list:
                    try:
                        rule_list = ConTextItem.from_yaml(rule_list)
                    except:
                        raise ValueError(
                            "rule list {0} could not be read".format(rule_list)
                        )
                elif path.exists(rule_list):
                    item_data = ConTextItem.from_json(rule_list)
                    self.add(item_data)
                else:
                    raise ValueError(
                        "rule_list must be a valid path. Currently is: {0}".format(
                            rule_list
                        )
                    )

            elif isinstance(rule_list, list):
                # otherwise it is a list of contextitems
                if not rule_list:
                    raise ValueError("rule_list must not be empty.")
                for item in rule_list:
                    # check that all items are contextitems
                    if not isinstance(item, ConTextItem):
                        raise ValueError(
                            "rule_list must contain only ContextItems. Currently contains: {0}".format(
                                type(item)
                            )
                        )
                self.add(rule_list)

            else:
                raise ValueError(
                    "rule_list must be a valid path or list of ContextItems. Currenty is: {0}".format(
                        type(rule_list)
                    )
                )

        elif not rules:
            # otherwise leave the list empty.
            # do nothing
            self._item_data = []

        else:
            # loading from json path or list is possible later
            raise ValueError(
                "rules must either be 'default' (default), 'other' or None."
            )
Ejemplo n.º 20
0
    def parse_conll_text_as_spacy(
        self,
        text: str,
        ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$",
        ner_map: Dict[str, str] = None,
    ) -> Doc:
        """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n).
        Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are
        supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a
        Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #)
        is saved in Span._.conll_metadata of sentence Spans.

        This method has been adapted from the work by spaCy.
        See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179

        Multi-word tokens and empty nodes are not supported.

        :param text: CoNLL-U formatted text
        :param ner_tag_pattern: Regex pattern for entity tag in the MISC field
        :param ner_map: Map old NER tag names to new ones, '' maps to O
        :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including
         the custom CoNLL extensions
        """
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)

        docs = []
        for chunk in text.split("\n\n"):
            lines = [
                l for l in chunk.splitlines() if l and not l.startswith("#")
            ]
            words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], []
            heads, deps, deps_graphs = [], [], []
            for i in range(len(lines)):
                line = lines[i]
                parts = line.split("\t")

                if any(not p for p in parts):
                    raise ValueError(
                        "According to the CoNLL-U Format, fields cannot be empty. See"
                        " https://universaldependencies.org/format.html")

                id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts

                if any(" " in f
                       for f in (id_, pos, tag, morph, head, dep, deps_graph)):
                    raise ValueError(
                        "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain"
                        " spaces. See https://universaldependencies.org/format.html"
                    )

                if "." in id_ or "-" in id_:
                    raise NotImplementedError(
                        "Multi-word tokens and empty nodes are not supported in spacy_conll"
                    )

                words.append(word)

                if "SpaceAfter=No" in misc:
                    spaces.append(False)
                else:
                    spaces.append(True)

                id_ = int(id_) - 1
                lemmas.append(lemma)
                poses.append(pos)
                tags.append(pos if tag == "_" else tag)
                morphs.append(morph if morph != "_" else "")
                heads.append((int(head) - 1) if head not in ("0",
                                                             "_") else id_)
                deps.append("ROOT" if dep == "root" else dep)
                deps_graphs.append(deps_graph)
                miscs.append(misc)

            doc = Doc(
                self.nlp.vocab,
                words=words,
                spaces=spaces,
                tags=tags,
                pos=poses,
                morphs=morphs,
                lemmas=lemmas,
                heads=heads,
                deps=deps,
            )

            # Set custom Token extensions
            for i in range(len(doc)):
                doc[i]._.conll_misc_field = miscs[i]
                doc[i]._.conll_deps_graphs_field = deps_graphs[i]

            ents = get_entities(lines, ner_tag_pattern, ner_map)
            doc.ents = spans_from_biluo_tags(doc, ents)

            # The deprel relations ensure that this CoNLL chunk is one sentence
            # Deprel cannot therefore not be empty or each word is considered a separate sentence
            if len(list(doc.sents)) != 1:
                raise ValueError(
                    "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format"
                    " requirements. See https://universaldependencies.org/format.html. Particularly make"
                    " sure that the DEPREL field is filled in.")

            # Save the metadata in a custom sentence Span attribute so that the formatter can use it
            metadata = "\n".join(
                [l for l in chunk.splitlines() if l.startswith("#")])
            # We really only expect one sentence
            for sent in doc.sents:
                sent._.conll_metadata = f"{metadata}\n" if metadata else ""

            docs.append(doc)

        # Add CoNLL custom extensions
        return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))