def __init__(self, nlp: Language): """Initialise components""" extensions = [ "_n_sentences", "_n_tokens", "_n_syllables", "token_length", "sentence_length", "syllables", "counts", ] ext_funs = [ n_sentences, n_tokens, n_syllables, self.token_length, self.sentence_length, self.syllables, self.counts, ] for ext, fun in zip(extensions, ext_funs): if ext not in ["_n_sentences", "sentence_length", "syllables"]: if not Span.has_extension(ext): Span.set_extension(ext, getter=fun) if not Doc.has_extension(ext): Doc.set_extension(ext, getter=fun) if not Doc.has_extension("_filtered_tokens"): Doc.set_extension("_filtered_tokens", default=[]) if not Span.has_extension("_filtered_tokens"): Span.set_extension("_filtered_tokens", getter=filtered_tokens)
def configure_spacy_entity_extension_attributes(): """Add custom extension attributes to the spaCy Span class.""" from spacy.tokens import Span if not Span.has_extension("score"): Span.set_extension("score", default=-1.0) if not Span.has_extension("recognizer"): Span.set_extension("recognizer", default="")
def __init__(self, entities, model, noun_phrases=False): self.entities = entities self.model = model self.noun_phrases = noun_phrases if not Span.has_extension('entity_relation_subj'): Span.set_extension('entity_relation_subj', default='') if not Span.has_extension('entity_relation_root'): Span.set_extension('entity_relation_root', default='')
def _set_span_conll(self, span: Span, span_idx: int = 1): """Sets a span's properties according to the CoNLL-U format. :param span: a spaCy Span :param span_idx: optional index, corresponding to the n-th sentence in the parent Doc """ span_conll_str = "" if self.include_headers: # Get metadata from custom extension or create it ourselves if not (span.has_extension("conll_metadata") and span._.conll_metadata): span._.conll_metadata = f"# sent_id = {span_idx}\n# text = {span.text}\n" span_conll_str += span._.conll_metadata for token_idx, token in enumerate(span, 1): self._set_token_conll(token, token_idx) span._.set(self.ext_names["conll"], [t._.get(self.ext_names["conll"]) for t in span]) span_conll_str += "".join( [t._.get(self.ext_names["conll_str"]) for t in span]) span._.set(self.ext_names["conll_str"], span_conll_str) if PD_AVAILABLE and not self.disable_pandas: span._.set( self.ext_names["conll_pd"], pd.DataFrame([t._.get(self.ext_names["conll"]) for t in span]), )
def __init__(self, nlp, ent_types=[]): if not Span.has_extension("negex"): Span.set_extension("negex", default=False, force=True) psuedo_negations = [ "gram negative", "no further", "not able to be", "not certain if", "not certain whether", "not necessarily", "not rule out", "not ruled out", "not been ruled out", "without any further", "without difficulty", "without further", ] preceeding_negations = [ "absence of", "declined", "denied", "denies", "denying", "did not exhibit", "no sign of", "no signs of", "not", "not demonstrate", "patient was not", "rules out", "doubt", "negative for", "no", "no cause of", "no complaints of", "no evidence of", "versus", "without", "without indication of", "without sign of", "without signs of", "ruled out", ] following_negations = ["declined", "unlikely"] termination = ["but", "however"] # efficiently build spaCy matcher patterns psuedo_patterns = list(nlp.tokenizer.pipe(psuedo_negations)) preceeding_patterns = list(nlp.tokenizer.pipe(preceeding_negations)) following_patterns = list(nlp.tokenizer.pipe(following_negations)) termination_patterns = list(nlp.tokenizer.pipe(termination)) self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.matcher.add("Psuedo", None, *psuedo_patterns) self.matcher.add("Preceeding", None, *preceeding_patterns) self.matcher.add("Following", None, *following_patterns) self.matcher.add("Termination", None, *termination_patterns) self.keys = [k for k in self.matcher._docs.keys()] self.ent_types = ent_types
def __init__( self, nlp, language="en_clinical", ent_types=list(), extension_name="negex", pseudo_negations=list(), preceding_negations=list(), following_negations=list(), termination=list(), chunk_prefix=list(), ): if not language in LANGUAGES: raise KeyError(f"{language} not found in languages termset. " "Ensure this is a supported language or specify " "your own termsets when initializing Negex.") termsets = LANGUAGES[language] if not Span.has_extension(extension_name): Span.set_extension(extension_name, default=False, force=True) if not pseudo_negations: if not "pseudo_negations" in termsets: raise KeyError( "pseudo_negations not specified for this language.") pseudo_negations = termsets["pseudo_negations"] if not preceding_negations: if not "preceding_negations" in termsets: raise KeyError( "preceding_negations not specified for this language.") preceding_negations = termsets["preceding_negations"] if not following_negations: if not "following_negations" in termsets: raise KeyError( "following_negations not specified for this language.") following_negations = termsets["following_negations"] if not termination: if not "termination" in termsets: raise KeyError("termination not specified for this language.") termination = termsets["termination"] # efficiently build spaCy matcher patterns self.pseudo_patterns = list(nlp.tokenizer.pipe(pseudo_negations)) self.preceding_patterns = list(nlp.tokenizer.pipe(preceding_negations)) self.following_patterns = list(nlp.tokenizer.pipe(following_negations)) self.termination_patterns = list(nlp.tokenizer.pipe(termination)) self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.matcher.add("pseudo", None, *self.pseudo_patterns) self.matcher.add("Preceding", None, *self.preceding_patterns) self.matcher.add("Following", None, *self.following_patterns) self.matcher.add("Termination", None, *self.termination_patterns) self.nlp = nlp self.ent_types = ent_types self.extension_name = extension_name self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
def __init__(self, nlp: Language, use_pos: bool): """Initialise components""" self.use_pos = use_pos if not Doc.has_extension("pos_proportions"): Doc.set_extension("pos_proportions", getter=self.pos_proportions) if not Span.has_extension("pos_proportions"): Span.set_extension("pos_proportions", getter=self.pos_proportions)
def __init__( self, nlp, language="en", ent_types=list(), pseudo_negations=list(), preceding_negations=list(), following_negations=list(), termination=list(), chunk_prefix=list(), ): if not language in LANGUAGES: raise KeyError("Language not found") termsets = LANGUAGES[language] if not Span.has_extension("negex"): Span.set_extension("negex", default=False, force=True) if not pseudo_negations: if not "pseudo_negations" in termsets: raise KeyError( "pseudo_negations not specified for this language.") pseudo_negations = termsets["pseudo_negations"] if not preceding_negations: if not "preceding_negations" in termsets: raise KeyError( "preceding_negations not specified for this language.") preceding_negations = termsets["preceding_negations"] if not following_negations: if not "following_negations" in termsets: raise KeyError( "following_negations not specified for this language.") following_negations = termsets["following_negations"] if not termination: if not "termination" in termsets: raise KeyError("termination not specified for this language.") termination = termsets["termination"] # build spaCy matcher patterns self.pseudo_patterns = list(nlp.tokenizer.pipe(pseudo_negations)) self.preceding_patterns = list(nlp.tokenizer.pipe(preceding_negations)) self.following_patterns = list(nlp.tokenizer.pipe(following_negations)) self.termination_patterns = list(nlp.tokenizer.pipe(termination)) self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.matcher.add("pseudo", None, *self.pseudo_patterns) self.matcher.add("Preceding", None, *self.preceding_patterns) self.matcher.add("Following", None, *self.following_patterns) self.matcher.add("Termination", None, *self.termination_patterns) self.nlp = nlp self.ent_types = ent_types self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
def __init__( self, nlp, language="en_clinical", ent_types=list(), extension_name="negex", pseudo_negations=list(), preceding_negations=list(), following_negations=list(), termination=list(), chunk_prefix=list(), ): if not language in LANGUAGES: raise KeyError(f"{language} not found in languages termset. " "Ensure this is a supported language or specify " "your own termsets when initializing Negex.") termsets = LANGUAGES[language] if not Span.has_extension(extension_name): Span.set_extension(extension_name, default=False, force=True) if not pseudo_negations: if not "pseudo_negations" in termsets: raise KeyError( "pseudo_negations not specified for this language.") self.pseudo_negations = termsets["pseudo_negations"] else: self.pseudo_negations = pseudo_negations if not preceding_negations: if not "preceding_negations" in termsets: raise KeyError( "preceding_negations not specified for this language.") self.preceding_negations = termsets["preceding_negations"] else: self.preceding_negations = preceding_negations if not following_negations: if not "following_negations" in termsets: raise KeyError( "following_negations not specified for this language.") self.following_negations = termsets["following_negations"] else: self.following_negations = following_negations if not termination: if not "termination" in termsets: raise KeyError("termination not specified for this language.") self.termination = termsets["termination"] else: self.termination = termination self.nlp = nlp self.ent_types = ent_types self.extension_name = extension_name self.build_patterns() self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
def __init__(self, nlp, label, regexp: Regexp): string_store = nlp.vocab.strings if label not in string_store: string_store.add(label) self.label = string_store[label] self.regexp = regexp if not Span.has_extension('regexp_match'): Span.set_extension('regexp_match', default='')
def __init__(self, nlp: Language): """Initialise components""" if not Token.has_extension("dependency_distance"): Token.set_extension("dependency_distance", getter=self.token_dependency) if not Span.has_extension("dependency_distance"): Span.set_extension("dependency_distance", getter=self.span_dependency) if not Doc.has_extension("dependency_distance"): Doc.set_extension("dependency_distance", getter=self.doc_dependency)
def __init__(self, nlp, patterns="default", add_attrs=False, max_scope=None): self.nlp = nlp self.add_attrs = add_attrs self.matcher = Matcher(nlp.vocab) self.max_scope = max_scope self.phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.assertion_attributes_mapping = None self._patterns = [] self._section_titles = set() if patterns is not None: if patterns == "default": import os if not os.path.exists(DEFAULT_RULES_FILEPATH): raise FileNotFoundError( "The expected location of the default patterns file cannot be found. Please either " "add patterns manually or add a jsonl file to the following location: ", DEFAULT_RULES_FILEPATH) self.add(self.load_patterns_from_jsonl(DEFAULT_RULES_FILEPATH)) # If a list, add each of the patterns in the list elif isinstance(patterns, list): self.add(patterns) elif isinstance(patterns, str): import os assert os.path.exists(patterns) self.add(self.load_patterns_from_jsonl(patterns)) if add_attrs is False: self.add_attrs = False elif add_attrs is True: self.assertion_attributes_mapping = DEFAULT_ATTRS self.register_default_attributes() elif isinstance(add_attrs, dict): # Check that each of the attributes being added has been set for modifier in add_attrs.keys(): attr_dict = add_attrs[modifier] for attr_name, attr_value in attr_dict.items(): if not Span.has_extension(attr_name): raise ValueError( "Custom extension {0} has not been set. Call Span.set_extension." ) self.add_attrs = True self.assertion_attributes_mapping = add_attrs else: raise ValueError( "add_attrs must be either True (default), False, or a dictionary, not {0}" .format(add_attrs))
def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs): """Instantiate SpacyQuickUMLS object This creates a QuickUMLS spaCy component which can be used in modular pipelines. This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts. Note that this implementation follows and enforces a known spacy convention that entity Spans cannot overlap on a single token. Args: nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values quickumls_fp (str): Path to QuickUMLS data best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. ignore_syntax (bool, optional): Whether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) """ self.quickumls = QuickUMLS( quickumls_fp, # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed # when we're using it as a component in a pipeline spacy_component=True, **kwargs) # save this off so that we can get vocab values of labels later self.nlp = nlp # keep these for matching self.best_match = best_match self.ignore_syntax = ignore_syntax # let's extend this with some proprties that we want if not Span.has_extension("similarity"): Span.set_extension('similarity', default=-1.0) if not Span.has_extension("semtypes"): Span.set_extension('semtypes', default=-1.0)
def _set_extensions(self): """Sets the default extensions if they do not exist yet.""" for obj in Doc, Span, Token: if not obj.has_extension(self.ext_names["conll_str"]): obj.set_extension(self.ext_names["conll_str"], default=None) if not obj.has_extension(self.ext_names["conll"]): obj.set_extension(self.ext_names["conll"], default=None) if PD_AVAILABLE and not self.disable_pandas: if not obj.has_extension(self.ext_names["conll_pd"]): obj.set_extension(self.ext_names["conll_pd"], default=None) # Adds fields from the CoNLL-U format that are not available in spaCy # However, ConllParser might set these fields when it has read CoNLL_str->spaCy if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None)
def __init__( self, nlp: Language, name: str, neg_termset: dict, ent_types: list, extension_name: str, chunk_prefix: list, ): # if not termset_lang in LANGUAGES: # raise KeyError( # f"{termset_lang} not found in languages termset. " # "Ensure this is a supported termset or specify " # "your own termsets when initializing Negex." # ) # termsets = LANGUAGES[termset_lang] if not Span.has_extension(extension_name): Span.set_extension(extension_name, default=False, force=True) ts = neg_termset expected_keys = [ "pseudo_negations", "preceding_negations", "following_negations", "termination", ] if not set(ts.keys()) == set(expected_keys): raise KeyError( f"Unexpected or missing keys in 'neg_termset', expected: {expected_keys}, instead got: {list(ts.keys())}" ) self.pseudo_negations = ts["pseudo_negations"] self.preceding_negations = ts["preceding_negations"] self.following_negations = ts["following_negations"] self.termination = ts["termination"] self.nlp = nlp self.ent_types = ent_types self.extension_name = extension_name self.build_patterns() self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
def __init__(self): # register Token attributes if they are not registered already from spacy.tokens import Token for attr_name in [ "speaker", "start_time", "end_time", "confidence", "entity_linking", "addressee" ]: if not Token.has_extension(attr_name): Token.set_extension(attr_name, default=None) # register Span attributes if they are not registered already from spacy.tokens import Span if not Span.has_extension("speaker"): Span.set_extension("speaker", getter=self.span_speaker) if not Span.has_extension("start_time"): Span.set_extension("start_time", getter=self.span_start_time) if not Span.has_extension("end_time"): Span.set_extension("end_time", getter=self.span_end_time) if not Span.has_extension("confidence"): Span.set_extension("confidence", getter=self.span_average_confidence) if not Span.has_extension("entity_linking"): Span.set_extension("entity_linking", getter=self.span_entity_linking) if not Span.has_extension("addressee"): Span.set_extension("addressee", getter=self.span_addressee) # minimalist spaCy pipeline (used only for its tokenizer) self.tokenizer = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) # custom spaCy pipeline (that adds forced alignment attributes and ensures # that a new sentence starts at every speaker change) self.nlp = spacy.load("en_core_web_sm") self.nlp.add_pipe(self.placeholder, name="forced_alignment", first=True) self.nlp.add_pipe(self.start_sentence_at_speaker_change, after="forced_alignment")
def __init__(self, nlp, keywords, label, tokentag, doctag=None, spantag=None): nlp.vocab.strings.add(label) self.label = nlp.vocab.strings[label] self._label_str = label self._token_tag = tokentag self._doctag = doctag self._spantag = spantag self._keywordtag = "is_keyword" self._labeltag = "label_" # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(key) for key in keywords] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(self._token_tag, None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension(self._token_tag, default=False) if not Token.has_extension(self._keywordtag): Token.set_extension(self._keywordtag, default=False) Token.set_extension(self._labeltag, default=None) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension(self._doctag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) Span.set_extension(self._spantag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) if not Span.has_extension("dep_"): Span.set_extension("dep_", default="") Span.set_extension("head_", default=None)
def __init__( self, nlp, rules="default", add_attrs=False, max_scope=None, include_header=False, phrase_matcher_attr="LOWER", require_start_line=False, require_end_line=False, newline_pattern=r"[\n\r]+[\s]*$", ): """Create a new Sectionizer component. The sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'. Sections will be represented in custom attributes as: category (str): A normalized title of the section. Example: 'past_medical_history' section_title (Span): The Span of the doc which was matched as a section header. Example: 'Past Medical History:' section_span (Span): The entire section of the note, starting with section_header and up until the end of the section, which will be either the start of the next section header of some pre-specified scope. Example: 'Past Medical History: Type II DM' Section attributes will be registered for each Doc, Span, and Token in the following attributes: Doc._.sections: A list of namedtuples of type Section with 4 elements: - section_title - section_header - section_parent - section_span. A Doc will also have attributes corresponding to lists of each (ie., Doc._.section_titles, Doc._.section_headers, Doc._.section_parents, Doc._.section_list) (Span|Token)._.section_title (Span|Token)._.section_header (Span|Token)._.section_parent (Span|Token)._.section_span Args: nlp: A SpaCy language model object rules (str, list, or None): Where to read rules from. Default is "default", which will load the default rules provided by medSpaCy, which are derived from MIMIC-II. If a list, should be a list of pattern dicts following these conventional spaCy formats: [ {"section_title": "past_medical_history", "pattern": "Past Medical History:"}, {"section_title": "problem_list", "pattern": [{"TEXT": "PROBLEM"}, {"TEXT": "LIST"}, {"TEXT": ":"}]} ] If a string other than "default", should be a path to a jsonl file containing rules. max_scope (None or int): Optional argument specifying the maximum number of tokens following a section header which can be included in a section. This can be useful if you think your section rules are incomplete and want to prevent sections from running too long in the note. Default is None, meaning that the scope of a section will be until either the next section header or the end of the document. include_title (bool): whether the section title is included in the section text phrase_matcher_attr (str): The name of the token attribute which will be used by the PhraseMatcher for any rules with a "pattern" value of a string. require_start_line (bool): Optionally require a section header to start on a new line. Default False. require_end_line (bool): Optionally require a section header to end with a new line. Default False. newline_pattern (str): Regular expression to match the new line either preceding or following a header if either require_start_line or require_end_line are True. """ self.nlp = nlp self.add_attrs = add_attrs self.matcher = MedspacyMatcher(nlp, phrase_matcher_attr=phrase_matcher_attr) self.max_scope = max_scope self.require_start_line = require_start_line self.require_end_line = require_end_line self.newline_pattern = re.compile(newline_pattern) self.assertion_attributes_mapping = None self._parent_sections = {} self._parent_required = {} self._rule_item_mapping = self.matcher._rule_item_mapping self._rules = [] self._section_categories = set() self.include_header = include_header if rules is not None: if rules == "default": import os if not os.path.exists(DEFAULT_RULES_FILEPATH): raise FileNotFoundError( "The expected location of the default rules file cannot be found. Please either " "add rules manually or add a jsonl file to the following location: ", DEFAULT_RULES_FILEPATH, ) self.add(SectionRule.from_json(DEFAULT_RULES_FILEPATH)) # If a list, add each of the rules in the list elif isinstance(rules, list): self.add(rules) elif isinstance(rules, str): path.exists(rules) self.add(SectionRule.from_json(rules)) if add_attrs is False: self.add_attrs = False elif add_attrs is True: self.assertion_attributes_mapping = DEFAULT_ATTRS self.register_default_attributes() elif isinstance(add_attrs, dict): # Check that each of the attributes being added has been set for modifier in add_attrs.keys(): attr_dict = add_attrs[modifier] for attr_name, attr_value in attr_dict.items(): if not Span.has_extension(attr_name): raise ValueError( "Custom extension {0} has not been set. Call Span.set_extension." ) self.add_attrs = True self.assertion_attributes_mapping = add_attrs else: raise ValueError( "add_attrs must be either True (default), False, or a dictionary, not {0}" .format(add_attrs))
def __init__( self, nlp, targets="ents", add_attrs=True, phrase_matcher_attr="LOWER", rules="default", rule_list=None, allowed_types=None, excluded_types=None, use_context_window=False, max_scope=None, max_targets=None, terminations=None, prune=True, remove_overlapping_modifiers=False, ): """Create a new ConTextComponent algorithm. This component matches modifiers in a Doc, defines their scope, and identifies edges between targets and modifiers. Sets two spaCy extensions: - Span._.modifiers: a list of TagObject objects which modify a target Span - Doc._.context_graph: a ConText graph object which contains the targets, modifiers, and edges between them. Args: nlp: a spaCy NLP model targets: the attribute of Doc which contains targets. Default is "ents", in which case it will use the standard Doc.ents attribute. Otherwise will look for a custom attribute in Doc._.{targets} add_attrs: Whether or not to add the additional spaCy Span attributes (ie., Span._.x) defining assertion on the targets. By default, these are: - is_negated: True if a target is modified by 'NEGATED_EXISTENCE', default False - is_uncertain: True if a target is modified by 'POSSIBLE_EXISTENCE', default False - is_historical: True if a target is modified by 'HISTORICAL', default False - is_hypothetical: True if a target is modified by 'HYPOTHETICAL', default False - is_family: True if a target is modified by 'FAMILY', default False In the future, these should be made customizable. phrase_matcher_attr: The token attribute to be used by the underlying PhraseMatcher. If "LOWER", then the matching of modifiers with a "literal" string will be case-insensitive. If "TEXT" or "ORTH", it will be case-sensitive. Default "LOWER'. prune: Whether or not to prune modifiers which are substrings of another modifier. For example, if "no history of" and "history of" are both ConTextItems, both will match the text "no history of afib", but only "no history of" should modify afib. If True, will drop shorter substrings completely. Default True. remove_overlapping_modifiers: Whether or not to remove any matched modifiers which overlap with target entities. If False, any overlapping modifiers will not modify the overlapping entity but will still modify any other targets in its scope. Default False. rules: Which rules to load on initialization. Default is 'default'. - 'default': Load the default set of rules provided with cyConText - 'other': Load a custom set of rules, please also set rule_list with a file path or list. - None: Load no rules. rule_list: The location of rules in json format or a list of ContextItems. Default is None. allowed_types (set or None): A set of target labels to allow a ConTextItem to modify. If None, will apply to any type not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used. An error will be thrown if both or not None. If this attribute is also defined in the ConTextItem, it will keep that value. Otherwise it will inherit this value. excluded_types (set or None): A set of target labels which this modifier cannot modify. If None, will apply to all target types unless allowed_types is not None. If this attribute is also defined in the ConTextItem, it will keep that value. Otherwise it will inherit this value. max_targets (int or None): The maximum number of targets which a modifier can modify. If None, will modify all targets in its scope. If this attribute is also defined in the ConTextItem, it will keep that value. Otherwise it will inherit this value. use_context_window (bool): Whether to use a specified range around a target to check for modifiers rather than split sentence boundaries. This can be useful for quicker processing by skipping sentence splitting or errors caused by poorly defined sentence boundaries. If True, max_scope must be an integer greater than 0. max_scope (int or None): A number to explicitly limit the size of the modifier's scope If this attribute is also defined in the ConTextItem, it will keep that value. Otherwise it will inherit this value. terminations (dict or None): Optional mapping between different categories which will cause one modifier type to be 'terminated' by another type. For example, if given a mapping: {"POSITIVE_EXISTENCE": {"NEGATED_EXISTENCE", "UNCERTAIN"}, "NEGATED_EXISTENCE": {"FUTURE"}, } all modifiers of type "POSITIVE_EXISTENCE" will be terminated by "NEGATED_EXISTENCE" or "UNCERTAIN" modifiers, and all "NEGATED_EXISTENCE" modifiers will be terminated by "FUTURE". This can also be defined for specific ConTextItems in the `terminated_by` attribute. Returns: context: a ConTextComponent Raises: ValueError: if one of the parameters is incorrectly formatted. """ self.nlp = nlp if targets != "ents": raise NotImplementedError() self._target_attr = targets self.prune = prune self.remove_overlapping_modifiers = remove_overlapping_modifiers self._item_data = [] self._i = 0 self._categories = set() # _modifier_item_mapping: A mapping from spaCy Matcher match_ids to ConTextItem # This allows us to use spaCy Matchers while still linking back to the ConTextItem # To get the rule and category self._modifier_item_mapping = dict() self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=phrase_matcher_attr, validate=True ) # TODO: match on custom attributes self.matcher = Matcher(nlp.vocab, validate=True) self.register_graph_attributes() if add_attrs is False: self.add_attrs = False elif add_attrs is True: self.add_attrs = True self.context_attributes_mapping = DEFAULT_ATTRS self.register_default_attributes() elif isinstance(add_attrs, dict): # Check that each of the attributes being added has been set for modifier in add_attrs.keys(): attr_dict = add_attrs[modifier] for attr_name, attr_value in attr_dict.items(): if not Span.has_extension(attr_name): raise ValueError( "Custom extension {0} has not been set. Call Span.set_extension.".format( attr_name ) ) self.add_attrs = True self.context_attributes_mapping = add_attrs else: raise ValueError( "add_attrs must be either True (default), False, or a dictionary, not {0}".format( add_attrs ) ) if use_context_window is True: if not isinstance(max_scope, int) or max_scope < 1: raise ValueError( "If 'use_context_window' is True, 'max_scope' must be an integer greater 1, " "not {0}".format(max_scope) ) self.use_context_window = use_context_window if max_scope is not None and ( not isinstance(max_scope, int) or max_scope < 1 ): raise ValueError( "'max_scope' must be None or an integer greater 1, " "not {0}".format(max_scope) ) self.max_scope = max_scope self.allowed_types = allowed_types self.excluded_types = excluded_types self.max_targets = max_targets if terminations is None: terminations = dict() self.terminations = {k.upper(): v for (k, v) in terminations.items()} if rules == "default": item_data = ConTextItem.from_json(DEFAULT_RULES_FILEPATH) self.add(item_data) elif rules == "other": # use custom rules if isinstance(rule_list, str): # if rules_list is a string, then it must be a path to a json if "yaml" in rule_list or "yml" in rule_list: try: rule_list = ConTextItem.from_yaml(rule_list) except: raise ValueError( "rule list {0} could not be read".format(rule_list) ) elif path.exists(rule_list): item_data = ConTextItem.from_json(rule_list) self.add(item_data) else: raise ValueError( "rule_list must be a valid path. Currently is: {0}".format( rule_list ) ) elif isinstance(rule_list, list): # otherwise it is a list of contextitems if not rule_list: raise ValueError("rule_list must not be empty.") for item in rule_list: # check that all items are contextitems if not isinstance(item, ConTextItem): raise ValueError( "rule_list must contain only ContextItems. Currently contains: {0}".format( type(item) ) ) self.add(rule_list) else: raise ValueError( "rule_list must be a valid path or list of ContextItems. Currenty is: {0}".format( type(rule_list) ) ) elif not rules: # otherwise leave the list empty. # do nothing self._item_data = [] else: # loading from json path or list is possible later raise ValueError( "rules must either be 'default' (default), 'other' or None." )
def parse_conll_text_as_spacy( self, text: str, ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$", ner_map: Dict[str, str] = None, ) -> Doc: """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n). Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #) is saved in Span._.conll_metadata of sentence Spans. This method has been adapted from the work by spaCy. See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179 Multi-word tokens and empty nodes are not supported. :param text: CoNLL-U formatted text :param ner_tag_pattern: Regex pattern for entity tag in the MISC field :param ner_map: Map old NER tag names to new ones, '' maps to O :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including the custom CoNLL extensions """ if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None) docs = [] for chunk in text.split("\n\n"): lines = [ l for l in chunk.splitlines() if l and not l.startswith("#") ] words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], [] heads, deps, deps_graphs = [], [], [] for i in range(len(lines)): line = lines[i] parts = line.split("\t") if any(not p for p in parts): raise ValueError( "According to the CoNLL-U Format, fields cannot be empty. See" " https://universaldependencies.org/format.html") id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts if any(" " in f for f in (id_, pos, tag, morph, head, dep, deps_graph)): raise ValueError( "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain" " spaces. See https://universaldependencies.org/format.html" ) if "." in id_ or "-" in id_: raise NotImplementedError( "Multi-word tokens and empty nodes are not supported in spacy_conll" ) words.append(word) if "SpaceAfter=No" in misc: spaces.append(False) else: spaces.append(True) id_ = int(id_) - 1 lemmas.append(lemma) poses.append(pos) tags.append(pos if tag == "_" else tag) morphs.append(morph if morph != "_" else "") heads.append((int(head) - 1) if head not in ("0", "_") else id_) deps.append("ROOT" if dep == "root" else dep) deps_graphs.append(deps_graph) miscs.append(misc) doc = Doc( self.nlp.vocab, words=words, spaces=spaces, tags=tags, pos=poses, morphs=morphs, lemmas=lemmas, heads=heads, deps=deps, ) # Set custom Token extensions for i in range(len(doc)): doc[i]._.conll_misc_field = miscs[i] doc[i]._.conll_deps_graphs_field = deps_graphs[i] ents = get_entities(lines, ner_tag_pattern, ner_map) doc.ents = spans_from_biluo_tags(doc, ents) # The deprel relations ensure that this CoNLL chunk is one sentence # Deprel cannot therefore not be empty or each word is considered a separate sentence if len(list(doc.sents)) != 1: raise ValueError( "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format" " requirements. See https://universaldependencies.org/format.html. Particularly make" " sure that the DEPREL field is filled in.") # Save the metadata in a custom sentence Span attribute so that the formatter can use it metadata = "\n".join( [l for l in chunk.splitlines() if l.startswith("#")]) # We really only expect one sentence for sent in doc.sents: sent._.conll_metadata = f"{metadata}\n" if metadata else "" docs.append(doc) # Add CoNLL custom extensions return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))