def test_string_match(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: PE") sectionizer(doc) section = doc._.sections[0] assert section.category == "past_medical_history" assert section.title_span.text == "Past Medical History:" assert section.section_span.text == "Past Medical History: PE" # def test_list_pattern_match(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule( category="past_medical_history", literal="past medical history:", pattern=[{ "LOWER": "past" }, { "LOWER": "medical" }, { "LOWER": "history" }, { "LOWER": ":" }], )) doc = nlp("Past Medical History: PE") sectionizer(doc) section = doc._.sections[0] assert section.category == "past_medical_history" assert section.title_span.text == "Past Medical History:" assert section.section_span.text == "Past Medical History: PE"
def test_duplicate_parent_definitions(self): with warnings.catch_warnings(record=True) as w: sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="s1", literal="section 1:"), SectionRule(category="s2", literal="section 2:", parents=["s1"]), SectionRule(category="s2", literal="section 2:", parents=["s3"]), SectionRule(category="s3", literal="section 3:"), ]) text = "section 1: abc section 2: abc section 3: abc section 2: abc" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 4 s1 = doc._.sections[0] s2 = doc._.sections[1] s3 = doc._.sections[2] s2_2 = doc._.sections[3] assert len(w) == 1 assert issubclass(w[0].category, RuntimeWarning) assert s1.parent is None assert s2.parent.category == "s1" assert s3.parent is None assert s2_2.parent.category == "s3"
def test_duplicate_parent_definitions(self): with warnings.catch_warnings(record=True) as w: sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="s1", literal="section 1:"), SectionRule(category="s2", literal="section 2:", parents=["s1"]), SectionRule(category="s2", literal="section 2:", parents=["s3"]), SectionRule(category="s3", literal="section 3:"), ]) text = "section 1: abc section 2: abc section 3: abc section 2: abc" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 4 s1 = doc._.sections[0] s2 = doc._.sections[1] s3 = doc._.sections[2] s2_2 = doc._.sections[3] # assert len(w) == 1 # this throws errors if warnings occur elsewhere. check that specific warning is in # the buffer instead warning_found = False for warn in w: print("Duplicate" in str(warn.message)) if warn.category is RuntimeWarning and "Duplicate section title" in str( warn.message): warning_found = True assert warning_found assert s1.parent is None assert s2.parent.category == "s1" assert s3.parent is None assert s2_2.parent.category == "s3"
def test_max_scope_none(self): sectionizer = Sectionizer(nlp, rules=None, max_scope=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: This is the sentence.") sectionizer(doc) assert doc[-1]._.section_category == "past_medical_history"
def test_end_line(self): sectionizer = Sectionizer(nlp, rules=None, require_end_line=True) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) text = "\n\n Past Medical History:\n The patient has a Past Medical History: this" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 2
def test_max_scope(self): sectionizer = Sectionizer(nlp, rules=None, max_scope=2) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: This is the sentence.") sectionizer(doc) section = doc._.sections[0] assert section.body_span[ 0]._.section_category == "past_medical_history" # This should be out of range of the section scope assert section.body_span[3]._.section_category is None
def test_num_sections(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: PE") sectionizer(doc) assert len(doc._.sections) == 1 # Now reprocess and make sure it resets doc = nlp("Past Medical History: PE") sectionizer(doc) assert len(doc._.sections) == 1
def test_doc_attributes(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: PE") sectionizer(doc) assert len(doc._.sections) assert len(doc._.section_categories) assert len(doc._.section_titles) assert len(doc._.section_spans) assert len(doc._.section_bodies)
def test_max_scope_rule(self): sectionizer = Sectionizer(nlp, rules=None, max_scope=2) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:", max_scope=100)) doc = nlp("Past Medical History: This is the sentence.") sectionizer(doc) section = doc._.sections[-1] token = doc[-1] assert section.category == "past_medical_history" assert token in section.section_span assert token._.section_category == "past_medical_history"
def test_context_attributes(self): sectionizer = Sectionizer( nlp, rules=None, add_attrs={"past_medical_history": { "is_negated": True }}) sectionizer.add( [SectionRule("Past Medical History:", "past_medical_history")]) doc = nlp("Past Medical History: Pneumonia") from spacy.tokens import Span doc.ents = (Span(doc, 4, 5), ) sectionizer(doc) assert doc.ents[0]._.is_negated is True
def test_section(self): sectionizer = Sectionizer(nlp, rules=None) rule = SectionRule(category="past_medical_history", literal="Past Medical History:") sectionizer.add(rule) doc = nlp("Past Medical History: PE") sectionizer(doc) section = doc._.sections[0] assert section.category == "past_medical_history" assert section.section_span == doc[0:] assert section.title_span == doc[0:-1] assert section.body_span == doc[-1:] assert section.parent is None assert section.rule is rule
def test_span_attributes(self): sectionizer = Sectionizer(nlp, rules=None) rule = SectionRule(category="past_medical_history", literal="Past Medical History:") sectionizer.add(rule) doc = nlp("Past Medical History: PE") sectionizer(doc) span = doc[-1:] assert span._.section is doc._.sections[0] assert span._.section_category == "past_medical_history" assert span._.section_span == doc[0:] assert span._.section_title == doc[0:-1] assert span._.section_body == doc[-1:] assert span._.section_parent is None assert span._.section_rule is rule
def test_span_attributes(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("Past Medical History: PE") sectionizer(doc) token = doc[-1] assert len(token._.section) assert len(token._.section_category) assert len(token._.section_title) assert len(token._.section_span) assert len(token._.section_body) assert len(token._.section_rule)
def test_document_starts_no_header(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( SectionRule(category="past_medical_history", literal="Past Medical History:")) doc = nlp("This is separate. Past Medical History: PE") sectionizer(doc) assert len(doc._.sections) == 2 section = doc._.sections[0] assert section.category is None assert section.title_span.text == "" assert section.body_span.text == "This is separate." section = doc._.sections[1] assert section.category == "past_medical_history" assert section.title_span.text == "Past Medical History:" assert section.section_span.text == "Past Medical History: PE"
def test_parent_section_chain(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="s1", literal="section 1:"), SectionRule(category="s2", literal="section 2:", parents=["s1"]), SectionRule(category="s3", literal="section 3:", parents=["s2"]), ]) text = "section 1: abc section 2: abc section 3: abc" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 3 s1 = doc._.sections[0] s2 = doc._.sections[1] s3 = doc._.sections[2] assert s1.parent is None assert s2.parent.category == "s1" assert s3.parent.category == "s2"
def test_parent_section_multiple_candidates(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="past_medical_history", literal="Past Medical History:"), SectionRule(category="explanation", literal="Explanation:", parents=["past_medical_history", "allergies"]), ]) text = "Past Medical History: some other text. Explanation: The patient has one" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 2 pmh = doc._.sections[0] explanation = doc._.sections[1] assert pmh.parent is None assert explanation.parent.category == "past_medical_history"
def test_parent_section_parent_required(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="past_medical_history", literal="Past Medical History:"), SectionRule(category="explanation", literal="Explanation:", parents=["past_medical_history"], parent_required=True), ]) text = "other text Explanation: The patient has one" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 1 section = doc._.sections[0] print(section) assert section.category is None assert section.parent is None
def test_parent_section_no_valid_parent(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="past_medical_history", literal="Past Medical History:"), SectionRule(category="allergies", literal="Allergies:"), SectionRule(category="explanation", literal="Explanation:", parents=["past_medical_history"]), ]) text = "Past Medical History: some other text. Allergies: peanuts Explanation: pt cannot eat peanuts" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 3 pmh = doc._.sections[0] allergies = doc._.sections[1] explanation = doc._.sections[2] assert pmh.parent is None assert allergies.parent is None assert explanation.parent is None
def test_parent_section_chain_backtracking_interrupted(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="s1", literal="section 1:"), SectionRule(category="s2", literal="section 2:", parents=["s1"]), SectionRule(category="s3", literal="section 3:", parents=["s2"]), SectionRule(category="s4", literal="section 4:", parents=["s1"]), SectionRule(category="break", literal="section break:"), ]) text = "section 1: abc section 2: abc section 3: abc section break: abc section 4: abc" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 5 s1 = doc._.sections[0] s2 = doc._.sections[1] s3 = doc._.sections[2] s4 = doc._.sections[4] assert s1.parent is None assert s2.parent.category == "s1" assert s3.parent.category == "s2" assert s4.parent is None
def test_parent_section_duplicate_sections_different_parents(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add([ SectionRule(category="past_medical_history", literal="Past Medical History:"), SectionRule(category="allergies", literal="Allergies:"), SectionRule(category="explanation", literal="Explanation:", parents=["past_medical_history", "allergies"]), ]) text = "Past Medical History: some other text. Explanation: The patient has one. Allergies: peanuts Explanation: pt cannot eat peanuts" doc = nlp(text) sectionizer(doc) assert len(doc._.sections) == 4 pmh = doc._.sections[0] explanation = doc._.sections[1] allergies = doc._.sections[2] explanation2 = doc._.sections[3] assert pmh.parent is None assert explanation.parent.category == "past_medical_history" assert allergies.parent is None assert explanation2.parent.category == "allergies"
def load(model="default", enable=None, disable=None, load_rules=True, set_attributes=True): """Load a spaCy language object with cov_bsv pipeline components. By default, the base model will be 'en_core_web_sm' with the 'tagger' and 'parser' pipeline components, supplemented with the following custom components: - preprocessor (set to be nlp.tokenizer): Modifies the preprocessed text and returns a tokenized Doc. Preprocess rules are defined in cov_bsv.knowledge_base.preprocess_rules - concept_tagger: Assigns a semantic tag in a custom attribute "token._.concept_tag" to each Token in a Doc, which helps with concept extraction and normalization. Concept tag rules are defined in cov_bsv.knowledge_base.concept_tag_rules. - target_matcher: Extracts spans to doc.ents using extended rule-based matching. Target rules are defined in cov_bsv.knowledge_base.target_rules. - sectionizer: Identifies note section headers in the text and assigns section titles to entities and tokens contained in that section. Section patterns are defined in cov_bsv.knowledge_base.section_patterns. - context: Identifies semantic modifiers of entities and asserts attributes such as positive status, negation, and other experiencier. Context rules are defined in cov_bsv.knowledge_base.context_rules. - postprocessor: Modifies or removes the entity based on business logic. This handles special cases or complex logic using the results of earlier entities. Postprocess rules are defined in cov_bsv.knowledge_base.postprocess_rules. - document_classifier: Assigns a label of "POS", "UNK", or "NEG" to the doc._.cov_classification. A document will be classified as positive if it has at least one positive, non-excluded entity. Args: model: The name of the base spaCy model to load. If "default" will load the tagger and parser from "en_core_web_sm". enable (iterable or None): A list of component names to include in the pipeline. If None, will include all pipeline components listed above. disable (iterable or None): A list of component names to exclude. Cannot be set if `enable` is not None. load_rules (bool): Whether or not to include default rules for custom components. Default True. set_attributes (bool): Whether or not to register custom attributes to spaCy classes. If load_rules is True, this will automatically be set to True because the rules in the knowledge base rely on these custom attributes. The following extensions are registered (all defaults are False unless specified): Span._.is_future Span._.is_historical Span._.is_positive Span._.is_not_relevant Span._.is_negated Span._.is_uncertain Span._.is_screening Span._.is_other_experiencer Span._.concept_tag (default "") Returns: nlp: a spaCy Language object """ if enable is not None and disable is not None: raise ValueError("Either `enable` or `disable` must be None.") if disable is not None: # If there's a single pipe name, nest it in a set if isinstance(disable, str): disable = {disable} else: disable = set(disable) enable = set(DEFAULT_PIPENAMES).difference(set(disable)) elif enable is not None: if isinstance(enable, str): enable = {enable} else: enable = set(enable) disable = set(DEFAULT_PIPENAMES).difference(enable) else: enable = DEFAULT_PIPENAMES disable = set() if model == "default": model = "en_core_web_sm" disable.add("ner") if set_attributes: _set_attributes() import spacy nlp = spacy.load(model, disable=disable) if "preprocessor" in enable: from medspacy.preprocess import Preprocessor preprocessor = Preprocessor(nlp.tokenizer) if load_rules: preprocessor.add(preprocess_rules) nlp.tokenizer = preprocessor if "concept_tagger" in enable: from spacy.tokens import Token Token.set_extension("concept_tag", default="", force=True) from medspacy.ner import ConceptTagger concept_tagger = ConceptTagger(nlp) if load_rules: for (_, rules) in concept_tag_rules.items(): concept_tagger.add(rules) nlp.add_pipe(concept_tagger) if "target_matcher" in enable: from medspacy.ner import TargetMatcher target_matcher = TargetMatcher(nlp) if load_rules: for (_, rules) in target_rules.items(): target_matcher.add(rules) nlp.add_pipe(target_matcher) if "sectionizer" in enable: from medspacy.section_detection import Sectionizer sectionizer = Sectionizer(nlp, rules=None, add_attrs=SECTION_ATTRS) if load_rules: sectionizer.add(section_rules) nlp.add_pipe(sectionizer) if "context" in enable: from medspacy.context import ConTextComponent context = ConTextComponent( nlp, add_attrs=CONTEXT_MAPPING, rules=None, remove_overlapping_modifiers=True, ) if load_rules: context.add(context_rules) nlp.add_pipe(context) if "postprocessor" in enable: from medspacy.postprocess import Postprocessor postprocessor = Postprocessor(debug=False) if load_rules: postprocessor.add(postprocess_rules) nlp.add_pipe(postprocessor) if "document_classifier" in enable: document_classifier = DocumentClassifier() nlp.add_pipe(document_classifier) return nlp
from medspacy.section_detection import Sectionizer, SectionRule nlp = spacy.load("en_core_web_sm") nlp.remove_pipe("ner") matcher = EntityRuler(nlp) matcher.add_patterns([{"label": "PROBLEM", "pattern": "cough"}]) nlp.add_pipe(matcher) context = ConTextComponent(nlp) nlp.add_pipe(context) sectionizer = Sectionizer(nlp) sectionizer.add( [ SectionRule("Section 1:", "section1"), SectionRule("Section 2:", "section2", parents=["section1"]), ] ) nlp.add_pipe(sectionizer) simple_text = "Patient has a cough." context_text = "Patient has no cough." section_text = "Section 1: Patient has a cough" section_parent_text = """Section 1: comment Section 2: Patient has a cough""" many_concept_texts = ["cough " * i for i in range(10)] simple_doc = nlp(simple_text) context_doc = nlp(context_text) section_doc = nlp(section_text) section_parent_doc = nlp(section_parent_text)
read_batch_size = 1 ##################################################### ##################################################### # NLP FACTORY # initialize or call a method to produce your custom # NLP pipeline here. # # NOTE: DocConsumer MUST be present at the end. nlp = spacy.load("en_core_web_sm") context = ConTextComponent(nlp) sectionizer = Sectionizer(nlp, patterns=None) sectionizer.add([{ "section_title": "equals", "pattern": [{ "LOWER": "=", "OP": "+" }] }]) consumer = DocConsumer( nlp, context=True, sectionizer=True) # DocConsumer has optional bool context and sectionizer nlp.add_pipe(sectionizer) nlp.add_pipe(context) nlp.add_pipe(consumer) ##################################################### ##################################################### # CREATING DB CONNECTIONS db_read_conn = DbConnect(driver, server, db, user, pwd) db_write_conn = DbConnect(driver, server, db, user, pwd)
def test_section_categories(self): sectionizer = Sectionizer(nlp, rules=None) sectionizer.add( [SectionRule("Past Medical History:", "past_medical_history")]) assert sectionizer.section_categories == ["past_medical_history"]