Python Sectionizer.addの例、medspacy.section_detection.Sectionizer.add Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_string_match(self):
        sectionizer = Sectionizer(nlp, rules=None)
        sectionizer.add(
            SectionRule(category="past_medical_history",
                        literal="Past Medical History:"))
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)
        section = doc._.sections[0]
        assert section.category == "past_medical_history"
        assert section.title_span.text == "Past Medical History:"
        assert section.section_span.text == "Past Medical History: PE"

        # def test_list_pattern_match(self):
        sectionizer = Sectionizer(nlp, rules=None)
        sectionizer.add(
            SectionRule(
                category="past_medical_history",
                literal="past medical history:",
                pattern=[{
                    "LOWER": "past"
                }, {
                    "LOWER": "medical"
                }, {
                    "LOWER": "history"
                }, {
                    "LOWER": ":"
                }],
            ))
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)
        section = doc._.sections[0]
        assert section.category == "past_medical_history"
        assert section.title_span.text == "Past Medical History:"
        assert section.section_span.text == "Past Medical History: PE"

コード例 #2

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_duplicate_parent_definitions(self):
     with warnings.catch_warnings(record=True) as w:
         sectionizer = Sectionizer(nlp, rules=None)
         sectionizer.add([
             SectionRule(category="s1", literal="section 1:"),
             SectionRule(category="s2",
                         literal="section 2:",
                         parents=["s1"]),
             SectionRule(category="s2",
                         literal="section 2:",
                         parents=["s3"]),
             SectionRule(category="s3", literal="section 3:"),
         ])
         text = "section 1: abc section 2: abc section 3: abc section 2: abc"
         doc = nlp(text)
         sectionizer(doc)
         assert len(doc._.sections) == 4
         s1 = doc._.sections[0]
         s2 = doc._.sections[1]
         s3 = doc._.sections[2]
         s2_2 = doc._.sections[3]
         assert len(w) == 1
         assert issubclass(w[0].category, RuntimeWarning)
         assert s1.parent is None
         assert s2.parent.category == "s1"
         assert s3.parent is None
         assert s2_2.parent.category == "s3"

コード例 #3

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: ppayne12/medspacy

 def test_duplicate_parent_definitions(self):
     with warnings.catch_warnings(record=True) as w:
         sectionizer = Sectionizer(nlp, rules=None)
         sectionizer.add([
             SectionRule(category="s1", literal="section 1:"),
             SectionRule(category="s2",
                         literal="section 2:",
                         parents=["s1"]),
             SectionRule(category="s2",
                         literal="section 2:",
                         parents=["s3"]),
             SectionRule(category="s3", literal="section 3:"),
         ])
         text = "section 1: abc section 2: abc section 3: abc section 2: abc"
         doc = nlp(text)
         sectionizer(doc)
         assert len(doc._.sections) == 4
         s1 = doc._.sections[0]
         s2 = doc._.sections[1]
         s3 = doc._.sections[2]
         s2_2 = doc._.sections[3]
         # assert len(w) == 1 # this throws errors if warnings occur elsewhere. check that specific warning is in
         # the buffer instead
         warning_found = False
         for warn in w:
             print("Duplicate" in str(warn.message))
             if warn.category is RuntimeWarning and "Duplicate section title" in str(
                     warn.message):
                 warning_found = True
         assert warning_found
         assert s1.parent is None
         assert s2.parent.category == "s1"
         assert s3.parent is None
         assert s2_2.parent.category == "s3"

コード例 #4

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_max_scope_none(self):
     sectionizer = Sectionizer(nlp, rules=None, max_scope=None)
     sectionizer.add(
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"))
     doc = nlp("Past Medical History: This is the sentence.")
     sectionizer(doc)
     assert doc[-1]._.section_category == "past_medical_history"

コード例 #5

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_end_line(self):
     sectionizer = Sectionizer(nlp, rules=None, require_end_line=True)
     sectionizer.add(
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"))
     text = "\n\n Past Medical History:\n The patient has a Past Medical History: this"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 2

コード例 #6

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_max_scope(self):
     sectionizer = Sectionizer(nlp, rules=None, max_scope=2)
     sectionizer.add(
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"))
     doc = nlp("Past Medical History: This is the sentence.")
     sectionizer(doc)
     section = doc._.sections[0]
     assert section.body_span[
         0]._.section_category == "past_medical_history"
     # This should be out of range of the section scope
     assert section.body_span[3]._.section_category is None

コード例 #7

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_num_sections(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add(
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"))
     doc = nlp("Past Medical History: PE")
     sectionizer(doc)
     assert len(doc._.sections) == 1
     # Now reprocess and make sure it resets
     doc = nlp("Past Medical History: PE")
     sectionizer(doc)
     assert len(doc._.sections) == 1

コード例 #8

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_doc_attributes(self):
        sectionizer = Sectionizer(nlp, rules=None)
        sectionizer.add(
            SectionRule(category="past_medical_history",
                        literal="Past Medical History:"))
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)

        assert len(doc._.sections)
        assert len(doc._.section_categories)
        assert len(doc._.section_titles)
        assert len(doc._.section_spans)
        assert len(doc._.section_bodies)

コード例 #9

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_max_scope_rule(self):
     sectionizer = Sectionizer(nlp, rules=None, max_scope=2)
     sectionizer.add(
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:",
                     max_scope=100))
     doc = nlp("Past Medical History: This is the sentence.")
     sectionizer(doc)
     section = doc._.sections[-1]
     token = doc[-1]
     assert section.category == "past_medical_history"
     assert token in section.section_span
     assert token._.section_category == "past_medical_history"

コード例 #10

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_context_attributes(self):
     sectionizer = Sectionizer(
         nlp,
         rules=None,
         add_attrs={"past_medical_history": {
             "is_negated": True
         }})
     sectionizer.add(
         [SectionRule("Past Medical History:", "past_medical_history")])
     doc = nlp("Past Medical History: Pneumonia")
     from spacy.tokens import Span
     doc.ents = (Span(doc, 4, 5), )
     sectionizer(doc)
     assert doc.ents[0]._.is_negated is True

コード例 #11

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_section(self):
        sectionizer = Sectionizer(nlp, rules=None)
        rule = SectionRule(category="past_medical_history",
                           literal="Past Medical History:")
        sectionizer.add(rule)
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)

        section = doc._.sections[0]
        assert section.category == "past_medical_history"
        assert section.section_span == doc[0:]
        assert section.title_span == doc[0:-1]
        assert section.body_span == doc[-1:]
        assert section.parent is None
        assert section.rule is rule

コード例 #12

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_span_attributes(self):
        sectionizer = Sectionizer(nlp, rules=None)
        rule = SectionRule(category="past_medical_history",
                           literal="Past Medical History:")
        sectionizer.add(rule)
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)

        span = doc[-1:]
        assert span._.section is doc._.sections[0]
        assert span._.section_category == "past_medical_history"
        assert span._.section_span == doc[0:]
        assert span._.section_title == doc[0:-1]
        assert span._.section_body == doc[-1:]
        assert span._.section_parent is None
        assert span._.section_rule is rule

コード例 #13

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_span_attributes(self):
        sectionizer = Sectionizer(nlp, rules=None)
        sectionizer.add(
            SectionRule(category="past_medical_history",
                        literal="Past Medical History:"))
        doc = nlp("Past Medical History: PE")
        sectionizer(doc)

        token = doc[-1]

        assert len(token._.section)
        assert len(token._.section_category)
        assert len(token._.section_title)
        assert len(token._.section_span)
        assert len(token._.section_body)
        assert len(token._.section_rule)

コード例 #14

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

    def test_document_starts_no_header(self):
        sectionizer = Sectionizer(nlp, rules=None)
        sectionizer.add(
            SectionRule(category="past_medical_history",
                        literal="Past Medical History:"))
        doc = nlp("This is separate. Past Medical History: PE")
        sectionizer(doc)
        assert len(doc._.sections) == 2
        section = doc._.sections[0]
        assert section.category is None
        assert section.title_span.text == ""
        assert section.body_span.text == "This is separate."

        section = doc._.sections[1]
        assert section.category == "past_medical_history"
        assert section.title_span.text == "Past Medical History:"
        assert section.section_span.text == "Past Medical History: PE"

コード例 #15

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_chain(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="s1", literal="section 1:"),
         SectionRule(category="s2", literal="section 2:", parents=["s1"]),
         SectionRule(category="s3", literal="section 3:", parents=["s2"]),
     ])
     text = "section 1: abc section 2: abc section 3: abc"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 3
     s1 = doc._.sections[0]
     s2 = doc._.sections[1]
     s3 = doc._.sections[2]
     assert s1.parent is None
     assert s2.parent.category == "s1"
     assert s3.parent.category == "s2"

コード例 #16

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_multiple_candidates(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"),
         SectionRule(category="explanation",
                     literal="Explanation:",
                     parents=["past_medical_history", "allergies"]),
     ])
     text = "Past Medical History: some other text. Explanation: The patient has one"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 2
     pmh = doc._.sections[0]
     explanation = doc._.sections[1]
     assert pmh.parent is None
     assert explanation.parent.category == "past_medical_history"

コード例 #17

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_parent_required(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"),
         SectionRule(category="explanation",
                     literal="Explanation:",
                     parents=["past_medical_history"],
                     parent_required=True),
     ])
     text = "other text Explanation: The patient has one"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 1
     section = doc._.sections[0]
     print(section)
     assert section.category is None
     assert section.parent is None

コード例 #18

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_no_valid_parent(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"),
         SectionRule(category="allergies", literal="Allergies:"),
         SectionRule(category="explanation",
                     literal="Explanation:",
                     parents=["past_medical_history"]),
     ])
     text = "Past Medical History: some other text. Allergies: peanuts Explanation: pt cannot eat peanuts"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 3
     pmh = doc._.sections[0]
     allergies = doc._.sections[1]
     explanation = doc._.sections[2]
     assert pmh.parent is None
     assert allergies.parent is None
     assert explanation.parent is None

コード例 #19

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_chain_backtracking_interrupted(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="s1", literal="section 1:"),
         SectionRule(category="s2", literal="section 2:", parents=["s1"]),
         SectionRule(category="s3", literal="section 3:", parents=["s2"]),
         SectionRule(category="s4", literal="section 4:", parents=["s1"]),
         SectionRule(category="break", literal="section break:"),
     ])
     text = "section 1: abc section 2: abc section 3: abc section break: abc section 4: abc"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 5
     s1 = doc._.sections[0]
     s2 = doc._.sections[1]
     s3 = doc._.sections[2]
     s4 = doc._.sections[4]
     assert s1.parent is None
     assert s2.parent.category == "s1"
     assert s3.parent.category == "s2"
     assert s4.parent is None

コード例 #20

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_parent_section_duplicate_sections_different_parents(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add([
         SectionRule(category="past_medical_history",
                     literal="Past Medical History:"),
         SectionRule(category="allergies", literal="Allergies:"),
         SectionRule(category="explanation",
                     literal="Explanation:",
                     parents=["past_medical_history", "allergies"]),
     ])
     text = "Past Medical History: some other text. Explanation: The patient has one. Allergies: peanuts Explanation: pt cannot eat peanuts"
     doc = nlp(text)
     sectionizer(doc)
     assert len(doc._.sections) == 4
     pmh = doc._.sections[0]
     explanation = doc._.sections[1]
     allergies = doc._.sections[2]
     explanation2 = doc._.sections[3]
     assert pmh.parent is None
     assert explanation.parent.category == "past_medical_history"
     assert allergies.parent is None
     assert explanation2.parent.category == "allergies"

コード例 #21

0

ファイルを表示

def load(model="default",
         enable=None,
         disable=None,
         load_rules=True,
         set_attributes=True):
    """Load a spaCy language object with cov_bsv pipeline components.
    By default, the base model will be 'en_core_web_sm' with the 'tagger'
    and 'parser' pipeline components, supplemented with the following custom
    components:
        - preprocessor (set to be nlp.tokenizer): Modifies the preprocessed text and returns
            a tokenized Doc. Preprocess rules are defined in cov_bsv.knowledge_base.preprocess_rules
        - concept_tagger: Assigns a semantic tag in a custom attribute "token._.concept_tag"
            to each Token in a Doc, which helps with concept extraction and normalization.
            Concept tag rules are defined in cov_bsv.knowledge_base.concept_tag_rules.
        - target_matcher: Extracts spans to doc.ents using extended rule-based matching.
            Target rules are defined in cov_bsv.knowledge_base.target_rules.
        - sectionizer: Identifies note section headers in the text and assigns section titles to
            entities and tokens contained in that section. Section patterns are defined in
            cov_bsv.knowledge_base.section_patterns.
        - context: Identifies semantic modifiers of entities and asserts attributes such as
            positive status, negation, and other experiencier. Context rules are defined in
            cov_bsv.knowledge_base.context_rules.
        - postprocessor: Modifies or removes the entity based on business logic. This handles
            special cases or complex logic using the results of earlier entities. Postprocess rules
            are defined in cov_bsv.knowledge_base.postprocess_rules.
        - document_classifier: Assigns a label of "POS", "UNK", or "NEG" to the doc._.cov_classification.
            A document will be classified as positive if it has at least one positive, non-excluded entity.

    Args:
        model: The name of the base spaCy model to load. If "default" will load the tagger and parser
            from "en_core_web_sm".
        enable (iterable or None): A list of component names to include in the pipeline.
        If None, will include all pipeline components listed above.
        disable (iterable or None): A list of component names to exclude.
            Cannot be set if `enable` is not None.
        load_rules (bool): Whether or not to include default rules for custom components. Default True.
        set_attributes (bool): Whether or not to register custom attributes to spaCy classes. If load_rules is True,
            this will automatically be set to True because the rules in the knowledge base rely on these custom attributes.
            The following extensions are registered (all defaults are False unless specified):
                Span._.is_future
                Span._.is_historical
                Span._.is_positive
                Span._.is_not_relevant
                Span._.is_negated
                Span._.is_uncertain
                Span._.is_screening
                Span._.is_other_experiencer
                Span._.concept_tag (default "")

    Returns:
        nlp: a spaCy Language object
    """
    if enable is not None and disable is not None:
        raise ValueError("Either `enable` or `disable` must be None.")
    if disable is not None:
        # If there's a single pipe name, nest it in a set
        if isinstance(disable, str):
            disable = {disable}
        else:
            disable = set(disable)
        enable = set(DEFAULT_PIPENAMES).difference(set(disable))
    elif enable is not None:
        if isinstance(enable, str):
            enable = {enable}
        else:
            enable = set(enable)
        disable = set(DEFAULT_PIPENAMES).difference(enable)
    else:
        enable = DEFAULT_PIPENAMES
        disable = set()

    if model == "default":
        model = "en_core_web_sm"
        disable.add("ner")

    if set_attributes:
        _set_attributes()

    import spacy
    nlp = spacy.load(model, disable=disable)

    if "preprocessor" in enable:
        from medspacy.preprocess import Preprocessor

        preprocessor = Preprocessor(nlp.tokenizer)
        if load_rules:
            preprocessor.add(preprocess_rules)
        nlp.tokenizer = preprocessor

    if "concept_tagger" in enable:
        from spacy.tokens import Token

        Token.set_extension("concept_tag", default="", force=True)
        from medspacy.ner import ConceptTagger

        concept_tagger = ConceptTagger(nlp)
        if load_rules:
            for (_, rules) in concept_tag_rules.items():
                concept_tagger.add(rules)
        nlp.add_pipe(concept_tagger)

    if "target_matcher" in enable:
        from medspacy.ner import TargetMatcher

        target_matcher = TargetMatcher(nlp)
        if load_rules:
            for (_, rules) in target_rules.items():
                target_matcher.add(rules)
        nlp.add_pipe(target_matcher)

    if "sectionizer" in enable:
        from medspacy.section_detection import Sectionizer
        sectionizer = Sectionizer(nlp, rules=None, add_attrs=SECTION_ATTRS)
        if load_rules:
            sectionizer.add(section_rules)
        nlp.add_pipe(sectionizer)

    if "context" in enable:
        from medspacy.context import ConTextComponent

        context = ConTextComponent(
            nlp,
            add_attrs=CONTEXT_MAPPING,
            rules=None,
            remove_overlapping_modifiers=True,
        )
        if load_rules:
            context.add(context_rules)
        nlp.add_pipe(context)

    if "postprocessor" in enable:
        from medspacy.postprocess import Postprocessor

        postprocessor = Postprocessor(debug=False)
        if load_rules:
            postprocessor.add(postprocess_rules)
        nlp.add_pipe(postprocessor)

    if "document_classifier" in enable:
        document_classifier = DocumentClassifier()
        nlp.add_pipe(document_classifier)

    return nlp

コード例 #22

0

ファイルを表示

ファイル: test_doc_consumer.py プロジェクト: ppayne12/medspacy

from medspacy.section_detection import Sectionizer, SectionRule

nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe("ner")

matcher = EntityRuler(nlp)
matcher.add_patterns([{"label": "PROBLEM", "pattern": "cough"}])
nlp.add_pipe(matcher)

context = ConTextComponent(nlp)
nlp.add_pipe(context)

sectionizer = Sectionizer(nlp)
sectionizer.add(
    [
        SectionRule("Section 1:", "section1"),
        SectionRule("Section 2:", "section2", parents=["section1"]),
    ]
)
nlp.add_pipe(sectionizer)

simple_text = "Patient has a cough."
context_text = "Patient has no cough."
section_text = "Section 1: Patient has a cough"
section_parent_text = """Section 1: comment
Section 2: Patient has a cough"""
many_concept_texts = ["cough " * i for i in range(10)]

simple_doc = nlp(simple_text)
context_doc = nlp(context_text)
section_doc = nlp(section_text)
section_parent_doc = nlp(section_parent_text)

コード例 #23

0

ファイルを表示

read_batch_size = 1
#####################################################

#####################################################
# NLP FACTORY
# initialize or call a method to produce your custom
# NLP pipeline here.
#
# NOTE: DocConsumer MUST be present at the end.
nlp = spacy.load("en_core_web_sm")
context = ConTextComponent(nlp)
sectionizer = Sectionizer(nlp, patterns=None)
sectionizer.add([{
    "section_title": "equals",
    "pattern": [{
        "LOWER": "=",
        "OP": "+"
    }]
}])
consumer = DocConsumer(
    nlp, context=True,
    sectionizer=True)  # DocConsumer has optional bool context and sectionizer
nlp.add_pipe(sectionizer)
nlp.add_pipe(context)
nlp.add_pipe(consumer)
#####################################################

#####################################################
# CREATING DB CONNECTIONS
db_read_conn = DbConnect(driver, server, db, user, pwd)
db_write_conn = DbConnect(driver, server, db, user, pwd)

コード例 #24

0

ファイルを表示

 def test_section_categories(self):
     sectionizer = Sectionizer(nlp, rules=None)
     sectionizer.add(
         [SectionRule("Past Medical History:", "past_medical_history")])
     assert sectionizer.section_categories == ["past_medical_history"]