def test_load_disable(self):
     nlp = medspacy.load(disable=["context"])
     expected_pipe_names = {
         "medspacy_pyrush",
         "medspacy_target_matcher",
     }
     assert set(nlp.pipe_names) == expected_pipe_names
Exemple #2
0
def load():
    import medspacy
    nlp = medspacy.load(enable=["sentencizer", "tokenizer"])

    # Add components
    from medspacy.target_matcher import TargetMatcher, TargetRule
    target_matcher = TargetMatcher(nlp)
    target_filepath = path.join(RESOURCES_DIR, "target_rules.json")
    target_rules = TargetRule.from_json(target_filepath)
    target_matcher.add(target_rules)
    nlp.add_pipe(target_matcher)

    from medspacy.context import ConTextComponent, ConTextRule
    context_filepath = path.join(RESOURCES_DIR, "context_rules.json")
    context = ConTextComponent(nlp, rules=None, add_attrs=CONTEXT_ATTRS)
    context_rules = ConTextRule.from_json(context_filepath)
    context.add(context_rules)
    nlp.add_pipe(context)

    from medspacy.section_detection import Sectionizer
    # TODO: Add radiology section rules
    sectionizer = Sectionizer(nlp)
    nlp.add_pipe(sectionizer)

    clf = DocumentClassifier(nlp)
    nlp.add_pipe(clf)

    return nlp
 def test_default_load(self):
     nlp = medspacy.load()
     expected_pipe_names = {
         "medspacy_pyrush",
         "medspacy_context",
         "medspacy_target_matcher",
     }
     assert set(nlp.pipe_names) == expected_pipe_names
Exemple #4
0
 def test_default_load(self):
     nlp = medspacy.load()
     expected_pipe_names = {
         "sentencizer",
         "context",
         "target_matcher",
     }
     assert set(nlp.pipe_names) == expected_pipe_names
Exemple #5
0
    def test_load_all_components(self):
        full_pipe_names = [
            "sentencizer", "target_matcher", "context", "sectionizer",
            "postprocessor"
        ]

        nlp = medspacy.load(enable="all")
        assert nlp.pipe_names == full_pipe_names
        assert isinstance(nlp.tokenizer, medspacy.preprocess.Preprocessor)
    def test_disable_medspacy_tokenizer(self):
        default_tokenizer = spacy.blank("en").tokenizer
        custom_tokenizer = medspacy.load(disable=["tokenizer"]).tokenizer

        text = r"Pt c\o n;v;d h\o chf+cp n/v/d"

        default_doc = default_tokenizer(text)
        medspacy_doc = custom_tokenizer(text)

        assert [token.text for token in default_doc] == [token.text for token in medspacy_doc]
 def test_load_disable(self):
     nlp = medspacy.load(disable=["tagger", "parser"])
     expected_pipe_names = {
         "sentencizer",
         "target_matcher",
         "context",
         "sectionizer",
         "postprocessor",
     }
     assert set(nlp.pipe_names) == expected_pipe_names
     assert isinstance(nlp.tokenizer, nlp_preprocessor.Preprocessor)
    def test_load_all_components(self):
        full_pipe_names = [
            "medspacy_pyrush",
            "medspacy_target_matcher",
            "medspacy_context",
            "medspacy_sectionizer",
            "medspacy_postprocessor",
            "medspacy_doc_consumer",
        ]

        nlp = medspacy.load(enable="all")
        assert nlp.pipe_names == full_pipe_names
        assert isinstance(nlp.tokenizer, medspacy.preprocess.Preprocessor)
    def test_medspacy_tokenizer_numerics(self):
        custom_tokenizer = medspacy.load(enable=["medspacy_tokenizer"]).tokenizer

        text = r"1.5 mg"

        medspacy_doc = custom_tokenizer(text)

        tokens = [token.text for token in medspacy_doc]

        assert len(tokens) == 2

        # Check that some expected token boundries are generated
        joined_tokens = " ".join(tokens)
        assert "1.5" in joined_tokens
        assert "1 . 5" not in joined_tokens
    def test_medspacy_tokenizer(self):
        default_tokenizer = spacy.blank("en").tokenizer
        custom_tokenizer = medspacy.load(enable=["tokenizer"]).tokenizer

        text = r"Pt c\o n;v;d h\o chf+cp n/v/d"

        default_doc = default_tokenizer(text)
        medspacy_doc = custom_tokenizer(text)

        assert [token.text for token in default_doc] != [token.text for token in medspacy_doc]

        # Check that some expected token boundries are generated
        joined_tokens = " ".join([token.text for token in medspacy_doc])
        assert "c \\ o" in joined_tokens
        assert "n / v / d" in joined_tokens
        assert "chf + cp" in joined_tokens
    def test_medspacy_tokenizer_uppercase(self):
        custom_tokenizer = medspacy.load(enable=["medspacy_tokenizer"]).tokenizer

        # Issue 13: Ensure that uppercase tokens are not tokenized as each character
        # https://github.com/medspacy/medspacy/issues/13
        text = r"DO NOT BREAK ME UP"

        medspacy_doc = custom_tokenizer(text)

        tokens = [token.text for token in medspacy_doc]

        assert len(tokens) == 5

        # Check that some expected token boundries are generated
        joined_tokens = " ".join(tokens)
        assert "DO NOT BREAK ME UP" in joined_tokens
        assert "B R E A K" not in joined_tokens
Exemple #12
0
    def test_quickumls_extractions(self):
        """
        Test that extractions can be performed using the very small (<100 concept) UMLS sample resources
        """

        # let's make sure that this pipe has been initialized
        # At least for MacOS and Linux which are currently supported...
        if not TestQuickUMLS.can_test_quickumls():
            return

        # allow default QuickUMLS (very small sample data) to be loaded
        nlp = medspacy.load(enable=["quickumls"])
        quickumls = nlp.get_pipe("QuickUMLS matcher")

        # TODO -- Consider moving this and other extraction tests to separate tests from loading
        doc = nlp(
            'Decreased dipalmitoyllecithin content found in lung specimens')

        assert len(doc.ents) == 1

        entity_spans = [ent.text for ent in doc.ents]

        assert 'dipalmitoyllecithin' in entity_spans
Exemple #13
0
    def test_initialize_pipeline(self):
        """
        Test that a pipeline with a QuickUMLS component can be loaded in medpacy
        NOTE: Currently this is only available by default in Linux and MacOS
            Windows requires additional steps, but this will test capability on Windows
            if these manual steps are followed
        """

        # let's make sure that this pipe has been initialized
        # At least for MacOS and Linux which are currently supported...
        if not TestQuickUMLS.can_test_quickumls():
            return

        # allow default QuickUMLS (very small sample data) to be loaded
        nlp = medspacy.load(enable=["quickumls"])
        assert nlp

        quickumls = nlp.get_pipe("QuickUMLS matcher")
        assert quickumls
        # this is a member of the QuickUMLS algorithm inside the component
        assert quickumls.quickumls
        # Check that the simstring database exists
        assert quickumls.quickumls.ss_db
 def test_load_lang_model(self):
     nlp = spacy.load("en_core_web_sm", disable={"ner"})
     nlp = medspacy.load(nlp)
     assert {"tagger", "parser"}.intersection(set(nlp.pipe_names))
 def test_not_load_rules(self):
     nlp = medspacy.load(load_rules=False)
     context = nlp.get_pipe("medspacy_context")
     assert not context.rules
 def test_nlp(self):
     nlp = medspacy.load()
     assert nlp("This is a sentence. So is this.")
 def test_load_enable(self):
     nlp = medspacy.load(enable={"medspacy_target_matcher", "medspacy_sectionizer"})
     assert len(nlp.pipeline) == 2
     assert set(nlp.pipe_names) == {"medspacy_target_matcher", "medspacy_sectionizer"}
Exemple #18
0
 def test_not_load_rules(self):
     nlp = medspacy.load(load_rules=False)
     context = nlp.get_pipe("context")
     assert not context.item_data
     sectionizer = nlp.get_pipe("sectionizer")
     assert not sectionizer.patterns
Exemple #19
0
 def test_load_de(self):
     assert medspacy.load("de_core_news_sm")
Exemple #20
0
 def test_load_enable(self):
     nlp = medspacy.load(enable=["target_matcher"])
     assert len(nlp.pipeline) == 1
     assert "target_matcher" in nlp.pipe_names
     assert isinstance(nlp.tokenizer, spacy.tokenizer.Tokenizer)
                           port=3306,
                           user=sys.argv[1],
                           passwd=sys.argv[2],
                           db="mimic2")
    return conn


print("creating db connection")
conn = get_mimic_connection()
cursor = conn.cursor()

print("loading i2b2 language model")

nlp = medspacy.load("en_info_3700_i2b2_2012",
                    disable=[
                        "tagger", "parser", "ner", "target_matcher",
                        "sectionizer", "context", "postprocessor"
                    ])
print(nlp.pipeline)
cursor.execute(
    """SELECT text FROM noteevents WHERE category='RADIOLOGY_REPORT' LIMIT %d"""
    % NUM_REPORTS)
r = [r[0] for r in cursor.fetchall()]
print(len(r))
r = [rr for rr in r if rr]
print(len(r))
docs = nlp.pipe(r, n_process=6, batch_size=64)
print("processed tokenization")
sents = [[utils.simple_preprocess(line.string) for line in doc.sents]
         for _, doc in enumerate(docs)]
Exemple #22
0
 def test_load_rules(self):
     nlp = medspacy.load(load_rules=True)
     context = nlp.get_pipe("context")
     assert context.item_data
Exemple #23
0
import pytest
import os
import tempfile

from medspacy.io.db_connect import DbConnect
import sqlite3

import medspacy
from medspacy.target_matcher import TargetRule
from medspacy.io import DocConsumer

tmpdirname = tempfile.TemporaryDirectory()
db = os.path.join(tmpdirname.name, "test")

nlp = medspacy.load(enable=["sentencizer", "target_matcher", "context", "sectionizer"])
nlp.get_pipe("target_matcher").add(TargetRule("pneumonia", "CONDITION"))
doc = nlp("There is no evidence of pneumonia.")

doc_consumer = DocConsumer(nlp)
doc_consumer(doc)



class TestDbWriter:

    def test_init_from_sqlite3_conn_defaults(self):
        """Test writing with default values for ent attributes."""
        sq_conn = sqlite3.connect(db)
        cursor = sq_conn.cursor()
        db_conn = DbConnect(conn=sq_conn)
        from medspacy.io.db_writer import DbWriter
Exemple #24
0
def get_mimic_connection():
    conn = pymysql.connect(host=MIMICHOST,
                           port=3306,
                           user=sys.argv[1],
                           passwd=sys.argv[2],
                           db="mimic2")
    return conn


print("creating db connection")
conn = get_mimic_connection()
cursor = conn.cursor()

print("loading i2b2 language model")

nlp = medspacy.load("en_info_3700_i2b2_2012",
                    disable=["tagger", "parser", "ner"])


class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    def __iter__(self):
        cursor.execute("""SELECT text FROM noteevents""")
        while True:
            r = cursor.fetchone()
            if not r:
                return
            r = r[0]

            for line in nlp(r).sents:
                # assume there's one document per line, tokens separated by whitespace
                yield utils.simple_preprocess(line.string)