def test_section_data_ent(self): consumer = DocConsumer(nlp) doc = consumer(section_doc) data = doc._.get_data("ent") ent = doc.ents[0] assert data["section_category"][0] == ent._.section_category assert data["section_parent"][0] == ent._.section_parent
def write_nlp_db(nlp, df, filepath="./nlp.db"): from datetime import datetime start = datetime.now() texts = df["text"] # Process all of the texts print("Processing {} texts".format(len(texts))) docs = list(nlp.pipe(texts)) print("Processed {} docs in {} seconds".format(len(docs), datetime.now() - start)) from medspacy.io import DocConsumer, DbReader, DbWriter, Pipeline, DbConnect import sqlite3 conn = sqlite3.connect(filepath) doc_consumer = DocConsumer(nlp) for doc in docs: doc_consumer(doc) db_conn = DbConnect(conn=conn) writer = DbWriter(db_conn, "ents", create_table=True, drop_existing=True) for doc in docs: writer.write(doc) conn.close() print() print("Saved output from {} docs to '{}'".format(len(docs), filepath))
def test_default_cols(self): consumer = DocConsumer(nlp) doc = consumer(simple_doc) data = doc._.get_data("ent") assert data is not None assert set(data.keys()) == set(consumer.dtype_attrs["ent"]) assert set(data.keys()) == set(DEFAULT_ENT_ATTRS)
def test_get_default_attrs(self): attrs = DocConsumer.get_default_attrs() assert set(attrs.keys()) == {"ent", "context", "section", "doc"} assert set(attrs["ent"]) == set(DEFAULT_ENT_ATTRS) assert set(attrs["section"]) == set(ALLOWED_SECTION_ATTRS) assert set(attrs["context"]) == set(ALLOWED_CONTEXT_ATTRS) assert set(attrs["doc"]) == set(DEFAULT_DOC_ATTRS)
def test_default_data(self): consumer = DocConsumer(nlp) doc = consumer(simple_doc) data = doc._.get_data("ent") ent = doc.ents[0] assert data["text"][0] == ent.text assert data["label_"][0] == ent.label_ assert data["start_char"][0] == ent.start_char assert data["end_char"][0] == ent.end_char
def test_context_data(self): consumer = DocConsumer(nlp) doc = consumer(context_doc) data = doc._.get_data("ent") ent = doc.ents[0] assert data["is_family"][0] == ent._.is_family assert data["is_hypothetical"][0] == ent._.is_hypothetical assert data["is_historical"][0] == ent._.is_historical assert data["is_uncertain"][0] == ent._.is_uncertain assert data["is_negated"][0] == ent._.is_negated
def test_ten_concepts(self): consumer = DocConsumer(nlp, dtypes=("ent", )) docs = [consumer(d) for d in many_concept_docs] for doc in docs: print(doc) num_concepts = len(doc.ents) data = doc._.get_data("ent") for key in data.keys(): print(key) print(num_concepts) print(data[key]) assert num_concepts == len(data[key])
def test_section_data_section(self): consumer = DocConsumer(nlp, dtypes=("section",)) doc = consumer(section_doc) data = doc._.get_data("section") section = doc._.sections[0] assert data["section_category"][0] == section.category assert data["section_title_text"][0] == section.title_span.text assert data["section_title_start_char"][0] == section.title_span.start_char assert data["section_title_end_char"][0] == section.title_span.end_char assert data["section_text"][0] == section.section_span.text assert data["section_text_start_char"][0] == section.section_span.start_char assert data["section_text_end_char"][0] == section.section_span.end_char assert data["section_parent"][0] == section.parent
from medspacy.io.db_connect import DbConnect import sqlite3 import medspacy from medspacy.target_matcher import TargetRule from medspacy.io import DocConsumer tmpdirname = tempfile.TemporaryDirectory() db = os.path.join(tmpdirname.name, "test") nlp = medspacy.load(enable=["sentencizer", "target_matcher", "context", "sectionizer"]) nlp.get_pipe("target_matcher").add(TargetRule("pneumonia", "CONDITION")) doc = nlp("There is no evidence of pneumonia.") doc_consumer = DocConsumer(nlp) doc_consumer(doc) class TestDbWriter: def test_init_from_sqlite3_conn_defaults(self): """Test writing with default values for ent attributes.""" sq_conn = sqlite3.connect(db) cursor = sq_conn.cursor() db_conn = DbConnect(conn=sq_conn) from medspacy.io.db_writer import DbWriter writer = DbWriter(db_conn, "ents", cols=None, col_types=None, create_table=True, drop_existing=False) writer.write(doc)
def test_all_dtypes(self): consumer = DocConsumer(nlp, dtypes="all") assert consumer.dtypes == ALLOWED_DATA_TYPES
def test_section_cols(self): consumer = DocConsumer(nlp, dtypes=("section", )) doc = consumer(context_doc) data = doc._.get_data("section") assert data is not None assert set(data.keys()) == set(ALLOWED_SECTION_ATTRS)
def test_context_cols(self): consumer = DocConsumer(nlp, dtypes=("context", )) doc = consumer(context_doc) data = doc._.get_data("context") assert data is not None assert set(data.keys()) == set(ALLOWED_CONTEXT_ATTRS)
def test_init_context(self): doc_consumer = DocConsumer(nlp, dtypes=("context", )) assert doc_consumer.dtypes == ("context", )
def test_init_default(self): doc_consumer = DocConsumer(nlp) assert DocConsumer(nlp) assert doc_consumer.dtypes == ("ent", )
def test_get_data_attrs_not_none(self): consumer = DocConsumer(nlp) doc = consumer(simple_doc) data = doc._.get_data("ent", attrs=["label_", "is_negated"]) assert set(data.keys()) == {"label_", "is_negated"}
from medspacy.io import DocConsumer tmpdirname = tempfile.TemporaryDirectory() db = os.path.join(tmpdirname.name, "test.db") # Set up a simple pipeline which will allow us to write results nlp = medspacy.load( enable=["pyrush", "target_matcher", "context", "sectionizer"]) nlp.get_pipe("medspacy_target_matcher").add([ TargetRule("pneumonia", "CONDITION"), TargetRule("breast ca", "CONDITION") ]) doc = nlp("There is no evidence of pneumonia.") doc_consumer = DocConsumer( nlp, dtype_attrs={"ent": ["text", "label_", "is_negated", "section_category"]}) nlp.add_pipe("medspacy_doc_consumer", config={ "dtype_attrs": { "ent": ["text", "label_", "is_negated", "section_category"] } }) db_dtypes = [ "varchar(100)", "varchar(100)", "int", "varchar(100)", ]