def test_to_seq_data_dict_on_types4(self): ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') print(list(doc.sents)) res=Vectorizer.to_seq_data_dict(doc,type_filter=['Nonspecific_SSTI','PreAnnotated'],sent_window=3) # print('\n'.join([str(item) for item in res.items()])) for i in range(0, len(res['X'])): self.print(res,i) print('\n') assert(len(res['X'])==2) assert(res['PreAnnotated'][0][0]=='O') assert(res['PreAnnotated'][0][7]=='[SEP]') assert(res['PreAnnotated'][0][16]=='O') assert(res['PreAnnotated'][0][17]=='PreAnnotated') assert(res['PreAnnotated'][0][18]=='PreAnnotated') assert(res['PreAnnotated'][0][19]=='PreAnnotated') assert(res['Nonspecific_SSTI'][0][25]=='[SEP]') assert(res['Nonspecific_SSTI'][0][26]=='Nonspecific_SSTI') assert(res['Nonspecific_SSTI'][0][27]=='Nonspecific_SSTI') assert(res['Nonspecific_SSTI'][0][28]=='O') assert(res['PreAnnotated'][1][8]=='O') assert(res['PreAnnotated'][1][9]=='PreAnnotated') assert(res['PreAnnotated'][1][10]=='PreAnnotated') assert(res['PreAnnotated'][1][11]=='PreAnnotated') assert(res['Nonspecific_SSTI'][1][12]=='O')ma assert(res['Nonspecific_SSTI'][1][17]=='[SEP]') assert(res['Nonspecific_SSTI'][1][18]=='Nonspecific_SSTI') assert(res['Nonspecific_SSTI'][1][19]=='Nonspecific_SSTI') assert(res['Nonspecific_SSTI'][1][20]=='O')
def test_to_seq_data_dict_on_types5(self): ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') print(list(doc.sents)) res=Vectorizer.to_seq_data_dict(doc,type_filter={"Nonspecific_SSTI": {'status': {'present': 'PRES_NS_SSTI'}, 'test':{'v2':"TEST"} }, "PreAnnotated":"PREANNO" },sent_window=1, data_dict=OrderedDict(),output_labels={}) # print('\n'.join([str(item) for item in res.items()])) for i in range(0, len(res['X'])): self.print(res,i) print('\n') assert(len(res['X'])==4) assert(res['PREANNO'][1][0]=='O') assert(res['PREANNO'][1][8]=='O') assert(res['PREANNO'][1][9]=='PREANNO') assert(res['PREANNO'][1][10]=='PREANNO') assert(res['PREANNO'][1][11]=='PREANNO') assert(res['PREANNO'][1][12]=='O') assert(res['PRES_NS_SSTI'][2][0]=='PRES_NS_SSTI') assert(res['PRES_NS_SSTI'][2][1]=='PRES_NS_SSTI') assert(res['PRES_NS_SSTI'][2][2]=='O') assert(res['TEST'][2][0]=='TEST') assert(res['TEST'][2][1]=='TEST') assert(res['TEST'][2][2]=='O')
def test_to_sents_df_on_attr_value2(self): ereader = EhostDocReader( nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') df = Vectorizer.to_sents_df(doc, type_filter={ "Nonspecific_SSTI": { 'status': { 'negated': 'PRES_Nonspecific_SSTI' } } }) print(df.shape) print(df) assert (df.shape[0] == 4) assert (df.iloc[0].y == 'NEG') df = Vectorizer.to_sents_df(doc, sent_window=2, type_filter={ "Nonspecific_SSTI": { 'status': { 'negated': 'PRES_Nonspecific_SSTI' } } }) print(df.shape) assert (df.shape[0] == 3) print(df) assert (df.iloc[0].y == 'NEG')
def test_parse_to_dicts(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader(nlp=English()) spans, classes, attributes, relations = ereader.parse_to_dicts( 'data/ehost_test_corpus/saved/doc1.txt.knowtator.xml') assert (len(spans) == 7) assert (len(classes) == 7) assert (len(attributes) == 6)
def test_read(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') self.eval(doc) def test_read_doc_name(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == 'doc1.txt') ereader.doc_name_depth = 1 doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'corpus/doc1.txt') ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', doc_name_depth=2) doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
def test_read_overlap(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') assert (len(doc._.concepts) == 3) assert (len(doc._.concepts['PreAnnotated']) == 1) doc = ereader.read('data/ehost_test_corpus2/corpus/doc2.txt') assert (len(doc._.concepts) == 7) assert (len(doc._.concepts['Exclusions']) == 2) assert (len(doc._.concepts['Doc_Level_Purulence_Assessment']) == 2)
def test_check_spans2(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True, store_anno_string=True, log_level=logging.DEBUG) doc = ereader.read('data/ehost_test_corpus2/corpus/doc2.txt') for spans in doc._.concepts.values(): for span in spans: print(span._.span_txt, '<>', span) assert (span._.span_txt.replace('\n', ' ') in str(span).replace('\n', ' '))
def test_to_seq_data_dict_on_types(self): ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') print(list(doc.sents)) res=Vectorizer.to_seq_data_dict(doc,type_filter=['Nonspecific_SSTI','PreAnnotated']) # print('\n'.join([str(item) for item in res.items()])) for i,s in enumerate(res['X']): self.print(res,i) print('\n') assert (res['Nonspecific_SSTI'][2][0]=='Nonspecific_SSTI') assert (res['Nonspecific_SSTI'][2][1]=='Nonspecific_SSTI') assert (res['PreAnnotated'][1][9]=='PreAnnotated') assert (res['PreAnnotated'][1][10]=='PreAnnotated') assert (res['PreAnnotated'][1][11]=='PreAnnotated')
def test_check_spans(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', support_overlap=False, store_anno_string=True, encoding='UTF8', log_level=logging.DEBUG) doc = ereader.read('data/ehost_test_corpus/corpus/doc2.txt') for span in doc.ents: print(span._.span_txt, '<>', span) assert (span._.span_txt.replace('\n', ' ') in str(span).replace('\n', ' '))
def test_to_sents_nparray(self): ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml', support_overlap=True) doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt') print(len(list(doc.sents))) assert (len(doc._.concepts) == 3) assert (len(doc._.concepts['Nonspecific_SSTI']) == 1) df = Vectorizer.to_sents_nparray(doc) print(df) assert (df.shape[0] == 4) df = Vectorizer.to_sents_nparray(doc, sent_window=2) print(df.shape) assert (df.shape[0] == 5) df = Vectorizer.to_sents_nparray(doc, sent_window=2, track_doc_name=True) print(df.shape) assert (df.shape[0] == 5) assert (df.shape[1] == 4)
def test_set_attributes(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') nlp = English() doc = nlp('test status attribute') assert (hasattr(doc[1:2]._, 'status')) assert (doc[1:2]._.status == 'present')
def test_read_doc_name(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == 'doc1.txt') ereader.doc_name_depth = 1 doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'corpus/doc1.txt') ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', doc_name_depth=2) doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
def test_read_doc_name(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml')
def test_reader_initail(self): ereader = EhostDocReader(nlp=English()) assert (hasattr(ereader, 'use_adjudication')) assert (not ereader.use_adjudication)