Example #1
0
    def test_to_seq_data_dict_on_types4(self):
        ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml',
                                 support_overlap=True)
        doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
        print(list(doc.sents))
        res=Vectorizer.to_seq_data_dict(doc,type_filter=['Nonspecific_SSTI','PreAnnotated'],sent_window=3)
        # print('\n'.join([str(item) for item in res.items()]))
        for i in range(0, len(res['X'])):
            self.print(res,i)
            print('\n')
        assert(len(res['X'])==2)
        assert(res['PreAnnotated'][0][0]=='O')
        assert(res['PreAnnotated'][0][7]=='[SEP]')
        assert(res['PreAnnotated'][0][16]=='O')
        assert(res['PreAnnotated'][0][17]=='PreAnnotated')
        assert(res['PreAnnotated'][0][18]=='PreAnnotated')
        assert(res['PreAnnotated'][0][19]=='PreAnnotated')
        assert(res['Nonspecific_SSTI'][0][25]=='[SEP]')
        assert(res['Nonspecific_SSTI'][0][26]=='Nonspecific_SSTI')
        assert(res['Nonspecific_SSTI'][0][27]=='Nonspecific_SSTI')
        assert(res['Nonspecific_SSTI'][0][28]=='O')

        assert(res['PreAnnotated'][1][8]=='O')
        assert(res['PreAnnotated'][1][9]=='PreAnnotated')
        assert(res['PreAnnotated'][1][10]=='PreAnnotated')
        assert(res['PreAnnotated'][1][11]=='PreAnnotated')
        assert(res['Nonspecific_SSTI'][1][12]=='O')ma
        assert(res['Nonspecific_SSTI'][1][17]=='[SEP]')
        assert(res['Nonspecific_SSTI'][1][18]=='Nonspecific_SSTI')
        assert(res['Nonspecific_SSTI'][1][19]=='Nonspecific_SSTI')
        assert(res['Nonspecific_SSTI'][1][20]=='O')
Example #2
0
    def test_to_seq_data_dict_on_types5(self):
        ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml',
                                 support_overlap=True)
        doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
        print(list(doc.sents))
        res=Vectorizer.to_seq_data_dict(doc,type_filter={"Nonspecific_SSTI": {'status': {'present': 'PRES_NS_SSTI'},
                                                                              'test':{'v2':"TEST"}
                                                                              },
                                                         "PreAnnotated":"PREANNO"
                                                         },sent_window=1, data_dict=OrderedDict(),output_labels={})
        # print('\n'.join([str(item) for item in res.items()]))
        for i in range(0, len(res['X'])):
            self.print(res,i)
            print('\n')
        assert(len(res['X'])==4)
        assert(res['PREANNO'][1][0]=='O')
        assert(res['PREANNO'][1][8]=='O')
        assert(res['PREANNO'][1][9]=='PREANNO')
        assert(res['PREANNO'][1][10]=='PREANNO')
        assert(res['PREANNO'][1][11]=='PREANNO')
        assert(res['PREANNO'][1][12]=='O')

        assert(res['PRES_NS_SSTI'][2][0]=='PRES_NS_SSTI')
        assert(res['PRES_NS_SSTI'][2][1]=='PRES_NS_SSTI')
        assert(res['PRES_NS_SSTI'][2][2]=='O')

        assert(res['TEST'][2][0]=='TEST')
        assert(res['TEST'][2][1]=='TEST')
        assert(res['TEST'][2][2]=='O')
Example #3
0
 def test_to_sents_df_on_attr_value2(self):
     ereader = EhostDocReader(
         nlp=self.nlp,
         schema_file='data/ehost_test_corpus2/config/projectschema.xml',
         support_overlap=True)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
     df = Vectorizer.to_sents_df(doc,
                                 type_filter={
                                     "Nonspecific_SSTI": {
                                         'status': {
                                             'negated':
                                             'PRES_Nonspecific_SSTI'
                                         }
                                     }
                                 })
     print(df.shape)
     print(df)
     assert (df.shape[0] == 4)
     assert (df.iloc[0].y == 'NEG')
     df = Vectorizer.to_sents_df(doc,
                                 sent_window=2,
                                 type_filter={
                                     "Nonspecific_SSTI": {
                                         'status': {
                                             'negated':
                                             'PRES_Nonspecific_SSTI'
                                         }
                                     }
                                 })
     print(df.shape)
     assert (df.shape[0] == 3)
     print(df)
     assert (df.iloc[0].y == 'NEG')
 def test_parse_to_dicts(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     ereader = EhostDocReader(nlp=English())
     spans, classes, attributes, relations = ereader.parse_to_dicts(
         'data/ehost_test_corpus/saved/doc1.txt.knowtator.xml')
     assert (len(spans) == 7)
     assert (len(classes) == 7)
     assert (len(attributes) == 6)
    def test_read(self):
        if Doc.has_extension("concepts"):
            Doc.remove_extension("concepts")
        ereader = EhostDocReader(
            nlp=English(),
            schema_file='data/ehost_test_corpus/config/projectschema.xml')
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        self.eval(doc)

        def test_read_doc_name(self):
            ereader = EhostDocReader(
                nlp=English(),
                schema_file='data/ehost_test_corpus/config/projectschema.xml')

        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == 'doc1.txt')
        ereader.doc_name_depth = 1
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == r'corpus/doc1.txt')
        ereader = EhostDocReader(
            nlp=English(),
            schema_file='data/ehost_test_corpus/config/projectschema.xml',
            doc_name_depth=2)
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
 def test_read_overlap(self):
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus2/config/projectschema.xml',
         support_overlap=True)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
     assert (len(doc._.concepts) == 3)
     assert (len(doc._.concepts['PreAnnotated']) == 1)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc2.txt')
     assert (len(doc._.concepts) == 7)
     assert (len(doc._.concepts['Exclusions']) == 2)
     assert (len(doc._.concepts['Doc_Level_Purulence_Assessment']) == 2)
 def test_check_spans2(self):
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus2/config/projectschema.xml',
         support_overlap=True,
         store_anno_string=True,
         log_level=logging.DEBUG)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc2.txt')
     for spans in doc._.concepts.values():
         for span in spans:
             print(span._.span_txt, '<>', span)
             assert (span._.span_txt.replace('\n', ' ')
                     in str(span).replace('\n', ' '))
Example #8
0
 def test_to_seq_data_dict_on_types(self):
     ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml',
                              support_overlap=True)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
     print(list(doc.sents))
     res=Vectorizer.to_seq_data_dict(doc,type_filter=['Nonspecific_SSTI','PreAnnotated'])
     # print('\n'.join([str(item) for item in res.items()]))
     for i,s in enumerate(res['X']):
         self.print(res,i)
         print('\n')
     assert (res['Nonspecific_SSTI'][2][0]=='Nonspecific_SSTI')
     assert (res['Nonspecific_SSTI'][2][1]=='Nonspecific_SSTI')
     assert (res['PreAnnotated'][1][9]=='PreAnnotated')
     assert (res['PreAnnotated'][1][10]=='PreAnnotated')
     assert (res['PreAnnotated'][1][11]=='PreAnnotated')
 def test_check_spans(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml',
         support_overlap=False,
         store_anno_string=True,
         encoding='UTF8',
         log_level=logging.DEBUG)
     doc = ereader.read('data/ehost_test_corpus/corpus/doc2.txt')
     for span in doc.ents:
         print(span._.span_txt, '<>', span)
         assert (span._.span_txt.replace('\n', ' ')
                 in str(span).replace('\n', ' '))
Example #10
0
 def test_to_sents_nparray(self):
     ereader = EhostDocReader(nlp=self.nlp, schema_file='data/ehost_test_corpus2/config/projectschema.xml',
                              support_overlap=True)
     doc = ereader.read('data/ehost_test_corpus2/corpus/doc1.txt')
     print(len(list(doc.sents)))
     assert (len(doc._.concepts) == 3)
     assert (len(doc._.concepts['Nonspecific_SSTI']) == 1)
     df = Vectorizer.to_sents_nparray(doc)
     print(df)
     assert (df.shape[0] == 4)
     df = Vectorizer.to_sents_nparray(doc, sent_window=2)
     print(df.shape)
     assert (df.shape[0] == 5)
     df = Vectorizer.to_sents_nparray(doc, sent_window=2, track_doc_name=True)
     print(df.shape)
     assert (df.shape[0] == 5)
     assert (df.shape[1] == 4)
 def test_set_attributes(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
     nlp = English()
     doc = nlp('test status attribute')
     assert (hasattr(doc[1:2]._, 'status'))
     assert (doc[1:2]._.status == 'present')
 def test_read_doc_name(self):
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
     doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
     assert (doc._.doc_name == 'doc1.txt')
     ereader.doc_name_depth = 1
     doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
     assert (doc._.doc_name == r'corpus/doc1.txt')
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml',
         doc_name_depth=2)
     doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
     assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
 def test_read_doc_name(self):
     ereader = EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
 def test_reader_initail(self):
     ereader = EhostDocReader(nlp=English())
     assert (hasattr(ereader, 'use_adjudication'))
     assert (not ereader.use_adjudication)