def test_decode(self): ''' Testing the decoding without fusion. ''' input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [0, 0, 1.0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1.0, 1.0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] ] ]) group = 'test' concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED] semantic_groups = OrderedDict([(group, concepts)]) d = Decoder(StringList([input_string]), prediction, semantic_groups) d.decode() print([t.text for t in d.token_lists[0]]) print(d.concepts[0]) print(d.scores[0]) print(d.char_level_concepts[0]) # A ge ne or others expected_concepts = OrderedDict([('test', [ Catalogue.UNTAGGED, Catalogue.GENEPROD, Catalogue.GENEPROD, Catalogue.UNTAGGED, Catalogue.UNTAGGED ])]) self.assertEqual(expected_concepts, d.concepts[0])
def test_serializer_1(self): ''' Testing tagging of multiple token ("ge ne" as type="gene") with one semantic group ('entities'). ''' input_string = 'A ge ne or others' prediction = torch.Tensor([[#A g e n e o r o t h e r s [0 ,0 ,1. ,1. ,1. ,1. ,1. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ], # geneprod [1. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ], # small_molecule [0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0.8 ,0.8 ,0.8 ,0.8 ,0.8 ,0.8 ], # cell [0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0.2 ,0.2 ,0.1 ,0.1 ,0.1 ,0.1 ], # protein [0 ,1 ,0 ,0 ,1. ,0 ,0 ,1 ,1 ,1 ,1 ,0 ,0 ,0.1 ,0.1 ,0.1 ,0.1 ] # untagged ]]) #self, text_examples, prediction, output_semantics group = 'entities' output_semantics = Catalogue.from_list(['geneprod','small_molecule','cell','protein', 'untagged']) print(output_semantics) semantic_groups = OrderedDict([(group, output_semantics)]) d = Decoder(input_string, prediction, semantic_groups) d.decode() print([t.text for t in d.token_list]) serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(d) expected_xml_string = '<smtag><sd-tag type="small_molecule" type_score="100">A</sd-tag> <sd-tag type="geneprod" type_score="100">ge ne</sd-tag> or <sd-tag type="cell" type_score="80">others</sd-tag></smtag>' print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_serializer_1(self): ''' Simple test to tag 2 words. ''' input_string = 'A gene or protein.' prediction = torch.Tensor([ [ #A g e n e o r p r o t e i n . [ 0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0 ] ] ]) #self, text_examples, prediction, output_semantics b = Decoder([input_string], prediction, Catalogue.from_list( ['gene', 'small_molecule', 'tissue', 'protein'])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(b)[0] expected_xml_string = '<smtag>A <sd-tag type="gene" type_score="99">gene</sd-tag> or <sd-tag type="protein" type_score="99">protein</sd-tag>.</smtag>' #expected_html_string = 'A <span class="sd-tag gene">gene</span> or <span class="sd-tag protein">protein</span>.' #print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_cat(self): ''' Testing the decoding without fusion. ''' input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [0, 0, 1.0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1.0, 1.0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] ] ]) group = 'test' concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED] semantic_groups = OrderedDict([(group, concepts)]) d = Decoder(StringList([input_string]), prediction, semantic_groups) d.decode() d.fuse_adjacent() cloned = d.clone() d.cat_(cloned) print(d.prediction) print([t.text for t in d.token_lists[0]]) print(d.concepts[0]) print(d.scores[0]) print(d.char_level_concepts[0]) self.assertTensorEqual(torch.cat((prediction, prediction), 1), d.prediction)
def test_fuse_adjacent_1(self): ''' Testing the fusion between two similarly labeled terms separated by a tab. ''' input_string = 'A\tge\tne\tor\tothers' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [0, 0, 1.0, 1.0, 0.2, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1.0, 1.0, 0, 0, 0.8, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] ] ]) group = 'test' concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED] semantic_groups = OrderedDict([(group, concepts)]) d = Decoder(StringList([input_string]), prediction, semantic_groups) d.decode() d.fuse_adjacent() print([t.text for t in d.token_lists[0]]) print(d.concepts[0]) print(d.scores[0]) print(d.char_level_concepts[0]) # A gene or others expected_concepts = OrderedDict([('test', [ Catalogue.UNTAGGED, Catalogue.GENEPROD, Catalogue.UNTAGGED, Catalogue.UNTAGGED ])]) self.assertEqual(expected_concepts, d.concepts[0])
def test_serializer_updatexml(self): ''' Test the update of a pretagged xml object ''' xml_string = '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag type="protein">others</sd-tag></sd-panel>' xml = fromstring(xml_string) expected_xml_string = tostring(fromstring('<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag role="intervention" type="protein">others</sd-tag></sd-panel>')) input_string = 'A ge ne or others' prediction = torch.Tensor([[#A g e n e o r o t h e r s [0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0.99,0.99,0.99,0.99,0.99,0.99], [1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0 ,0 ,0 ,0 ,0 ,0 ] ]]) group = 'roles' output_semantics = Catalogue.from_list(['intervention', 'untagged']) semantic_groups = OrderedDict([(group, output_semantics)]) d = Decoder(input_string, prediction, semantic_groups) d.decode() print([t.text for t in d.token_list]) updatexml_(xml, d) resulting_xml_string = tostring(xml) print(resulting_xml_string) self.assertEqual(expected_xml_string, resulting_xml_string)
def test_context_predictor_anonymization(self): input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [0, 0, 1.0, 1.0, 0.2, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1.0, 1.0, 0, 0, 0.8, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] ] ]) group = 'test' concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED] semantic_groups = OrderedDict([(group, concepts)]) d = Decoder(StringList([input_string]), prediction, semantic_groups) d.decode() d.fuse_adjacent() p = ContextualPredictor(self.context_model) anonymized_encoded = p.anonymize(d, 'test', Catalogue.GENEPROD) anonymized = anonymized_encoded.words[0] expected = "A " + config.marking_char * len("ge ne") + " or others" self.assertEqual(expected, anonymized)
def test_fuse_adjacent_2(self): ''' Testing the fusion of two terms at the end of the string. ''' input_string = 'A ge n' prediction = torch.Tensor([[ #A g e n [0, 0, 0.99, 0.99, 0.6, 0.99], [1., 1., 0, 0, 0, 0] ]]) group = 'test' concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED] semantic_groups = OrderedDict([(group, concepts)]) d = Decoder(StringList([input_string]), prediction, semantic_groups) d.decode() d.fuse_adjacent() print([t.text for t in d.token_lists[0]]) print(d.concepts[0]) print(d.scores[0]) print(d.char_level_concepts[0]) # A ge n expected_concepts = OrderedDict([ ('test', [Catalogue.UNTAGGED, Catalogue.GENEPROD]) ]) self.assertEqual(expected_concepts, d.concepts[0])