Ejemplo n.º 1
0
    def test_decode(self):
        '''
        Testing the decoding without fusion.
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [0, 0, 1.0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    1.0, 1.0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                    1.0, 1.0, 1.0, 1.0
                ]
            ]
        ])

        group = 'test'
        concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED]
        semantic_groups = OrderedDict([(group, concepts)])
        d = Decoder(StringList([input_string]), prediction, semantic_groups)
        d.decode()
        print([t.text for t in d.token_lists[0]])
        print(d.concepts[0])
        print(d.scores[0])
        print(d.char_level_concepts[0])
        #                                          A                   ge                  ne                  or                  others
        expected_concepts = OrderedDict([('test', [
            Catalogue.UNTAGGED, Catalogue.GENEPROD, Catalogue.GENEPROD,
            Catalogue.UNTAGGED, Catalogue.UNTAGGED
        ])])
        self.assertEqual(expected_concepts, d.concepts[0])
Ejemplo n.º 2
0
    def test_serializer_1(self):
        '''
        Testing tagging of multiple token ("ge ne" as type="gene") with one semantic group ('entities').
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([[#A         g    e         n    e         o    r         o    t    h    e    r    s
                                    [0   ,0   ,1.  ,1.  ,1. ,1.  ,1.  ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ], # geneprod
                                    [1.  ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ], # small_molecule
                                    [0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0.8 ,0.8 ,0.8 ,0.8 ,0.8 ,0.8 ], # cell
                                    [0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0.2 ,0.2 ,0.1 ,0.1 ,0.1 ,0.1 ], # protein
                                    [0   ,1   ,0   ,0   ,1.  ,0   ,0   ,1   ,1   ,1   ,1   ,0   ,0   ,0.1 ,0.1 ,0.1 ,0.1 ]  # untagged
                                  ]])

        #self, text_examples, prediction, output_semantics
        group = 'entities'
        output_semantics = Catalogue.from_list(['geneprod','small_molecule','cell','protein', 'untagged'])
        print(output_semantics)
        semantic_groups = OrderedDict([(group, output_semantics)])
        d = Decoder(input_string, prediction, semantic_groups)
        d.decode()
        print([t.text for t in d.token_list])
        serializer = Serializer(tag="sd-tag", format="xml")
        predicted_xml_string = serializer.serialize(d)
        expected_xml_string = '<smtag><sd-tag type="small_molecule" type_score="100">A</sd-tag> <sd-tag type="geneprod" type_score="100">ge ne</sd-tag> or <sd-tag type="cell" type_score="80">others</sd-tag></smtag>'
        print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)
Ejemplo n.º 3
0
    def test_serializer_1(self):
        '''
        Simple test to tag 2 words.
        '''
        input_string = 'A gene or protein.'
        prediction = torch.Tensor([
            [  #A         g    e    n    e         o    r         p    r    o    t    e    i    n    .
                [
                    0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99, 0
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Decoder([input_string], prediction,
                    Catalogue.from_list(
                        ['gene', 'small_molecule', 'tissue', 'protein']))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        serializer = Serializer(tag="sd-tag", format="xml")
        predicted_xml_string = serializer.serialize(b)[0]
        expected_xml_string = '<smtag>A <sd-tag type="gene" type_score="99">gene</sd-tag> or <sd-tag type="protein" type_score="99">protein</sd-tag>.</smtag>'
        #expected_html_string = 'A <span class="sd-tag gene">gene</span> or <span class="sd-tag protein">protein</span>.'
        #print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)
Ejemplo n.º 4
0
    def test_cat(self):
        '''
        Testing the decoding without fusion.
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [0, 0, 1.0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    1.0, 1.0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                    1.0, 1.0, 1.0, 1.0
                ]
            ]
        ])

        group = 'test'
        concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED]
        semantic_groups = OrderedDict([(group, concepts)])
        d = Decoder(StringList([input_string]), prediction, semantic_groups)
        d.decode()
        d.fuse_adjacent()
        cloned = d.clone()
        d.cat_(cloned)
        print(d.prediction)
        print([t.text for t in d.token_lists[0]])
        print(d.concepts[0])
        print(d.scores[0])
        print(d.char_level_concepts[0])
        self.assertTensorEqual(torch.cat((prediction, prediction), 1),
                               d.prediction)
Ejemplo n.º 5
0
 def test_fuse_adjacent_1(self):
     '''
     Testing the fusion between two similarly labeled terms separated by a tab.
     '''
     input_string = 'A\tge\tne\tor\tothers'
     prediction = torch.Tensor([
         [  #A         g    e         n    e         o    r         o    t    h    e    r    s
             [0, 0, 1.0, 1.0, 0.2, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [
                 1.0, 1.0, 0, 0, 0.8, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0
             ]
         ]
     ])
     group = 'test'
     concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED]
     semantic_groups = OrderedDict([(group, concepts)])
     d = Decoder(StringList([input_string]), prediction, semantic_groups)
     d.decode()
     d.fuse_adjacent()
     print([t.text for t in d.token_lists[0]])
     print(d.concepts[0])
     print(d.scores[0])
     print(d.char_level_concepts[0])
     #                                          A                   gene                or                  others
     expected_concepts = OrderedDict([('test', [
         Catalogue.UNTAGGED, Catalogue.GENEPROD, Catalogue.UNTAGGED,
         Catalogue.UNTAGGED
     ])])
     self.assertEqual(expected_concepts, d.concepts[0])
Ejemplo n.º 6
0
    def test_serializer_updatexml(self):
        '''
        Test the update of a pretagged xml object
        '''
        xml_string = '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag type="protein">others</sd-tag></sd-panel>'
        xml = fromstring(xml_string)
        expected_xml_string = tostring(fromstring('<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag role="intervention" type="protein">others</sd-tag></sd-panel>'))
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([[#A         g    e         n    e         o    r         o    t    h    e    r    s
                                    [0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0   ,0.99,0.99,0.99,0.99,0.99,0.99],
                                    [1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0   ,0   ,0   ,0   ,0   ,0   ]
                                  ]])

        group = 'roles'
        output_semantics = Catalogue.from_list(['intervention', 'untagged'])
        semantic_groups = OrderedDict([(group, output_semantics)])
        d = Decoder(input_string, prediction, semantic_groups)
        d.decode()
        print([t.text for t in d.token_list])
        updatexml_(xml, d)
        resulting_xml_string = tostring(xml)
        print(resulting_xml_string)
        self.assertEqual(expected_xml_string, resulting_xml_string)
Ejemplo n.º 7
0
 def test_context_predictor_anonymization(self):
     input_string = 'A ge ne or others'
     prediction = torch.Tensor([
         [  #A         g    e         n    e         o    r         o    t    h    e    r    s
             [0, 0, 1.0, 1.0, 0.2, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [
                 1.0, 1.0, 0, 0, 0.8, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0
             ]
         ]
     ])
     group = 'test'
     concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED]
     semantic_groups = OrderedDict([(group, concepts)])
     d = Decoder(StringList([input_string]), prediction, semantic_groups)
     d.decode()
     d.fuse_adjacent()
     p = ContextualPredictor(self.context_model)
     anonymized_encoded = p.anonymize(d, 'test', Catalogue.GENEPROD)
     anonymized = anonymized_encoded.words[0]
     expected = "A " + config.marking_char * len("ge ne") + " or others"
     self.assertEqual(expected, anonymized)
Ejemplo n.º 8
0
 def test_fuse_adjacent_2(self):
     '''
     Testing the fusion of two terms at the end of the string.
     '''
     input_string = 'A ge n'
     prediction = torch.Tensor([[  #A         g    e         n    
         [0, 0, 0.99, 0.99, 0.6, 0.99], [1., 1., 0, 0, 0, 0]
     ]])
     group = 'test'
     concepts = [Catalogue.GENEPROD, Catalogue.UNTAGGED]
     semantic_groups = OrderedDict([(group, concepts)])
     d = Decoder(StringList([input_string]), prediction, semantic_groups)
     d.decode()
     d.fuse_adjacent()
     print([t.text for t in d.token_lists[0]])
     print(d.concepts[0])
     print(d.scores[0])
     print(d.char_level_concepts[0])
     #                                          A                   ge n
     expected_concepts = OrderedDict([
         ('test', [Catalogue.UNTAGGED, Catalogue.GENEPROD])
     ])
     self.assertEqual(expected_concepts, d.concepts[0])