Beispiel #1
0
class TestLabelers(unittest.TestCase):
    def setUp(self):
        self.dataset = StringReader(
            'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
            )
        NLTKSplitter().split(self.dataset)
        TmVarTokenizer().tokenize(self.dataset)
        part = list(self.dataset.parts())[0]
        part.annotations.append(
            Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
        part.annotations.append(
            Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))

    def test_bio_labeler(self):
        BIOLabeler().label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'I-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'I-e_x', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)

    def test_bieo_labeler(self):
        BIEOLabeler().label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'E-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'E-e_x', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)

    def test_tmvar_labeler(self):
        TmVarLabeler(STUB_ENTITY_CLASS_ID).label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'A', 'I', 'P', 'P', 'P', 'T', 'W',
            'O', 'A', 'I', 'W', 'P', 'I', 'M', 'P', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)
Beispiel #2
0
    def test_generate_patterns_245(self):
        dataset = StringReader('token c.A436C token').read()
        NLTKSplitter().split(dataset)
        TmVarTokenizer().tokenize(dataset)
        TmVarDictionaryFeatureGenerator().generate(dataset)

        token_features = [{key: value for key, value in token.features.items() if value is not 'O'}
                          for token in dataset.tokens()]
        self.assertEqual(token_features[0], {})
        self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'})
        self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'})
        self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'})
        self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'})
        self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'})
        self.assertEqual(token_features[6], {})