Exemple #1
0
    def test_case_5(self):
        data = get_tagged_texts_as_pd(self.folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        bio_ner_tags = iob3bio(data.ner_tag.values.tolist())

        self.assertTrue(len(bio_ner_tags) == len(data))
Exemple #2
0
 def test_case_6(self):
     ner_tags = iob3bio([
         'per-tit', 'per-nam', 'per-nam', 'per-nam', 'O', 'O', 'O', 'O',
         'gpe-tit', 'gpe-nam', 'gpe-nam', 'gpe-tit', 'gpe-tit', 'O', 'O',
         'O', 'O', 'O', 'O'
     ])
     self.assertEqual(ner_tags, [
         'B-per', 'B-per', 'I-per', 'I-per', 'O', 'O', 'O', 'O', 'B-gpe',
         'B-gpe', 'I-gpe', 'B-gpe', 'I-gpe', 'O', 'O', 'O', 'O', 'O', 'O'
     ], 'Something is wrong!!!')
Exemple #3
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0',
                                      'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        data = additional_features(df=data)

        cls.features = data.columns.values.tolist()

        cls.features.remove('ner_tag')

        cls.features.remove('word_net_sense_number')

        cls.features.remove('verb_net_roles')

        cls.features.remove('semantic_relation')

        cls.features.remove('animacy_tag')

        cls.features.remove('super_tag')

        cls.features.remove('lambda_dsr')

        X, y = SentenceExtractor(features=cls.features,
                                 target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)

        cls.X_train = [
            sentence for sentence in cls.X_train if len(sentence) > 0
        ]

        cls.y_train = [
            sentence.tolist() for sentence in cls.y_train if len(sentence) > 0
        ]

        cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0]

        cls.y_test = [
            sentence.tolist() for sentence in cls.y_test if len(sentence) > 0
        ]
Exemple #4
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        cls.X, cls.y = SentenceExtractor(
            features=[
                'token',
                'pos_tag',
                'lemma'
            ],
            target='ner_tag'
        ).fit_transform(data)
Exemple #5
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        data = additional_features(df=data)

        # features list:
        cls.features = [
            'token',
            'lemma',
            'pos_tag',
            'is_title',
            'contains_digits',
            'word_len',
            'suffix',
            'prefix',
            'prev_pos_tag',
            'prev_is_title',
            'prev_contains_digits',
            'prev_word_len',
            'prev_suffix',
            'prev_prefix',
            'next_pos_tag',
            'next_is_title',
            'next_contains_digits',
            'next_word_len',
            'next_suffix',
            'next_prefix'
        ]

        X, y = SentenceExtractor(features=cls.features, target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(X, y, test_size=0.33, random_state=42)

        cls.X_train = [sentence for sentence in cls.X_train if len(sentence) > 0]

        cls.y_train = [sentence.tolist() for sentence in cls.y_train if len(sentence) > 0]

        cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0]

        cls.y_test = [sentence.tolist() for sentence in cls.y_test if len(sentence) > 0]
Exemple #6
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0',
                                      'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        cls.features = ['token', 'pos_tag', 'lemma']

        X, y = SentenceExtractor(features=cls.features,
                                 target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)