Example #1
0
    def test_positive_tagging_with_bilou_unit(self):
        # Given
        tagging_scheme = TaggingScheme.BILOU
        slot_name = "animal"
        slot_size = 1

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        expected_tags = [UNIT_PREFIX + slot_name]
        self.assertListEqual(tags, expected_tags)
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        scope = set(self.slot_name_mapping[slot]
                    for slot in builtin_slots_names)
        builtin_entities = [
            be for entity_kind in scope
            for be in self.builtin_entity_parser.parse(
                text, scope=[entity_kind], use_cache=True)
        ]
        # We remove builtin entities which conflicts with custom slots
        # extracted by the CRF
        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        # We resolve conflicts between builtin entities by keeping the longest
        # matches. In case when two builtin entities span the same range, we
        # keep both.
        builtin_entities = _disambiguate_builtin_entities(builtin_entities)

        # We group builtin entities based on their position
        grouped_entities = (list(bes) for _, bes in groupby(
            builtin_entities, key=lambda s: s[RES_MATCH_RANGE][START]))
        grouped_entities = sorted(
            grouped_entities,
            key=lambda entities: entities[0][RES_MATCH_RANGE][START])

        features = self.compute_features(tokens)
        spans_ranges = [
            entities[0][RES_MATCH_RANGE] for entities in grouped_entities
        ]
        tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)

        # We loop on all possible slots permutations and use the CRF to find
        # the best one in terms of probability
        slots_permutations = _get_slots_permutations(grouped_entities,
                                                     self.slot_name_mapping)
        best_updated_tags = tags
        best_permutation_score = -1
        for slots in slots_permutations:
            updated_tags = copy(tags)
            for slot_index, slot in enumerate(slots):
                indexes = tokens_indexes[slot_index]
                sub_tags_sequence = positive_tagging(
                    self.config.tagging_scheme, slot, len(indexes))
                updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence
            score = self._get_sequence_probability(features, updated_tags)
            if score > best_permutation_score:
                best_updated_tags = updated_tags
                best_permutation_score = score
        slots = tags_to_slots(text, tokens, best_updated_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)

        return _reconciliate_builtin_slots(text, slots, builtin_entities)
Example #3
0
    def test_positive_tagging_with_bilou_unit(self):
        # Given
        tagging_scheme = TaggingScheme.BILOU
        slot_name = "animal"
        slot_size = 1

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        expected_tags = [UNIT_PREFIX + slot_name]
        self.assertListEqual(tags, expected_tags)
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        scope = set(self.slot_name_mapping[slot]
                    for slot in builtin_slots_names)
        builtin_entities = [be for entity_kind in scope
                            for be in get_builtin_entities(text, self.language,
                                                           [entity_kind])]
        # We remove builtin entities which conflicts with custom slots
        # extracted by the CRF
        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        # We resolve conflicts between builtin entities by keeping the longest
        # matches. In case when two builtin entities span the same range, we
        # keep both.
        builtin_entities = _disambiguate_builtin_entities(builtin_entities)

        # We group builtin entities based on their position
        grouped_entities = (
            list(bes)
            for _, bes in groupby(builtin_entities,
                                  key=lambda s: s[RES_MATCH_RANGE][START]))
        grouped_entities = sorted(
            grouped_entities,
            key=lambda entities: entities[0][RES_MATCH_RANGE][START])

        features = self.compute_features(tokens)
        spans_ranges = [entities[0][RES_MATCH_RANGE]
                        for entities in grouped_entities]
        tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)

        # We loop on all possible slots permutations and use the CRF to find
        # the best one in terms of probability
        slots_permutations = _get_slots_permutations(
            grouped_entities, self.slot_name_mapping)
        best_updated_tags = tags
        best_permutation_score = -1
        for slots in slots_permutations:
            updated_tags = copy(tags)
            for slot_index, slot in enumerate(slots):
                indexes = tokens_indexes[slot_index]
                sub_tags_sequence = positive_tagging(
                    self.config.tagging_scheme, slot, len(indexes))
                updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence
            score = self._get_sequence_probability(features, updated_tags)
            if score > best_permutation_score:
                best_updated_tags = updated_tags
                best_permutation_score = score
        slots = tags_to_slots(text, tokens, best_updated_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)

        return _reconciliate_builtin_slots(text, slots, builtin_entities)
Example #5
0
    def test_positive_tagging_with_io(self):
        # Given
        tagging_scheme = TaggingScheme.IO
        slot_name = "animal"
        slot_size = 3

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        t = INSIDE_PREFIX + slot_name
        expected_tags = [t, t, t]
        self.assertListEqual(tags, expected_tags)
Example #6
0
    def test_positive_tagging_should_handle_zero_length(self):
        # Given
        slot_name = "animal"
        slot_size = 0

        # When
        tags = []
        for scheme in TaggingScheme:
            tags.append(positive_tagging(scheme, slot_name, slot_size))

        # Then
        expected_tags = [[]] * len(TaggingScheme)
        self.assertEqual(tags, expected_tags)
Example #7
0
    def test_positive_tagging_with_bilou(self):
        # Given
        tagging_scheme = TaggingScheme.BILOU
        slot_name = "animal"
        slot_size = 3

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        expected_tags = [BEGINNING_PREFIX + slot_name,
                         INSIDE_PREFIX + slot_name, LAST_PREFIX + slot_name]
        self.assertListEqual(tags, expected_tags)
Example #8
0
    def test_positive_tagging_with_io(self):
        # Given
        tagging_scheme = TaggingScheme.IO
        slot_name = "animal"
        slot_size = 3

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        t = INSIDE_PREFIX + slot_name
        expected_tags = [t, t, t]
        self.assertListEqual(tags, expected_tags)
Example #9
0
    def test_positive_tagging_should_handle_zero_length(self):
        # Given
        slot_name = "animal"
        slot_size = 0

        # When
        tags = []
        for scheme in TaggingScheme:
            tags.append(positive_tagging(scheme, slot_name, slot_size))

        # Then
        expected_tags = [[]] * len(TaggingScheme)
        self.assertEqual(tags, expected_tags)
Example #10
0
    def test_positive_tagging_with_bilou(self):
        # Given
        tagging_scheme = TaggingScheme.BILOU
        slot_name = "animal"
        slot_size = 3

        # When
        tags = positive_tagging(tagging_scheme, slot_name, slot_size)

        # Then
        expected_tags = [
            BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name,
            LAST_PREFIX + slot_name
        ]
        self.assertListEqual(tags, expected_tags)
Example #11
0
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        augmented_tags = tags
        scope = [self.slot_name_mapping[slot] for slot in builtin_slots_names]
        builtin_entities = get_builtin_entities(text, self.language, scope)

        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        grouped_entities = groupby(builtin_entities,
                                   key=lambda s: s[ENTITY_KIND])
        features = None
        for entity, matches in grouped_entities:
            spans_ranges = [match[RES_MATCH_RANGE] for match in matches]
            num_possible_builtins = len(spans_ranges)
            tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)
            related_slots = list(
                set(s for s in builtin_slots_names if
                    self.slot_name_mapping[s] == entity))
            best_updated_tags = augmented_tags
            best_permutation_score = -1

            for slots in _generate_slots_permutations(
                    num_possible_builtins, related_slots,
                    self.config.exhaustive_permutations_threshold):
                updated_tags = copy(augmented_tags)
                for slot_index, slot in enumerate(slots):
                    if slot_index >= len(tokens_indexes):
                        break
                    indexes = tokens_indexes[slot_index]
                    sub_tags_sequence = positive_tagging(
                        self.config.tagging_scheme, slot, len(indexes))
                    updated_tags[indexes[0]:indexes[-1] + 1] = \
                        sub_tags_sequence
                if features is None:
                    features = self.compute_features(tokens)
                score = self._get_sequence_probability(features, updated_tags)
                if score > best_permutation_score:
                    best_updated_tags = updated_tags
                    best_permutation_score = score
            augmented_tags = best_updated_tags
        slots = tags_to_slots(text, tokens, augmented_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)
        return _reconciliate_builtin_slots(text, slots, builtin_entities)