예제 #1
0
    def test_iob1_to_bioul(self):
        tag_sequence = ['I-ORG', 'O', 'I-MISC', 'O']
        bioul_sequence = span_utils.iob1_to_bioul(tag_sequence)
        assert bioul_sequence == ['U-ORG', 'O', 'U-MISC', 'O']

        tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER']
        bioul_sequence = span_utils.iob1_to_bioul(tag_sequence)
        assert bioul_sequence == ['O', 'U-PER', 'B-PER', 'I-PER', 'L-PER', 'U-PER']
예제 #2
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        with open(file_path, "r") as data_file:
            logger.info(
                "Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines`
                # corresponds to the words of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [
                        list(field) for field in zip(*fields)
                    ]
                    # TextField requires ``Token`` objects
                    new_tokens = []
                    for token in tokens:
                        if self.convert_numbers:
                            token = re.sub(r"[0-9]+", NUM_TOKEN, token)
                            # if re.match(r"^[0-9]+$", token):
                            #     token = NUM_TOKEN
                        new_tokens.append(Token(token))
                    # tokens = [Token(token) for token in tokens]
                    sequence = TextField(new_tokens, self._token_indexers)

                    instance_fields: Dict[str, Field] = {'tokens': sequence}

                    # Recode the labels if necessary.
                    if self.coding_scheme == "BIOUL":
                        coded_chunks = iob1_to_bioul(chunk_tags)
                        coded_ner = iob1_to_bioul(ner_tags)
                    else:
                        # the default IOB1
                        coded_chunks = chunk_tags
                        coded_ner = ner_tags

                    # Add "feature labels" to instance
                    if 'pos' in self.feature_labels:
                        instance_fields['pos_tags'] = SequenceLabelField(
                            pos_tags, sequence, "pos_tags")
                    if 'chunk' in self.feature_labels:
                        instance_fields['chunk_tags'] = SequenceLabelField(
                            coded_chunks, sequence, "chunk_tags")
                    if 'ner' in self.feature_labels:
                        instance_fields['ner_tags'] = SequenceLabelField(
                            coded_ner, sequence, "ner_tags")

                    # Add "tag label" to instance
                    if self.tag_label == 'ner':
                        instance_fields['tags'] = SequenceLabelField(
                            coded_ner, sequence)
                    elif self.tag_label == 'pos':
                        instance_fields['tags'] = SequenceLabelField(
                            pos_tags, sequence)
                    elif self.tag_label == 'chunk':
                        instance_fields['tags'] = SequenceLabelField(
                            coded_chunks, sequence)
                    if self.label_indexer is not None:
                        instance_fields["labels"] = self.label_indexer.index(
                            ner_tags, as_label_field=True)
                    yield Instance(instance_fields)