Ejemplo n.º 1
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = iob1_to_bioul(
                chunk_tags) if chunk_tags is not None else None
            coded_ner = iob1_to_bioul(
                ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence)

        return Instance(instance_fields)
Ejemplo n.º 2
0
    def _conll_rows_to_sentence(self, conll_rows: List[str]) -> ACESentence:
        sentence: List[str] = []
        mention_tags: List[str] = []

        span_labels: List[List[str]] = []
        current_span_labels: List[str] = []

        # Cluster id -> List of (start_index, end_index) spans.
        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)
        # Cluster id -> List of start_indices which are open for this id.
        coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

        for index, row in enumerate(conll_rows):
            conll_components = row.split()

            word = conll_components[1]

            if not span_labels:
                span_labels = [[] for _ in conll_components[2:-1]]
                current_span_labels = [None for _ in conll_components[2:-1]]
            self._process_span_annotations_for_word(
                annotations=conll_components[2:-1],
                span_labels=span_labels,
                current_span_labels=current_span_labels)

            #Process coref
            self._process_coref_span_annotations_for_word(
                conll_components[-1], index, clusters, coref_stacks)

            sentence.append(word)

        mention_tags = iob1_to_bioul(span_labels[0])

        #Process coref clusters
        coref_span_tuples: Set[TypedSpan] = {
            (cluster_id, span)
            for cluster_id, span_list in clusters.items() for span in span_list
        }

        #Reformat the labels to only keep the the last token of the head
        #Cf paper, we model relation between last tokens of heads.
        last_head_token_relations = []
        bioul_relations = []

        for relation_frame in span_labels[1:]:
            bioul_relation_frame = iob1_to_bioul(relation_frame)

            reformatted_frame = []
            for annotation in bioul_relation_frame:
                if annotation[:2] in ["L-", "U-"]:
                    reformatted_frame.append(annotation[2:])
                else:
                    reformatted_frame.append("*")

            last_head_token_relations.append(reformatted_frame)
            bioul_relations.append(bioul_relation_frame)

        return ACESentence(sentence, mention_tags, bioul_relations,
                           last_head_token_relations, coref_span_tuples)
Ejemplo n.º 3
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [
                        list(field) for field in zip(*fields)
                    ]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]
                    sequence = TextField(tokens, self._token_indexers)

                    instance_fields: Dict[str, Field] = {'tokens': sequence}

                    # Recode the labels if necessary.
                    if self.coding_scheme == "BIOUL":
                        coded_chunks = iob1_to_bioul(chunk_tags)
                        coded_ner = iob1_to_bioul(ner_tags)
                    else:
                        # the default IOB1
                        coded_chunks = chunk_tags
                        coded_ner = ner_tags

                    # Add "feature labels" to instance
                    if 'pos' in self.feature_labels:
                        instance_fields['pos_tags'] = SequenceLabelField(
                            pos_tags, sequence, "pos_tags")
                    if 'chunk' in self.feature_labels:
                        instance_fields['chunk_tags'] = SequenceLabelField(
                            coded_chunks, sequence, "chunk_tags")
                    if 'ner' in self.feature_labels:
                        instance_fields['ner_tags'] = SequenceLabelField(
                            coded_ner, sequence, "ner_tags")

                    # Add "tag label" to instance
                    if self.tag_label == 'ner':
                        instance_fields['tags'] = SequenceLabelField(
                            coded_ner, sequence)
                    elif self.tag_label == 'pos':
                        instance_fields['tags'] = SequenceLabelField(
                            pos_tags, sequence)
                    elif self.tag_label == 'chunk':
                        instance_fields['tags'] = SequenceLabelField(
                            coded_chunks, sequence)

                    yield Instance(instance_fields)
Ejemplo n.º 4
0
    def _read(self, file_path: str):
        file_path = cached_path(
            file_path)  # if `file_path` is a URL, redirect to the cache
        ontonotes_reader = Ontonotes()
        logger.info("Reading NER instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.named_entities:
                tags = ["O" for _ in tokens]
            else:
                tags = sentence.named_entities

            if self._coding_scheme == "BIOUL":
                tags = iob1_to_bioul(tags)

            yield self.text_to_instance(tokens, tags)