Ejemplo n.º 1
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            new_sentences = [s.words for s in sentences]
            flattened_sentences = [
                self._normalize_word(word) for sentence in new_sentences
                for word in sentence
            ]

            def tokenizer(s: str):
                return self.token_indexer.wordpiece_tokenizer(s)

            flattened_sentences = tokenizer(" ".join(flattened_sentences))
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
Ejemplo n.º 2
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        i = 0
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            percent_user_spans = 0.0
            if self._simulate_user_inputs and i >= self._fully_labelled_threshold:
                percent_user_spans = 1.0

            i += 1

            yield self.text_to_instance([s.words for s in sentences],
                                        sentences[0].document_id,
                                        sentences[0].sentence_id,
                                        canonical_clusters, percent_user_spans)
Ejemplo n.º 3
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            speakers = []
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

                speakers.append(sentence.speakers)

            doc_key = sentences[0].document_id
            genre = self.genres[doc_key[:2]]

            speakers = self.flatten(speakers)
            assert total_tokens == len(speakers)

            speaker_dict = {s: i for i, s in enumerate(set(speakers))}
            speaker_ids = np.array([speaker_dict[s] for s in speakers])

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
Ejemplo n.º 4
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        ontonotes_reader = Ontonotes()
        for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            instance = self.text_to_instance([s.words for s in sentences], canonical_clusters)
            instances.append(instance)

        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Ejemplo n.º 5
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
Ejemplo n.º 6
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
Ejemplo n.º 7
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = 'tests/fixtures/coref/coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
Ejemplo n.º 8
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = 'tests/fixtures/coref/coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
    def _read_dataset(self,
                      file_path: str,
                      count_only: bool = False,
                      keep_idx: Optional[Set[int]] = None):
        """
        Yield instances from the file_path.

        Parameters
        ----------
        file_path: str, required
            The path to the data file.
        count_only: bool, optional (default=``False``)
            If True, no instances are returned and instead a dummy object is
            returned. This is useful for quickly counting the number of instances
            in the data file, since creating instances is relatively expensive.
        keep_idx: Set[int], optional (default=``None``)
            If not None, only yield instances whose index is in this set.
        """
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        # Reseed for reproducibility
        self._reseed(seed=self._seed)

        index = 0
        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            text_sentences: List[List[str]] = [s.words for s in sentences]
            flattened_text_sentences: List[str] = [
                self._normalize_word(word) for text_sentence in text_sentences
                for word in text_sentence
            ]
            sentence_arc_indices: List[Tuple[int, int]] = []
            sentence_labels: List[str] = []

            # Filter the clusters to only have single-token entities
            # TODO(nfliu): How do we handle spans here?
            filtered_clusters = filter_clusters(canonical_clusters,
                                                max_span_size=1)

            # Check if there are at least two clusters, each of which has at least 2 different items.
            # If not, then skip creating examples from this passage.
            counter = 0
            all_cluster_words = []
            all_cluster_unique_words = []
            for cluster in filtered_clusters:
                # Get the words that show up in the cluster
                cluster_words = list(
                    tuple(flattened_text_sentences[index]
                          for index in range(item[0], item[1] + 1))
                    for item in cluster)
                all_cluster_words.append(cluster_words)

                cluster_unique_words = set(cluster_words)
                all_cluster_unique_words.append(cluster_unique_words)
                if len(set(cluster_words)) >= 2:
                    counter += 1
            if counter < 2:
                continue

            if keep_idx is not None and index not in keep_idx:
                index += 1
                continue
            if count_only:
                yield 1
                continue

            # Contextualize the tokens if a Contextualizer was provided.
            # TODO (nfliu): How can we make this batched?
            # Would make contextualizers that use the GPU much faster.
            if self._contextualizer:
                token_representations = self._contextualizer(
                    [flattened_text_sentences])[0]
            else:
                token_representations = None

            # For each cluster with 2+ different items, make positive examples between each of the different items
            # that are different strings and make negative examples between each of the different items and a
            # random token from another cluster.
            assert ((len(filtered_clusters) == len(all_cluster_words)) &
                    (len(all_cluster_words) == len(all_cluster_unique_words)))

            for cluster_index, (cluster_spans, cluster_words,
                                cluster_unique_words) in enumerate(
                                    zip(filtered_clusters, all_cluster_words,
                                        all_cluster_unique_words)):
                # Don't make examples from this if there is only 1 unique item.
                if len(cluster_unique_words) < 2:
                    continue
                # Get all combinations of cluster spans (a, b), where a occurs
                # in the text before b.
                all_coreferring_spans = []
                for parent_cluster_span in cluster_spans:
                    for child_cluster_span in cluster_spans:
                        # Skip child_cluster_span if it occurs before the parent_span.
                        # TODO (nfliu): this is single-word specific
                        if child_cluster_span[0] < parent_cluster_span[0]:
                            continue

                        # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical
                        if (flattened_text_sentences[
                                child_cluster_span[0]:child_cluster_span[1] +
                                1] == flattened_text_sentences[
                                    parent_cluster_span[0]:
                                    parent_cluster_span[1] + 1]):
                            continue
                        # Add to the set of coreference candidates
                        all_coreferring_spans.append(
                            (child_cluster_span, parent_cluster_span))

                # Take the coreference_candidates and generate positive and negative examples
                for (child_span, parent_span) in all_coreferring_spans:
                    # TODO (nfliu): This is single-word specific, will have to change
                    # if we generalize to spans
                    sentence_arc_indices.append(
                        (child_span[0], parent_span[0]))
                    sentence_labels.append("1")

                    # Generate a negative example for the child.
                    other_clusters = [
                        cluster for i, cluster in enumerate(filtered_clusters)
                        if i != cluster_index
                    ]
                    negative_coreferent = self._sample_negative_coreferent(
                        other_clusters, child_span[0])
                    if negative_coreferent:
                        sentence_arc_indices.append(
                            (child_span[0], negative_coreferent[0]))
                        sentence_labels.append("0")
            yield self.text_to_instance(
                tokens=flattened_text_sentences,
                arc_indices=sentence_arc_indices,
                token_representations=token_representations,
                labels=sentence_labels)
            index += 1
Ejemplo n.º 10
0
    return doc_str, spans, span_pairs


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        'convert conll 2012 format into brat format')
    parser.add_argument('--inp', type=str, required=True, help='input dir')
    parser.add_argument('--out', type=str, required=True, help='output dir')
    args = parser.parse_args()

    print('reading coref instances from dataset files at: {}'.format(args.inp))

    avg_cluster_size = []
    ontonotes_reader = Ontonotes()
    for docid, doc in tqdm(
            enumerate(ontonotes_reader.dataset_document_iterator(args.inp))):
        docid += 1
        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)

        total_tokens = 0
        for sentence in doc:
            for typed_span in sentence.coref_spans:
                span_id, (start,
                          end) = typed_span  # both start and end are inclusive
                clusters[span_id].append(
                    (start + total_tokens, end + total_tokens))
            total_tokens += len(sentence.words)

        canonical_clusters = canonicalize_clusters(clusters)
        avg_cluster_size.extend([len(c) for c in canonical_clusters])
        doc_str, spans, span_pairs = cluster_to_brat([s.words for s in doc],
Ejemplo n.º 11
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
Ejemplo n.º 12
0
class ConllReader(Reader):
    def __init__(self):
        self.onto_reader = Ontonotes()

    def get_srl_ents(self, srl_frame):
        ents, rels = [], []
        begin = False
        arg = None
        start, end = -1, -1
        for idx, tag in enumerate(srl_frame):
            if tag.startswith("B") and not begin:
                begin = True
                start = idx
                arg = tag[2:]
            elif begin and tag.startswith("I"):
                assert arg == tag[2:], (arg, tag[2:])
                continue
            elif begin:
                if tag == "O":
                    begin = False
                    end = idx
                    ents.append((start, end))
                    rels.append((arg, (start, end)))
                    start = end = -1
                elif tag.startswith("B"):
                    end = idx
                    ents.append((start, end))
                    rels.append((arg, (start, end)))
                    start = idx
                    end = -1
                    arg = tag[2:]
        if len(rels) <= 1:
            return ents, []

        final_rels, verb_span = [], None
        for tag, mention in rels:
            if tag == "V":
                verb_span = mention
                break
        if not verb_span:
            # we did not find a verb
            return ents, final_rels
        for tag, mention in rels:
            if tag != "V":
                final_rels.append((tag, ("head", verb_span), ("dep", mention)))
        return ents, final_rels

    def get_coref_ents(self, cc):
        ents, clusters, rels = [], defaultdict(list), []
        for idx, mention in cc:
            mention = (
                mention[0],
                mention[1] + 1,
            )  # we add one to maintain python slicing
            ents.append(mention)
            clusters[idx].append(mention)
        for cluster in clusters.values():
            if len(cluster) <= 1:
                continue
            ant = cluster[0]
            for mention in cluster[1:]:
                rels.append(("COREF", ("head", ant), ("dep", mention)))
        return ents, rels

    def w2c(self, text, ents):
        doc = nlp(text)
        w2c = dict()
        for word in doc:
            w2c[word.i] = (word.idx, word.idx + len(word))
        cents = {ent: (w2c[ent[0]][0], w2c[ent[1] - 1][1]) for ent in ents}
        return w2c, cents

    def make_ter(self, sent):
        text = " ".join(sent.words)
        # one sentence can have multiple srl annotations
        # depending on how many verbs it has
        to_return = []
        for _, frame in sent.srl_frames:
            ents, rels = [], []
            e_srl, r_srl = self.get_srl_ents(frame)
            e_coref, r_coref = self.get_coref_ents(sent.coref_spans)
            ents.extend(e_srl + e_coref)
            rels.extend(r_srl + r_coref)
            _, cents = self.w2c(text, set(ents))
            ent_dict = {
                k: {
                    "id": "E" + str(n),
                    "cid": v
                }
                for n, (k, v) in enumerate(cents.items())
            }
            ents = [["E" + str(n), "", [list(v)]]
                    for n, (_, v) in enumerate(cents.items())]
            for idx, rel in enumerate(rels):
                (tag, (head, e1), (dep, e2)) = rel
                rels[idx] = [
                    "R" + str(idx),
                    tag,
                    [[head, ent_dict[e1]["id"]], [dep, ent_dict[e2]["id"]]],
                ]
            to_return.append([text, ents, rels])
        return to_return

    def read(self, fpath: str):
        for doc in self.onto_reader.dataset_document_iterator(fpath):
            for sent in doc:
                to_return = self.make_ter(sent)
                for text, ents, rels in to_return:
                    yield text, ents, rels