Beispiel #1
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int],
                                        str], relation_dict, cluster_dict,
                         trigger_dict, argument_dict, doc_key: str,
                         dataset: str, sentence_num: int, groups: List[str],
                         start_ix: int, end_ix: int, tree: Dict[str, Any],
                         syntax_dict: Dict[Tuple[int, int], str],
                         children_dict: Dict[Tuple[int, int],
                                             List[Tuple[int, int]]],
                         dep_children_dict: Dict[Tuple[int, int],
                                                 List[Tuple[int, int]]],
                         tf_dict: Dict[Tuple[int, int], Any]):
        """
        TODO(dwadden) document me.
        """

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # feili, NER labels. One label per token
        ner_sequence_labels = self._generate_ner_label(sentence, ner_dict)
        ner_sequence_label_field = SequenceLabelField(
            ner_sequence_labels,
            text_field,
            label_namespace="ner_sequence_labels")

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        cluster_dict=cluster_dict,
                        trigger_dict=trigger_dict,
                        argument_dict=argument_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num,
                        seq_dict=ner_sequence_labels,
                        tree=tree,
                        syntax_dict=syntax_dict,
                        children_dict=children_dict,
                        dep_children_dict=dep_children_dict)
        metadata_field = MetadataField(metadata)

        # Trigger labels. One label per token in the input.
        token_trigger_labels = []
        for i in range(len(text_field)):
            token_trigger_labels.append(trigger_dict[i])

        trigger_label_field = SequenceLabelField(
            token_trigger_labels, text_field, label_namespace="trigger_labels")

        # Generate fields for text spans, ner labels, coref labels.
        spans = []
        span_ner_labels = []
        # feili
        span_labels = []
        span_coref_labels = []
        span_syntax_labels = []
        span_children_labels = []
        dep_span_children_labels = []
        # span_children_syntax_labels = []
        span_tree_labels = []
        raw_spans = []
        assert len(syntax_dict) == len(children_dict)
        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            # here we need to consider how to use tree info
            # for example, use_tree, span is in tree, match is true or false
            # if self._tree_span_filter and not self._is_span_in_tree(span_ix, syntax_dict, children_dict):
            #     if len(raw_spans) == 0: # in case that there is no span for this instance
            #         pass
            #     else:
            #         continue
            span_tree_labels.append('1' if self._is_span_in_tree(
                span_ix, syntax_dict, children_dict) else '')

            span_ner_labels.append(ner_dict[span_ix])
            span_labels.append('' if ner_dict[span_ix] == '' else '1')
            span_coref_labels.append(cluster_dict[span_ix])
            spans.append(SpanField(start, end, text_field))
            span_syntax_labels.append(syntax_dict[span_ix])
            raw_spans.append(span_ix)

            # if len(children_dict[span_ix]) == 0:
            #     children_field = ListField([SpanField(-1, -1, text_field)])
            #     children_syntax_field = SequenceLabelField([''], children_field,
            #                                            label_namespace="span_syntax_labels")
            # else:
            #     children_field = ListField([SpanField(children_span[0], children_span[1], text_field)
            #                for children_span in children_dict[span_ix]])
            #     children_syntax_field = SequenceLabelField([syntax_dict[children_span] for children_span in children_dict[span_ix]],
            #                                                children_field, label_namespace="span_syntax_labels")
            # span_children_labels.append(children_field)
            # span_children_syntax_labels.append(children_syntax_field)

        span_field = ListField(spans)

        for span in raw_spans:

            if len(children_dict[span]) == 0:
                children_field = ListField([IndexField(-1, span_field)])
            else:
                children_field = []
                for children_span in children_dict[span]:
                    if children_span in raw_spans:
                        children_field.append(
                            IndexField(raw_spans.index(children_span),
                                       span_field))
                    else:
                        children_field.append(IndexField(-1, span_field))
                children_field = ListField(children_field)

            span_children_labels.append(children_field)

        # for span in raw_spans:
        #     if len(dep_children_dict[span]) == 0:
        #         children_field = ListField([IndexField(-1, span_field)])
        #     else:
        #         children_field = []
        #         for children_span in dep_children_dict[span]:
        #             if children_span in raw_spans:
        #                 children_field.append(IndexField(raw_spans.index(children_span), span_field))
        #             else:
        #                 children_field.append(IndexField(-1, span_field))
        #         children_field = ListField(children_field)
        #     dep_span_children_labels.append(children_field)

        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_tokens)]
        dep_adjs = []
        dep_adjs_indices = []
        # tf_indices = {}
        # tf_features = {}
        # for k, v in tf_dict.items():
        #     tf_indices[k] = []
        #     tf_features[k] = []
        tf_indices = []
        tf_features = []
        for token_pair in candidate_indices:
            dep_adj_label = dep_children_dict[token_pair]
            if dep_adj_label:
                dep_adjs_indices.append(token_pair)
                dep_adjs.append(dep_adj_label)

            # for k,v in tf_dict.items():
            #     feature = tf_dict[k][token_pair]
            #     if feature:
            #         tf_indices[k].append(token_pair)
            #         tf_features[k].append(feature)

            feature = tf_dict[token_pair]
            if feature:
                tf_indices.append(token_pair)
                tf_features.append(feature)

        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")
        coref_label_field = SequenceLabelField(span_coref_labels,
                                               span_field,
                                               label_namespace="coref_labels")
        # feili
        span_label_field = SequenceLabelField(span_labels,
                                              span_field,
                                              label_namespace="span_labels")

        # Generate labels for relations and arguments. Only store non-null values.
        # For the arguments, by convention the first span specifies the trigger, and the second
        # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs
        # for the event arguments field, but AllenNLP doesn't make it possible to express
        # adjacencies between two different sequences.
        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        arguments = []
        argument_indices = []
        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_spans)]
        for i, j in candidate_indices:
            token_span_pair = (i, span_tuples[j])
            argument_label = argument_dict[token_span_pair]
            if argument_label:
                argument_indices.append((i, j))
                arguments.append(argument_label)

        argument_label_field = AdjacencyFieldAssym(
            indices=argument_indices,
            row_field=text_field,
            col_field=span_field,
            labels=arguments,
            label_namespace="argument_labels")

        # Syntax
        span_syntax_field = SequenceLabelField(
            span_syntax_labels,
            span_field,
            label_namespace="span_syntax_labels")
        span_children_field = ListField(span_children_labels)
        span_tree_field = SequenceLabelField(
            span_tree_labels, span_field, label_namespace="span_tree_labels")
        # span_children_syntax_field = ListField(span_children_syntax_labels)
        # dep_span_children_field = ListField(dep_span_children_labels)
        dep_span_children_field = AdjacencyField(
            indices=dep_adjs_indices,
            sequence_field=text_field,
            labels=dep_adjs,
            label_namespace="dep_adj_labels")

        # tf_f1_field = AdjacencyField(indices=tf_indices['F1'], sequence_field=text_field, labels=tf_features['F1'],
        #     label_namespace="tf_f1_labels")
        # tf_f2_field = AdjacencyField(indices=tf_indices['F2'], sequence_field=text_field, labels=tf_features['F2'],
        #                              label_namespace="tf_f2_labels")
        # tf_f3_field = AdjacencyField(indices=tf_indices['F3'], sequence_field=text_field, labels=tf_features['F3'],
        #                              label_namespace="tf_f3_labels")
        # tf_f4_field = AdjacencyField(indices=tf_indices['F4'], sequence_field=text_field, labels=tf_features['F4'],
        #                              label_namespace="tf_f4_labels")
        # tf_f5_field = AdjacencyField(indices=tf_indices['F5'], sequence_field=text_field, labels=tf_features['F5'],
        #                              label_namespace="tf_f5_labels")

        tf_field = AdjacencyField(indices=tf_indices,
                                  sequence_field=text_field,
                                  labels=tf_features,
                                  label_namespace="tf_labels")

        # Pull it  all together.
        fields = dict(
            text=text_field_with_context,
            spans=span_field,
            ner_labels=ner_label_field,
            coref_labels=coref_label_field,
            trigger_labels=trigger_label_field,
            argument_labels=argument_label_field,
            relation_labels=relation_label_field,
            metadata=metadata_field,
            span_labels=span_label_field,
            ner_sequence_labels=ner_sequence_label_field,
            syntax_labels=span_syntax_field,
            span_children=span_children_field,
            span_tree_labels=span_tree_field,
            dep_span_children=dep_span_children_field,
            # tf_f1 = tf_f1_field,
            # tf_f2 = tf_f2_field,
            # tf_f3 = tf_f3_field,
            # tf_f4 = tf_f4_field,
            # tf_f5 = tf_f5_field)
            tf=tf_field)
        # span_children_syntax=span_children_syntax_field)

        return Instance(fields)
Beispiel #2
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int],
                                        str], relation_dict, doc_key: str,
                         dataset: str, sentence_num: int, groups: List[str],
                         start_ix: int, end_ix: int, tree: Dict[str, Any],
                         children_dict: Dict[Tuple[int, int],
                                             List[Tuple[int, int]]],
                         dep_children_dict: Dict[Tuple[int, int],
                                                 List[Tuple[int, int]]],
                         tf_dict: Dict[Tuple[int, int], Any]):

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num,
                        tree=tree,
                        children_dict=children_dict,
                        dep_children_dict=dep_children_dict)
        metadata_field = MetadataField(metadata)

        # Generate fields for text spans, ner labels
        spans = []
        span_ner_labels = []
        span_children_labels = []
        raw_spans = []

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            span_ner_labels.append(ner_dict[span_ix])
            spans.append(SpanField(start, end, text_field))
            raw_spans.append(span_ix)

        span_field = ListField(spans)

        for span in raw_spans:

            if len(children_dict[span]) == 0:
                children_field = ListField([IndexField(-1, span_field)])
            else:
                children_field = []
                for children_span in children_dict[span]:
                    if children_span in raw_spans:
                        children_field.append(
                            IndexField(raw_spans.index(children_span),
                                       span_field))
                    else:
                        children_field.append(IndexField(-1, span_field))
                children_field = ListField(children_field)

            span_children_labels.append(children_field)

        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_tokens)]
        dep_adjs = []
        dep_adjs_indices = []
        tf_indices = []
        tf_features = []
        for token_pair in candidate_indices:
            dep_adj_label = dep_children_dict[token_pair]
            if dep_adj_label:
                dep_adjs_indices.append(token_pair)
                dep_adjs.append(dep_adj_label)

            feature = tf_dict[token_pair]
            if feature:
                tf_indices.append(token_pair)
                tf_features.append(feature)

        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")

        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        # Syntax
        span_children_field = ListField(span_children_labels)
        dep_span_children_field = AdjacencyField(
            indices=dep_adjs_indices,
            sequence_field=text_field,
            labels=dep_adjs,
            label_namespace="dep_adj_labels")

        tf_field = AdjacencyField(indices=tf_indices,
                                  sequence_field=text_field,
                                  labels=tf_features,
                                  label_namespace="tf_labels")

        fields = dict(text=text_field_with_context,
                      spans=span_field,
                      ner_labels=ner_label_field,
                      relation_labels=relation_label_field,
                      metadata=metadata_field,
                      span_children=span_children_field,
                      dep_span_children=dep_span_children_field,
                      tf=tf_field)

        return Instance(fields)