indices as fields.
        """
        fields = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags,
                                                 tokens,
                                                 label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField(
                [x[0] for x in dependencies],
                tokens,
                label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField(
                [int(x[1]) for x in dependencies],
                tokens,
                label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({
            u"words": words,
            u"pos": upos_tags
        })
        return Instance(fields)


UniversalDependenciesDatasetReader = DatasetReader.register(
    u"universal_dependencies")(UniversalDependenciesDatasetReader)
예제 #2
0
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields = {
            u"text": text_field,
            u"spans": span_field,
            u"metadata": metadata_field
        }
        if span_labels is not None:
            fields[u"span_labels"] = SequenceLabelField(
                span_labels, span_field)

        return Instance(fields)

    @staticmethod
    def _normalize_word(word):
        if word == u"/." or word == u"/?":
            return word[1:]
        else:
            return word


ConllCorefReader = DatasetReader.register(u"coref")(ConllCorefReader)
예제 #3
0
        identifier in the file path are yielded.
        """
        for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
            if (domain_identifier is None or "/{domain_identifier}/"
                    in conll_file) and u"/pt/" not in conll_file:
                yield ontonotes_reader.sentence_iterator(conll_file)

    #overrides
    def text_to_instance(
            self,  # type: ignore
            tokens,
            ner_tags=None):
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField(
            {u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)


OntonotesNamedEntityRecognition = DatasetReader.register(u"ontonotes_ner")(
    OntonotesNamedEntityRecognition)
예제 #4
0
            **kwargs,
        )

    @classmethod
    def squad2(
        cls,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        passage_length_limit: int = None,
        question_length_limit: int = None,
        skip_impossible_questions: bool = False,
        no_answer_token: str = SQUAD2_NO_ANSWER_TOKEN,
        **kwargs,
    ) -> "SquadReader":
        """
        Gives a `SquadReader` suitable for SQuAD v2.0.
        """
        return cls(
            tokenizer=tokenizer,
            token_indexers=token_indexers,
            passage_length_limit=passage_length_limit,
            question_length_limit=question_length_limit,
            skip_impossible_questions=skip_impossible_questions,
            no_answer_token=no_answer_token,
            **kwargs,
        )


DatasetReader.register("squad1", constructor="squad1")(SquadReader)
DatasetReader.register("squad2", constructor="squad2")(SquadReader)
예제 #5
0
        """
        # pylint: disable=no-self-use,unused-argument
        paragraphs = []
        for evidence_file in evidence_files:
            whole_document = u' '.join(evidence_file)
            tokens = whole_document.split(u' ')
            paragraph = u' '.join(tokens[:400])
            paragraphs.append(paragraph)
        return paragraphs

    #overrides
    def text_to_instance(
            self,  # type: ignore
            question_text,
            passage_text,
            token_spans=None,
            answer_texts=None,
            question_tokens=None,
            passage_tokens=None):
        # pylint: disable=arguments-differ
        if not question_tokens:
            question_tokens = self._tokenizer.tokenize(question_text)
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        return util.make_reading_comprehension_instance(
            question_tokens, passage_tokens, self._token_indexers,
            passage_text, token_spans, answer_texts)


TriviaQaReader = DatasetReader.register(u"triviaqa")(TriviaQaReader)
예제 #6
0
        action_field = ListField(production_rule_fields)
        action_map = dict((action.rule, i)  # type: ignore
                          for i, action in enumerate(action_field.field_list))
        index_fields = []
        world_field = MetadataField(world)
        fields = {
            u'utterance': utterance_field,
            u'actions': action_field,
            u'world': world_field,
            u'linking_scores': ArrayField(world.linking_scores)
        }

        if sql_query:
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))

                action_sequence_field = []
                action_sequence_field.append(ListField(index_fields))
                fields[u'target_action_sequence'] = ListField(
                    action_sequence_field)
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)


AtisDatasetReader = DatasetReader.register(u"atis")(AtisDatasetReader)
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields = {u"tokens": text_field}
        if sentiment is not None:
            # 0 and 1 are negative sentiment, 2 is neutral, and 3 and 4 are positive sentiment
            # In 5-class, we use labels as is.
            # 3-class reduces the granularity, and only asks the model to predict
            # negative, neutral, or positive.
            # 2-class further reduces the granularity by only asking the model to
            # predict whether an instance is negative or positive.
            if self._granularity == u"3-class":
                if int(sentiment) < 2:
                    sentiment = u"0"
                elif int(sentiment) == 2:
                    sentiment = u"1"
                else:
                    sentiment = u"2"
            elif self._granularity == u"2-class":
                if int(sentiment) < 2:
                    sentiment = u"0"
                elif int(sentiment) == 2:
                    return None
                else:
                    sentiment = u"1"
            fields[u'label'] = LabelField(sentiment)
        return Instance(fields)


StanfordSentimentTreeBankDatasetReader = DatasetReader.register(u"sst_tokens")(
    StanfordSentimentTreeBankDatasetReader)
    def text_to_instance(
            self,  # type: ignore
            tokens,
            verb_label,
            tags=None):
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({
            u"words": [x.text for x in tokens],
            u"verb": verb
        })
        return Instance(fields)


SrlReader = DatasetReader.register(u"srl")(SrlReader)
예제 #9
0
        We parse such strings and return the parsed information here.  We don't actually use the
        target value right now, because we use a pre-computed set of logical forms.  So we don't
        bother parsing it; we can change that if we ever need to.
        """
        id_piece, rest = lisp_string.split(u') (utterance "')
        example_id = id_piece.split(u'(id ')[1]
        question, rest = rest.split(
            u'") (context (graph tables.TableKnowledgeGraph ')
        table_filename, rest = rest.split(u')) (targetValue (list')
        return {
            u'id': example_id,
            u'question': question,
            u'table_filename': table_filename
        }

    @staticmethod
    def _should_keep_logical_form(logical_form):
        # DPD has funny ideas about long strings of "ors" being reasonable logical forms.  They
        # aren't, and they crash our recursive type inference code.  TODO(mattg): we need to fix
        # the type inference code to not die in those cases, somehow...
        if logical_form.count(u'(or') > 3:
            logger.debug(
                'Skipping logical form with inordinate number of "ors": {logical_form}'
            )
            return False
        return True


WikiTablesDatasetReader = DatasetReader.register(u"wikitables")(
    WikiTablesDatasetReader)
예제 #10
0
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug(u"Passage: %s", passage_text)
                logger.debug(u"Passage tokens: %s", passage_tokens)
                logger.debug(u"Question text: %s", question_text)
                logger.debug(u"Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug(u"Token span: (%d, %d)", span_start, span_end)
                logger.debug(u"Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug(u"Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(
            self._tokenizer.tokenize(question_text), passage_tokens,
            self._token_indexers, passage_text, token_spans, answer_texts)


SquadReader = DatasetReader.register(u"squad")(SquadReader)
예제 #11
0
                                                               num_tokens)])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:],
                                     self._output_indexer)
            yield Instance({
                u'input_tokens': input_field,
                u'output_tokens': output_field
            })

    #overrides
    def text_to_instance(self, sentence):  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_string = self._tokenizer.tokenize(sentence)
        input_field = TextField(tokenized_string[:-1], self._token_indexers)
        output_field = TextField(tokenized_string[1:], self._output_indexer)
        return Instance({
            u'input_tokens': input_field,
            u'output_tokens': output_field
        })


LanguageModelingReader = DatasetReader.register(u"language_modeling")(
    LanguageModelingReader)
예제 #12
0
            for line_num, line in enumerate(data_file):
                line = line.strip(u"\n")

                if not line:
                    continue

                line_parts = line.split(u'\t')
                if len(line_parts) != 2:
                    raise ConfigurationError(u"Invalid line format: %s (line number %d)" % (line, line_num + 1))
                source_sequence, target_sequence = line_parts
                yield self.text_to_instance(source_sequence, target_sequence)

    #overrides
    def text_to_instance(self, source_string     , target_string      = None)            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)
            return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
        else:
            return Instance({u'source_tokens': source_field})

Seq2SeqDatasetReader = DatasetReader.register(u"seq2seq")(Seq2SeqDatasetReader)
예제 #13
0
        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans              = []
        span_labels                      = [] if gold_clusters is not None else None

        for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width):
            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields                   = {u"text": text_field,
                                    u"spans": span_field,
                                    u"metadata": metadata_field}
        if span_labels is not None:
            fields[u"span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)

WinobiasReader = DatasetReader.register(u"winobias")(WinobiasReader)
예제 #14
0
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            ccg_categories : ``SequenceLabelField``
                The CCG categories (only if supplied)
            original_pos_tags : ``SequenceLabelField``
                Original POS tag (only if supplied)
            modified_pos_tags : ``SequenceLabelField``
                Modified POS tag (only if supplied)
            predicate_arg_categories : ``SequenceLabelField``
                Predicate-argument categories (only if supplied)
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields = {u"tokens": text_field}

        for field_name, labels in ((u'ccg_categories', ccg_categories),
                                   (u'original_pos_tags', original_pos_tags),
                                   (u'modified_pos_tags', modified_pos_tags),
                                   (u'predicate_arg_categories',
                                    predicate_arg_categories)):
            if labels is not None:
                fields[field_name] = SequenceLabelField(labels, text_field)

        return Instance(fields)


CcgBankDatasetReader = DatasetReader.register(u"ccgbank")(CcgBankDatasetReader)
예제 #15
0
                premise = example[u"sentence1"]
                hypothesis = example[u"sentence2"]

                yield self.text_to_instance(premise, hypothesis, label)

    #overrides
    def text_to_instance(
            self,  # type: ignore
            premise,
            hypothesis,
            label=None):
        # pylint: disable=arguments-differ
        fields = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens,
                                          self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {
            u"premise_tokens": [x.text for x in premise_tokens],
            u"hypothesis_tokens": [x.text for x in hypothesis_tokens]
        }
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields)


SnliReader = DatasetReader.register(u"snli")(SnliReader)
예제 #16
0
        with open(file_path, u"r") as data_file:

            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip(u"\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags)

    def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields)

SequenceTaggingDatasetReader = DatasetReader.register(u"sequence_tagging")(SequenceTaggingDatasetReader)
예제 #17
0
            # We don't actually want the spans for leaves, because
            # their labels are POS tags. Instead, we just add the length
            # of the word to the end index as we iterate through.
            end = index + len(tree)
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            # Set the end index of the current span to
            # the last appended index - 1, as the span is inclusive.
            span = (index, end - 1)
            current_span_label = typed_spans.get(span)
            if current_span_label is None:
                # This span doesn't have nested labels, just
                # use the current node's label.
                typed_spans[span] = tree.label()
            else:
                # This span has already been added, so prepend
                # this label (as we are traversing the tree from
                # the bottom up).
                typed_spans[span] = tree.label() + u"-" + current_span_label

        return end


PennTreeBankConstituencySpanDatasetReader = DatasetReader.register(
    u"ptb_trees")(PennTreeBankConstituencySpanDatasetReader)
예제 #18
0
        logger.info(u"Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), u"r") as data_file:
            tsv_in = csv.reader(data_file, delimiter=u'\t')
            for row in tsv_in:
                if len(row) == 4:
                    yield self.text_to_instance(premise=row[1],
                                                hypothesis=row[2],
                                                label=row[0])

    #overrides
    def text_to_instance(
            self,  # type: ignore
            premise,
            hypothesis,
            label=None):
        # pylint: disable=arguments-differ
        fields = {}
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
        fields[u"premise"] = TextField(tokenized_premise, self._token_indexers)
        fields[u"hypothesis"] = TextField(tokenized_hypothesis,
                                          self._token_indexers)
        if label is not None:
            fields[u'label'] = LabelField(label)

        return Instance(fields)


QuoraParaphraseDatasetReader = DatasetReader.register(u"quora_paraphrase")(
    QuoraParaphraseDatasetReader)