indices as fields. """ fields = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields[u"words"] = tokens fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields[u"head_tags"] = SequenceLabelField( [x[0] for x in dependencies], tokens, label_namespace=u"head_tags") fields[u"head_indices"] = SequenceLabelField( [int(x[1]) for x in dependencies], tokens, label_namespace=u"head_index_tags") fields[u"metadata"] = MetadataField({ u"words": words, u"pos": upos_tags }) return Instance(fields) UniversalDependenciesDatasetReader = DatasetReader.register( u"universal_dependencies")(UniversalDependenciesDatasetReader)
span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields = { u"text": text_field, u"spans": span_field, u"metadata": metadata_field } if span_labels is not None: fields[u"span_labels"] = SequenceLabelField( span_labels, span_field) return Instance(fields) @staticmethod def _normalize_word(word): if word == u"/." or word == u"/?": return word[1:] else: return word ConllCorefReader = DatasetReader.register(u"coref")(ConllCorefReader)
identifier in the file path are yielded. """ for conll_file in ontonotes_reader.dataset_path_iterator(file_path): if (domain_identifier is None or "/{domain_identifier}/" in conll_file) and u"/pt/" not in conll_file: yield ontonotes_reader.sentence_iterator(conll_file) #overrides def text_to_instance( self, # type: ignore tokens, ner_tags=None): u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField( {u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields) OntonotesNamedEntityRecognition = DatasetReader.register(u"ontonotes_ner")( OntonotesNamedEntityRecognition)
**kwargs, ) @classmethod def squad2( cls, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, passage_length_limit: int = None, question_length_limit: int = None, skip_impossible_questions: bool = False, no_answer_token: str = SQUAD2_NO_ANSWER_TOKEN, **kwargs, ) -> "SquadReader": """ Gives a `SquadReader` suitable for SQuAD v2.0. """ return cls( tokenizer=tokenizer, token_indexers=token_indexers, passage_length_limit=passage_length_limit, question_length_limit=question_length_limit, skip_impossible_questions=skip_impossible_questions, no_answer_token=no_answer_token, **kwargs, ) DatasetReader.register("squad1", constructor="squad1")(SquadReader) DatasetReader.register("squad2", constructor="squad2")(SquadReader)
""" # pylint: disable=no-self-use,unused-argument paragraphs = [] for evidence_file in evidence_files: whole_document = u' '.join(evidence_file) tokens = whole_document.split(u' ') paragraph = u' '.join(tokens[:400]) paragraphs.append(paragraph) return paragraphs #overrides def text_to_instance( self, # type: ignore question_text, passage_text, token_spans=None, answer_texts=None, question_tokens=None, passage_tokens=None): # pylint: disable=arguments-differ if not question_tokens: question_tokens = self._tokenizer.tokenize(question_text) if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts) TriviaQaReader = DatasetReader.register(u"triviaqa")(TriviaQaReader)
action_field = ListField(production_rule_fields) action_map = dict((action.rule, i) # type: ignore for i, action in enumerate(action_field.field_list)) index_fields = [] world_field = MetadataField(world) fields = { u'utterance': utterance_field, u'actions': action_field, u'world': world_field, u'linking_scores': ArrayField(world.linking_scores) } if sql_query: if action_sequence: for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_field = [] action_sequence_field.append(ListField(index_fields)) fields[u'target_action_sequence'] = ListField( action_sequence_field) else: # If we are given a SQL query, but we are unable to parse it, then we will skip it. return None return Instance(fields) AtisDatasetReader = DatasetReader.register(u"atis")(AtisDatasetReader)
text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields = {u"tokens": text_field} if sentiment is not None: # 0 and 1 are negative sentiment, 2 is neutral, and 3 and 4 are positive sentiment # In 5-class, we use labels as is. # 3-class reduces the granularity, and only asks the model to predict # negative, neutral, or positive. # 2-class further reduces the granularity by only asking the model to # predict whether an instance is negative or positive. if self._granularity == u"3-class": if int(sentiment) < 2: sentiment = u"0" elif int(sentiment) == 2: sentiment = u"1" else: sentiment = u"2" elif self._granularity == u"2-class": if int(sentiment) < 2: sentiment = u"0" elif int(sentiment) == 2: return None else: sentiment = u"1" fields[u'label'] = LabelField(sentiment) return Instance(fields) StanfordSentimentTreeBankDatasetReader = DatasetReader.register(u"sst_tokens")( StanfordSentimentTreeBankDatasetReader)
def text_to_instance( self, # type: ignore tokens, verb_label, tags=None): u""" We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields[u'tokens'] = text_field fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields[u'tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields[u"metadata"] = MetadataField({ u"words": [x.text for x in tokens], u"verb": verb }) return Instance(fields) SrlReader = DatasetReader.register(u"srl")(SrlReader)
We parse such strings and return the parsed information here. We don't actually use the target value right now, because we use a pre-computed set of logical forms. So we don't bother parsing it; we can change that if we ever need to. """ id_piece, rest = lisp_string.split(u') (utterance "') example_id = id_piece.split(u'(id ')[1] question, rest = rest.split( u'") (context (graph tables.TableKnowledgeGraph ') table_filename, rest = rest.split(u')) (targetValue (list') return { u'id': example_id, u'question': question, u'table_filename': table_filename } @staticmethod def _should_keep_logical_form(logical_form): # DPD has funny ideas about long strings of "ors" being reasonable logical forms. They # aren't, and they crash our recursive type inference code. TODO(mattg): we need to fix # the type inference code to not die in those cases, somehow... if logical_form.count(u'(or') > 3: logger.debug( 'Skipping logical form with inordinate number of "ors": {logical_form}' ) return False return True WikiTablesDatasetReader = DatasetReader.register(u"wikitables")( WikiTablesDatasetReader)
passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug(u"Passage: %s", passage_text) logger.debug(u"Passage tokens: %s", passage_tokens) logger.debug(u"Question text: %s", question_text) logger.debug(u"Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug(u"Token span: (%d, %d)", span_start, span_end) logger.debug(u"Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug(u"Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance( self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts) SquadReader = DatasetReader.register(u"squad")(SquadReader)
num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({ u'input_tokens': input_field, u'output_tokens': output_field }) #overrides def text_to_instance(self, sentence): # type: ignore # pylint: disable=arguments-differ tokenized_string = self._tokenizer.tokenize(sentence) input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) return Instance({ u'input_tokens': input_field, u'output_tokens': output_field }) LanguageModelingReader = DatasetReader.register(u"language_modeling")( LanguageModelingReader)
for line_num, line in enumerate(data_file): line = line.strip(u"\n") if not line: continue line_parts = line.split(u'\t') if len(line_parts) != 2: raise ConfigurationError(u"Invalid line format: %s (line number %d)" % (line, line_num + 1)) source_sequence, target_sequence = line_parts yield self.text_to_instance(source_sequence, target_sequence) #overrides def text_to_instance(self, source_string , target_string = None) : # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({u"source_tokens": source_field, u"target_tokens": target_field}) else: return Instance({u'source_tokens': source_field}) Seq2SeqDatasetReader = DatasetReader.register(u"seq2seq")(Seq2SeqDatasetReader)
cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans = [] span_labels = [] if gold_clusters is not None else None for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields = {u"text": text_field, u"spans": span_field, u"metadata": metadata_field} if span_labels is not None: fields[u"span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields) WinobiasReader = DatasetReader.register(u"winobias")(WinobiasReader)
------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. ccg_categories : ``SequenceLabelField`` The CCG categories (only if supplied) original_pos_tags : ``SequenceLabelField`` Original POS tag (only if supplied) modified_pos_tags : ``SequenceLabelField`` Modified POS tag (only if supplied) predicate_arg_categories : ``SequenceLabelField`` Predicate-argument categories (only if supplied) """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields = {u"tokens": text_field} for field_name, labels in ((u'ccg_categories', ccg_categories), (u'original_pos_tags', original_pos_tags), (u'modified_pos_tags', modified_pos_tags), (u'predicate_arg_categories', predicate_arg_categories)): if labels is not None: fields[field_name] = SequenceLabelField(labels, text_field) return Instance(fields) CcgBankDatasetReader = DatasetReader.register(u"ccgbank")(CcgBankDatasetReader)
premise = example[u"sentence1"] hypothesis = example[u"sentence2"] yield self.text_to_instance(premise, hypothesis, label) #overrides def text_to_instance( self, # type: ignore premise, hypothesis, label=None): # pylint: disable=arguments-differ fields = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields[u'premise'] = TextField(premise_tokens, self._token_indexers) fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields[u'label'] = LabelField(label) metadata = { u"premise_tokens": [x.text for x in premise_tokens], u"hypothesis_tokens": [x.text for x in hypothesis_tokens] } fields[u"metadata"] = MetadataField(metadata) return Instance(fields) SnliReader = DatasetReader.register(u"snli")(SnliReader)
with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip(u"\n") # skip blank lines if not line: continue tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter)] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags) def text_to_instance(self, tokens , tags = None) : # type: ignore u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields = {} sequence = TextField(tokens, self._token_indexers) fields[u"tokens"] = sequence fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) if tags is not None: fields[u"tags"] = SequenceLabelField(tags, sequence) return Instance(fields) SequenceTaggingDatasetReader = DatasetReader.register(u"sequence_tagging")(SequenceTaggingDatasetReader)
# We don't actually want the spans for leaves, because # their labels are POS tags. Instead, we just add the length # of the word to the end index as we iterate through. end = index + len(tree) else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end # Set the end index of the current span to # the last appended index - 1, as the span is inclusive. span = (index, end - 1) current_span_label = typed_spans.get(span) if current_span_label is None: # This span doesn't have nested labels, just # use the current node's label. typed_spans[span] = tree.label() else: # This span has already been added, so prepend # this label (as we are traversing the tree from # the bottom up). typed_spans[span] = tree.label() + u"-" + current_span_label return end PennTreeBankConstituencySpanDatasetReader = DatasetReader.register( u"ptb_trees")(PennTreeBankConstituencySpanDatasetReader)
logger.info(u"Reading instances from lines in file at: %s", file_path) with open(cached_path(file_path), u"r") as data_file: tsv_in = csv.reader(data_file, delimiter=u'\t') for row in tsv_in: if len(row) == 4: yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0]) #overrides def text_to_instance( self, # type: ignore premise, hypothesis, label=None): # pylint: disable=arguments-differ fields = {} tokenized_premise = self._tokenizer.tokenize(premise) tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) fields[u"premise"] = TextField(tokenized_premise, self._token_indexers) fields[u"hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers) if label is not None: fields[u'label'] = LabelField(label) return Instance(fields) QuoraParaphraseDatasetReader = DatasetReader.register(u"quora_paraphrase")( QuoraParaphraseDatasetReader)