def setUp(self): super().setUp() token_indexer = {"tokens": SingleIdTokenIndexer()} field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer) self.instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ]
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul( chunk_tags) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance( self, # type: ignore tokens: List[Token], verb_label: List[int], tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields['tokens'] = text_field fields['verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields['tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields["metadata"] = MetadataField({ "words": [x.text for x in tokens], "verb": verb }) return Instance(fields)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def text_to_instance(self, sentence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_string = self._tokenizer.tokenize(sentence) input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) return Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def text_to_instance( self, # type: ignore tokens: List[str], ccg_categories: List[str] = None, original_pos_tags: List[str] = None, modified_pos_tags: List[str] = None, predicate_arg_categories: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. ccg_categories : ``List[str]``, optional, (default = None). The CCG categories for the words in the sentence. (e.g. N/N) original_pos_tags : ``List[str]``, optional, (default = None). The tag assigned to the word in the Penn Treebank. modified_pos_tags : ``List[str]``, optional, (default = None). The POS tag might have changed during the translation to CCG. predicate_arg_categories : ``List[str]``, optional, (default = None). Encodes the word-word dependencies in the underlying predicate- argument structure. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. ccg_categories : ``SequenceLabelField`` The CCG categories (only if supplied) original_pos_tags : ``SequenceLabelField`` Original POS tag (only if supplied) modified_pos_tags : ``SequenceLabelField`` Modified POS tag (only if supplied) predicate_arg_categories : ``SequenceLabelField`` Predicate-argument categories (only if supplied) """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} for field_name, labels in (('ccg_categories', ccg_categories), ('original_pos_tags', original_pos_tags), ('modified_pos_tags', modified_pos_tags), ('predicate_arg_categories', predicate_arg_categories)): if labels is not None: # end namespace in labels so Vocabulary doesn't add PAD and UNK namespace = self._label_namespace_prefix + field_name + '_labels' fields[field_name] = SequenceLabelField( labels, text_field, namespace) return Instance(fields)
def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
def text_to_instance( self, source_string: str, target_string: str = None) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({ "source_tokens": source_field, "target_tokens": target_field }) else: return Instance({'source_tokens': source_field})
def text_to_instance(self, tokens: List[str], sentiment: str = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. sentiment ``str``, optional, (default = None). The sentiment for this sentence. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The sentiment label of the sentence or phrase. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if sentiment is not None: # 0 and 1 are negative sentiment, 2 is neutral, and 3 and 4 are positive sentiment # In 5-class, we use labels as is. # 3-class reduces the granularity, and only asks the model to predict # negative, neutral, or positive. # 2-class further reduces the granularity by only asking the model to # predict whether an instance is negative or positive. if self._granularity == "3-class": if int(sentiment) < 2: sentiment = "0" elif int(sentiment) == 2: sentiment = "1" else: sentiment = "2" elif self._granularity == "2-class": if int(sentiment) < 2: sentiment = "0" elif int(sentiment) == 2: return None else: sentiment = "1" fields['label'] = LabelField(sentiment) return Instance(fields)
def text_to_instance( self, # type: ignore premise: str, hypothesis: str, label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokenized_premise = self._tokenizer.tokenize(premise) tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) fields["premise"] = TextField(tokenized_premise, self._token_indexers) fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers) if label is not None: fields['label'] = LabelField(label) return Instance(fields)
def text_to_instance( self, # type: ignore words: List[str], upos_tags: List[str], dependencies: List[Tuple[str, int]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField( [x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField( [int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": words, "pos": upos_tags}) return Instance(fields)
def text_to_instance( self, # type: ignore premise: str, hypothesis: str, label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields['label'] = LabelField(label) metadata = { "premise_tokens": [x.text for x in premise_tokens], "hypothesis_tokens": [x.text for x in hypothesis_tokens] } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, # type: ignore utterances: List[str], sql_query: str = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- utterances: ``List[str]``, required. List of utterances in the interaction, the last element is the current utterance. sql_query: ``str``, optional The SQL query, given as label during training or validation. """ utterance = utterances[-1] action_sequence: List[str] = [] if not utterance: return None world = AtisWorld(utterances) if sql_query: try: action_sequence = world.get_action_sequence(sql_query) except ParseError: logger.debug(f'Parsing error') tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) utterance_field = TextField(tokenized_utterance, self._token_indexers) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): lhs, _ = production_rule.split(' ->') is_global_rule = not lhs in ['number', 'string'] # The whitespaces are not semantically meaningful, so we filter them out. production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws']) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) action_map = {action.rule: i # type: ignore for i, action in enumerate(action_field.field_list)} index_fields: List[Field] = [] world_field = MetadataField(world) fields = {'utterance' : utterance_field, 'actions' : action_field, 'world' : world_field, 'linking_scores' : ArrayField(world.linking_scores)} if sql_query: if action_sequence: for production_rule in action_sequence: index_fields.append(IndexField(action_map[production_rule], action_field)) action_sequence_field: List[Field] = [] action_sequence_field.append(ListField(index_fields)) fields['target_action_sequence'] = ListField(action_sequence_field) else: # If we are given a SQL query, but we are unable to parse it, then we will skip it. return None return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} pos_namespace = self._label_namespace_prefix + self._pos_label_namespace if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace) fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree if self._use_pos_tags: metadata["pos_tags"] = pos_tags fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace=self._label_namespace_prefix + "labels") return Instance(fields)
def text_to_instance(self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [self._normalize_word(word) for sentence in sentences for word in sentence] metadata: Dict[str, Any] = {"original_text": flattened_sentences} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = {"text": text_field, "spans": span_field, "metadata": metadata_field} if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def _json_blob_to_instance(self, json_obj: JsonDict) -> Instance: question_tokens = self._read_tokens_from_json_list( json_obj['question_tokens']) question_field = TextField(question_tokens, self._question_token_indexers) question_metadata = MetadataField( {"question_tokens": [x.text for x in question_tokens]}) table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines( json_obj['table_lines'], question_tokens) entity_tokens = [ self._read_tokens_from_json_list(token_list) for token_list in json_obj['entity_texts'] ] table_field = KnowledgeGraphField( table_knowledge_graph, question_tokens, tokenizer=None, token_indexers=self._table_token_indexers, entity_tokens=entity_tokens, linking_features=json_obj['linking_features'], include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens) world = WikiTablesWorld(table_knowledge_graph) world_field = MetadataField(world) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_table_entity(rule_right_side) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) example_string_field = MetadataField(json_obj['example_lisp_string']) fields = { 'question': question_field, 'metadata': question_metadata, 'table': table_field, 'world': world_field, 'actions': action_field, 'example_lisp_string': example_string_field } if 'target_action_sequences' in json_obj or 'agenda' in json_obj: action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if 'target_action_sequences' in json_obj: action_sequence_fields: List[Field] = [] for sequence in json_obj['target_action_sequences']: index_fields: List[Field] = [] for production_rule in sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) fields['target_action_sequences'] = ListField( action_sequence_fields) if 'agenda' in json_obj: agenda_index_fields: List[Field] = [] for agenda_action in json_obj['agenda']: agenda_index_fields.append( IndexField(action_map[agenda_action], action_field)) fields['agenda'] = ListField(agenda_index_fields) return Instance(fields)
def text_to_instance( self, # type: ignore question: str, table_lines: List[str], example_lisp_string: str = None, dpd_output: List[str] = None, tokenized_question: List[Token] = None) -> Instance: """ Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as TSV files, which we use for training. Parameters ---------- question : ``str`` Input question table_lines : ``List[str]`` The table content itself, as a list of rows. See ``TableQuestionKnowledgeGraph.read_from_lines`` for the expected format. example_lisp_string : ``str``, optional The original (lisp-formatted) example string in the WikiTableQuestions dataset. This comes directly from the ``.examples`` file provided with the dataset. We pass this to SEMPRE for evaluating logical forms during training. It isn't otherwise used for anything. dpd_output : List[str], optional List of logical forms, produced by dynamic programming on denotations. Not required during test. tokenized_question : ``List[Token]``, optional If you have already tokenized the question, you can pass that in here, so we don't duplicate that work. You might, for example, do batch processing on the questions in the whole dataset, then pass the result in here. """ # pylint: disable=arguments-differ tokenized_question = tokenized_question or self._tokenizer.tokenize( question.lower()) question_field = TextField(tokenized_question, self._question_token_indexers) metadata: Dict[str, Any] = { "question_tokens": [x.text for x in tokenized_question] } metadata["original_table"] = "".join(table_lines) table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines( table_lines, tokenized_question) table_metadata = MetadataField(table_lines) table_field = KnowledgeGraphField( table_knowledge_graph, tokenized_question, self._table_token_indexers, tokenizer=self._tokenizer, feature_extractors=self._linking_feature_extractors, include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens) world = WikiTablesWorld(table_knowledge_graph) world_field = MetadataField(world) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_table_entity(rule_right_side) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { 'question': question_field, 'metadata': MetadataField(metadata), 'table': table_field, 'world': world_field, 'actions': action_field } if self._include_table_metadata: fields['table_metadata'] = table_metadata if example_lisp_string: fields['example_lisp_string'] = MetadataField(example_lisp_string) # We'll make each target action sequence a List[IndexField], where the index is into # the action list we made above. We need to ignore the type here because mypy doesn't # like `action.rule` - it's hard to tell mypy that the ListField is made up of # ProductionRuleFields. action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if dpd_output: action_sequence_fields: List[Field] = [] for logical_form in dpd_output: if not self._should_keep_logical_form(logical_form): logger.debug(f'Question was: {question}') logger.debug(f'Table info was: {table_lines}') continue try: expression = world.parse_logical_form(logical_form) except ParsingError as error: logger.debug( f'Parsing error: {error.message}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Logical form was: {logical_form}') logger.debug(f'Table info was: {table_lines}') continue except: logger.error(logical_form) raise action_sequence = world.get_action_sequence(expression) try: index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except KeyError as error: logger.debug( f'Missing production rule: {error.args}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Table info was: {table_lines}') logger.debug(f'Logical form was: {logical_form}') continue if len(action_sequence_fields) >= self._max_dpd_logical_forms: break if not action_sequence_fields: # This is not great, but we're only doing it when we're passed logical form # supervision, so we're expecting labeled logical forms, but we can't actually # produce the logical forms. We should skip this instance. Note that this affects # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the # full test data. return None fields['target_action_sequences'] = ListField( action_sequence_fields) if self._output_agendas: agenda_index_fields: List[Field] = [] for agenda_string in world.get_agenda(): agenda_index_fields.append( IndexField(action_map[agenda_string], action_field)) if not agenda_index_fields: agenda_index_fields = [IndexField(-1, action_field)] fields['agenda'] = ListField(agenda_index_fields) return Instance(fields)
def text_to_instance( self, # type: ignore sentence: str, structured_representations: List[List[List[JsonDict]]], labels: List[str] = None, target_sequences: List[List[str]] = None, identifier: str = None) -> Instance: """ Parameters ---------- sentence : ``str`` The query sentence. structured_representations : ``List[List[List[JsonDict]]]`` A list of Json representations of all the worlds. See expected format in this class' docstring. labels : ``List[str]`` (optional) List of string representations of the labels (true or false) corresponding to the ``structured_representations``. Not required while testing. target_sequences : ``List[List[str]]`` (optional) List of target action sequences for each element which lead to the correct denotation in worlds corresponding to the structured representations. identifier : ``str`` (optional) The identifier from the dataset if available. """ # pylint: disable=arguments-differ worlds = [NlvrWorld(data) for data in structured_representations] tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change # later. for production_rule in worlds[0].all_possible_actions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) worlds_field = ListField([MetadataField(world) for world in worlds]) fields: Dict[str, Field] = { "sentence": sentence_field, "worlds": worlds_field, "actions": action_field } if identifier is not None: fields["identifier"] = MetadataField(identifier) # Depending on the type of supervision used for training the parser, we may want either # target action sequences or an agenda in our instance. We check if target sequences are # provided, and include them if they are. If not, we'll get an agenda for the sentence, and # include that in the instance. if target_sequences: action_sequence_fields: List[Field] = [] for target_sequence in target_sequences: index_fields = ListField([ IndexField(instance_action_ids[action], action_field) for action in target_sequence ]) action_sequence_fields.append(index_fields) # TODO(pradeep): Define a max length for this field. fields["target_action_sequences"] = ListField( action_sequence_fields) elif self._output_agendas: # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true # now, but may change later too. agenda = worlds[0].get_agenda_for_sentence( sentence, add_paths_to_agenda=False) assert agenda, "No agenda found for sentence: %s" % sentence # agenda_field contains indices into actions. agenda_field = ListField([ IndexField(instance_action_ids[action], action_field) for action in agenda ]) fields["agenda"] = agenda_field if labels: labels_field = ListField([ LabelField(label, label_namespace='denotations') for label in labels ]) fields["labels"] = labels_field return Instance(fields)
def make_reading_comprehension_instance( question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = { 'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if answer_texts: metadata['answer_texts'] = answer_texts if token_spans: # There may be multiple answer annotations, so we pick the one that occurs the most. This # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc", # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at # all of the annotations. This is why we have a separate official SQuAD metric calculation # (the "em" and "f1" metrics use the official script). candidate_answers: Counter = Counter() for span_start, span_end in token_spans: candidate_answers[(span_start, span_end)] += 1 span_start, span_end = candidate_answers.most_common(1)[0][0] fields['span_start'] = IndexField(span_start, passage_field) fields['span_end'] = IndexField(span_end, passage_field) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_list_tokens : ``List[List[Token]]`` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans_lists : ``List[List[Tuple[int, int]]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : ``List[int]`` List of the affirmation bit for each question answer pairs. followup_list : ``List[int]`` List of the continuation bit for each question answer pairs. num_context_answers : ``int``, optional How many answers to encode into the passage. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start > 0 assert span_end > 0 except: raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) prev_answer_marker_lists = [["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens)] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) if num_context_answers > 0: fields['p1_answer_marker'] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields['p2_answer_marker'] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields['p3_answer_marker'] = ListField( p3_answer_marker_list) fields['yesno_list'] = ListField( \ [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \ for followup in followup_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)