def text_to_instance(self, source_string: str, target_string: str = None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, source_string: str, target_string: str = None ) -> Instance: # type: ignore """ Turn raw source string and target string into an `Instance`. # Parameters source_string : `str`, required target_string : `str`, optional (default = None) # Returns Instance See the above for a description of the fields that the instance will contain. """ tokenized_source = self._source_tokenizer.tokenize(source_string) if not tokenized_source: # If the tokenized source is empty, it will cause issues downstream. raise ValueError(f"source tokenizer produced no tokens from source '{source_string}'") source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source, self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source]} fields_dict = {"source_tokens": source_field, "source_to_target": source_to_target_field} if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source + tokenized_target) source_token_ids = source_and_target_token_ids[: len(tokenized_source)] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source) :] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore rule_text: str, question: str, scenario: str, history: List[Dict[str, str]], utterance_id: str = None, tree_id: str = None, source_url: str = None, answer: str = None, evidence: List[Dict[str, str]] = None) -> Optional[Instance]: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # For CopyNet Model source_string = rule_text for follow_up_qna in history: source_string += ' @@||@@ ' source_string += follow_up_qna['follow_up_question'] target_string = answer # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # For BiDAF model passage_text = rule_text question_text = question + ' @@||@@ ' + scenario for follow_up_qna in history: question_text += ' @@||@@ ' question_text += follow_up_qna['follow_up_question'] question_text += ' @@?@@ ' question_text += follow_up_qna['follow_up_answer'] passage_tokens = self._bidaf_tokenizer.tokenize(passage_text) question_tokens = self._bidaf_tokenizer.tokenize(question_text) fields_dict['passage'] = TextField(passage_tokens, self._bidaf_token_indexers) fields_dict['question'] = TextField(question_tokens, self._bidaf_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) action = 'More' if answer not in ['Yes', 'No', 'Irrelevant' ] else answer fields_dict['label'] = LabelField(action) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['rule_text'] = rule_text meta_fields['question'] = question meta_fields['scenario'] = scenario meta_fields['history'] = history fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance(self, source_key: str, target_key: str = None, line_obj: Dict = {}) -> Instance: """ Turn json object into an ``Instance``. Parameters ---------- source_key : ``str``, required, json object key name of the source sequence target_key : ``str``, optional (default = None), json object key name of the target sequence line_obj : ``Dict``, required, json object containing the raw instance info Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # Read source and target target_sequence = line_obj.get(target_key, None) lang_src_token = line_obj["src_lang"].upper() lang_tgt_token = line_obj["tgt_lang"].upper() # Read Predicate Indicator and make Array verb_label = [0, 0] + [1 if label[-2:] == "-V" else 0 for label in line_obj["BIO"]] + [0] # Read Language Indicator and make Array lang_src_ix = self._available_languages[lang_src_token] lang_tgt_ix = self._available_languages[lang_tgt_token] # This array goes to the encoder as a whole lang_src_ix_arr = [0, 0] + [lang_src_ix for tok in line_obj[source_key]] + [0] # This array goes to each one of the decoder_steps lang_tgt_ix_arr = lang_tgt_ix # is just int for step decoder dimensionality # Tokenize Source tokenized_source = list(map(Token, line_obj[source_key])) # Data comes already tokenized! tokenized_source.insert(0, Token(lang_tgt_token)) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # Process Target info during training... if target_sequence is not None: tokenized_target = list(map(Token, line_obj[target_key])) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) # Add Verb Indicator to the Fields fields_dict['verb_indicator'] = SequenceLabelField(verb_label, source_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokenized_source[verb_label.index(1)].text meta_fields["verb"] = verb # Add Language Indicator to the Fields meta_fields["src_lang"] = lang_src_token meta_fields["tgt_lang"] = lang_tgt_token meta_fields["original_BIO"] = line_obj.get("BIO", []) meta_fields["original_predicate_senses"] = line_obj.get("pred_sense_origin", []) meta_fields["predicate_senses"] = line_obj.get("pred_sense", []) meta_fields["original_target"] = line_obj.get("seq_tag_tokens", []) fields_dict['language_enc_indicator'] = ArrayField(np.array(lang_src_ix_arr)) fields_dict['language_dec_indicator'] = ArrayField(np.array(lang_tgt_ix_arr)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance(self, source: str, target: str = None ) -> Instance: def prepare_text(text, max_tokens): tokens = self.tokenizer.tokenize(text)[0:max_tokens] tokens.insert(0,Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens # tokenize source sequence source_tokens = prepare_text(source,self.source_max_tokens) source_tokens_indexed = TextField(source_tokens, self.source_token_indexers) result = {'source_tokens': source_tokens_indexed} # meta_fields meta_fields = {} # copy if self.save_copy_fields: source_to_target_field = NamespaceSwappingField(source_tokens[1:-1],self.target_namespace) result['source_to_target'] = source_to_target_field meta_fields['source_tokens'] = [x.text for x in source_tokens[1:-1]] # pointer if self.save_pgn_fields: source_to_target_field = NamespaceSwappingField(source_tokens, self.target_namespace) result['source_to_target'] = source_to_target_field meta_fields['source_tokens'] = [x.text for x in source_tokens] if target: # target_tokens target_tokens = prepare_text(target,self.target_max_tokens) target_tokens_indexed = TextField(target_tokens,self.target_token_indexers) result['target_tokens'] = target_tokens_indexed if self.save_copy_fields: meta_fields['target_tokens'] = [y.text for y in target_tokens[1:-1]] source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens) source_token_ids = source_and_target_token_ids[:len(source_tokens)-2] result['source_token_ids'] = ArrayField(np.array(source_token_ids,dtype='long')) target_token_ids = source_and_target_token_ids[len(source_tokens)-2:] result['target_token_ids'] = ArrayField(np.array(target_token_ids,dtype='long')) if self.save_pgn_fields: meta_fields['target_tokens'] = [y.text for y in target_tokens] source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens) source_token_ids = source_and_target_token_ids[:len(source_tokens)] result['source_token_ids'] = ArrayField(np.array(source_token_ids,dtype='long')) target_token_ids = source_and_target_token_ids[len(source_tokens):] result['target_token_ids'] = ArrayField(np.array(target_token_ids,dtype='long')) elif self.save_copy_fields: source_token_ids = self._tokens_to_ids(source_tokens[1:-1]) result['source_token_ids'] = ArrayField(np.array(source_token_ids)) elif self.save_pgn_fields: source_token_ids = self._tokens_to_ids(source_tokens) result['source_token_ids'] = ArrayField(np.array(source_token_ids)) if self.save_copy_fields or self.save_pgn_fields: result['metadata'] = MetadataField(meta_fields) return Instance(result)
def text_to_instance(self, source: str, target: str = None) -> Instance: def prepare_text(text, max_tokens): text = text.lower() if self._lowercase else text tokens = self._tokenizer.tokenize(text)[:max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens source_tokens = prepare_text(source, self._source_max_tokens) source_tokens_indexed = TextField(source_tokens, self._source_token_indexers) result = {'source_tokens': source_tokens_indexed} meta_fields = {} if self._save_copy_fields: source_to_target_field = NamespaceSwappingField( source_tokens[1:-1], self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [ x.text for x in source_tokens[1:-1] ] if self._save_pgn_fields: source_to_target_field = NamespaceSwappingField( source_tokens, self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens] if target: target_tokens = prepare_text(target, self._target_max_tokens) target_tokens_indexed = TextField(target_tokens, self._target_token_indexers) result['target_tokens'] = target_tokens_indexed if self._save_pgn_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens] source_and_target_token_ids = self._tokens_to_ids( source_tokens + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len( source_tokens)] result["source_token_ids"] = ArrayField( np.array(source_token_ids, dtype='long')) target_token_ids = source_and_target_token_ids[ len(source_tokens):] result["target_token_ids"] = ArrayField( np.array(target_token_ids, dtype='long')) if self._save_copy_fields: meta_fields["target_tokens"] = [ y.text for y in target_tokens[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( source_tokens[1:-1] + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len( source_tokens) - 2] result["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[ len(source_tokens) - 2:] result["target_token_ids"] = ArrayField( np.array(target_token_ids)) elif self._save_copy_fields: source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) elif self._save_pgn_fields: source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) if self._save_copy_fields or self._save_pgn_fields: result["metadata"] = MetadataField(meta_fields) return Instance(result)
def text_to_instance( self, source_string: str, target_strings: str = None, example_id: str = None, validation: bool = False, gradients: bool = False, confidences: float = None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ if target_strings is not None: target_strings += ['EOE'] ## End of extractions confidences += [1] if self._bert: source_string = bert_utils.replace_strings(source_string) if target_strings is not None: rep_target_strings = [] for target_string in target_strings: rep_target_strings.append( bert_utils.replace_strings(target_string)) target_strings = rep_target_strings tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]], "example_ids": example_id, "validation": validation, "gradients": gradients, "confidences": confidences } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_strings is not None: target_fields, tokenized_targets, source_token_idss, target_token_idss = [], [], [], [] num_target_tokens = 0 for i in range(len(target_strings)): tokenized_target = self._target_tokenizer.tokenize( target_strings[i]) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) tokenized_targets.append(tokenized_target) num_target_tokens += len(tokenized_target) target_field = TextField(tokenized_target, self._target_token_indexers) target_fields.append(target_field) source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] target_token_ids = source_and_target_token_ids[ len(tokenized_source) - 2:] target_token_idss.append(ArrayField( np.array(target_token_ids))) fields_dict["target_tokens"] = ListField(target_fields) meta_fields["target_tokens"] = [[ y.text for y in tokenized_target[1:-1] ] for tokenized_target in tokenized_targets] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) fields_dict["target_token_ids"] = ListField(target_token_idss) # confidences = np.array(confidences) # confidence_field = ArrayField(confidences) # fields_dict['confidences'] = confidence_field else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) if (self._max_tokens != None and target_strings != None and len(tokenized_source) + num_target_tokens >= self._max_tokens): return None return Instance(fields_dict)
def text_to_instance(self, document: List[str], topics: List[str], context: List[str], cloze: Optional[str] = None) -> Instance: """ Parameters ---------- document: The list of document sentences. topics: The list of topics. context: The list of context sentences. cloze: The cloze string. """ fields = {} # There is some weirdness that can happen if the document tokens are lowercased # but the context/cloze tokens are not (or vice versa). We will deal with that when # it's necessary. For now, we don't allow it. assert self.document_token_indexers['tokens'].lowercase_tokens == self.cloze_token_indexers['tokens'].lowercase_tokens assert self.context_token_indexers['tokens'].lowercase_tokens == self.cloze_token_indexers['tokens'].lowercase_tokens if self.document_token_indexers['tokens'].lowercase_tokens: document = [sentence.lower() for sentence in document] context = [sentence.lower() for sentence in context] if cloze is not None: cloze = cloze.lower() # Setup the document field tokenized_document = self.document_tokenizer.tokenize(document) if self.max_document_length is not None: tokenized_document = tokenized_document[:self.max_document_length] fields['document'] = TextField(tokenized_document, self.document_token_indexers) # Get the document token indices but in the cloze namespace fields['document_in_cloze_namespace'] = NamespaceSwappingField(tokenized_document, self.cloze_namespace) # Build a map from token to all of the indices that token appears document_token_to_indices = get_token_to_index_map(tokenized_document) # Get a field that, for every document token, has the first index within # the document that token appears fields['document_token_first_indices'] = \ get_first_indices_field(tokenized_document, document_token_to_indices) # Setup the topics tokenized_topics = [self.topic_tokenizer.tokenize(topic) for topic in topics] topic_fields = [TextField(tokenized_topic, self.topic_token_indexers) for tokenized_topic in tokenized_topics] fields['topics'] = ListField(topic_fields) # Setup the context tokenized_context = self.context_tokenizer.tokenize(context) if self.max_context_length is not None: # We take the last tokens instead of the first because the cloze # comes immediately after the context tokenized_context = tokenized_context[-self.max_context_length:] fields['context'] = TextField(tokenized_context, self.context_token_indexers) context_token_document_indices_field, mask_field = \ get_token_mapping_field(document_token_to_indices, tokenized_context) fields['context_token_document_indices'] = context_token_document_indices_field fields['context_token_document_indices_mask'] = mask_field # Setup the cloze field, if it exists if cloze is not None: tokenized_cloze = self.cloze_tokenizer.tokenize(cloze) if self.max_cloze_length is not None: tokenized_cloze = tokenized_cloze[:self.max_cloze_length] fields['cloze'] = TextField(tokenized_cloze, self.cloze_token_indexers) cloze_token_document_indices_field, mask_field = \ get_token_mapping_field(document_token_to_indices, tokenized_cloze) fields['cloze_token_document_indices'] = cloze_token_document_indices_field fields['cloze_token_document_indices_mask'] = mask_field # Pass the original data through as metadata metadata = {} metadata['document'] = document metadata['document_tokens'] = [str(token) for token in tokenized_document] metadata['topics'] = topics metadata['context'] = context if cloze is not None: metadata['cloze'] = cloze fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, user: List[str], system: List[str], domains: List[str], usr_value_dict: Dict[int, str], sys_value_dict: Dict[int, str], # acts: List[List[str]], target: List[str] = None) -> Instance: fields_dict: Dict[str, Field] = {} # Note that it's a non-hierarchical model, # so the user/system/target are all TextField user_string = " ".join(user) tokenized_user = self._tokenizer.tokenize(user_string) tokenized_user.insert(0, Token(START_SYMBOL)) tokenized_user.append(Token(END_SYMBOL)) user_field = TextField(tokenized_user, self._token_indexers) fields_dict["user_tokens"] = user_field sys_string = " ".join(system) tokenized_sys = self._tokenizer.tokenize(sys_string) tokenized_sys.insert(0, Token(START_SYMBOL)) tokenized_sys.append(Token(END_SYMBOL)) sys_field = TextField(tokenized_sys, self._token_indexers) fields_dict["sys_tokens"] = sys_field # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol) if there is no match # p.s. separate matching of user and system. wait, do they need separate namespace? user_to_target_field = NamespaceSwappingField(tokenized_user, self._target_namespace) sys_to_target_field = NamespaceSwappingField(tokenized_sys, self._target_namespace) fields_dict["user_to_target"] = user_to_target_field fields_dict["sys_to_target"] = sys_to_target_field meta_fields = { "user_tokens": [x.text for x in tokenized_user], "sys_tokens": [x.text for x in tokenized_sys], "user_values_dict": usr_value_dict, "sys_values_dict": sys_value_dict } # add: generate the mask of "delex" slots usr_mask = np.zeros(len(user_field)) for k in usr_value_dict.keys(): usr_mask[k] = 1 fields_dict["user_value_mask"] = ArrayField(usr_mask) sys_mask = np.zeros(len(sys_field)) for k in sys_value_dict.keys(): sys_mask[k] = 1 fields_dict["sys_value_mask"] = ArrayField(sys_mask) if target is not None: target_string = " ".join(target) tokenized_target = self._tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexer) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target] user_token_ids = self._tokens_to_ids(tokenized_user) sys_token_ids = self._tokens_to_ids(tokenized_sys) target_token_ids = self._tokens_to_ids(tokenized_target) fields_dict["user_token_ids"] = ArrayField( np.array(user_token_ids)) fields_dict["sys_token_ids"] = ArrayField(np.array(sys_token_ids)) fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) else: user_token_ids = self._tokens_to_ids(tokenized_user) sys_token_ids = self._tokens_to_ids(tokenized_sys) fields_dict["user_token_ids"] = ArrayField( np.array(user_token_ids)) fields_dict["sys_token_ids"] = ArrayField(np.array(sys_token_ids)) domain_field = MultiLabelField(domains, label_namespace="domain_labels") fields_dict["domain_labels"] = domain_field fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, source_string: str, target_string: str = None, weight: float = None, ) -> Instance: # type: ignore """ Turn raw source string and target string into an `Instance`. # Parameters source_string : `str`, required target_string : `str`, optional (default = `None`) weight : `float`, optional (default = `None`) An optional weight to assign to this instance when calculating the loss in [CopyNetSeq2Seq.forward()](../../models/copynet_seq2seq/#forward.parameters). # Returns `Instance` See the above for a description of the fields that the instance will contain. """ tokenized_source = self._source_tokenizer.tokenize(source_string) if not tokenized_source: # If the tokenized source is empty, it will cause issues downstream. raise ValueError( f"source tokenizer produced no tokens from source '{source_string}'" ) source_field = TextField(tokenized_source) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source, self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source)] fields_dict["source_token_ids"] = TensorField( torch.tensor(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ):] fields_dict["target_token_ids"] = TensorField( torch.tensor(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source) fields_dict["source_token_ids"] = TensorField( torch.tensor(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) if weight is not None: fields_dict["weight"] = TensorField( torch.tensor(float(weight), dtype=torch.float)) return Instance(fields_dict)
def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ if answer and answer in ['Yes', 'No', 'Irrelevant']: return None predicted_span, predicted_label = self.get_prediction( rule_text, question, scenario, history) if answer is not None: # while training and validation token_span = self.dataset_reader.find_lcs( answer, predicted_span, self._source_tokenizer.tokenize, fuzzy_matching=False) if token_span is None: return None answer_offsets = [ (token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(answer) ] try: target_string1 = answer[:answer_offsets[token_span[0] - 1][1]] target_string2 = answer[answer_offsets[token_span[1] + 1][0]:] except IndexError: return None else: target_string1 = None target_string2 = None if self.add_rule: if self.embed_span: source_string = self.get_embedded_span(rule_text, predicted_span) else: source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@' else: source_string = predicted_span if self.add_question: source_string += ' @qs@ ' + question + ' @qe' if self.add_followup_ques: for follow_up_qna in history: source_string += ' @fs@ ' + follow_up_qna[ 'follow_up_question'] + ' @fe' tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]], "predicted_span_tokens": [ token.text for token in self._source_tokenizer.tokenize(predicted_span) ] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string1 is not None and target_string2 is not None: tokenized_target1 = self._target_tokenizer.tokenize(target_string1) tokenized_target1.insert(0, Token(START_SYMBOL)) tokenized_target1.append(Token(END_SYMBOL)) tokenized_target2 = self._target_tokenizer.tokenize(target_string2) tokenized_target2.insert(0, Token(START_SYMBOL)) tokenized_target2.append(Token(END_SYMBOL)) target_field1 = TextField(tokenized_target1, self._target_token_indexers) target_field2 = TextField(tokenized_target2, self._target_token_indexers) fields_dict["target_tokens1"] = target_field1 fields_dict["target_tokens2"] = target_field2 meta_fields["target_tokens1"] = [ y.text for y in tokenized_target1[1:-1] ] meta_fields["target_tokens2"] = [ y.text for y in tokenized_target2[1:-1] ] source_and_target_token_ids1 = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target1) source_and_target_token_ids2 = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target2) source_token_ids1 = np.array( source_and_target_token_ids1[:len(tokenized_source) - 2]) source_token_ids2 = np.array( source_and_target_token_ids2[:len(tokenized_source) - 2]) assert np.array_equal(source_token_ids1, source_token_ids2) fields_dict["source_token_ids"] = ArrayField(source_token_ids1) target_token_ids1 = np.array( source_and_target_token_ids1[len(tokenized_source) - 2:]) target_token_ids2 = np.array( source_and_target_token_ids2[len(tokenized_source) - 2:]) fields_dict["target_token_ids1"] = ArrayField(target_token_ids1) fields_dict["target_token_ids2"] = ArrayField(target_token_ids2) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['label'] = predicted_label fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore rule_text: str, question: str, scenario: str, history: List[Dict[str, str]], utterance_id: str = None, tree_id: str = None, source_url: str = None, answer: str = None, evidence: List[Dict[str, str]] = None) -> Optional[Instance]: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ utterance = {'snippet': rule_text, 'question': question,\ 'scenario': scenario, 'history': history} span = self.predictor.predict_json(utterance)['best_span_str'] span_size = len(span) span_start_index = rule_text.find(span) if span_start_index != -1: source_string = '' source_string += rule_text[:span_start_index] + '@@**@@ ' + span source_string += ' @@**@@ ' + rule_text[span_start_index + span_size:] else: source_string = rule_text print('Can\'t find span.') source_string += ' @@||@@ ' + question for follow_up_qna in history: source_string += ' @@||@@ ' source_string += follow_up_qna['follow_up_question'] source_string += ' @@?@@ ' source_string += follow_up_qna['follow_up_answer'] target_string = answer # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['rule_text'] = rule_text meta_fields['question'] = question meta_fields['scenario'] = scenario meta_fields['history'] = history fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ if answer and answer in ['Yes', 'No', 'Irrelevant']: return None target_string = answer if self.train_using_gold and answer is not None: # i.e. during training and validation predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More' predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize) if predicted_span_ixs is None: return None else: rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)] predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]] else: predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history) if self.add_rule: if self.embed_span: source_string = self.get_embedded_span(rule_text, predicted_span) else: source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@' else: source_string = predicted_span if self.add_question: source_string += ' @qs@ ' + question + ' @qe' if self.add_followup_ques: for follow_up_qna in history: source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe' tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) meta_fields['label'] = predicted_label fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore rule_text: str, question: str, scenario: str, history: List[Dict[str, str]], utterance_id: str = None, tree_id: str = None, source_url: str = None, answer: str = None, evidence: List[Dict[str, str]] = None) -> Optional[Instance]: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # For CopyNet Model source_string = rule_text + ' [SEP]' target_string = answer # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) # tokenized_source.append(Token(END_SYMBOL)) ' @@SEP@@' acts as end symbol source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # For Bert model passage_text1 = rule_text + ' [SEP]' question_text1 = question passage_text2 = rule_text + ' [SEP]' question_text2 = scenario bert_input1 = passage_text1 + ' ' + question_text1 bert_input2 = passage_text2 + ' ' + question_text2 bert_input_tokens1 = self.get_tokens_with_history_encoding( bert_input1, history) bert_input_tokens2 = self._bert_tokenizer.tokenize(bert_input2) bert_input_tokens1.insert(0, Token(START_SYMBOL)) bert_input_tokens2.insert(0, Token(START_SYMBOL)) fields_dict['bert_input1'] = TextField(bert_input_tokens1, self._bert_token_indexers) fields_dict['bert_input2'] = TextField(bert_input_tokens2, self._bert_token_indexers) meta_fields['passage_tokens1'] = self._bert_tokenizer.tokenize( passage_text1) meta_fields['passage_tokens2'] = self._bert_tokenizer.tokenize( passage_text2) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) action = 'More' if answer not in ['Yes', 'No', 'Irrelevant' ] else answer fields_dict['label'] = LabelField(action) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['rule_text'] = rule_text meta_fields['question'] = question meta_fields['scenario'] = scenario meta_fields['history'] = history fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)