def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) if file_path.endswith("zip"): archive = zipfile.ZipFile(file_path, "r") data_file = archive.open(os.path.basename(file_path)[:-4]) else: data_file = open(file_path, "r") logger.info("Reading instances from lines in file at: %s", file_path) dialogs = json.load(data_file) for dial_name in dialogs: dialog = dialogs[dial_name]["log"] for turn in dialog: tokens = turn["text"].split() spans = turn["span_info"] tags = [] domain = "None" intent = "None" for i in range(len(tokens)): for span in spans: if i == span[3]: new_domain, new_intent = span[0].split("-", 1) if domain == "None": domain = new_domain elif domain != new_domain: continue if intent == "None": intent = new_intent elif intent != new_intent: continue tags.append("B-" + span[1]) break if i > span[3] and i <= span[4]: new_domain, new_intent = span[0].split("-", 1) if domain != new_domain: continue if intent != new_intent: continue tags.append("I-" + span[1]) break else: tags.append("O") if domain != "None": assert intent != "None", "intent must not be None when domain is not None" elif turn["dialog_act"] != {}: assert intent == "None", "intent must be None when domain is None" di = list(turn["dialog_act"].keys())[0] dai = turn["dialog_act"][di][0] domain = di.split("-")[0] intent = di.split("-", 1)[-1] + "+" + dai[0] + "*" + dai[1] # print(turn["dialog_act"]) # print(domain) # print(intent) # print(tags) # for dacts in turn["dialog_act"]: # for dact in turn["dialog_act"][dacts]: # if dacts not in dialog_act: # dialog_act[dacts] = turn["dialog_act"][dacts] # break # elif dact[0] not in [sv[0] for sv in dialog_act[dacts]]: # dialog_act[dacts].append(dact) # domains = set() # intents = set() # for dacts in turn["dialog_act"]: # for dact in turn["dialog_act"][dacts]: # domains.add(dacts.split("-")[0]) # intents = [] # for dacts in turn["dialog_act"]: # for dact in turn["dialog_act"][dacts]: # if dacts not in dialog_act or dact[0] not in [sv[0] for sv in dialog_act[dacts]]: # intents.append(dacts+"+"+dact[0]+"*"+dact[1]) # if dact[0] == "none": # intents.add(dacts.split("-")[1]) # else: # intents.add(dacts.split("-")[1]+"+"+dact[0]) # if domain == "None" and len(domains) > 0: # domain = random.choice(list(domains)) # if intent == "None" and len(intents) > 0: # intent = random.choice(list(intents)) dialog_act = {} for dacts in turn["span_info"]: if dacts[0] not in dialog_act: dialog_act[dacts[0]] = [] dialog_act[dacts[0]].append( [dacts[1], " ".join(tokens[dacts[3]:dacts[4] + 1])]) for dacts in turn["dialog_act"]: for dact in turn["dialog_act"][dacts]: if dacts not in dialog_act: dialog_act[dacts] = turn["dialog_act"][dacts] break elif dact[0] not in [ sv[0] for sv in dialog_act[dacts] ]: dialog_act[dacts].append(dact) tokens = [Token(token) for token in tokens] # yield self.text_to_instance(tokens, tags, domain, intent, turn["dialog_act"]) yield self.text_to_instance(tokens, tags, domain, intent, dialog_act)
def text_to_instance( # type: ignore self, tokens: List[Token], verb_label: List[int], img, tags: List[str] = None ) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ metadata_dict: Dict[str, Any] = {} if self.bert_tokenizer is not None: wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input( [t.text for t in tokens] ) new_verbs = _convert_verb_indices_to_wordpiece_indices(verb_label, offsets) metadata_dict["offsets"] = start_offsets # In order to override the indexing mechanism, we need to set the `text_id` # attribute directly. This causes the indexing to use this id. text_field = TextField( [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in wordpieces], token_indexers=self._token_indexers, ) verb_indicator = SequenceLabelField(new_verbs, text_field) else: text_field = TextField(tokens, token_indexers=self._token_indexers) verb_indicator = SequenceLabelField(verb_label, text_field) #? Maybe other options??? img_feats = img['features'].copy() img_boxes = img['boxes'].copy() obj_num = img['num_boxes'] assert len(img_feats) == len(img_boxes) == obj_num # Normalize the boxes to 0 ~ 1 img_boxes = img_boxes.copy() img_boxes[:, (0, 2)] /= img['img_w'] img_boxes[:, (1, 3)] /= img['img_h'] np.testing.assert_array_less(img_boxes, 1+1e-5) np.testing.assert_array_less(-img_boxes, 0+1e-5) # Concat box feats to each object features img_concat = np.hstack((img_feats, img_boxes)) img_field = ArrayField(img_concat) fields: Dict[str, Field] = {} fields["tokens"] = text_field fields["verb_indicator"] = verb_indicator fields["img_emb"] = img_field if all([x == 0 for x in verb_label]): verb = None verb_index = None else: verb_index = verb_label.index(1) verb = tokens[verb_index].text metadata_dict["words"] = [x.text for x in tokens] metadata_dict["verb"] = verb metadata_dict["verb_index"] = verb_index if tags: if self.bert_tokenizer is not None: new_tags = _convert_tags_to_wordpiece_tags(tags, offsets) fields["tags"] = SequenceLabelField(new_tags, text_field) else: fields["tags"] = SequenceLabelField(tags, text_field) metadata_dict["gold_tags"] = tags fields["metadata"] = MetadataField(metadata_dict) return Instance(fields)
def text_to_instance( self, # type: ignore sentence_tokens: List[str], predicates: List[int], predicate_index: int, constits: List[List[str]] = None, parents: List[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ text_field = TextField([Token(t) for t in sentence_tokens], token_indexers=self._token_indexers) verb_field = SequenceLabelField(predicates, text_field) predicate_field = IndexField(predicate_index, text_field) # Span-based output fields. span_starts: List[Field] = [] span_ends: List[Field] = [] span_mask: List[int] = [ 1 for _ in range(len(sentence_tokens) * self.max_span_width) ] span_labels: Optional[List[str]] = [] if constits is not None else None parent_labels: Optional[ List[str]] = [] if parents is not None else None for j in range(len(sentence_tokens)): for diff in range(self.max_span_width): width = diff if j - diff < 0: # This is an invalid span. span_mask[j * self.max_span_width + diff] = 0 width = j span_starts.append(IndexField(j - width, text_field)) span_ends.append(IndexField(j, text_field)) if constits is not None: label = constits[j][diff] span_labels.append(label) if parents is not None: parent_labels.append(parents[j][diff]) start_fields = ListField(span_starts) end_fields = ListField(span_ends) span_mask_fields = SequenceLabelField(span_mask, start_fields) fields: Dict[str, Field] = { "tokens": text_field, "targets": verb_field, "span_starts": start_fields, "span_ends": end_fields, "span_mask": span_mask_fields, "target_index": predicate_field } if constits: fields['tags'] = SequenceLabelField( span_labels, start_fields, label_namespace=self.label_namespace) fields['parent_tags'] = SequenceLabelField( parent_labels, start_fields, label_namespace=self.parent_label_namespace) return Instance(fields)
def _process_sentence( self, sentence_tokens: List[str], constits: Dict[Tuple[int, int], str], verbal_predicates: List[int], predicate_argument_labels: List[List[str]]) -> List[Instance]: """ Parameters ---------- sentence_tokens : ``List[str]``, required. The tokenised sentence. constits : ``Dict[Tuple[int, int], str]]``, required. verbal_predicates : ``List[int]``, required. The indexes of the verbal predicates in the sentence which have an associated annotation. predicate_argument_labels : ``List[List[str]]``, required. A list of predicate argument labels, one for each verbal_predicate. The internal lists are of length: len(sentence). Returns ------- A list of Instances. """ default = "*" def get_new_label(original: str, newer: str): return newer if original == default else "{}|{}".format( newer, original) constit_matrix = [[default for _ in range(self.max_span_width)] for _ in sentence_tokens] for span in constits: start, end = span diff = end - start if diff >= self.max_span_width: continue # while diff >= self.max_span_width: # old_label = constit_matrix[end][self.max_span_width - 1] # constit_matrix[end][self.max_span_width - # 1] = get_new_label(old_label, constits[span]) # end = end - self.max_span_width # diff = end - start constit_matrix[end][diff] = get_new_label( constit_matrix[end][diff], constits[span]) tokens = [Token(t) for t in sentence_tokens] if not verbal_predicates: # Sentence contains no predicates. tags = ["O" for _ in sentence_tokens] verb_label = [0 for _ in sentence_tokens] srl_args = self._convert_bio_into_matrix(tags) dummy_verb_index = 0 return [ self.text_to_instance(tokens, verb_label, dummy_verb_index, constit_matrix, srl_args) ] else: instances = [] for verb_index, tags in zip(verbal_predicates, predicate_argument_labels): verb_label = [0 for _ in sentence_tokens] verb_label[verb_index] = 1 srl_args = self._convert_bio_into_matrix(tags) instances.append( self.text_to_instance(tokens, verb_label, verb_index, constit_matrix, srl_args)) self.find_overlap(srl_args, constit_matrix) return instances
def text_to_instance( self, # type: ignore sentences: List[List[str]], document_id: str, sentence_id: int, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, user_threshold: Optional[float] = 0.0) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. document_id : ``str``, required. A string representing the document ID. sentence_id : ``int``, required. An int representing the sentence ID. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. user_threshold: ``Optional[float]``, optional (default = 0.0) approximate % of gold labels to label to hold out as user input. EX = 0.5, 0.33, 0.25, 0.125 Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] metadata: Dict[str, Any] = { "original_text": flattened_sentences, "ID": document_id + ";" + str(sentence_id) } if gold_clusters is not None: metadata["clusters"] = gold_clusters metadata["num_gold_clusters"] = len(gold_clusters) text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) user_threshold_mod = int( 1 / user_threshold ) if self._simulate_user_inputs and user_threshold > 0 else 0 cluster_dict = {} simulated_user_cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for i in range(len(cluster)): # use modulo to have a relatively even distribution of user labels across length of document, # (since clusters are sorted)--so user simulated clusters are spread evenly across document if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1: cluster_dict[tuple(cluster[i])] = cluster_id simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict # Consequently user_labels encompasses all gold labels spans: List[Field] = [] if gold_clusters is not None: span_labels: Optional[List[int]] = [] user_labels: Optional[List[ int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None else: span_labels = user_labels = None # our must-link and cannot-link constraints, derived from user labels # using gold_clusters being None as an indicator of whether we're running training or not # TODO: confirm ^^ must_link: Optional[ List[int]] = [] if gold_clusters is not None else None cannot_link: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) if self._simulate_user_inputs and user_threshold > 0: if (start, end) in simulated_user_cluster_dict: user_labels.append( simulated_user_cluster_dict[(start, end)]) else: user_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) if user_labels is not None: fields["user_labels"] = SequenceLabelField( user_labels, span_field) return Instance(fields)
def text_to_instance( self, # type: ignore query: List[str], derived_cols: List[Tuple[str, str]], derived_tables: List[str], prelinked_entities: Dict[str, Dict[str, str]] = None, sql: List[str] = None, alignment: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if sql is not None: action_sequence, all_actions = self._world.get_action_sequence_and_all_actions( query=sql, derived_cols=derived_cols, derived_tables=derived_tables, prelinked_entities=prelinked_entities) if action_sequence is None: return None if alignment is not None: # Modify the alignment according to the action sequence alignment = AttnSupGrammarBasedWorld.modify_alignment( action_sequence=action_sequence, alignment=alignment) else: # having a list of NO_ALIGN is basically equivalent to mask all the alignment alignment = ['NO_ALIGN'] * len(action_sequence) index_fields: List[Field] = [] production_rule_fields: List[Field] = [] for production_rule in all_actions: nonterminal, _ = production_rule.split(' ->') production_rule = ' '.join(production_rule.split(' ')) field = ProductionRuleField( production_rule, self._world.is_global_rule(nonterminal), nonterminal=nonterminal) production_rule_fields.append(field) valid_actions_field = ListField(production_rule_fields) fields["valid_actions"] = valid_actions_field action_map = { action.rule: i # type: ignore for i, action in enumerate(valid_actions_field.field_list) } for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], valid_actions_field)) if not action_sequence: index_fields = [IndexField(-1, valid_actions_field)] # if not action_sequence and re.findall(r"COUNT \( \* \) (?:<|>|<>|=) 0", " ".join(sql)): # index_fields = [IndexField(-2, valid_actions_field)] action_sequence_field = ListField(index_fields) fields["action_sequence"] = action_sequence_field alignment_index_fields: List[IndexField] = [] tmp_tokens_as_strings = [t.text for t in tokens] for aligned_token in alignment: try: aligned_token_index = int( tmp_tokens_as_strings.index(aligned_token)) alignment_index_fields.append( IndexField(aligned_token_index, tokens)) except ValueError as e: # a special "no alignment" index alignment_index_fields.append( IndexField(-1, tokens.empty_field())) fields["alignment_sequence"] = ListField(alignment_index_fields) return Instance(fields)
def _read_tokens_from_json_list(json_list) -> List[Token]: return [ Token(text=json_obj['text'], lemma=json_obj['lemma']) for json_obj in json_list ]
def text_to_instance( self, # type: ignore rule_text: str, question: str, scenario: str, history: List[Dict[str, str]], utterance_id: str = None, tree_id: str = None, source_url: str = None, answer: str = None, evidence: List[Dict[str, str]] = None) -> Optional[Instance]: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # For CopyNet Model source_string = rule_text + ' [SEP]' target_string = answer # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) # tokenized_source.append(Token(END_SYMBOL)) ' @@SEP@@' acts as end symbol source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # For Bert model passage_text1 = rule_text + ' [SEP]' question_text1 = question passage_text2 = rule_text + ' [SEP]' question_text2 = scenario bert_input1 = passage_text1 + ' ' + question_text1 bert_input2 = passage_text2 + ' ' + question_text2 bert_input_tokens1 = self.get_tokens_with_history_encoding( bert_input1, history) bert_input_tokens2 = self._bert_tokenizer.tokenize(bert_input2) bert_input_tokens1.insert(0, Token(START_SYMBOL)) bert_input_tokens2.insert(0, Token(START_SYMBOL)) fields_dict['bert_input1'] = TextField(bert_input_tokens1, self._bert_token_indexers) fields_dict['bert_input2'] = TextField(bert_input_tokens2, self._bert_token_indexers) meta_fields['passage_tokens1'] = self._bert_tokenizer.tokenize( passage_text1) meta_fields['passage_tokens2'] = self._bert_tokenizer.tokenize( passage_text2) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) action = 'More' if answer not in ['Yes', 'No', 'Irrelevant' ] else answer fields_dict['label'] = LabelField(action) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['rule_text'] = rule_text meta_fields['question'] = question meta_fields['scenario'] = scenario meta_fields['history'] = history fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. # Parameters tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags : ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. # Returns An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constituency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ if self._convert_parentheses: tokens = [PTB_PARENTHESES.get(token, token) for token in tokens] text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} pos_namespace = self._label_namespace_prefix + self._pos_label_namespace if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace) fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader." ) spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: gold_labels.append(gold_spans.get((start, end), "NO-LABEL")) metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree if self._use_pos_tags: metadata["pos_tags"] = pos_tags fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace=self._label_namespace_prefix + "labels", ) return Instance(fields)
def text_to_instance( self, source_string: str, target_string: str = None) -> Instance: # type: ignore """ Turn raw source string and target string into an `Instance`. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional Returns ------- Instance An Instance containing at least the following fields: - `source_tokens`: a `TextField` containing the tokenized source sentence, including the `START_SYMBOL` and `END_SYMBOL`. This will result in a tensor of shape `(batch_size, source_length)`. - `source_token_ids`: an `ArrayField` of size `(batch_size, trimmed_source_length)` that contains an ID for each token in the source sentence. Tokens that match at the lowercase level will share the same ID. If `target_tokens` is passed as well, these IDs will also correspond to the `target_token_ids` field, i.e. any tokens that match at the lowercase level in both the source and target sentences will share the same ID. Note that these IDs have no correlation with the token indices from the corresponding vocabulary namespaces. - `source_to_target`: a `CopyMapField` that keeps track of the index of the target token that matches each token in the source sentence. When there is no matching target token, the OOV index is used. This will result in a tensor of shape `(batch_size, trimmed_source_length)`. - `metadata`: a `MetadataField` which contains the source tokens and potentially target tokens as lists of strings. When `target_string` is passed, the instance will also contain these fields: - `target_tokens`: a `TextField` containing the tokenized target sentence, including the `START_SYMBOL` and `END_SYMBOL`. This will result in a tensor of shape `(batch_size, target_length)`. - `target_token_ids`: an `ArrayField` of size `(batch_size, target_length)`. This is calculated in the same way as `source_token_ids`. Notes ----- By `source_length` we are referring to the number of tokens in the source sentence including the `START_SYMBOL` and `END_SYMBOL`, while `trimmed_source_length` refers to the number of tokens in the source sentence *excluding* the `START_SYMBOL` and `END_SYMBOL`, i.e. `trimmed_source_length = source_length - 2`. On the other hand, `target_length` is the number of tokens in the target sentence *including* the `START_SYMBOL` and `END_SYMBOL`. In the context where there is a `batch_size` dimension, the above refer to the maximum of their individual values across the batch. """ # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = CopyMapField(tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance(self, text: str, targets: Optional[List[str]] = None, target_sentiments: Optional[List[Union[str, int]]] = None, spans: Optional[List[List[int]]] = None, categories: Optional[List[str]] = None, category_sentiments: Optional[List[Union[str, int]]] = None, **kwargs) -> Instance: ''' The original text, text tokens as well as the targets and target tokens are stored in the MetadataField. :NOTE: At least targets and/or categories must be present. :NOTE: That the left and right contexts returned in the instance are a List of a List of tokens. A list for each Target. :param text: The text that contains the target(s) and/or categories. :param targets: The targets that are within the text :param target_sentiments: The sentiment of the targets. To be used if training the classifier :param spans: The spans that represent the character offsets for each of the targets given in the targets list. :param categories: The categories that are within the text :param category_sentiments: The sentiment of the categories :returns: An Instance object with all of the above encoded for a PyTorch model. :raises ValueError: If either targets and categories are both None :raises ValueError: If `self._target_sequences` is True and the passed `spans` argument is None. :raises ValueError: If `self._left_right_contexts` is True and the passed `spans` argument is None. ''' if targets is None and categories is None: raise ValueError('Either targets or categories must be given if you ' 'want to be predict the sentiment of a target ' 'or a category') instance_fields: Dict[str, Field] = {} # Metadata field metadata_dict = {} if targets is not None: # need to change this so that it takes into account the case where # the positions are True but not the target sequences. if self._target_sequences or self._position_embeddings or self._position_weights: if spans is None: raise ValueError('To create target sequences requires `spans`') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') target_text_object.force_targets() text = target_text_object['text'] allen_tokens = self._tokenizer.tokenize(text) tokens = [x.text for x in allen_tokens] target_text_object['tokenized_text'] = tokens target_text_object.sequence_labels(per_target=True) target_sequences = target_text_object['sequence_labels'] # Need to add the target sequences to the instances in_label = {'B', 'I'} number_targets = len(targets) all_target_tokens: List[List[Token]] = [[] for _ in range(number_targets)] target_sequence_fields = [] target_indicators: List[List[int]] = [] for target_index in range(number_targets): one_values = [] target_ones = [0] * len(allen_tokens) for token_index, token in enumerate(allen_tokens): target_sequence_value = target_sequences[target_index][token_index] in_target = 1 if target_sequence_value in in_label else 0 if in_target: all_target_tokens[target_index].append(allen_tokens[token_index]) one_value_list = [0] * len(allen_tokens) one_value_list[token_index] = 1 one_values.append(one_value_list) target_ones[token_index] = 1 one_values = np.array(one_values) target_sequence_fields.append(ArrayField(one_values, dtype=np.int32)) target_indicators.append(target_ones) if self._position_embeddings: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=True) target_text_distances = [] for target_distance in target_distances: token_distances = [Token(distance) for distance in target_distance] token_distances = TextField(token_distances, self._position_indexers) target_text_distances.append(token_distances) instance_fields['position_embeddings'] = ListField(target_text_distances) if self._position_weights: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=False) target_distances = np.array(target_distances) instance_fields['position_weights'] = ArrayField(target_distances, dtype=np.int32) if self._target_sequences: instance_fields['target_sequences'] = ListField(target_sequence_fields) instance_fields['tokens'] = TextField(allen_tokens, self._token_indexers) metadata_dict['text words'] = tokens metadata_dict['text'] = text # update target variable as the targets could have changed due # to the force_targets function targets = target_text_object['targets'] else: all_target_tokens = [self._tokenizer.tokenize(target) for target in targets] target_fields = [TextField(target_tokens, self._token_indexers) for target_tokens in all_target_tokens] target_fields = ListField(target_fields) instance_fields['targets'] = target_fields # Add the targets and the tokenised targets to the metadata metadata_dict['targets'] = [target for target in targets] metadata_dict['target words'] = [[x.text for x in target_tokens] for target_tokens in all_target_tokens] # Target sentiment if it exists if target_sentiments is not None: target_sentiments_field = SequenceLabelField(target_sentiments, target_fields, label_namespace='target-sentiment-labels') instance_fields['target_sentiments'] = target_sentiments_field if categories is not None and self._use_categories: category_fields = TextField([Token(category) for category in categories], self._token_indexers) instance_fields['categories'] = category_fields # Category sentiment if it exists if category_sentiments is not None: category_sentiments_field = SequenceLabelField(category_sentiments, category_fields, label_namespace='category-sentiment-labels') instance_fields['category_sentiments'] = category_sentiments_field # Add the categories to the metadata metadata_dict['categories'] = [category for category in categories] if 'tokens' not in instance_fields: tokens = self._tokenizer.tokenize(text) instance_fields['tokens'] = TextField(tokens, self._token_indexers) metadata_dict['text'] = text metadata_dict['text words'] = [x.text for x in tokens] # If required processes the left and right contexts left_contexts = None right_contexts = None if self._left_right_contexts: if spans is None: raise ValueError('To create left, right, target contexts requires' ' the `spans` of the targets which is None') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') # left, right, and target contexts for each target in the # the text left_right_targets = target_text_object.left_right_target_contexts(incl_target=self._incl_target) left_contexts: List[str] = [] right_contexts: List[str] = [] for left_right_target in left_right_targets: left, right, _ = left_right_target left_contexts.append(left) if self._reverse_right_context: right_tokens = self._tokenizer.tokenize(right) reversed_right_tokens = [] for token in reversed(right_tokens): reversed_right_tokens.append(token.text) right = ' '.join(reversed_right_tokens) right_contexts.append(right) if left_contexts is not None: left_field = self._add_context_field(left_contexts) instance_fields["left_contexts"] = left_field if right_contexts is not None: right_field = self._add_context_field(right_contexts) instance_fields["right_contexts"] = right_field instance_fields["metadata"] = MetadataField(metadata_dict) return Instance(instance_fields)
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # pylint: disable=arguments-differ # Flatten and pad tokens tokens = data['tokens'] tokens = [Token(x) for x in tokens] fields = {'tokens': TextField(tokens, self._token_indexers)} return Instance(fields)
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # pylint: disable=arguments-differ # Flatten and pad tokens tokens = _flatten(data['tokens']) tokens = ['@@START@@', *tokens, '@@END@@'] source = [Token(x) for x in tokens[:-1]] target = [Token(x) for x in tokens[1:]] fields = { 'source': TextField(source, self._token_indexers), 'target': TextField(target, self._token_indexers) } # Process annotations if 'annotations' in data: # We maintain a "shortlist" of observed entities, that is used for baseline models # that only select entities from the set that appear in the document (as opposed to # the set of all possible entities). shortlist = [DEFAULT_PADDING_TOKEN] reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0} entity_ids = [DEFAULT_PADDING_TOKEN] * len(target) shortlist_inds = np.zeros(shape=(len(target, ))) alias_copy_inds = np.zeros(shape=(len(target), )) alias_tokens = [TextField([], self._token_indexers)] * len(target) alias_inds: List[List[int]] = [[]] * len(target) max_len = 0 # Process annotations for annotation in data['annotations']: # Obtain the entity identifier for the annotated span entity_id = annotation['id'] alias = annotation['alias'] alias_map = { token: i + 1 for i, token in enumerate(set(alias)) } # If neccessary, update the shortlist. Obtain the index of the entity identifier in # the shortlist. if entity_id not in reverse_shortlist: reverse_shortlist[entity_id] = len(reverse_shortlist) shortlist.append(entity_id) shortlist_ind = reverse_shortlist[entity_id] # Update the outputs for i in range(*annotation['span']): # Note: +1 offset to account for start token. if tokens[i + 1] not in alias_map: continue else: entity_ids[i] = entity_id shortlist_inds[i] = shortlist_ind alias_copy_inds[i] = alias_map[tokens[i + 1]] alias_inds[i] = [alias_map[token] for token in alias] alias_tokens[i] = TextField([Token(x) for x in alias], self._token_indexers) max_len = max(max_len, len(alias)) # Make alias_inds into a numpy array alias_ind_array = np.zeros((len(target), max_len)) for i, arr in enumerate(alias_inds): for j, ind in enumerate(arr): alias_ind_array[i, j] = ind fields['entity_ids'] = TextField( [Token(x) for x in entity_ids], token_indexers=self._entity_indexers) fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds, dtype=np.int64) fields['shortlist'] = TextField( [Token(x) for x in shortlist], token_indexers=self._entity_indexers) fields['shortlist_inds'] = SequentialArrayField(shortlist_inds, dtype=np.int64) fields['alias_tokens'] = ListField(alias_tokens) fields['alias_inds'] = SequentialArrayField(alias_ind_array, dtype=np.int64) return Instance(fields)
def _tokenize(iterable: Iterable[str]): return [Token(x) for x in iterable]
def _read(self, file_path: str) -> Iterator[Instance]: with open(file_path) as f: for line in f: pairs = line.strip().split() sentence, tags = zip(*(pair.split("###") for pair in pairs)) yield self.text_to_instance([Token(word) for word in sentence], tags)
def tokenizer(self, text): text = [ Token(mrph.midasi) for mrph in self.jumanpp.analysis(text).mrph_list() ][0:self.max_tokens] return text
def prepare_text(text, max_tokens): tokens = self._tokenizer.tokenize(text)[:max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens
def text_to_instance( self, # type: ignore tokens: List[str], ccg_categories: List[str] = None, original_pos_tags: List[str] = None, modified_pos_tags: List[str] = None, predicate_arg_categories: List[str] = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. ccg_categories : ``List[str]``, optional, (default = None). The CCG categories for the words in the sentence. (e.g. N/N) original_pos_tags : ``List[str]``, optional, (default = None). The tag assigned to the word in the Penn Treebank. modified_pos_tags : ``List[str]``, optional, (default = None). The POS tag might have changed during the translation to CCG. predicate_arg_categories : ``List[str]``, optional, (default = None). Encodes the word-word dependencies in the underlying predicate- argument structure. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. tags : ``SequenceLabelField`` The tags corresponding to the ``tag_label`` constructor argument. feature_label_tags : ``SequenceLabelField`` Tags corresponding to each feature_label (if any) specified in the ``feature_labels`` constructor argument. """ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} # Add "feature labels" to instance if "ccg" in self.feature_labels: if ccg_categories is None: raise ConfigurationError( "Dataset reader was specified to use CCG categories as " "features. Pass them to text_to_instance.") fields["ccg_tags"] = SequenceLabelField(ccg_categories, text_field, "ccg_tags") if "original_pos" in self.feature_labels: if original_pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use original POS tags as " "features. Pass them to text_to_instance.") fields["original_pos_tags"] = SequenceLabelField( original_pos_tags, text_field, "original_pos_tags") if "modified_pos" in self.feature_labels: if modified_pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use modified POS tags as " " features. Pass them to text_to_instance.") fields["modified_pos_tags"] = SequenceLabelField( modified_pos_tags, text_field, "modified_pos_tags") if "predicate_arg" in self.feature_labels: if predicate_arg_categories is None: raise ConfigurationError( "Dataset reader was specified to use predicate arg tags as " " features. Pass them to text_to_instance.") fields["predicate_arg_tags"] = SequenceLabelField( predicate_arg_categories, text_field, "predicate_arg_tags") # Add "tag label" to instance if self.tag_label == "ccg" and ccg_categories is not None: fields["tags"] = SequenceLabelField(ccg_categories, text_field, self.label_namespace) elif self.tag_label == "original_pos" and original_pos_tags is not None: fields["tags"] = SequenceLabelField(original_pos_tags, text_field, self.label_namespace) elif self.tag_label == "modified_pos" and modified_pos_tags is not None: fields["tags"] = SequenceLabelField(modified_pos_tags, text_field, self.label_namespace) elif self.tag_label == "predicate_arg" and predicate_arg_categories is not None: fields["tags"] = SequenceLabelField(predicate_arg_categories, text_field, self.label_namespace) return Instance(fields)
def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ if answer and answer in ['Yes', 'No', 'Irrelevant']: return None target_string = answer if self.train_using_gold and answer is not None: # i.e. during training and validation predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More' predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize) if predicted_span_ixs is None: return None else: rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)] predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]] else: predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history) if self.add_rule: if self.embed_span: source_string = self.get_embedded_span(rule_text, predicted_span) else: source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@' else: source_string = predicted_span if self.add_question: source_string += ' @qs@ ' + question + ' @qe' if self.add_followup_ques: for follow_up_qna in history: source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe' tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) meta_fields['label'] = predicted_label fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def attack_from_json(self, inputs: JsonDict = None, input_field_to_attack: str = 'tokens', grad_input_field: str = 'grad_input_1', ignore_tokens: List[str] = None) -> JsonDict: """ Replaces one token at a time from the input until the model's prediction changes. ``input_field_to_attack`` is for example ``tokens``, it says what the input field is called. ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads dictionary. The method computes the gradient w.r.t. the tokens, finds the token with the maximum gradient (by L2 norm), and replaces it with another token based on the first-order Taylor approximation of the loss. This process is iteratively repeated until the prediction changes. Once a token is replaced, it is not flipped again. """ if self.token_embedding is None: self.initialize() ignore_tokens = ["@@NULL@@", '.', ',', ';', '!', '?'] if ignore_tokens is None else ignore_tokens original_instances = self.predictor.json_to_labeled_instances(inputs) original_text_field: TextField = original_instances[0][input_field_to_attack] # type: ignore original_tokens = deepcopy(original_text_field.tokens) final_tokens = [] for current_instance in original_instances: # Gets a list of the fields that we want to check to see if they change. fields_to_compare = utils.get_fields_to_compare(inputs, current_instance, input_field_to_attack) current_text_field: TextField = current_instance[input_field_to_attack] # type: ignore current_tokens = current_text_field.tokens grads, outputs = self.predictor.get_gradients([current_instance]) # ignore any token that is in the ignore_tokens list by setting the token to already flipped flipped: List[int] = [] for index, token in enumerate(current_tokens): if token.text in ignore_tokens: flipped.append(index) while True: # Compute L2 norm of all grads. grad = grads[grad_input_field] grads_magnitude = [g.dot(g) for g in grad] # only flip a token once for index in flipped: grads_magnitude[index] = -1 # we flip the token with highest gradient norm index_of_token_to_flip = numpy.argmax(grads_magnitude) # when we have already flipped all the tokens once if grads_magnitude[index_of_token_to_flip] == -1: break flipped.append(index_of_token_to_flip) # Get new token using taylor approximation input_tokens = current_text_field._indexed_tokens["tokens"] original_id_of_token_to_flip = input_tokens[index_of_token_to_flip] new_id_of_flipped_token = _first_order_taylor(grad[index_of_token_to_flip], self.token_embedding.weight, # type: ignore original_id_of_token_to_flip) # flip token new_token = Token(self.vocab._index_to_token["tokens"][new_id_of_flipped_token]) # type: ignore current_text_field.tokens[index_of_token_to_flip] = new_token current_instance.indexed = False # Get model predictions on current_instance, and then label the instances grads, outputs = self.predictor.get_gradients([current_instance]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # add labels to current_instances current_instance_labeled = self.predictor.predictions_to_labeled_instances(current_instance, outputs)[0] # if the prediction has changed, then stop if any(current_instance_labeled[field] != fields_to_compare[field] for field in fields_to_compare): break final_tokens.append(current_tokens) return sanitize({"final": final_tokens, "original": original_tokens, "outputs": outputs})
def text_to_instance(self, # type: ignore document_id: str, part_number: str, sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- document_id: ``str``, required. The id of the document. sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [self._normalize_word(word) for sentence in sentences for word in sentence] metadata: Dict[str, Any] = { "document_id": document_id, "part_number": part_number, "original_text": flattened_sentences, } if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = {"text": text_field, "spans": span_field, "metadata": metadata_field} if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def tokenize(self, text: str) -> List[Token]: return [Token(token) for token in self.tokenizer.tokenize(text)]
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) if file_path.endswith("zip"): archive = zipfile.ZipFile(file_path, "r") data_file = archive.open(os.path.basename(file_path)[:-4]) else: data_file = open(file_path, "r") logger.info("Reading instances from lines in file at: %s", file_path) dialogs = json.load(data_file) for dial_name in dialogs: dialog = dialogs[dial_name]["log"] context_tokens_list = [] for i, turn in enumerate(dialog): if self._agent and self._agent == "user" and i % 2 != 1: continue if self._agent and self._agent == "system" and i % 2 != 0: continue tokens = turn["text"].split() dialog_act = {} for dacts in turn["span_info"]: if dacts[0] not in dialog_act: dialog_act[dacts[0]] = [] dialog_act[dacts[0]].append( [dacts[1], " ".join(tokens[dacts[3]:dacts[4] + 1])]) spans = turn["span_info"] tags = [] for j in range(len(tokens)): for span in spans: if j == span[3]: tags.append("B-" + span[0] + "+" + span[1]) break if j > span[3] and j <= span[4]: tags.append("I-" + span[0] + "+" + span[1]) break else: tags.append("O") intents = [] for dacts in turn["dialog_act"]: for dact in turn["dialog_act"][dacts]: if dacts not in dialog_act or dact[0] not in [ sv[0] for sv in dialog_act[dacts] ]: if dact[1] in [ "none", "?", "yes", "no", "do nt care", "do n't care" ]: intents.append(dacts + "+" + dact[0] + "*" + dact[1]) for dacts in turn["dialog_act"]: for dact in turn["dialog_act"][dacts]: if dacts not in dialog_act: dialog_act[dacts] = turn["dialog_act"][dacts] break elif dact[0] not in [ sv[0] for sv in dialog_act[dacts] ]: dialog_act[dacts].append(dact) num_context = random.randint( 0, self._context_size ) if self._random_context_size else self._context_size if len(context_tokens_list) > 0 and num_context > 0: wrapped_context_tokens = [ Token(token) for context_tokens in context_tokens_list[-num_context:] for token in context_tokens ] else: wrapped_context_tokens = [Token("SENT_END")] wrapped_tokens = [Token(token) for token in tokens] context_tokens_list.append(tokens + ["SENT_END"]) yield self.text_to_instance(wrapped_context_tokens, wrapped_tokens, tags, intents, dialog_act)
def text_to_instance( self, question_text: str, passage_text: str, passage_tokens: List[Token], passage_spans: List[Tuple[int, int]], numbers_in_passage: List[Any], number_words: List[str], number_indices: List[int], number_len: List[int], question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, count_gold_spans_text: List[str] = None) -> Union[Instance, None]: # Tokenize question and passage question_tokens = self.tokenizer.tokenize(question_text) qlen = len(question_tokens) plen = len(passage_tokens) question_passage_tokens = [Token('[CLS]')] + question_tokens + [ Token('[SEP]') ] + passage_tokens if len(question_passage_tokens) > self.max_pieces - 1: question_passage_tokens = question_passage_tokens[:self. max_pieces - 1] passage_tokens = passage_tokens[:self.max_pieces - qlen - 3] plen = len(passage_tokens) number_indices, number_len, numbers_in_passage = \ clipped_passage_num(number_indices, number_len, numbers_in_passage, plen) question_passage_tokens += [Token('[SEP]')] number_indices = [index + qlen + 2 for index in number_indices] + [-1] # Not done in-place so they won't change the numbers saved for the passage number_len = number_len + [1] numbers_in_passage = numbers_in_passage + [0] number_tokens = [Token(str(number)) for number in numbers_in_passage] extra_number_tokens = [Token(str(num)) for num in self.extra_numbers] mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1] fields: Dict[str, Field] = {} # Add feature fields question_passage_field = TextField(question_passage_tokens, self.token_indexers) fields["question_passage"] = question_passage_field number_token_indices = \ [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) for i, start_ind in enumerate(number_indices)] fields["number_indices"] = ListField(number_token_indices) numbers_in_passage_field = TextField(number_tokens, self.token_indexers) extra_numbers_field = TextField(extra_number_tokens, self.token_indexers) all_numbers_field = TextField(extra_number_tokens + number_tokens, self.token_indexers) mask_index_fields: List[Field] = [ IndexField(index, question_passage_field) for index in mask_indices ] fields["mask_indices"] = ListField(mask_index_fields) # Compile question, passage, answer metadata metadata = { "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "original_number_words": number_words, "extra_numbers": self.extra_numbers, "passage_tokens": passage_tokens, "question_tokens": question_tokens, "question_passage_tokens": question_passage_tokens, "passage_id": passage_id, "question_id": question_id } if self.extract_spans: metadata["passage_spans"] = passage_spans if count_gold_spans_text is not None: metadata["count_gold_spans_text"] = count_gold_spans_text if answer_annotations: for annotation in answer_annotations: tokenized_spans = [[ token.text for token in self.tokenizer.tokenize(answer) ] for answer in annotation['spans']] annotation['spans'] = [ tokenlist_to_passage(token_list) for token_list in tokenized_spans ] # Get answer type, answer text, tokenize answer_type, answer_texts = DropReader.extract_answer_info_from_annotation( answer_annotations[0]) tokenized_answer_texts = [] num_spans = min(len(answer_texts), self.max_spans) for answer_text in answer_texts: answer_tokens = self.tokenizer.tokenize(answer_text) tokenized_answer_texts.append(' '.join( token.text for token in answer_tokens)) metadata["answer_annotations"] = answer_annotations metadata["answer_texts"] = answer_texts metadata["answer_tokens"] = tokenized_answer_texts # Find answer text in question and passage valid_question_spans = DropReader.find_valid_spans( question_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_question_spans): valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1) valid_passage_spans = DropReader.find_valid_spans( passage_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_passage_spans): valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2) # Get target numbers target_numbers = [] for answer_text in answer_texts: number = self.word_to_num(answer_text) if number is not None: target_numbers.append(number) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = [] exp_strings = None if answer_type in ["number", "date"]: if self.exp_search == 'full': expressions = get_full_exp( list(enumerate(self.extra_numbers + numbers_in_passage)), target_numbers, self.operations, self.op_dict, self.max_depth) zipped = list(zip(*expressions)) if zipped: valid_expressions = list(zipped[0]) exp_strings = list(zipped[1]) elif self.exp_search == 'add_sub': valid_expressions = \ DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) elif self.exp_search == 'template': valid_expressions, exp_strings = \ get_template_exp(self.extra_numbers + numbers_in_passage, target_numbers, self.templates, self.template_strings) exp_strings = sum(exp_strings, []) # Get possible ways to arrive at target numbers with counting valid_counts: List[int] = [] if answer_type in ["number"]: numbers_for_count = list(range(self.max_count + 1)) valid_counts = DropReader.find_valid_counts( numbers_for_count, target_numbers) # Update metadata with answer info answer_info = { "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "num_spans": num_spans, "expressions": valid_expressions, "counts": valid_counts } if self.exp_search in ['template', 'full']: answer_info['expr_text'] = exp_strings metadata["answer_info"] = answer_info # Add answer fields passage_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_passage_spans ] if not passage_span_fields: passage_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_question_spans ] if not question_span_fields: question_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) if self.exp_search == 'add_sub': add_sub_signs_field: List[Field] = [] extra_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: extra_signs = signs_for_one_add_sub_expressions[:len( self.extra_numbers)] normal_signs = signs_for_one_add_sub_expressions[ len(self.extra_numbers):] add_sub_signs_field.append( SequenceLabelField(normal_signs, numbers_in_passage_field)) extra_signs_field.append( SequenceLabelField(extra_signs, extra_numbers_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) if not extra_signs_field: extra_signs_field.append( SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field)) fields["answer_as_expressions"] = ListField( add_sub_signs_field) if self.extra_numbers: fields["answer_as_expressions_extra"] = ListField( extra_signs_field) elif self.exp_search in ['template', 'full']: expression_indices = [] for expression in valid_expressions: if not expression: expression.append(3 * [-1]) expression_indices.append( ArrayField(np.array(expression), padding_value=-1)) if not expression_indices: expression_indices = \ [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))] fields["answer_as_expressions"] = ListField(expression_indices) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in valid_counts ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) fields["num_spans"] = LabelField(num_spans, skip_indexing=True) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, source_string: str, target_lang: str, target_string: str = None) -> Instance: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_lang : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if self._provide_trg_lang: lang_id_field = LabelField( target_lang, label_namespace=self._language_id_namespace) metadata_trg_lang = MetadataField(target_lang) fields_dict["target_lang"] = lang_id_field fields_dict["target_language"] = metadata_trg_lang if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, passage_tokens: List[Token] = None) -> Union[Instance, None]: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) question_tokens = self._tokenizer.tokenize(question_text) question_tokens = split_tokens_by_hyphen(question_tokens) if self.passage_length_limit is not None: passage_tokens = passage_tokens[:self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[:self.question_length_limit] answer_type: str = None answer_texts: List[str] = [] if answer_annotations: # Currently we only use the first annotated answer here, but actually this doesn't affect # the training, because we only have one annotation for the train set. answer_type, answer_texts = self.extract_answer_info_from_annotation( answer_annotations[0]) # Tokenize the answer text in order to find the matched span based on token tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self._tokenizer.tokenize(answer_text) answer_tokens = split_tokens_by_hyphen(answer_tokens) tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens)) if self.instance_format == "squad": valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append( (len(passage_tokens) - 1, len(passage_tokens) - 1)) return make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, valid_passage_spans, # this `answer_texts` will not be used for evaluation answer_texts, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "valid_passage_spans": valid_passage_spans, "answer_annotations": answer_annotations }) elif self.instance_format == "bert": question_concat_passage_tokens = question_tokens + [ Token("[SEP]") ] + passage_tokens valid_passage_spans = [] for span in self.find_valid_spans(passage_tokens, tokenized_answer_texts): # This span is for `question + [SEP] + passage`. valid_passage_spans.append( (span[0] + len(question_tokens) + 1, span[1] + len(question_tokens) + 1)) if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append( (len(question_concat_passage_tokens) - 1, len(question_concat_passage_tokens) - 1)) answer_info = { "answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans } return self.make_bert_drop_instance(question_tokens, passage_tokens, question_concat_passage_tokens, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "answer_annotations": answer_annotations }) elif self.instance_format == "drop": numbers_in_passage = [] number_indices = [] for token_index, token in enumerate(passage_tokens): number = self.convert_word_to_number(token.text) if number is not None: numbers_in_passage.append(number) number_indices.append(token_index) # hack to guarantee minimal length of padded number numbers_in_passage.append(0) number_indices.append(-1) numbers_as_tokens = [ Token(str(number)) for number in numbers_in_passage ] valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] valid_question_spans = \ self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] target_numbers = [] # `answer_texts` is a list of valid answers. for answer_text in answer_texts: number = self.convert_word_to_number(answer_text) if number is not None: target_numbers.append(number) valid_signs_for_add_sub_expressions: List[List[int]] = [] valid_counts: List[int] = [] if answer_type in ["number", "date"]: valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions( numbers_in_passage, target_numbers) if answer_type in ["number"]: # Currently we only support count number 0 ~ 9 numbers_for_count = list(range(10)) valid_counts = self.find_valid_counts(numbers_for_count, target_numbers) type_to_answer_map = { "passage_span": valid_passage_spans, "question_span": valid_question_spans, "addition_subtraction": valid_signs_for_add_sub_expressions, "counting": valid_counts } if self.skip_when_all_empty \ and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty): return None answer_info = { "answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions, "counts": valid_counts } return self.make_marginal_drop_instance(question_tokens, passage_tokens, numbers_as_tokens, number_indices, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "passage_id": passage_id, "question_id": question_id, "answer_info": answer_info, "answer_annotations": answer_annotations }) else: raise ValueError( f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", " f"but got {self.instance_format}")
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, "pos_tags") fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags) gold_spans = { span: label for (span, label) in gold_spans_with_pos_tags.items() if "-POS" not in label } else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField(gold_labels, span_list_field) return Instance(fields)
def _read(self, file_path: str): file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) # if self._span_file_path is not None: span_file = open(self._span_file_path) span_file = json.load(span_file) #archive = load_archive(self._extraction_model_path) #model = archive.model model = None p1_dataset_reader = DatasetReader.from_params( archive.config["dataset_reader"]) p1_token_indexers = p1_dataset_reader._token_indexers logger.info("Reading the dataset") for data, best_span in zip(dataset, span_file): answer = data['answers'][0] question = data['query'] well_formed_answer = data['wellFormedAnswers'][0] passages_json = data['passages'] passages = [ passages_json[i]['passage_text'] for i in range(len(passages_json)) ] # passages_length = [len(p) for p in passages] passages_is_selected = [ passages_json[i]['is_selected'] for i in range(len(passages_json)) ] # concatenated_passage = ' '.join(passages) tokenized_passages_list = [ self._tokenizer.tokenize(util.normalize_text(p)) for p in passages ] passages_length = [len(p) for p in tokenized_passages_list] cumulative_passages_length = np.cumsum(passages_length) normalized_answer = None if answer != None: normalized_answer = util.normalize_text(answer) normalized_question = util.normalize_text(question) tokenized_answer = self._tokenizer.tokenize(normalized_answer) tokenized_question = self._tokenizer.tokenize(normalized_question) question_field = TextField(tokenized_question, self._token_indexers) fields = {'question': question_field} start_idx, end_idx, rouge_score, passage_idx = None, None, None, None tokenized_answer.insert(0, Token(START_SYMBOL)) tokenized_answer.append(Token(END_SYMBOL)) tokenized_passage = [ token for sublist in tokenized_passages_list for token in sublist ] passage_field = TextField(tokenized_passage, self._token_indexers) fields['passage'] = passage_field p1_question_field = TextField(tokenized_question, p1_token_indexers) p1_passage_field = TextField(tokenized_passage, p1_token_indexers) p1_fields = { 'question': p1_question_field, 'passage': p1_passage_field } p1_instance = Instance(p1_fields) outputs = model.forward_on_instance(p1_instance, -1) start_idx = outputs['span_start_idx'] end_idx = outputs['span_end_idx'] for idx in range(len(cumulative_passages_length)): if start_idx < cumulative_passages_length[idx]: break if idx != 0: start_idx = start_idx - cumulative_passages_length[idx - 1] end_idx = end_idx - cumulative_passages_length[idx - 1] assert start_idx <= end_idx, "Span prediction does not make sense!!!" # yield instance from predicted span span_start_field = IndexField(int(start_idx), passage_field) span_end_field = IndexField(int(end_idx), passage_field) answer_field = TextField(tokenized_answer, self._token_indexers) fields['passage'] = passage_field fields['span_start'] = span_start_field fields['span_end'] = span_end_field fields['answer'] = answer_field evidence = self.get_evidence(tokenized_passage, int(start_idx), int(end_idx)) fields['metadata'] = MetadataField({ 'evidence': evidence, 'question_text': normalized_question, 'answer_text': normalized_answer }) yield Instance(fields) # yield instances from gold spans for item in best_span: if item['score'] > 0.5: passage_field = TextField( tokenized_passages_list[item['passage']], self._token_indexers) span_start_field = IndexField(item['start'], passage_field) span_end_field = IndexField(item['end'], passage_field) answer_field = TextField(tokenized_answer, self._token_indexers) fields['passage'] = passage_field fields['span_start'] = span_start_field fields['span_end'] = span_end_field fields['answer'] = answer_field evidence = self.get_evidence( tokenized_passages_list[item['passage']], int(start_idx), int(end_idx)) fields['metadata'] = MetadataField({ 'evidence': evidence, 'question_text': normalized_question, 'answer_text': normalized_answer }) yield Instance(fields)