def text_to_instance(self, premise: str, hypothesis: str, label: str = None, tag=None) -> Instance: premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) if self.max_seq_length != None: self._truncate_seq_pair(premise_tokens, hypothesis_tokens) premise_ids = self._tokenizer.convert_tokens_to_ids(premise_tokens) hypothesis_ids = self._tokenizer.convert_tokens_to_ids( hypothesis_tokens) input_ids = self._tokenizer.add_special_tokens_sentences_pair( premise_ids, hypothesis_ids) token_type_ids = self.get_token_type_ids(input_ids) attention_mask = [1] * len(input_ids) # Add padding if max_seq_length is defined if self.max_seq_length != None: padding = [0] * (self.max_seq_length - len(input_ids)) input_ids += padding attention_mask += padding token_type_ids += padding metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': premise_tokens, 'hypothesis_tokens': hypothesis_tokens, 'label': label, 'tag': tag } fields = { 'input_ids': ArrayField(np.array(input_ids), dtype=np.int64), 'token_type_ids': ArrayField(np.array(token_type_ids), dtype=np.int64), 'attention_mask': ArrayField(np.array(attention_mask), dtype=np.int64), 'metadata': MetadataField(metadata) } if label is not None: fields['label'] = ArrayField(np.array(self._label_dict[label]), dtype=np.int64) return Instance(fields)
def text_to_instance(self, features: np.ndarray, premise: str, hypothesis: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} fields['features'] = ArrayField(features) metadata = { 'premise': premise, 'hypothesis': hypothesis, } fields['metadata'] = MetadataField(metadata) if label: fields['label'] = LabelField(label) return Instance(fields)
def text_to_instance(self, premise: str, hypothesis: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = [x.text for x in self._tokenizer.tokenize(premise)] hypothesis_tokens = [ x.text for x in self._tokenizer.tokenize(hypothesis) ] # n-grams from the premise prem_trigrams = set(skipgrams(premise_tokens, 3, 1)) prem_bigrams = set(skipgrams(premise_tokens, 2, 1)) prem_unigrams = set(ngrams(premise_tokens, 1)) # n-grams from the hypothesis hyp_trigrams = set(skipgrams(hypothesis_tokens, 3, 1)) hyp_bigrams = set(skipgrams(hypothesis_tokens, 2, 1)) hyp_unigrams = set(ngrams(hypothesis_tokens, 1)) # overlap proportions if hyp_trigrams: tri_overlap = len( prem_trigrams.intersection(hyp_trigrams)) / len(hyp_trigrams) else: 0.0 if hyp_bigrams: bi_overlap = len( prem_bigrams.intersection(hyp_bigrams)) / len(hyp_bigrams) else: 0.0 if hyp_unigrams: uni_overlap = len( prem_unigrams.intersection(hyp_unigrams)) / len(hyp_unigrams) else: 0.0 fields['features'] = FeaturesField( [tri_overlap, bi_overlap, uni_overlap]) metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': premise_tokens, 'hypothesis_tokens': hypothesis_tokens } fields['metadata'] = MetadataField(metadata) if label: fields['label'] = LabelField(label) return Instance(fields)
def text_to_instance(self, premise: str, hypothesis: str, hypothesis_structure: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:] hypothesis_tokens = self._tokenizer.tokenize( hypothesis)[-self._max_tokens:] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': [token.text for token in premise_tokens], 'hypothesis_tokens': [token.text for token in hypothesis_tokens] } fields['metadata'] = MetadataField(metadata) self._add_structure_to_fields(hypothesis_structure, fields) if label: fields['label'] = LabelField(label) return Instance(fields)
def make_reading_comprehension_instance( self, question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = { 'original_passage': passage_text, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if answer_texts: metadata['answer_texts'] = answer_texts if token_spans: metadata["token_spans"] = token_spans # assume spans are sorted by some criteria span_start = token_spans[0][0] span_end = token_spans[0][1] - 1 assert (span_start <= span_end) if span_end > len(passage_tokens) - 1: return None fields['span_start'] = IndexField(span_start, passage_field) fields['span_end'] = IndexField(span_end, passage_field) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, content: str, candidates: List[str], ground_truths: List[str] = None, real_count: int = 1) -> Iterable[Instance]: splits = re.split(r'#idiom#', content) assert real_count + 1 == len(splits) assert real_count == len(candidates) split_tokens = [self.tokenizer.tokenize(item) for item in splits] for index, current_candidates in enumerate(candidates): before_part_tokens = [Token(token) for token in split_tokens[0]] for before_part in split_tokens[1:index + 1]: before_part_tokens += [Token('[UNK]')] + [ Token(token) for token in before_part ] after_part_tokens = [ Token(token) for token in split_tokens[index + 1] ] for after_part in split_tokens[index + 2:]: after_part_tokens += [Token('[UNK]')] + [ Token(token) for token in after_part ] # 将 留空处 打上 [MASK]标记 content_tokens = before_part_tokens + [Token('[MASK]') ] + after_part_tokens # 取 留空 前后最多max_seq_length的内容作为输入 half_length = self.max_seq_length // 2 if len(before_part_tokens) < half_length: start = 0 end = min( len(before_part_tokens) + 1 + len(after_part_tokens), self.max_seq_length - 2) elif len(after_part_tokens) < half_length: end = len(before_part_tokens) + 1 + len(after_part_tokens) start = max(0, end - (self.max_seq_length - 2)) else: start = len(before_part_tokens) + 3 - half_length end = len(before_part_tokens) + 1 + half_length content_tokens = content_tokens[start:end] # 填空内容 content_field = TextField(content_tokens, self.content_token_indexer) # 留空 的位置 blank_index = content_tokens.index(Token("[MASK]")) blank_index_field = IndexField(blank_index, content_field) # 候选成语 candidate_tokens = [ self.idiom_list.index(option) for option in current_candidates ] candidate_tokens = np.array(candidate_tokens) candidate_field = ArrayField(candidate_tokens, dtype=np.long) fields = { "content": content_field, "blank_indices": blank_index_field, "candidates": candidate_field, } if ground_truths: label = current_candidates.index(ground_truths[index]) label_field = LabelField(label, skip_indexing=True) fields["answer"] = label_field # 元信息 meta = { "content": '[UNK]'.join(splits[:index + 1]) + "[MASK]" + '[UNK]'.join(splits[index + 1:]), "candidates": current_candidates, "answer": ground_truths[index] } else: meta = { "content": '[UNK]'.join(splits[:index + 1]) + "[MASK]" + '[UNK]'.join(splits[index + 1:]), "candidates": current_candidates, } fields["meta"] = MetadataField(meta) yield Instance(fields)
def text_to_instance(self, premise: str, hypothesis: str, label: str = None, tag=None) -> Instance: #################### ##### Tokenization and truncation #################### premise_tokens = self._tokenizer.tokenize(premise.strip()) hypothesis_tokens = self._tokenizer.tokenize(hypothesis.strip()) premise_tokens, hypothesis_tokens = self._truncate_input( premise_tokens, hypothesis_tokens) #################### ##### Create ids for encoder inputs, decoder inputs and decoder targets #################### ## Create encoder inputs src = [] src.append( self._tokenizer.add_special_tokens_single_sentence( self._tokenizer.convert_tokens_to_ids( [self._tokenizer.entail_token] + premise_tokens))) src.append( self._tokenizer.add_special_tokens_single_sentence( self._tokenizer.convert_tokens_to_ids( [self._tokenizer.neutral_token] + premise_tokens))) src.append( self._tokenizer.add_special_tokens_single_sentence( self._tokenizer.convert_tokens_to_ids( [self._tokenizer.contradict_token] + premise_tokens))) assert len(src[0]) == len(src[1]) == len(src[2]) src_length = len(src[0]) ## Create decoder inputs and targets # Targets of the decoder: [<s> A B C D E <\s>] target = self._tokenizer.add_special_tokens_single_sentence( self._tokenizer.convert_tokens_to_ids(hypothesis_tokens)) # Inputs of the decoder: [<\s> <s> A B C D E] prev_output_tokens = [self._tokenizer.eos_token_id] + target[:-1] target_length = len(target) #################### ##### Padding of the input #################### # Pad the premise ids (the source) if self.max_premise_length: encoder_padding = [self._tokenizer.pad_token_id ] * (self.max_premise_length - src_length) src = [s + encoder_padding for s in src] # Pad the hypothesis ids (the target) if self.max_hypothesis_length: decoder_padding = [self._tokenizer.pad_token_id ] * (self.max_hypothesis_length - target_length) target += decoder_padding prev_output_tokens += decoder_padding # Replicate `prev_output_tokens` and `src_lengths` three times prev_output_tokens = [prev_output_tokens] * 3 src_length = [src_length] * 3 #################### ##### Create instance #################### metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': premise_tokens, 'hypothesis_tokens': hypothesis_tokens, 'label': label, 'tag': tag } fields = { 'src': ArrayField(np.array(src), dtype=np.int64), 'src_lengths': ArrayField(np.array(src_length), dtype=np.int64), 'prev_output_tokens': ArrayField(np.array(prev_output_tokens), dtype=np.int64), 'target': ArrayField(np.array(target), dtype=np.int64), 'target_lengths': ArrayField(np.array(target_length), dtype=np.int64), 'metadata': MetadataField(metadata) } if label is not None: fields['label'] = ArrayField(np.array(self._label_dict[label]), dtype=np.int64) return Instance(fields)