def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. """ seq_cws_dict = {} output_tokens = [] if self.count % 10000 == 0: logger.info(f"count ={self.count}, processing text: {text}") self.count += 1 for ind, token in enumerate(whitespace_tokenize(text)): seq_cws = jieba.lcut(token) seq_cws_dict.update({x: 1 for x in seq_cws}) for token in whitespace_tokenize(text): chars = list(token) i = 0 while i < len(chars): if len(CH_RE.findall(chars[i])) == 0: # 不是中文的,原文加进去。 output_tokens.append(token) break has_add = False for length in range(5, 0, -1): if i + length > len(chars): continue if ''.join(chars[i:i + length]) in seq_cws_dict: output_tokens.append(chars[i]) for l in range(1, length): output_tokens.append('##' + chars[i + l]) i += length has_add = True break if not has_add: output_tokens.append(chars[i]) i += 1 return output_tokens
def tokenize(self, text, never_split=None): """ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. Args: **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. """ never_split = self.never_split + (never_split if never_split is not None else []) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: # pass MASK forward if MASK in token: split_tokens.append(MASK) if token != MASK: remaining_chars = token.replace(MASK, "").strip() if remaining_chars: split_tokens.append(remaining_chars) continue if self.do_lower_case and token not in never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def read_trivia_examples(input_file, is_training=True): total_cnt = 0 with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] no_answer_cnt = 0 for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["qid"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if qa["answers"] == []: no_answer_cnt += 1 continue if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) # word position start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) cleaned_start = actual_text.lower().find( cleaned_answer_text) #if actual_text.find(cleaned_answer_text) == -1: if cleaned_start == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: # cleaned_answer_text might be lower cased, needs to be reconstructued from actual_text orig_answer_text = actual_text[ cleaned_start:cleaned_start + len(cleaned_answer_text)] else: start_position = -1 end_position = -1 orig_answer_text = "" example = TriviaExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) print("# of questions without an answer".format(no_answer_cnt)) return examples
def _create_examples(self, input_data, set_type, language): is_training = set_type == "train" paragraph_id = 0 examples = [] for entry in tqdm(input_data): for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] sentence_breaks = list( infer_sentence_breaks(paragraph_text) ) # TODO can also get sentence_breaks from json directly. paragraph_id += 1 doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if _is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qas in paragraph["qas"]: qas_id = qas["id"] question_text = qas["question"] start_position = None end_position = None orig_answer_text = None # If a question has multiple answers, we only use the first. answer = qas["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) sentence_text = None for start, end in sentence_breaks: if start <= answer_offset < end: sentence_text = paragraph_text[start:end] break # A potential problem here is that the sentence might break # around the answer fragment. In that case, we skip the example. if not sentence_text: continue # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = RetrievalSquadExample( qas_id=qas_id, question_text=question_text, answer_text=actual_text, sentence_text=sentence_text, paragraph_text=paragraph_text, paragraph_id=paragraph_id) examples.append(example) return examples
def read_nq_examples(input_file_or_data, is_training): """Read a NQ json file into a list of NQExample. Refer to `nq_to_squad.py` to convert the `simplified-nq-t*.jsonl` files to NQ json.""" if isinstance(input_file_or_data, str): with open(input_file_or_data, "r", encoding='utf-8') as f: input_data = json.load(f)["data"] else: input_data = input_file_or_data for entry_index, entry in enumerate(tqdm(input_data, total=len(input_data))): # if entry_index >= 2: # break assert len(entry["paragraphs"]) == 1 paragraph = entry["paragraphs"][0] paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) assert len(paragraph["qas"]) == 1 qa = paragraph["qas"][0] start_position = None end_position = None long_position = None orig_answer_text = None short_is_impossible = False long_is_impossible = False if is_training: short_is_impossible = qa["short_is_impossible"] short_answers = qa["short_answers"] if len(short_answers) >= 2: # logger.info(f"Choosing leftmost of " # f"{len(short_answers)} short answer") short_answers = sorted(short_answers, key=lambda sa: sa["answer_start"]) short_answers = short_answers[0: 1] if not short_is_impossible: answer = short_answers[0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[ answer_offset + answer_length - 1] # Only add answers where the text can be exactly # recovered from the document. If this CAN'T # happen it's likely due to weird Unicode stuff # so we will just skip the example. # # Note that this means for training mode, every # example is NOT guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position: end_position + 1]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" long_is_impossible = qa["long_is_impossible"] long_answers = qa["long_answers"] if (len(long_answers) != 1) and not long_is_impossible: raise ValueError(f"For training, each question" f" should have exactly 1 long answer.") if not long_is_impossible: long_answer = long_answers[0] long_answer_offset = long_answer["answer_start"] long_position = char_to_word_offset[long_answer_offset] else: long_position = -1 # print(f'Q:{question_text}') # print(f'A:{start_position}, {end_position}, # {orig_answer_text}') # print(f'R:{doc_tokens[start_position: end_position]}') if not short_is_impossible and not long_is_impossible: assert long_position <= start_position if not short_is_impossible and long_is_impossible: assert False, f'Invalid pair short, long pair' example = NQExample( qas_id=qa["id"], question_text=qa["question"], doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, long_position=long_position, short_is_impossible=short_is_impossible, long_is_impossible=long_is_impossible, crop_start=qa["crop_start"]) yield example
def read_nq_entry(entry, is_training): """ Converts a NQ entry into a list of NqExamples. :param entry: dict :param is_training: bool :return: list[NqExample] """ def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] contexts_id = entry["id"] contexts = entry["contexts"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in contexts: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) questions = [] for i, question in enumerate(entry["questions"]): qas_id = "{}".format(contexts_id) question_text = question["input_text"] start_position = None end_position = None answer = None if is_training: answer_dict = entry["answers"][i] answer = make_nq_answer(contexts, answer_dict) # For now, only handle extractive, yes, and no. if answer is None or answer.offset is None: continue start_position = char_to_word_offset[answer.offset] end_position = char_to_word_offset[answer.offset + len(answer.text) - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(answer.text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue questions.append(question_text) example = NqExample(example_id=int(contexts_id), qas_id=qas_id, questions=questions[:], doc_tokens=doc_tokens, doc_tokens_map=entry.get("contexts_map", None), answer=answer, start_position=start_position, end_position=end_position) examples.append(example) return examples
def to_feature_list( self, tokenizer, max_seq_length, doc_stride, max_query_length, set_type, ): is_training = set_type == PHASE.TRAIN features = [] if is_training and not self.is_impossible: # Get start and end position start_position = self.start_position end_position = self.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join( self.doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(self.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(self.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not self.is_impossible: tok_start_position = orig_to_tok_index[self.start_position] if self.end_position < len(self.doc_tokens) - 1: tok_end_position = orig_to_tok_index[self.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, self.answer_text) spans = [] truncated_query = tokenizer.encode( self.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length, ) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, truncation="only_second" if tokenizer.padding_side == "right" else "only_first", pad_to_max_length=True, max_length=max_seq_length, return_overflowing_tokens=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict[ "input_ids"][:encoded_dict["input_ids"]. index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index( tokenizer.pad_token_id)) non_padded_ids = encoded_dict["input_ids"][ last_padding_id_position + 1:] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = (len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i) token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = ( len(truncated_query) + sequence_added_tokens) encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict or ( "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0): break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context( spans, doc_span_index, doc_span_index * doc_stride + j) index = (j if tokenizer.padding_side == "left" else spans[doc_span_index] ["truncated_query_with_special_tokens_length"] + j) spans[doc_span_index]["token_is_max_context"][ index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer # (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) (not sure why...) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens:] = 0 else: p_mask[-len(span["tokens"]):-(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where( span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask( span["input_ids"], already_has_special_tokens=True)).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = self.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False # noinspection PyUnboundLocalVariable if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index # We store "is_impossible" at an example level instead # noinspection PyUnusedLocal span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len( truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( DataRow( unique_id="", qas_id=self.qas_id, tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], token_is_max_context=span["token_is_max_context"], input_ids=np.array(span["input_ids"]), input_mask=np.array(span["attention_mask"]), segment_ids=np.array(span["token_type_ids"]), cls_index=np.array(cls_index), p_mask=np.array(p_mask.tolist()), paragraph_len=span["paragraph_len"], start_position=start_position, end_position=end_position, answers=self.answers, doc_tokens=self.doc_tokens, )) return features
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" reader = open(input_file, "r", encoding='utf-8') reader.readline() input_data = [] for line in reader: input_data.append(json.loads(line)) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for paragraph in input_data: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: try: qas_id = qa["id"] except: qas_id = qa["qid"] question_text = qa["question"] start_positions = [] end_positions = [] orig_answer_texts = [] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] # if (len(qa["answers"]) != 1) and (not is_impossible): # raise ValueError( # "For training, each question should have exactly 1 answer.") if not is_impossible: flag = True for answer in qa["detected_answers"]: orig_answer_text = paragraph_text[ answer["char_spans"][0][0]: answer["char_spans"][0][1] + 1] #answer["text"] answer_offset = answer["char_spans"][0][0] # answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer["char_spans"] [0][1]] # end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) flag = False break start_positions.append(start_position) end_positions.append(end_position) orig_answer_texts.append(orig_answer_text) if not flag and is_training: continue # else: # start_position = -1 # end_position = -1 # orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, # orig_answer_text=orig_answer_text, # start_position=start_position, # end_position=end_position, orig_answer_text=orig_answer_texts, start_position=start_positions, end_position=end_positions, is_impossible=is_impossible) examples.append(example) return examples
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join( example.doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize( example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'" ) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if tokenizer.__class__.__name__ in [ "RobertaTokenizer", "LongformerTokenizer", "BartTokenizer", "RobertaTokenizerFast", "LongformerTokenizerFast", "BartTokenizerFast", ]: sub_tokens = tokenizer.tokenize(token, add_prefix_space=True) else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length) # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling # in the way they compute mask of added tokens. tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower() sequence_added_tokens = ( tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1 if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET else tokenizer.model_max_length - tokenizer.max_len_single_sentence) sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): # Define the side we want to truncate / pad and the text/pair sorting if tokenizer.padding_side == "right": texts = truncated_query pairs = span_doc_tokens truncation = TruncationStrategy.ONLY_SECOND else: texts = span_doc_tokens pairs = truncated_query truncation = TruncationStrategy.ONLY_FIRST encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic texts, pairs, truncation=truncation, padding=padding_strategy, max_length=max_seq_length, return_overflowing_tokens=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict[ "input_ids"][:encoded_dict["input_ids"]. index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index( tokenizer.pad_token_id)) non_padded_ids = encoded_dict["input_ids"][ last_padding_id_position + 1:] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len( truncated_query ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len( truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict or ( "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0): break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context( spans, doc_span_index, doc_span_index * doc_stride + j) index = (j if tokenizer.padding_side == "left" else spans[doc_span_index] ["truncated_query_with_special_tokens_length"] + j) spans[doc_span_index]["token_is_max_context"][ index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens:] = 0 else: p_mask[-len(span["tokens"]):-(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where( span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask( span["input_ids"], already_has_special_tokens=True)).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index= 0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, guid=example.guid, )) return features
def read_squad_example(example: QASample): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False paragraph_text = example.context doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) qas_id = example.sample_id question_text = example.question sup_ids = example.sup_ids sup_token_pos_ids = [] answer = example.answer_dict orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] if sup_ids: for sup in sup_ids: sup_start_position = char_to_word_offset[sup[0]] sup_end_position = char_to_word_offset[sup[1] - 1] sup_token_pos_ids.append((sup_start_position, sup_end_position)) # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, sup_ids=sup_token_pos_ids)
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] input_data = input_data def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_positions = [] end_positions = [] orig_answer_texts = [] is_impossible = False if is_training: # for debug if version_2_with_negative: is_impossible = qa.get("is_impossible", False) if not is_impossible: flag = True for answer in qa["answers"]: orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) flag = False break start_positions.append(start_position) end_positions.append(end_position) orig_answer_texts.append(orig_answer_text) if not flag and is_training: continue # else: # start_position = -1 # end_position = -1 # orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_texts, start_position=start_positions, end_position=end_positions, is_impossible=is_impossible) examples.append(example) return examples
def read_coqa_examples(input_file, is_training=True, use_history=False, n_history=-1): """ read a CoQA json file into a list of QA examples """ total_cnt = 0 with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: # process story text paragraph_text = entry["story"] paragraph_id = entry["id"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False # each char is mapped to word position char_to_word_offset.append(len(doc_tokens) - 1) # process questions question_history_texts = [] for (question, ans) in zip(entry['questions'], entry['answers']): total_cnt += 1 cur_question_text = question["input_text"] question_history_texts.append(cur_question_text) question_id = question["turn_id"] ans_id = ans["turn_id"] start_position = None end_position = None yes_no_flag = None yes_no_ans = None orig_answer_text = None if (question_id != ans_id): print("question turns are not ordered!") print("mismatched question {}".format(cur_question_text)) if is_training: orig_answer_text = ans["text"] answer_offset = ans["span_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if (answer_offset + answer_length >= len(char_to_word_offset)): end_position = char_to_word_offset[-1] else: end_position = char_to_word_offset[answer_offset + answer_length] actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) yes_no_flag = int(ans["yes_no_flag"]) yes_no_ans = int(ans["yes_no_ans"]) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue if (use_history): if (n_history == -1 or n_history > len(question_history_texts)): question_texts = question_history_texts[:] else: question_texts = question_history_texts[-1 * n_history:] else: question_texts = question_history_texts[-1] example = CoQAExample(paragraph_id=paragraph_id, turn_id=question_id, question_texts=question_texts, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, yes_no_flag=yes_no_flag, yes_no_ans=yes_no_ans) examples.append(example) logger.info("Total raw examples: {}".format(total_cnt)) return examples
def read_newsqa_examples(input_file, is_training, version_2_with_negative, group): """Read a SQuAD json file into a list of NewsqaExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for paragraph in input_data: if paragraph['type']==group: paragraph_text = paragraph["text"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qid, qa in enumerate(paragraph["questions"]): qas_id = paragraph["storyId"]+str(qid) question_text = qa["q"] start_position = None end_position = None orig_answer_text = None is_impossible=False if 'consensus' not in qa: is_impossible=True elif 's' not in qa["consensus"] or 'e' not in qa["consensus"]: is_impossible=True if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if not is_impossible: answer = qa["consensus"] answer_offset = answer["s"] answer_end_offset = answer["e"] start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_end_offset-1] answer_length = answer_end_offset-answer_offset orig_answer_text = paragraph_text[answer_offset:answer_end_offset-1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) pdb.set_trace() continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = NewsqaExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_newsqa_examples(input_file, is_training, version_2_with_negative=True): """Read a NewsQA json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for story in input_data: story_text = story["text"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in story_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for i, qa in enumerate(story["questions"]): qas_id = story["storyId"] + '-' + str(i) question_text = qa["q"] start_position = None end_position = None orig_answer_text = None is_impossible = False # if is_training: if version_2_with_negative: if ("noAnswer" in qa["consensus"] or "badQuestion" in qa["consensus"]): is_impossible = True else: is_impossible = False if (len(qa["consensus"]) != 2) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer.", qa) if not is_impossible: answer = qa["consensus"] answer_offset = answer["s"] answer_length = answer["e"] - answer["s"] start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] orig_answer_text = story_text[answer["s"]:answer["e"]] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) #if len(examples) > 200: # break #if len(examples) > 200: # break return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) > 1) and (not is_impossible): # For comparability with this model implementation, if more than one answer exists # we will choose the first one as the correct gold answer. qa["answers"] = [qa["answers"][0]] # raise ValueError( # "For training, each question should have exactly 1 answer.") elif (len(qa["answers"]) == 0) and (not is_impossible): # In the none SQuAD datasets, it may very well be possible that no gold answer has # been found for an example. In these cases we just discard the example in training. continue if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) if answer_offset + answer_length - 1 >= len(char_to_word_offset): # In some datasets we get this edge case... s continue start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]).lower() cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)).lower() if actual_text.find(cleaned_answer_text) == -1: actual_text_1 = " ".join(doc_tokens[(start_position-1):end_position]).lower() if actual_text_1.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = start_position - 1 end_position = end_position - 1 else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def create_examples( examples: list, source: str, is_training: bool = True, multi_qa_type_class: bool = False, ): """ Args: examples(list): list of examples is_training (bool): whether we want to create examples for training or eval mode Return: list of examples (each example is an instance) """ sources = ['SQuAD', 'SubjQA'] if source not in sources: raise ValueError('Data source must be one of {}'.format(sources)) if not isinstance(examples, list): raise TypeError("Input should be a list of examples.") def is_whitespace(char: str): if char == " " or char == "\t" or char == "\r" or char == "\n" or ord( char) == 0x202F: return True return False def preproc_context(context: str): doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in context: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) return doc_tokens, char_to_word_offset example_instances = [] for example in examples: # TODO: figure out, whether we should strip off "ANSWERNOTFOUND" from reviews in SubjQA; # if not, then start and end positions should be second to the last index (i.e., sequence[-2]) instead of 0 (i.e., [CLS]), # since "ANSWERNOTFOUND" is last token in each review text context = example["context"] if source == 'SQuAD' else example[ "review"].rstrip('ANSWERNOTFOUND') doc_tokens, char_to_word_offset = preproc_context(context) if source == 'SQuAD': for qa in example["qas"]: qas_id = qa["id"] q_text = qa["question"] dataset = 'SQuAD' start_position = None end_position = None orig_answer_text = qa['answers'][0]['text'] if len( qa['answers']) == 1 else '' is_impossible = qa['is_impossible'] q_sbj = 2 if multi_qa_type_class else 0 a_sbj = 2 if multi_qa_type_class else 0 domain = 'wikipedia' # we don't need start and end positions in eval mode if is_training: if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: # skip example, if answer cannot be recovered from document continue # elif question is NOT answerable, then answer is the empty string and start and end positions are 0 else: #uncomment line below to skip unanswerable questions #continue start_position = 0 end_position = 0 orig_answer_text = "" elif source == 'SubjQA': qas_id = example['qa_id'] q_text = example['question'] dataset = 'SubjQA' start_position = None end_position = None is_impossible = example['is_impossible'] q_sbj = example['question_subj'] a_sbj = example['ans_subj'] domain = example['domain'] assert len( example['answer'] ) == 3, "Each answer must consist of an answer text, a start and an end index of answer span" if not is_impossible: orig_answer_text = example['answer']['answer_text'] answer_offset = example['answer']['answer_start'] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] try: end_position = char_to_word_offset[answer_offset + answer_length - 1] # sometimes orig. answer text has more white spaces between tokens than the same char. sequence in review text, # thus we will get an IndexError (i.e., answer_length is too long) except IndexError: orig_answer_text = context[ answer_offset:example['answer']['answer_end']] answer_length = len(orig_answer_text) end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: # skip example, if answer cannot be recovered from document continue # elif question is NOT answerable, then answer is the empty string and start and end positions are 0 else: #uncomment line below to skip unanswerable questions (for now) #continue start_position = 0 end_position = 0 orig_answer_text = "" example_instance = InputExample( qas_id=qas_id, q_text=q_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, q_sbj=q_sbj, a_sbj=a_sbj, domain=domain, dataset=dataset, ) example_instances.append(example_instance) return example_instances
def read_quac_examples(input_file, is_training=True, use_history=False, n_history=-1): """ read QuAC data into a list of QA examples """ with open(input_file, "r", encoding="utf-8") as reader: input_data = json.load(reader)['data'] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] #yesno_symbols = set() #followup_symbols = set() for entry in input_data: para_obj = entry['paragraphs'][0] paragraph_id = para_obj['id'] # process context paragraph paragraph_text = para_obj['context'] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False # each char is mapped to word position char_to_word_offset.append(len(doc_tokens) - 1) # process questions question_history_texts = [] for qa in para_obj['qas']: cur_question_text = qa['question'] question_history_texts.append(cur_question_text) example_id = qa['id'] # word position start_position = None end_position = None yes_no_flag = None yes_no_ans = None followup = None orig_answer_text = None if is_training: answer = qa['answers'][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length >= len(char_to_word_offset): end_position = char_to_word_offset[-1] else: end_position = char_to_word_offset[answer_offset + answer_length] actual_text = " ".join(doc_tokens[start_position:(end_position+1)]) cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue #logger.info("yesno symbol: {}, followup symbol: {}".format(qa['yesno'], qa['followup'])) yes_no_flag = int(qa['yesno'] in ['y','n']) yes_no_ans = int(qa['yesno'] == 'y') #yes_no_flag = yesno_vocab.index(qa['yesno']) #yesno_symbols.add(qa['yesno']) followup = followup_vocab.index(qa['followup']) #followup_symbols.add(qa['followup']) questions = [] if use_history: # !!! CONTINUE if n_history == -1 or len(question_history_texts) <= n_history: questions = question_history_texts[:] else: questions = question_history_texts[-1*n_history:] else: questions = [question_history_texts[-1]] example = QuACExample( example_id=example_id, questions=questions, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, yes_no_flag=yes_no_flag, yes_no_ans=yes_no_ans, followup=followup) examples.append(example) #logger.info("yesno symbols: {}, followup symbols: {}".format(yesno_symbols, followup_symbols)) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: if is_training and len(examples) >= 50: break if is_training and random.randint(1, 10) != 5: continue qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) < 1) and (not is_impossible): raise ValueError( "For training, each question should have more than(including) 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_quac_examples(input_file, is_training): """Read a QuAC json file into a list of CQAExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] # if FLAGS.load_small_portion: # input_data = input_data[:10] # print('input_data:', input_data) # tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>') for entry in input_data: # An additional "CANNOTANSWER" has been added in QuAC data, so no need to append one. entry = entry['paragraphs'][0] paragraph_text = entry["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) ############################################################ # convert the convasational QAs to squad format, with history ############################################################ questions = [(item['question'], item['id']) for item in entry['qas']] # [(question, question_id), ()] answers = [(item['orig_answer']['text'], item['orig_answer']['answer_start']) for item in entry['qas']] followups = [item['followup'] for item in entry['qas']] yesnos = [item['yesno'] for item in entry['qas']] qas = [] for i, (question, answer, followup, yesno) in enumerate(zip(questions, answers, followups, yesnos)): metadata = { 'turn': i + 1, 'history_turns': [], 'tok_history_answer_markers': [], 'followup': followup, 'yesno': yesno, 'history_turns_text': [] } # if FLAGS.use_RL: # start_index = 0 # else: # start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history) end_index = i question_with_histories = '' history_answer_marker = None # if FLAGS.use_history_answer_marker: a = 0 if a < 1: start_index = 0 # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards history_answer_marker = [] for history_turn, (each_answer, each_question) in enumerate( zip(answers[start_index:end_index], questions[start_index:end_index])): # [history_answer_start, history_answer_end, history_answer_text] each_marker = [ each_answer[1], each_answer[1] + len(each_answer[0]), each_answer[0] ] history_answer_marker.append(each_marker) metadata['history_turns'].append(history_turn + start_index + 1) metadata['history_turns_text'].append( (each_question[0], each_answer[0])) #[(q1, a1), (q2, a2), ...] else: # prepend historical questions and answers start_index = max(end_index - 6, 0) for each_answer in answers[start_index:end_index]: question_with_histories += each_answer[0] + ' ' # add the current question question_with_histories += question[0] qas.append({ 'id': question[1], 'question': question_with_histories, 'answers': [{ 'answer_start': answer[1], 'text': answer[0] }], 'history_answer_marker': history_answer_marker, 'metadata': metadata }) for qa in qas: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None # if is_training: # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time. if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization_bert.whitespace_tokenize(orig_answer_text)) if is_training and actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question. # we also construct each_tok_history_answer_marker to store a single history answer marker. tok_history_answer_marker = [0] * len(doc_tokens) for marker_index, marker in enumerate(qa['history_answer_marker']): each_tok_history_answer_marker = [0] * len(doc_tokens) history_orig_answer_text = marker[2] history_answer_offset = marker[0] history_answer_length = len(history_orig_answer_text) history_start_position = char_to_word_offset[ history_answer_offset] history_end_position = char_to_word_offset[ history_answer_offset + history_answer_length - 1] history_actual_text = " ".join( doc_tokens[history_start_position:(history_end_position + 1)]) history_cleaned_answer_text = " ".join( tokenization_bert.whitespace_tokenize( history_orig_answer_text)) if history_actual_text.find(history_cleaned_answer_text) != -1: tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ tok_history_answer_marker[history_end_position + 1 :] each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ each_tok_history_answer_marker[history_end_position + 1 :] assert len(tok_history_answer_marker) == len(doc_tokens) assert len(each_tok_history_answer_marker) == len( doc_tokens) qa['metadata']['tok_history_answer_markers'].append( each_tok_history_answer_marker) example = CQAExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, history_answer_marker=tok_history_answer_marker, metadata=qa['metadata']) examples.append(example) return examples
def read_mlqa_examples(input_file, is_training, version_2_with_negative, input_lang): """Read a MLQA json file into a list of MlqaExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True if input_lang == "zh": try: if "jieba" not in sys.modules: import jieba else: jieba = sys.modules["jieba"] except (AttributeError, ImportError): logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps") logger.error("1. pip install jieba") raise paragraph_text = " ".join(jieba.cut(paragraph_text)) for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: # is_impossible = qa["is_impossible"] raise ValueError('MLQA dataset doesn\'t contain impossible question...') if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer.") if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = MlqaExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_record_examples(input_file, is_training, version_2_with_negative=False): """Read a ReCoRD json file into a list of ReCoRDExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: # white space tokenization paragraph_text = entry["passage"]["text"].replace('\xa0', ' ') doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # load entities in passage passage_entities = [] for entity in entry['passage']['entities']: entity_start_offset = entity['start'] entity_end_offset = entity['end'] # some error labeled entities in record dataset if entity_end_offset < entity_start_offset: continue entity_text = paragraph_text[ entity_start_offset:entity_end_offset + 1] passage_entities.append({ 'orig_text': entity_text, 'start_position': char_to_word_offset[entity_start_offset], 'end_position': char_to_word_offset[entity_end_offset] }) for qa in entry["qas"]: qas_id = qa["id"] question_text = qa["query"].replace('\xa0', ' ') start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] # if (len(qa["answers"]) != 1) and (not is_impossible): # raise ValueError( # "For training, each question should have exactly 1 answer." # ) if not is_impossible: # just chose the first one? answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.info("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = ReCoRDExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, passage_entities=passage_entities, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def bionumqa_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join( example.doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize( example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair all_doc_nums = example.context_nums question_nums = example.question_nums span_doc_tokens = all_doc_tokens doc_num_indices = [ i for i in range(len(all_doc_tokens)) if all_doc_tokens[i] == "[NUM]" ] while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict[ "input_ids"][:encoded_dict["input_ids"]. index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index( tokenizer.pad_token_id)) non_padded_ids = encoded_dict["input_ids"][ last_padding_id_position + 1:] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len( truncated_query ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] doc_num_start = np.digitize(len(spans) * doc_stride, doc_num_indices) doc_num_end = np.digitize( len(spans) * doc_stride + paragraph_len, doc_num_indices) doc_nums = all_doc_nums[doc_num_start:doc_num_end] nums = question_nums + doc_nums number_mask = [0.0] * max_seq_length number_indice = [i for i in range(len(tokens)) if tokens[i] == '[NUM]'] for index, num in zip(number_indice, nums): number_mask[index] = num encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len( truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len encoded_dict["number"] = number_mask spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context( spans, doc_span_index, doc_span_index * doc_stride + j) index = (j if tokenizer.padding_side == "left" else spans[doc_span_index] ["truncated_query_with_special_tokens_length"] + j) spans[doc_span_index]["token_is_max_context"][ index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens:] = 0 else: p_mask[-len(span["tokens"]):-(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where( span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask( span["input_ids"], already_has_special_tokens=True)).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( BioNumQAFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index= 0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, qas_id=example.qas_id, number=span['number'])) return features
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position if example.question_type == 'factoid': start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join( example.doc_tokens[start_position:(end_position + 1)]) else: actual_text = "".join([ example.doc_sent[e] for e in example.pointing_answer ]).strip() cleaned_answer_text = " ".join(whitespace_tokenize( example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] all_sent_positions = [] all_tok_to_sep_idx = {} for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) if token == '|': all_sent_positions.append(len(all_doc_tokens)) sub_tokens = ['[SEP]'] for sub_token in sub_tokens: all_tok_to_sep_idx[len( all_doc_tokens)] = len(all_sent_positions) - 1 tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: if example.question_type == "narrative": sent_start_position = example.pointing_answer[0] sent_end_position = example.pointing_answer[-1] tok_start_position = 0 tok_end_position = 0 else: # center = int((example.char_start_position*2 + len(example.answer_text))/2) # start_for_sent = center-50 if center >= 50 else 0 # end_for_sent = center + 50 if center +50 < len(example.char_to_sent_offset) else len(example.char_to_sent_offset) -1 start_for_sent = example.char_start_position end_for_sent = example.char_start_position + len( example.answer_text) - 1 sent_start_position = example.char_to_sent_offset[start_for_sent] sent_end_position = example.char_to_sent_offset[end_for_sent] tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict[ "input_ids"][:encoded_dict["input_ids"]. index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index( tokenizer.pad_token_id)) non_padded_ids = encoded_dict["input_ids"][ last_padding_id_position + 1:] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} token_to_orig_sent_map = {} token_to_cur_sent_map = {} cur_sent_to_orig_sent_map = {} for i in range(paragraph_len): index = len( truncated_query ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] token_to_orig_sent_map[index] = all_tok_to_sep_idx[len(spans) * doc_stride + i] token_to_cur_sent_map[index] = all_tok_to_sep_idx[ len(spans) * doc_stride + i] - all_tok_to_sep_idx[len(spans) * doc_stride] + 1 cur_sent_to_orig_sent_map = { token_to_cur_sent_map[e]: token_to_orig_sent_map[e] for e in token_to_cur_sent_map.keys() if token_to_cur_sent_map[e] not in cur_sent_to_orig_sent_map.keys() } encoded_dict["paragraph_len"] = paragraph_len encoded_dict["question_mask"] = [ 1 - e for e in encoded_dict["token_type_ids"] ] encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["token_to_orig_sent_map"] = token_to_orig_sent_map encoded_dict["truncated_query_with_special_tokens_length"] = len( truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len encoded_dict["question_type"] = question_type2idx[ example.question_type] if example.question_type else None encoded_dict["answer_type"] = answer_type2idx[ example.answer_type] if example.answer_type else None encoded_dict["sentence_mask"] = [0] * (len(truncated_query) + 2) + [ token_to_cur_sent_map[k] for k in token_to_cur_sent_map.keys() ] encoded_dict["sentence_mask"] += [0] * ( 512 - len(encoded_dict["sentence_mask"])) encoded_dict["cur_sent_to_orig_sent_map"] = cur_sent_to_orig_sent_map spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context( spans, doc_span_index, doc_span_index * doc_stride + j) index = (j if tokenizer.padding_side == "left" else spans[doc_span_index] ["truncated_query_with_special_tokens_length"] + j) spans[doc_span_index]["token_is_max_context"][ index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens:] = 0 else: p_mask[-len(span["tokens"]):-(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where( span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask( span["input_ids"], already_has_special_tokens=True)).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible t_start_position = 0 t_end_position = 0 s_start_position = 0 s_end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. if span["question_type"] == 1: doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: t_start_position = cls_index t_end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len( truncated_query) + sequence_added_tokens t_start_position = tok_start_position - doc_start + doc_offset t_end_position = tok_end_position - doc_start + doc_offset sent_doc_start = all_tok_to_sep_idx[span["start"]] sent_doc_end = all_tok_to_sep_idx[span["start"] + span["length"] - 1] sent_start_position = sent_start_position if sent_start_position >= sent_doc_start else sent_doc_start sent_end_position = sent_end_position if sent_end_position <= sent_doc_end else sent_doc_end if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = 1 s_start_position = sent_start_position - sent_doc_start + doc_offset s_end_position = sent_end_position - sent_doc_start + doc_offset else: sent_doc_start = all_tok_to_sep_idx[span["start"]] sent_doc_end = all_tok_to_sep_idx[span["start"] + span["length"] - 1] out_of_span = False if not (sent_start_position >= sent_doc_start and sent_end_position <= sent_doc_end): out_of_span = True if out_of_span: span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = 1 s_start_position = sent_start_position - sent_doc_start + doc_offset s_end_position = sent_end_position - sent_doc_start + doc_offset if span_is_impossible and random.random() > 0.5 and is_training: continue features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], span["question_mask"], span["sentence_mask"], span["cur_sent_to_orig_sent_map"], cls_index, p_mask.tolist(), example_index= 0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], token_to_orig_sent_map=span["token_to_orig_sent_map"], sent_start_position=s_start_position, sent_end_position=s_end_position, tok_start_position=t_start_position, tok_end_position=t_end_position, question_type=span["question_type"], answer_type=span["answer_type"], is_impossible=span_is_impossible, qas_id=example.qas_id, )) return features
def Get_Date_From_DataSet(input_file): examples = [] with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] for entry in input_data: #entry 是由title和paragraph构成的dict for paragraph in entry["paragraphs"]: #遍历paragraph这个dict paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: #处理多个空格的情况 if is_whitespace(c): #如果c是空格 将前面空格的flag设为true prev_is_whitespace = True else: #如果不是空格,前面是空格的话另起一个 if prev_is_whitespace: doc_tokens.append(c) else: #如果不是空格,加到前面的token后面 doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) #记录每一个char对应的词汇index for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] is_impossible = qa["is_impossible"] if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] # 答案开始的char的位置 answer_length = len(orig_answer_text) start_position = char_to_word_offset[ answer_offset] # 获得answer开始位置词的索引 end_position = char_to_word_offset[answer_offset + answer_length - 1] # 获得answer结束位置词的索引 actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) # 按照start位置找到的answer cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) # 数据集中给的answer if actual_text.find( cleaned_answer_text ) == -1: # 将tokenize函数的提取和手动提取作比较,如果不能包含数据集中的数据,则warn logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_squad_examples_jb(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" if isinstance(input_file, str): with open(input_file, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] else: input_data = input_file examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if _is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] try: retriever_score = qa["retriever_score"] except KeyError: retriever_score = 0 start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[ answer_offset + answer_length - 1 ] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position : (end_position + 1)] ) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text) ) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text, ) continue else: start_position = -1 end_position = -1 orig_answer_text = "" examples.append( SquadExampleJB( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, paragraph=paragraph_text, title=entry["title"], retriever_score=retriever_score, ) ) return examples
def convert_to_example(self, question_text, qas_id=None, paragraph_text=None, char_to_word_offset=None, doc_tokens=None, is_impossible=False, answer=None, answer_offset=None): """ - qas_id: int - question_text: string - paragraph_text: string. If char_to_word_offset and doc_tokens exists, then you cal remain it to None - char_to_word_offset: list, it is the intermediate result after predealing the paragraph text - doc_tokens: list, it is the intermediate result after predealing the paragraph text - is_impossible: bool - answer: string - answer_offset: int, indicate the answer location in paragraph """ def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False if char_to_word_offset is None or len(char_to_word_offset) < 1: if char_to_word_offset is None: doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) start_position = None end_position = None orig_answer_text = None if self.is_training: if not is_impossible: orig_answer_text = answer answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return None else: start_position = -1 end_position = -1 orig_answer_text = "" else: is_impossible = False example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) return example
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join(example.doc_tokens[start_position: (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: # logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) # print("truncated query: {}".format(truncated_query)) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens # print("span doc tokens: {}".format(span_doc_tokens)) while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) index = ( j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) (not sure why...) p_mask = np.array(span["token_type_ids"]) p_mask = np.minimum(p_mask, 1) if tokenizer.padding_side == "right": # Limit positive values to one p_mask = 1 - p_mask p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 # Set the CLS index to '0' p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, ) ) return features
def read_squad_examples(input_file, is_training, do_lower_case=False): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case) doc_tokens = [] # [word0, word1,...]的形式 char_to_word_offset = [] # 记录每个char对应word的index temp_word = "" for c in paragraph_text: # 类似split(),以空格为分隔 if _is_whitespace(c): char_to_word_offset.append(len(doc_tokens) - 1) continue else: temp_word += c char_to_word_offset.append(len(doc_tokens)) if do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[len(doc_tokens)]: doc_tokens.append(temp_word) temp_word = "" assert len(doc_tokens) == len(raw_doc_tokens) assert doc_tokens == raw_doc_tokens for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) if orig_answer_text not in paragraph_text: logging.warning("Could not find answer") start_position = -1 end_position = -1 orig_answer_text = "" else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[ answer_offset] # start word index end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( whitespace_tokenize(orig_answer_text)) if do_lower_case: cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text, ) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=False, ) examples.append(example) return examples
def tokenize(self, text): return whitespace_tokenize(text)
def fincausal_convert_example_to_features(example: FinCausalExample, max_seq_length: int, doc_stride: int, is_training: bool) -> List[FinCausalFeatures]: features = [] if is_training: # Get start and end position start_cause_position = example.start_cause_position end_cause_position = example.end_cause_position start_effect_position = example.start_effect_position end_effect_position = example.end_effect_position # If the cause cannot be found in the text, then skip this example. actual_cause_text = " ".join(example.doc_tokens[start_cause_position: (end_cause_position + 1)]) cleaned_cause_text = " ".join(whitespace_tokenize(_run_split_on_punc(example.cause_text))) if actual_cause_text.find(cleaned_cause_text) == -1: logger.warning("Could not find cause: '%s' vs. '%s'", actual_cause_text, cleaned_cause_text) return [] # If the effect cannot be found in the text, then skip this example. actual_effect_text = " ".join(example.doc_tokens[start_effect_position: (end_effect_position + 1)]) cleaned_effect_text = " ".join(whitespace_tokenize(_run_split_on_punc(example.effect_text))) if actual_effect_text.find(cleaned_effect_text) == -1: logger.warning("Could not find effect: '%s' vs. '%s'", actual_effect_text, cleaned_effect_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training: tok_cause_start_position = orig_to_tok_index[example.start_cause_position] if example.end_cause_position < len(example.doc_tokens) - 1: tok_cause_end_position = orig_to_tok_index[example.end_cause_position + 1] - 1 else: tok_cause_end_position = len(all_doc_tokens) - 1 (tok_cause_start_position, tok_cause_end_position) = _improve_answer_span( all_doc_tokens, tok_cause_start_position, tok_cause_end_position, tokenizer, example.cause_text ) tok_effect_start_position = orig_to_tok_index[example.start_effect_position] if example.end_effect_position < len(example.doc_tokens) - 1: tok_effect_end_position = orig_to_tok_index[example.end_effect_position + 1] - 1 else: tok_effect_end_position = len(all_doc_tokens) - 1 (tok_effect_start_position, tok_effect_end_position) = _improve_answer_span( all_doc_tokens, tok_effect_start_position, tok_effect_end_position, tokenizer, example.effect_text ) if example.offset_sentence_2 > 0: tok_sentence_2_offset = orig_to_tok_index[example.offset_sentence_2 + 1] - 1 else: tok_sentence_2_offset = None if example.offset_sentence_3 > 0: tok_sentence_3_offset = orig_to_tok_index[example.offset_sentence_3 + 1] - 1 else: tok_sentence_3_offset = None spans: List[BatchEncoding] = [] sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict: BatchEncoding = tokenizer.encode_plus(span_doc_tokens, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - sequence_added_tokens - 1, truncation_strategy="only_first", truncation=True, return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - sequence_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict.data["input_ids"][ : encoded_dict.data["input_ids"].index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict.data["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) ) non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1:] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = sequence_added_tokens + i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if len(encoded_dict.get("overflowing_tokens", [])) == 0: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index].data["paragraph_len"]): is_max_context = _check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) spans[doc_span_index].data["token_is_max_context"][j] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span.data["input_ids"].index(tokenizer.cls_token_id) p_mask = np.ones(len(span.data["token_type_ids"])) p_mask[np.where(np.array(span.data["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 # Set the CLS index to '0' p_mask[cls_index] = 0 span_is_impossible = False cause_start_position = 0 cause_end_position = 0 effect_start_position = 0 effect_end_position = 0 doc_start = span.data["start"] doc_end = span.data["start"] + span.data["length"] - 1 out_of_span = False if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = sequence_added_tokens if tok_sentence_2_offset is not None: sentence_2_offset = tok_sentence_2_offset - doc_start + doc_offset else: sentence_2_offset = None if tok_sentence_3_offset is not None: sentence_3_offset = tok_sentence_3_offset - doc_start + doc_offset else: sentence_3_offset = None if is_training: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. if not (tok_cause_start_position >= doc_start and tok_cause_end_position <= doc_end and tok_effect_start_position >= doc_start and tok_effect_end_position <= doc_end): out_of_span = True if out_of_span: cause_start_position = cls_index cause_end_position = cls_index effect_start_position = cls_index effect_end_position = cls_index span_is_impossible = True else: cause_start_position = tok_cause_start_position - doc_start + doc_offset cause_end_position = tok_cause_end_position - doc_start + doc_offset effect_start_position = tok_effect_start_position - doc_start + doc_offset effect_end_position = tok_effect_end_position - doc_start + doc_offset features.append( FinCausalFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_orig_index=example.example_id, example_index=0, unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], cause_start_position=cause_start_position, cause_end_position=cause_end_position, effect_start_position=effect_start_position, effect_end_position=effect_end_position, sentence_2_offset=sentence_2_offset, sentence_3_offset=sentence_3_offset, is_impossible=span_is_impossible, ) ) return features