def test_is_whitespace(self): self.assertTrue(tokenization._is_whitespace(u" ")) self.assertTrue(tokenization._is_whitespace(u"\t")) self.assertTrue(tokenization._is_whitespace(u"\r")) self.assertTrue(tokenization._is_whitespace(u"\n")) self.assertTrue(tokenization._is_whitespace(u"\u00A0")) self.assertFalse(tokenization._is_whitespace(u"A")) self.assertFalse(tokenization._is_whitespace(u"-"))
def customize_tokenizer(text, do_lower_case=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation( c) or tokenization._is_whitespace( c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split()
def _read_json(self, input_file:str, is_training:bool, do_lower_case:bool): with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] examples = [] for entry in tqdm(input_data,desc=input_file): for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text) doc_tokens = [] char_to_word_offset = [] k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 assert k == len(raw_doc_tokens) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: tf.logging.warning("Could not find answer") else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # 跳过不符合要求的实例 actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples