def test_is_whitespace(self): self.assertTrue(tokenization._is_whitespace(u" ")) self.assertTrue(tokenization._is_whitespace(u"\t")) self.assertTrue(tokenization._is_whitespace(u"\r")) self.assertTrue(tokenization._is_whitespace(u"\n")) self.assertTrue(tokenization._is_whitespace(u"\u00A0")) self.assertFalse(tokenization._is_whitespace(u"A")) self.assertFalse(tokenization._is_whitespace(u"-"))
def read_squad_examples(input_file): """Read a SQuAD json file into a list of SquadExample.""" # with tf.gfile.Open(input_file, "r") as reader: # input_data = json.load(reader)["data"] # examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer( paragraph_text, do_lower_case=args_in_use.do_lower_case) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: # c is whitespace if tokenization._is_whitespace(c) or not c.split(): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if args_in_use.do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 if k != len(raw_doc_tokens): print(paragraph) print(doc_tokens) print(raw_doc_tokens) assert k == len(raw_doc_tokens) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) tf.logging.info("**********read_squad_examples complete!**********") return examples
def test_is_whitespace(self): self.assertTrue(tokenization._is_whitespace(u" ")) self.assertTrue(tokenization._is_whitespace(u"\t")) self.assertTrue(tokenization._is_whitespace(u"\r")) self.assertTrue(tokenization._is_whitespace(u"\n")) self.assertTrue(tokenization._is_whitespace(u"\u00A0")) self.assertFalse(tokenization._is_whitespace(u"A")) self.assertFalse(tokenization._is_whitespace(u"-"))
def customize_tokenizer(text, do_lower_case=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split() # 所以我们这里会拿到一个list
def customize_tokenizer(text, do_lower_case=True): temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if _is_chinese_char(ord(c)) or tokenization._is_punctuation( c) or tokenization._is_whitespace( c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split()
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] # examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=squad_params.do_lower_case) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if squad_params.do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 assert k == len(raw_doc_tokens) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: tf.logging.warning("Could not find answer") else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if squad_params.do_lower_case: cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: pdb.set_trace() tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) tf.logging.info("**********read_squad_examples complete!**********") return examples
def read_squad_examples(input_file, vocab_file): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] # min_context_len = 99999 max_context_len = 0 sum_context_len = 0 num_context = 0 min_answer_len = 99999 max_answer_len = 0 sum_answer_len = 0 num_answers = 0 for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] # 分词,计算文档长度 raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=False) sum_context_len += len(raw_doc_tokens) min_context_len = min(min_context_len, len(raw_doc_tokens)) max_context_len = max(max_context_len, len(raw_doc_tokens)) num_context += 1 doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 assert k == len(raw_doc_tokens) # ChineseFullTokenizer tokenizer = ChineseFullTokenizer(vocab_file=vocab_file, do_lower_case=False) doc_tokens_C = tokenizer.tokenize(paragraph_text) print("BasicTokenizer length:%d" % (len(doc_tokens))) print("ChineseFullTokenizer length: %d" % (len(doc_tokens_C))) print(doc_tokens == doc_tokens_C) print(doc_tokens) print(doc_tokens_C) # # 计算answer长度 # for qa in paragraph["qas"]: # question_text = qa["question"] # start_position = None # end_position = None # orig_answer_text = None # # 开发集中某些问题答案不唯一 # for answer in qa["answers"]: # # orig_answer_text = answer["text"] # # if orig_answer_text not in paragraph_text: # tf.logging.warning("Could not find answer") # else: # answer_offset = paragraph_text.index(orig_answer_text) # answer_length = len(orig_answer_text) # start_position = char_to_word_offset[answer_offset] # end_position = char_to_word_offset[answer_offset + answer_length - 1] # answer_len=(end_position-start_position+1) # # num_answers+=1 # sum_answer_len+=answer_len # min_answer_len=min(min_answer_len,answer_len) # max_answer_len=max(max_answer_len,answer_len) # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. # actual_text = "".join( # doc_tokens[start_position:(end_position + 1)]) # print('共%d个文档;平均长度为%.3f;最大长度为%d;最小长度%d;' %( # num_context,sum_context_len/num_context,max_context_len,min_context_len)) # print('共%d个答案;平均长度为%.3f;最大长度为%d;最小长度%d;'%( # num_answers, sum_answer_len / num_answers, max_answer_len, min_answer_len)) tf.logging.info("**********preprocess dataset complete!**********")
def read_squad_examples(input_file, is_training, do_lower_case): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=do_lower_case) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if do_lower_case is True: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 try: assert k == len(raw_doc_tokens) except AssertionError: print(len(raw_doc_tokens), len(doc_tokens)) for i in range(min(len(doc_tokens), len(raw_doc_tokens))): if raw_doc_tokens[i] != doc_tokens[i]: print(raw_doc_tokens[i - 3:i + 3], doc_tokens[i - 3:i + 3]) break print(''.join(doc_tokens[500:])) print("----") print(''.join(raw_doc_tokens[500:])) raise AssertionError for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] is_impossible = False start_position = None end_position = None orig_answer_text = None if is_training: is_impossible = len(qa['answers']) == 0 if len(qa["answers"]) > 1: pass #raise ValueError( # "For training, each question should have less than 1 answer.") if len(qa['answers']) == 0: orig_answer_text = "" start_position = end_position = 0 # use_cls else: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: logger.warning("Could not find answer") continue answer_offset = paragraph_text.index(orig_answer_text) #answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if do_lower_case: cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def convert_single_example(ex_index, example, label_list, max_seq_length, wordid_map, num_words=1, num_unk=1): label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, label_ids=[label_map["O"]] * max_seq_length, sequence_length=0, is_real_example=False, ) input_ids = [] # char based sequence = list(example.text) sequence_length = len(sequence) if len(sequence) <= FLAGS.max_seq_length else FLAGS.max_seq_length sequence.insert(0, "[CLS]") sequence.append("[SEP]") for w in sequence: num_words += 1 if w.lower() in wordid_map: input_ids.append(wordid_map[w]) elif _is_whitespace(w): input_ids.append(wordid_map["[unused1]"]) else: num_unk += 1 input_ids.append(0) input_mask = [1] * len(sequence) label_id = [label_map[l] for l in example.label] label_id.insert(0,0) label_id.append(0) segment_ids = [0] * FLAGS.max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) label_id.append(0) input_ids = input_ids[:max_seq_length] label_id = label_id[:max_seq_length] input_mask = input_mask[:max_seq_length] assert len(input_ids) == max_seq_length assert len(label_id) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % "".join(sequence)) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_id])) tf.logging.info("sequence_length: %s" % sequence_length) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_id, sequence_length=sequence_length, is_real_example=True,) return feature, num_words, num_unk