Esempio n. 1
0
 def test_is_whitespace(self):
     self.assertTrue(tokenization._is_whitespace(u" "))
     self.assertTrue(tokenization._is_whitespace(u"\t"))
     self.assertTrue(tokenization._is_whitespace(u"\r"))
     self.assertTrue(tokenization._is_whitespace(u"\n"))
     self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
     self.assertFalse(tokenization._is_whitespace(u"A"))
     self.assertFalse(tokenization._is_whitespace(u"-"))
Esempio n. 2
0
def read_squad_examples(input_file):
    """Read a SQuAD json file into a list of SquadExample."""
    #   with tf.gfile.Open(input_file, "r") as reader:
    #     input_data = json.load(reader)["data"]

    #
    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(
                paragraph_text, do_lower_case=args_in_use.do_lower_case)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                # c is whitespace
                if tokenization._is_whitespace(c) or not c.split():
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if args_in_use.do_lower_case:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1
            if k != len(raw_doc_tokens):
                print(paragraph)
                print(doc_tokens)
                print(raw_doc_tokens)
            assert k == len(raw_doc_tokens)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    tf.logging.info("**********read_squad_examples complete!**********")

    return examples
Esempio n. 3
0
  def test_is_whitespace(self):
    self.assertTrue(tokenization._is_whitespace(u" "))
    self.assertTrue(tokenization._is_whitespace(u"\t"))
    self.assertTrue(tokenization._is_whitespace(u"\r"))
    self.assertTrue(tokenization._is_whitespace(u"\n"))
    self.assertTrue(tokenization._is_whitespace(u"\u00A0"))

    self.assertFalse(tokenization._is_whitespace(u"A"))
    self.assertFalse(tokenization._is_whitespace(u"-"))
def customize_tokenizer(text, do_lower_case=False):
  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
  temp_x = ""
  text = tokenization.convert_to_unicode(text)
  for c in text:
    if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c):
      temp_x += " " + c + " "
    else:
      temp_x += c
  if do_lower_case:
    temp_x = temp_x.lower()
  return temp_x.split() # 所以我们这里会拿到一个list
Esempio n. 5
0
def customize_tokenizer(text, do_lower_case=True):
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if _is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    #
    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=squad_params.do_lower_case)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                if tokenization._is_whitespace(c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if squad_params.do_lower_case:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1

            assert k == len(raw_doc_tokens)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                if is_training:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]

                    if orig_answer_text not in paragraph_text:
                        tf.logging.warning("Could not find answer")
                    else:
                        answer_offset = paragraph_text.index(orig_answer_text)
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]

                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if squad_params.do_lower_case:
                            cleaned_answer_text = cleaned_answer_text.lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            pdb.set_trace()
                            tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                            continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    tf.logging.info("**********read_squad_examples complete!**********")

    return examples
Esempio n. 7
0
def read_squad_examples(input_file, vocab_file):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    #

    min_context_len = 99999
    max_context_len = 0
    sum_context_len = 0
    num_context = 0

    min_answer_len = 99999
    max_answer_len = 0
    sum_answer_len = 0
    num_answers = 0
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            # 分词,计算文档长度
            raw_doc_tokens = customize_tokenizer(paragraph_text,
                                                 do_lower_case=False)
            sum_context_len += len(raw_doc_tokens)
            min_context_len = min(min_context_len, len(raw_doc_tokens))
            max_context_len = max(max_context_len, len(raw_doc_tokens))
            num_context += 1

            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:

                if tokenization._is_whitespace(c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1

            assert k == len(raw_doc_tokens)

            # ChineseFullTokenizer
            tokenizer = ChineseFullTokenizer(vocab_file=vocab_file,
                                             do_lower_case=False)
            doc_tokens_C = tokenizer.tokenize(paragraph_text)

            print("BasicTokenizer length:%d" % (len(doc_tokens)))
            print("ChineseFullTokenizer length: %d" % (len(doc_tokens_C)))
            print(doc_tokens == doc_tokens_C)
            print(doc_tokens)
            print(doc_tokens_C)

            # # 计算answer长度
            # for qa in paragraph["qas"]:
            #     question_text = qa["question"]
            #     start_position = None
            #     end_position = None
            #     orig_answer_text = None
            #     # 开发集中某些问题答案不唯一
            #     for answer in qa["answers"]:
            #
            #         orig_answer_text = answer["text"]
            #
            #         if orig_answer_text not in paragraph_text:
            #             tf.logging.warning("Could not find answer")
            #         else:
            #             answer_offset = paragraph_text.index(orig_answer_text)
            #             answer_length = len(orig_answer_text)
            #             start_position = char_to_word_offset[answer_offset]
            #             end_position = char_to_word_offset[answer_offset + answer_length - 1]
            #             answer_len=(end_position-start_position+1)
            #
            #             num_answers+=1
            #             sum_answer_len+=answer_len
            #             min_answer_len=min(min_answer_len,answer_len)
            #             max_answer_len=max(max_answer_len,answer_len)
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            # actual_text = "".join(
            #     doc_tokens[start_position:(end_position + 1)])

            # print('共%d个文档;平均长度为%.3f;最大长度为%d;最小长度%d;' %(
            #         num_context,sum_context_len/num_context,max_context_len,min_context_len))
            # print('共%d个答案;平均长度为%.3f;最大长度为%d;最小长度%d;'%(
            #     num_answers, sum_answer_len / num_answers, max_answer_len, min_answer_len))
    tf.logging.info("**********preprocess dataset complete!**********")
Esempio n. 8
0
def read_squad_examples(input_file, is_training, do_lower_case):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(paragraph_text,
                                                 do_lower_case=do_lower_case)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                if tokenization._is_whitespace(c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if do_lower_case is True:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1

            try:
                assert k == len(raw_doc_tokens)
            except AssertionError:
                print(len(raw_doc_tokens), len(doc_tokens))
                for i in range(min(len(doc_tokens), len(raw_doc_tokens))):
                    if raw_doc_tokens[i] != doc_tokens[i]:
                        print(raw_doc_tokens[i - 3:i + 3],
                              doc_tokens[i - 3:i + 3])
                        break
                print(''.join(doc_tokens[500:]))
                print("----")
                print(''.join(raw_doc_tokens[500:]))
                raise AssertionError

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                is_impossible = False
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    is_impossible = len(qa['answers']) == 0
                    if len(qa["answers"]) > 1:
                        pass
                        #raise ValueError(
                        #    "For training, each question should have less than 1 answer.")
                    if len(qa['answers']) == 0:
                        orig_answer_text = ""
                        start_position = end_position = 0  # use_cls
                    else:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        if orig_answer_text not in paragraph_text:
                            logger.warning("Could not find answer")
                            continue
                        answer_offset = paragraph_text.index(orig_answer_text)
                        #answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if do_lower_case:
                            cleaned_answer_text = cleaned_answer_text.lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples
Esempio n. 9
0
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           wordid_map,  num_words=1, num_unk=1):
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i 

    if isinstance(example, PaddingInputExample): 
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            label_ids=[label_map["O"]] * max_seq_length,
            sequence_length=0,
            is_real_example=False, )
    
    input_ids = []
    # char based
    sequence =  list(example.text)
    sequence_length = len(sequence) if len(sequence) <= FLAGS.max_seq_length else FLAGS.max_seq_length
    sequence.insert(0, "[CLS]")
    sequence.append("[SEP]")
    for w in sequence:
        num_words += 1
        if w.lower() in wordid_map:
            input_ids.append(wordid_map[w])
        elif _is_whitespace(w):
            input_ids.append(wordid_map["[unused1]"])
        else:
            num_unk += 1
            input_ids.append(0)
    input_mask = [1] * len(sequence)
    label_id = [label_map[l] for l in example.label]
    label_id.insert(0,0)
    label_id.append(0)
    segment_ids = [0] * FLAGS.max_seq_length

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        label_id.append(0)

    input_ids = input_ids[:max_seq_length]
    label_id = label_id[:max_seq_length]
    input_mask = input_mask[:max_seq_length]

    
    assert len(input_ids) == max_seq_length
    assert len(label_id) == max_seq_length

    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("tokens: %s" % "".join(sequence))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_id]))
        tf.logging.info("sequence_length: %s" % sequence_length)

    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=label_id,
        sequence_length=sequence_length,
        is_real_example=True,)
    return feature, num_words, num_unk