コード例 #1
0
    def test_is_control(self):
        self.assertTrue(tokenization._is_control(u"\u0005"))

        self.assertFalse(tokenization._is_control(u"A"))
        self.assertFalse(tokenization._is_control(u" "))
        self.assertFalse(tokenization._is_control(u"\t"))
        self.assertFalse(tokenization._is_control(u"\r"))
コード例 #2
0
ファイル: tokenization_test.py プロジェクト: Wanke15/bert
  def test_is_control(self):
    self.assertTrue(tokenization._is_control(u"\u0005"))

    self.assertFalse(tokenization._is_control(u"A"))
    self.assertFalse(tokenization._is_control(u" "))
    self.assertFalse(tokenization._is_control(u"\t"))
    self.assertFalse(tokenization._is_control(u"\r"))
コード例 #3
0
def customize_tokenizer(text, do_lower_case=False):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(
                c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
コード例 #4
0
def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    #
    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            # 分词
            raw_doc_tokens = customize_tokenizer(paragraph_text,
                                                 do_lower_case=False)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                # 增加控制字符的判断
                if tokenization._is_whitespace(c) or tokenization._is_control(
                        c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)

                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1
            print("k:%d;raw_doc_tokens:%d" % (k, len(raw_doc_tokens)))
            print("qas_id:%s" % (paragraph["qas"][0]["id"]))
            if (k != len(raw_doc_tokens)):
                print("doc_tokens:")
                print(doc_tokens)

                print("raw_doc_tokens:")
                print(raw_doc_tokens)
            assert k == len(raw_doc_tokens)
            # 答案的basic token位置
            start_positions = []
            end_positions = []
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                for answer in qa["answers"]:
                    orig_answer_text = answer["text"]

                    if orig_answer_text not in paragraph_text:
                        tf.logging.warning("Could not find answer")
                    else:
                        answer_offset = paragraph_text.index(orig_answer_text)
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]

                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                        start_positions.append(start_position)
                        end_positions.append(end_position)

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_positions,
                                       end_position=end_positions)
                examples.append(example)
    tf.logging.info("**********read_squad_examples complete!**********")

    return examples