예제 #1
0
    def test_is_whitespace(self):
        self.assertTrue(tokenization._is_whitespace(u" "))
        self.assertTrue(tokenization._is_whitespace(u"\t"))
        self.assertTrue(tokenization._is_whitespace(u"\r"))
        self.assertTrue(tokenization._is_whitespace(u"\n"))
        self.assertTrue(tokenization._is_whitespace(u"\u00A0"))

        self.assertFalse(tokenization._is_whitespace(u"A"))
        self.assertFalse(tokenization._is_whitespace(u"-"))
예제 #2
0
def customize_tokenizer(text, do_lower_case=False):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
예제 #3
0
파일: cmrc.py 프로젝트: zofuthan/bert-text
    def _read_json(self, input_file:str,
                   is_training:bool,
                   do_lower_case:bool):
        with tf.gfile.Open(input_file, "r") as reader:
            input_data = json.load(reader)["data"]

        examples = []

        for entry in tqdm(input_data,desc=input_file):
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                raw_doc_tokens = customize_tokenizer(paragraph_text)
                doc_tokens = []
                char_to_word_offset = []

                k = 0
                temp_word = ""
                for c in paragraph_text:
                    if tokenization._is_whitespace(c):
                        char_to_word_offset.append(k - 1)
                        continue
                    else:
                        temp_word += c
                        char_to_word_offset.append(k)
                    if do_lower_case:
                        temp_word = temp_word.lower()
                    if temp_word == raw_doc_tokens[k]:
                        doc_tokens.append(temp_word)
                        temp_word = ""
                        k += 1

                assert k == len(raw_doc_tokens)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None

                    if is_training:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]

                        if orig_answer_text not in paragraph_text:
                            tf.logging.warning("Could not find answer")
                        else:
                            answer_offset = paragraph_text.index(orig_answer_text)
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset + answer_length - 1]

                            # 跳过不符合要求的实例
                            actual_text = "".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = "".join(
                                tokenization.whitespace_tokenize(orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text,
                                                   cleaned_answer_text)
                                continue

                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position)
                    examples.append(example)

        return examples