Python tokenize_chinese_chars Examples

Programming Language: Python

Namespace/Package Name: tokenization

Method/Function: tokenize_chinese_chars

Examples at hotexamples.com: 3

Python tokenize_chinese_chars - 3 examples found. These are the top rated real world Python examples of tokenization.tokenize_chinese_chars extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def _read_json(self, input_file, is_training):
        examples = []
        with open(input_file, "r") as f:
            input_data = json.load(f)["data"]
            for entry in input_data:
                for paragraph in entry["paragraphs"]:
                    paragraph_text = paragraph["context"]
                    for qa in paragraph["qas"]:
                        qas_id = qa["id"]
                        question_text = qa["question"]
                        start_pos = None
                        end_pos = None
                        orig_answer_text = None

                        if is_training:
                            if len(qa["answers"]) != 1:
                                raise ValueError(
                                    "For training, each question should have exactly 1 answer."
                                )

                            answer = qa["answers"][0]
                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"]
                            answer_length = len(orig_answer_text)
                            doc_tokens = [
                                paragraph_text[:answer_offset],
                                paragraph_text[answer_offset:answer_offset +
                                               answer_length],
                                paragraph_text[answer_offset + answer_length:]
                            ]

                            start_pos = 1
                            end_pos = 1

                            actual_text = " ".join(
                                doc_tokens[start_pos:(end_pos + 1)])
                            if actual_text.find(orig_answer_text) == -1:
                                print("Could not find answer: '%s' vs. '%s'",
                                      actual_text, orig_answer_text)
                                continue
                        else:
                            doc_tokens = tokenization.tokenize_chinese_chars(
                                paragraph_text)

                        Example = namedtuple('Example', [
                            'qas_id', 'question_text', 'doc_tokens',
                            'orig_answer_text', 'start_position',
                            'end_position'
                        ])

                        example = Example(qas_id=qas_id,
                                          question_text=question_text,
                                          doc_tokens=doc_tokens,
                                          orig_answer_text=orig_answer_text,
                                          start_position=start_pos,
                                          end_position=end_pos)
                        examples.append(example)

        return examples

Example #2

Show file

File: squad.py Project: wuhuaquan/ernie

def read_squad_examples(input_file,
                        is_training,
                        version_2_with_negative=False):
    """Read a SQuAD json file into a list of SquadExample."""
    with io.open(input_file, "r", encoding="utf8") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            #            doc_tokens = []
            #            char_to_word_offset = []
            #            prev_is_whitespace = True
            #            for c in paragraph_text:
            #                if is_whitespace(c):
            #                    prev_is_whitespace = True
            #                else:
            #                    if prev_is_whitespace:
            #                        doc_tokens.append(c)
            #                    else:
            #                        doc_tokens[-1] += c
            #                    prev_is_whitespace = False
            #                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_pos = None
                end_pos = None
                orig_answer_text = None
                is_impossible = False
                if is_training:

                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        doc_tokens = [
                            paragraph_text[:answer_offset],
                            paragraph_text[answer_offset:answer_offset +
                                           answer_length],
                            paragraph_text[answer_offset + answer_length:]
                        ]
                        start_pos = 1
                        end_pos = 1
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        #actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                        actual_text = " ".join(doc_tokens[start_pos:(end_pos +
                                                                     1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            print("Could not find answer: '%s' vs. '%s'",
                                  actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_pos = -1
                        end_pos = -1
                        orig_answer_text = ""
                else:
                    doc_tokens = tokenization.tokenize_chinese_chars(
                        paragraph_text)

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_pos,
                                       end_position=end_pos,
                                       is_impossible=is_impossible)
                examples.append(example)

    return examples

Example #3

Show file

File: run_mrc.py Project: glhlr/DaiMeng

def abc(test_examples = [],init_check=''):

    s_t = str (time.localtime())  #取名用
    if init_check != '':
        args.init_checkpoint = init_check
        args.ernie_config_path = 'config/ernie_config1.json'

    if test_examples == []: #想直接输入examples曾出现异常无法使用，废弃保留代码
        while True:
            break
            Example = namedtuple('Example', ['qas_id', 'question_text', 'doc_tokens','orig_answer_text', 'start_position', 'end_position'])
            example = Example(
                qas_id = s_t + str(ii) ,
                question_text=que,
                doc_tokens= tokenization.tokenize_chinese_chars(para),
                orig_answer_text=None,
                start_position=None,
                end_position=None)
            test_examples.append(example)

            break

    reader = task_reader.MRCReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        tokenizer=args.tokenizer,
        is_classify=args.is_classify,
        is_regression=args.is_regression,
        for_cn=args.for_cn,
        task_id=args.task_id,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    test_pyreader.decorate_tensor_provider(
        reader.data_generator(
            args.test_set,
            batch_size=args.batch_size,
            epoch=1,
            dev_count=1,
            shuffle=False,
            phase="test"))
    print(reader.get_examples("test"))
    mrc_result = evaluate(
        exe,
        test_prog,
        test_pyreader,
        test_graph_vars,
        "test",
        examples=reader.get_examples("test"),
        features=reader.get_features("test"),
        # examples = test_examples,
        # features=reader._convert_example_to_feature(examples=test_examples,max_seq_length=512,tokenizer=tokenization.FullTokenizer(
        # vocab_file='config/vocab.txt', do_lower_case=True),is_training =False),
        args=args)
    print('abc:mrc:return',len(mrc_result[1]))
    return mrc_result