def collate(data, tokenizer, input_block_size, output_block_size):
    """ List of tuple as an input. """
    question_inputs = []
    question_varible_outputs = []
    condition_outputs = []
    for i, example in enumerate(data):
        question_input = tokenizer.encode(example.question_input)
        question_input = fit_to_block_size(question_input, input_block_size,
                                           tokenizer.pad_token_id)
        question_inputs.append(question_input)

        if example.question_varible_output is not None:
            question_varible_output = tokenizer.encode(
                example.question_varible_output)
        else:
            question_varible_output = tokenizer.build_inputs_with_special_tokens(
                [])
        question_varible_output = fit_to_block_size(question_varible_output,
                                                    output_block_size,
                                                    tokenizer.pad_token_id)
        question_varible_outputs.append(question_varible_output)

        if example.condition_output is not None:
            condition_output = tokenizer.encode(example.condition_output)
        else:
            condition_output = tokenizer.build_inputs_with_special_tokens([])
        condition_output = fit_to_block_size(condition_output,
                                             output_block_size,
                                             tokenizer.pad_token_id)
        condition_outputs.append(condition_output)

    question_inputs = torch.tensor(question_inputs)
    question_varible_outputs = torch.tensor(question_varible_outputs)
    condition_outputs = torch.tensor(condition_outputs)

    question_inputs_mask = build_mask(question_inputs, tokenizer.pad_token_id)
    question_varible_outputs_mask = build_mask(question_varible_outputs,
                                               tokenizer.pad_token_id)
    condition_outputs_mask = build_mask(condition_outputs,
                                        tokenizer.pad_token_id)

    question_varible_outputs_mask_lm_labels = build_lm_labels(
        question_varible_outputs, tokenizer.pad_token_id)
    condition_outputs_mask_lm_labels = build_lm_labels(condition_outputs,
                                                       tokenizer.pad_token_id)

    return (
        question_inputs,
        [question_varible_outputs, condition_outputs],
        question_inputs_mask,
        [question_varible_outputs_mask, condition_outputs_mask],
        [
            question_varible_outputs_mask_lm_labels,
            condition_outputs_mask_lm_labels
        ],
    )
Beispiel #2
0
def collate(data, tokenizer, input_block_size,output_block_size):
    """ List of tuple as an input. """
    inputs=[]
    outputs=[]
    for i,example in enumerate(data):
        input=tokenizer.encode(example.input_text)
        input=fit_to_block_size(input, input_block_size, tokenizer.pad_token_id)
        inputs.append(input)
        if example.output_text is not None:
            output=tokenizer.encode(example.output_text)
        else:
            output=tokenizer.build_inputs_with_special_tokens([])
        output=fit_to_block_size(output, output_block_size, tokenizer.pad_token_id)

        outputs.append(output)


    inputs = torch.tensor(inputs)
    outputs = torch.tensor(outputs)
    encoder_mask = build_mask(inputs, tokenizer.pad_token_id)
    decoder_mask = build_mask(outputs, tokenizer.pad_token_id)
    lm_labels = build_lm_labels(outputs, tokenizer.pad_token_id)

    return (
        inputs,
        outputs,
        encoder_mask,
        decoder_mask,
        lm_labels,
    )
def collate(data, tokenizer, block_size):
    """ List of tuple as an input. """
    # remove the files with empty an story/summary, encode and fit to block
    data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
    data = [
        encode_for_summarization(story, summary, tokenizer)
        for story, summary in data
    ]
    data = [(
        fit_to_block_size(story, block_size, tokenizer.pad_token_id),
        fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
    ) for story, summary in data]

    stories = torch.tensor([story for story, summary in data])
    summaries = torch.tensor([summary for story, summary in data])
    encoder_token_type_ids = compute_token_type_ids(stories,
                                                    tokenizer.cls_token_id)
    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
    decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
    lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)

    return (
        stories,
        summaries,
        encoder_token_type_ids,
        encoder_mask,
        decoder_mask,
        lm_labels,
    )
def gen_batch_data(x, y, batch_size):
    '''
    批数据生成器
    :param x:
    :param y:
    :param batch_size:
    :return:
    '''

    tokenizer = AutoTokenizer.from_pretrained(BERT_PATH)
    indices = np.arange(x.shape[0])
    random.shuffle(indices)
    x = x[indices]
    y = y[indices]
    i = 0

    x_batch, y_batch, answer = [], [], []
    while True:
        bi = i * batch_size
        ei = min(i * batch_size + batch_size, len(indices))
        if ei == len(indices):
            i = 0
        else:
            i += 1

        # for idx in range(bi,ei):
        #     # 确保编码后也不超过max_seq_len
        #     x_      = x[idx]["que_text"][:max_que_seq_len-3]
        #     y_      = y[idx]["ans_text"][:max_ans_seq_len]
        #     # 加入答案主要是为了评估进行模型选择用
        #     #answer.append(y_)
        #     x_, y_ = myToken.get_tokenizer().encode(x_, y_)
        #     x_batch.append(x_)
        #     y_batch.append(y_)

        # x_batch = padding(x_batch)
        # y_batch = padding(y_batch)
        #answer  = np.array(answer)
        # yield [x_batch, y_batch], None
        # tokenizer = AutoTokenizer.from_pretrained(BERT_PATH)
        # source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
        # x_batch, y_batch, answer = [], [], []

        # data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
        data_que = [
            tokenizer.encode(que["que_text"][0:max_que_seq_len - 2])
            for que in x[bi:ei]
        ]
        data_ans = [
            tokenizer.encode(ans["ans_text"][0:max_ans_seq_len - 2])
            for ans in y[bi:ei]
        ]

        data_que = padding(data_que, tokenizer.pad_token_id)
        data_ans = padding(data_ans, tokenizer.pad_token_id)

        ques = torch.tensor(data_que, dtype=torch.long)
        anss = torch.tensor(data_ans, dtype=torch.long)
        encoder_token_type_ids = compute_token_type_ids(
            ques, tokenizer.sep_token_id)
        encoder_mask = build_mask(ques, tokenizer.pad_token_id)
        decoder_mask = build_mask(anss, tokenizer.pad_token_id)
        lm_labels = build_lm_labels(anss, tokenizer.pad_token_id)

        yield (
            ques,
            anss,
            encoder_token_type_ids,
            encoder_mask,
            decoder_mask,
            lm_labels,
        )
Beispiel #5
0
 def test_build_lm_labels(self):
     sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
     expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
     np.testing.assert_array_equal(
         build_lm_labels(sequence, 0).numpy(), expected.numpy())
Beispiel #6
0
 def test_build_lm_labels_no_padding(self):
     sequence = torch.tensor([1, 2, 3, 4])
     expected = sequence
     np.testing.assert_array_equal(
         build_lm_labels(sequence, 0).numpy(), expected.numpy())
Beispiel #7
0
def collate(data, encoder_tokenizer, decoder_tokenizer, input_block_size,
            output_block_size):
    """ List of tuple as an input. """
    inputs = []
    outputs = []
    vocabs = []
    example_buffer = []
    for i, example in enumerate(data):
        #input=encoder_tokenizer.encode(example.input)
        example_buffer.append(example)
        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        input_tokens = ['[CLS]'] + example.input.split() + ['SEP']
        for (i, token) in enumerate(input_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = encoder_tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)
        input = encoder_tokenizer.convert_tokens_to_ids(all_doc_tokens)
        example.tok_to_orig_index = tok_to_orig_index
        example.orig_to_tok_index = orig_to_tok_index
        input = fit_to_block_size(input, input_block_size,
                                  encoder_tokenizer.pad_token_id)
        inputs.append(input)

        if example.output is not None:
            #output=tokenizer.encode(example.output)
            output_tokens = example.output.split()
            #print('Before Whole Index: {}'.format(output_tokens))
            #print('encoder input: {}'.format(all_doc_tokens))
            output_tokens = translate_tokenindex_to_subtokenindex(
                example, output_tokens, example.vocab_indexes,
                example.fsa_states)
            #print('After Sub Index: {}'.format(output_tokens))
            output = decoder_tokenizer.convert_tokens_to_ids(output_tokens)
            output_states = example.fsa_states

        else:
            #output=decoder_tokenizer.build_inputs_with_special_tokens(['start'])
            output = decoder_tokenizer.convert_tokens_to_ids(['start'])

        output_vocab_indexes = example.vocab_indexes

        output = fit_to_block_size(output, output_block_size,
                                   decoder_tokenizer.pad_token_id)
        output_vocab_indexes = fit_to_block_size(
            output_vocab_indexes, output_block_size,
            decoder_tokenizer.pad_token_id)
        outputs.append(output)
        vocabs.append(output_vocab_indexes)
        #print('debug output={}'.format(example.output.split()))
        #print('debug output_states={}'.format(output_states))
        #print('debug output_vocab_indexes={}'.format(output_vocab_indexes))
        #print('debug outputid={}'.format(output))

        #if example.vocab_indexes is not None:
        #    vocab=example.vocab_indexes
        #else:
        #    vocab=[1]
        #vocabs.append(vocab)

    #print(tokenizer.vocab)
    #exit(-1)

    inputs = torch.tensor(inputs)
    outputs = torch.tensor(outputs)
    vocabs = torch.tensor(vocabs)

    inputs_mask = build_mask(inputs, encoder_tokenizer.pad_token_id)
    outputs_mask = build_mask(outputs, decoder_tokenizer.pad_token_id)
    vocabs_mask = build_mask(vocabs, decoder_tokenizer.pad_token_id)

    outputs_mask_lm_labels = build_lm_labels(outputs,
                                             decoder_tokenizer.pad_token_id)
    vocabs_mask_lm_labels = build_lm_labels(vocabs,
                                            decoder_tokenizer.pad_token_id)

    return (
        inputs,
        outputs,
        vocabs,
        inputs_mask,
        outputs_mask,
        vocabs_mask,
        outputs_mask_lm_labels,
        vocabs_mask_lm_labels,
        example_buffer,
    )