'output_keys': [
            'input_1', 'ground_truth_1', 'lan_idx_for_input_1',
            'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer'
        ]
    }]

    print('\n------------------- Encoding -------------------------')
    x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline(
        preprocess_pipeline=pipeline,
        lan_data_1=origin_ro_data,
        lan_data_2=origin_en_data,
        params={
            **params,
            # 'tokenizer': tokenizer
        })

    print('\n----------------------------------------------')
    print(x.shape)
    print(y.shape)
    print(lan_x.shape)
    print(lan_y.shape)
    print(soft_pos_y.shape)

    print('\n------------------- Decoding -------------------------')
    x = utils.pipeline(decode_pl(''), x[:2], None, {'tokenizer': tokenizer})
    y = utils.pipeline(decode_pl(''), y[:2], None, {'tokenizer': tokenizer})
    print(x[0])
    print(soft_pos_y[0])
    print(x[1])
    print(soft_pos_y[1])
Beispiel #2
0
    pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer
    # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise
    pipeline += pl.sent_2_tokens + sample_pl(2.0) + combine_pl(
        0.2) + pl.CDLM_encode + [{
            'output_keys': [
                'input_1', 'ground_truth_1', 'lan_idx_for_input_1',
                'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer'
            ]
        }]

    print('\n------------------- Encoding -------------------------')
    x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline(
        preprocess_pipeline=pipeline,
        lan_data_1=origin_zh_data[:1000],
        lan_data_2=origin_en_data[:1000],
        params={
            **params,
            # 'tokenizer': tokenizer
        })

    print('\n----------------------------------------------')
    print(x.shape)
    print(y.shape)
    print(lan_x.shape)
    print(lan_y.shape)
    print(soft_pos_y.shape)

    print('\n------------------- Decoding -------------------------')
    x = utils.pipeline(decode_pl('ner'), x, None, {'tokenizer': tokenizer})
    y = utils.pipeline(decode_pl('ner'), y, None, {'tokenizer': tokenizer})