Beispiel #1
0
def compare(in1, in2, length, out1, out2):
    data = ds.NumpySlicesDataset({"s1": [in1], "s2": [in2]})
    data = data.map(operations=text.TruncateSequencePair(length),
                    input_columns=["s1", "s2"])
    data = data.map(input_columns=["s1", "s2"],
                    operations=text.TruncateSequencePair(length))
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        np.testing.assert_array_equal(out1, d["s1"])
        np.testing.assert_array_equal(out2, d["s2"])
Beispiel #2
0
def test_callable():
    op = text.TruncateSequencePair(3)
    data = [["1", "2", "3"], ["4", "5"]]
    result_text = op(*data)
    column1, column2 = op(["1", "2", "3"], ["4", "5"])
    assert np.array_equal(result_text[0], ['1', '2'])
    assert np.array_equal(result_text[1], ['4'])
    assert np.array_equal(column1, ['1', '2'])
    assert np.array_equal(column2, ['4'])
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path,
                               data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64):
    """Process CMNLI dataset"""
    ### Loading CMNLI from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI',
                                 usage=data_usage, shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI',
                                 usage=data_usage, shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI',
                                 usage=data_usage, shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"],
                              columns_order=["id", "label_id", "sentence1", "sentence2"], operations=ops.Duplicate())
        dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0))
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(input_columns="label", output_columns="label_id", operations=label_lookup)
    ### Processing sentence pairs
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    ### Tokenizing sentences and truncate sequence pair
    dataset = dataset.map(input_columns=["sentence1"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence2"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence1", "sentence2"],
                          operations=text.TruncateSequencePair(max_seq_len-3))
    ### Adding special tokens
    dataset = dataset.map(input_columns=["sentence1"],
                          operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'),
                                                     append=np.array(["[SEP]"], dtype='S')))
    dataset = dataset.map(input_columns=["sentence2"],
                          operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')))
    ### Generating segment_ids
    dataset = dataset.map(input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"],
                          columns_order=["sentence1", "type_sentence1", "sentence2", "label_id"],
                          operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"],
                          columns_order=["sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id"],
                          operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["type_sentence1"], operations=[lookup, ops.Fill(0)])
    dataset = dataset.map(input_columns=["type_sentence2"], operations=[lookup, ops.Fill(1)])
    dataset = dataset.map(input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"],
                          columns_order=["sentence1", "sentence2", "segment_ids", "label_id"],
                          operations=ops.Concatenate())
    dataset = dataset.map(input_columns=["segment_ids"], operations=ops.PadEnd([max_seq_len], 0))
    ### Generating text_ids
    dataset = dataset.map(input_columns=["sentence1", "sentence2"], output_columns=["text_ids"],
                          columns_order=["text_ids", "segment_ids", "label_id"],
                          operations=ops.Concatenate())
    dataset = dataset.map(input_columns=["text_ids"], operations=lookup)
    dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0))
    ### Generating mask_ids
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32))
    dataset = dataset.batch(batch_size)
    label = []
    text_ids = []
    mask_ids = []
    segment_ids = []
    for data in dataset:
        label.append(data[0])
        text_ids.append(data[1])
        mask_ids.append(data[2])
        segment_ids.append(data[3])
    return label, text_ids, mask_ids, segment_ids