def process_ner_msra_dataset(data_dir,
                             label_list,
                             bert_vocab_path,
                             max_seq_len=128,
                             class_filter=None,
                             split_begin=None,
                             split_end=None):
    """Process MSRA dataset"""
    ### Loading MSRA from CLUEDataset
    dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter,
                                               split_begin, split_end),
                                  column_names=['text', 'label'])

    ### Processing label
    label_vocab = text.Vocab.from_list(label_list)
    label_lookup = text.Lookup(label_vocab)
    dataset = dataset.map(operations=label_lookup,
                          input_columns="label",
                          output_columns="label_ids")
    dataset = dataset.map(
        operations=ops.Concatenate(prepend=np.array([0], dtype='i')),
        input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)),
                          input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["label_ids"])
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    unicode_char_tokenizer = text.UnicodeCharTokenizer()
    dataset = dataset.map(operations=unicode_char_tokenizer,
                          input_columns=["text"],
                          output_columns=["sentence"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=lookup,
                          input_columns=["sentence"],
                          output_columns=["input_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["input_ids"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "input_mask"],
        column_order=["input_ids", "input_mask", "label_ids"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["input_mask"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "segment_ids"],
        column_order=["input_ids", "input_mask", "segment_ids", "label_ids"])
    dataset = dataset.map(operations=ops.Fill(0),
                          input_columns=["segment_ids"])
    return dataset
def test_random_apply():
    ds.config.set_seed(0)

    def test_config(arr, op_list, prob=0.5):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(input_columns=["col"],
                            operations=ops.RandomApply(op_list, prob))
            res = []
            for i in data.create_dict_iterator():
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    res1 = test_config([[0, 1]], [ops.Duplicate(), ops.Concatenate()])
    assert res1 in [[[0, 1]], [[0, 1, 0, 1]]]
    # test single nested compose
    assert test_config([[0, 1, 2]], [
        ops.Compose([ops.Duplicate(),
                     ops.Concatenate(),
                     ops.Slice([0, 1, 2])])
    ]) == [[0, 1, 2]]
    # test exception
    assert "is not of type (<class 'list'>" in test_config([1, 0],
                                                           ops.TypeCast(
                                                               mstype.int32))
    assert "Input prob is not within the required interval" in test_config(
        [0, 1], [ops.Slice([0, 1])], 1.1)
    assert "is not of type (<class 'float'>" in test_config(
        [1, 0], [ops.TypeCast(mstype.int32)], None)
    assert "op_list with value None is not of type (<class 'list'>" in test_config(
        [1, 0], None)
def test_random_choice():
    """
    Test RandomChoice op
    """
    ds.config.set_seed(0)

    def test_config(arr, op_list):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(operations=ops.RandomChoice(op_list),
                            input_columns=["col"])
            res = []
            for i in data.create_dict_iterator(num_epochs=1,
                                               output_numpy=True):
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    # Test whether an operation would be randomly chosen.
    # In order to prevent random failure, both results need to be checked.
    res1 = test_config([[0, 1, 2]], [ops.PadEnd([4], 0), ops.Slice([0, 2])])
    assert res1 in [[[0, 1, 2, 0]], [[0, 2]]]

    # Test nested structure
    res2 = test_config([[0, 1, 2]], [
        ops.Compose([ops.Duplicate(), ops.Concatenate()]),
        ops.Compose([ops.Slice([0, 1]), ops.OneHot(2)])
    ])
    assert res2 in [[[[1, 0], [0, 1]]], [[0, 1, 2, 0, 1, 2]]]
    # Test RandomChoice where there is only 1 operation
    assert test_config([[4, 3], [2, 1]], [ops.Slice([0])]) == [[4], [2]]
Esempio n. 4
0
def slice_compare(array, indexing, expected_array):
    data = ds.NumpySlicesDataset([array])
    if isinstance(indexing, list) and indexing and not isinstance(indexing[0], int):
        data = data.map(operations=ops.Slice(*indexing))
    else:
        data = data.map(operations=ops.Slice(indexing))
    for d in data.create_dict_iterator(output_numpy=True):
        np.testing.assert_array_equal(expected_array, d['column_0'])
def test_random_select_subpolicy():
    ds.config.set_seed(0)

    def test_config(arr, policy):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(operations=visions.RandomSelectSubpolicy(policy),
                            input_columns=["col"])
            res = []
            for i in data.create_dict_iterator(num_epochs=1,
                                               output_numpy=True):
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    # 3 possible outcomes
    policy1 = [[(ops.PadEnd([4], 0), 0.5),
                (ops.Compose([ops.Duplicate(),
                              ops.Concatenate()]), 1)],
               [(ops.Slice([0, 1]), 0.5), (ops.Duplicate(), 1),
                (ops.Concatenate(), 1)]]
    res1 = test_config([[1, 2, 3]], policy1)
    assert res1 in [[[1, 2, 1, 2]], [[1, 2, 3, 1, 2, 3]],
                    [[1, 2, 3, 0, 1, 2, 3, 0]]]

    # test exceptions
    assert "policy can not be empty." in test_config([[1, 2, 3]], [])
    assert "policy[0] can not be empty." in test_config([[1, 2, 3]], [[]])
    assert "op of (op, prob) in policy[1][0] is neither a c_transform op (TensorOperation) nor a callable pyfunc" \
           in test_config([[1, 2, 3]], [[(ops.PadEnd([4], 0), 0.5)], [(1, 0.4)]])
    assert "prob of (op, prob) policy[1][0] is not within the required interval of [0, 1]" in test_config(
        [[1]], [[(ops.Duplicate(), 0)], [(ops.Duplicate(), -0.1)]])
def test_eager_slice():
    """
    Test Slice op is callable
    """
    indexing = [[0], [0, 3]]
    slice_op = data_trans.Slice(*indexing)
    expected = np.array([[1, 4]])
    assert np.array_equal(slice_op([[1, 2, 3, 4, 5]]), expected)
Esempio n. 7
0
def slice_compare(array, indexing):
    data = ds.NumpySlicesDataset([array])
    array = np.array(array)
    data = data.map(operations=ops.Slice(indexing))
    for d in data:
        if indexing is None:
            array = array[:]
        else:
            array = array[indexing]
        np.testing.assert_array_equal(array, d[0])
Esempio n. 8
0
def test_slice_none_and_ellipsis():
    """
    Test passing None and Ellipsis to Slice
    """
    dataset = [[1], [3, 4, 5], [1, 2], [1, 2, 3, 4, 5, 6, 7]]
    exp_dataset = [[1], [3, 4, 5], [1, 2], [1, 2, 3, 4, 5, 6, 7]]

    def gen():
        for row in dataset:
            yield (np.array(row),)

    data = ds.GeneratorDataset(gen, column_names=["col"])
    data = data.map(operations=ops.Slice(None))
    for (d, exp_d) in zip(data.create_dict_iterator(output_numpy=True), exp_dataset):
        np.testing.assert_array_equal(exp_d, d['col'])

    data = ds.GeneratorDataset(gen, column_names=["col"])
    data = data.map(operations=ops.Slice(Ellipsis))
    for (d, exp_d) in zip(data.create_dict_iterator(output_numpy=True), exp_dataset):
        np.testing.assert_array_equal(exp_d, d['col'])
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path,
                               data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64):
    """Process TNEWS dataset"""
    ### Loading TNEWS from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"],
                              columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate())
        dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0))
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup)
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    dataset = dataset.map(input_columns=["sentence"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len)))
    dataset = dataset.map(input_columns=["sentence"],
                          operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'),
                                                     append=np.array(["[SEP]"], dtype='S')))
    dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup)
    dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0))
    dataset = dataset.batch(batch_size)
    label = []
    text_ids = []
    mask_ids = []
    segment_ids = []
    for data in dataset:
        label.append(data[0])
        text_ids.append(data[1])
        mask_ids.append(data[2])
        segment_ids.append(data[3])
    return label, text_ids, mask_ids, segment_ids
Esempio n. 10
0
def create_tinybert_dataset(batch_size=32,
                            device_num=1,
                            rank=0,
                            do_shuffle="true",
                            data_dir=None,
                            data_type='tfrecord',
                            seq_length=128,
                            task_type=mstype.int32,
                            drop_remainder=True):
    """create tinybert dataset"""
    if isinstance(data_dir, list):
        data_files = data_dir
    else:
        data_files = [data_dir]

    columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]

    shuffle = (do_shuffle == "true")

    if data_type == 'mindrecord':
        ds = de.MindDataset(data_files,
                            columns_list=columns_list,
                            shuffle=shuffle,
                            num_shards=device_num,
                            shard_id=rank)
    else:
        ds = de.TFRecordDataset(data_files,
                                columns_list=columns_list,
                                shuffle=shuffle,
                                num_shards=device_num,
                                shard_id=rank,
                                shard_equal_rows=(device_num == 1))

    if device_num == 1 and shuffle is True:
        ds = ds.shuffle(10000)

    type_cast_op = C.TypeCast(mstype.int32)
    slice_op = C.Slice(slice(0, seq_length, 1))
    label_type = mstype.int32 if task_type == 'classification' else mstype.float32
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["segment_ids"])
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["input_mask"])
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["input_ids"])
    ds = ds.map(operations=[C.TypeCast(label_type), slice_op],
                input_columns=["label_ids"])
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)

    return ds
Esempio n. 11
0
def test_out_of_bounds_slicing_str():
    """
    Test passing indices outside of the input to the slice objects
    """
    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-15, -1), [b"1", b"2", b"3", b"4"])
    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-15, 15), [b"1", b"2", b"3", b"4", b"5"])

    indexing = slice(-15, -7)
    expected_array = np.array([], dtype="S")
    data = [b"1", b"2", b"3", b"4", b"5"]
    data = ds.NumpySlicesDataset([data])
    data = data.map(operations=ops.Slice(indexing))
    for d in data.create_dict_iterator(output_numpy=True):
        np.testing.assert_array_equal(expected_array, d['column_0'])
Esempio n. 12
0
def test_slice_multiple_rows():
    dataset = [[1, 2], [3, 4, 5], [1], [1, 2, 3, 4, 5, 6, 7]]

    def gen():
        for row in dataset:
            yield (np.array(row), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    indexing = slice(0, 4)
    data = data.map(operations=ops.Slice(indexing))
    for i, d in enumerate(data):
        array = np.array(dataset[i])
        array = array[indexing]
        np.testing.assert_array_equal(array, d[0])
Esempio n. 13
0
def test_slice_multiple_rows():
    """
    Test passing in multiple rows
    """
    dataset = [[1], [3, 4, 5], [1, 2], [1, 2, 3, 4, 5, 6, 7]]
    exp_dataset = [[], [4, 5], [2], [2, 3, 4]]

    def gen():
        for row in dataset:
            yield (np.array(row),)

    data = ds.GeneratorDataset(gen, column_names=["col"])
    indexing = slice(1, 4)
    data = data.map(operations=ops.Slice(indexing))
    for (d, exp_d) in zip(data.create_dict_iterator(output_numpy=True), exp_dataset):
        np.testing.assert_array_equal(exp_d, d['col'])