def test_random_select_subpolicy(): ds.config.set_seed(0) def test_config(arr, policy): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(operations=visions.RandomSelectSubpolicy(policy), input_columns=["col"]) res = [] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # 3 possible outcomes policy1 = [[(ops.PadEnd([4], 0), 0.5), (ops.Compose([ops.Duplicate(), ops.Concatenate()]), 1)], [(ops.Slice([0, 1]), 0.5), (ops.Duplicate(), 1), (ops.Concatenate(), 1)]] res1 = test_config([[1, 2, 3]], policy1) assert res1 in [[[1, 2, 1, 2]], [[1, 2, 3, 1, 2, 3]], [[1, 2, 3, 0, 1, 2, 3, 0]]] # test exceptions assert "policy can not be empty." in test_config([[1, 2, 3]], []) assert "policy[0] can not be empty." in test_config([[1, 2, 3]], [[]]) assert "op of (op, prob) in policy[1][0] is neither a c_transform op (TensorOperation) nor a callable pyfunc" \ in test_config([[1, 2, 3]], [[(ops.PadEnd([4], 0), 0.5)], [(1, 0.4)]]) assert "prob of (op, prob) policy[1][0] is not within the required interval of [0, 1]" in test_config( [[1]], [[(ops.Duplicate(), 0)], [(ops.Duplicate(), -0.1)]])
def test_random_apply(): ds.config.set_seed(0) def test_config(arr, op_list, prob=0.5): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(input_columns=["col"], operations=ops.RandomApply(op_list, prob)) res = [] for i in data.create_dict_iterator(): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) res1 = test_config([[0, 1]], [ops.Duplicate(), ops.Concatenate()]) assert res1 in [[[0, 1]], [[0, 1, 0, 1]]] # test single nested compose assert test_config([[0, 1, 2]], [ ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Slice([0, 1, 2])]) ]) == [[0, 1, 2]] # test exception assert "is not of type (<class 'list'>" in test_config([1, 0], ops.TypeCast( mstype.int32)) assert "Input prob is not within the required interval" in test_config( [0, 1], [ops.Slice([0, 1])], 1.1) assert "is not of type (<class 'float'>" in test_config( [1, 0], [ops.TypeCast(mstype.int32)], None) assert "op_list with value None is not of type (<class 'list'>" in test_config( [1, 0], None)
def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None, split_begin=None, split_end=None): """Process MSRA dataset""" ### Loading MSRA from CLUEDataset dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end), column_names=['text', 'label']) ### Processing label label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_ids") dataset = dataset.map( operations=ops.Concatenate(prepend=np.array([0], dtype='i')), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["label_ids"]) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) lookup = text.Lookup(vocab, unknown_token='[UNK]') unicode_char_tokenizer = text.UnicodeCharTokenizer() dataset = dataset.map(operations=unicode_char_tokenizer, input_columns=["text"], output_columns=["sentence"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)), input_columns=["sentence"]) dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence"]) dataset = dataset.map(operations=lookup, input_columns=["sentence"], output_columns=["input_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["input_ids"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "input_mask"], column_order=["input_ids", "input_mask", "label_ids"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["input_mask"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "segment_ids"], column_order=["input_ids", "input_mask", "segment_ids", "label_ids"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["segment_ids"]) return dataset
def test_compose(): """ Test C++ and Python Compose Op """ ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(input_columns=["col"], operations=op_list) res = [] for i in data.create_dict_iterator(output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # Test simple compose with only 1 op, this would generate a warning assert test_config([[1, 0], [3, 4]], ops.Compose([ops.Fill(2)])) == [[2, 2], [2, 2]] # Test 1 column -> 2 columns -> 1 -> 2 -> 1 assert test_config([[1, 0]], ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()])) \ == [[1, 0] * 4] # Test one Python transform followed by a C transform. Type after OneHot is a float (mixed use-case) assert test_config( [1, 0], ops.Compose([py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)])) == [[[0, 1]], [[1, 0]]] # Test exceptions. with pytest.raises(TypeError) as error_info: ops.Compose([1, ops.TypeCast(mstype.int32)]) assert "op_list[0] is not a c_transform op (TensorOp) nor a callable pyfunc." in str( error_info.value) # Test empty op list with pytest.raises(ValueError) as error_info: test_config([1, 0], ops.Compose([])) assert "op_list can not be empty." in str(error_info.value) # Test Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2)])) == [[[0, 1]], [[1, 0]]] assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2), (lambda x: x + x)])) == [[[0, 2]], [[2, 0]]] # Test nested Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.Compose([py_ops.OneHotOp(2)]), (lambda x: x + x)])) \ == [[[0, 2]], [[2, 0]]] with pytest.raises(TypeError) as error_info: py_ops.Compose([(lambda x: x + x)])() assert "Compose was called without an image. Fix invocation (avoid it being invoked as Compose([...])())." in str( error_info.value)
def test_random_choice(): """ Test RandomChoice op """ ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(operations=ops.RandomChoice(op_list), input_columns=["col"]) res = [] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # Test whether an operation would be randomly chosen. # In order to prevent random failure, both results need to be checked. res1 = test_config([[0, 1, 2]], [ops.PadEnd([4], 0), ops.Slice([0, 2])]) assert res1 in [[[0, 1, 2, 0]], [[0, 2]]] # Test nested structure res2 = test_config([[0, 1, 2]], [ ops.Compose([ops.Duplicate(), ops.Concatenate()]), ops.Compose([ops.Slice([0, 1]), ops.OneHot(2)]) ]) assert res2 in [[[[1, 0], [0, 1]]], [[0, 1, 2, 0, 1, 2]]] # Test RandomChoice where there is only 1 operation assert test_config([[4, 3], [2, 1]], [ops.Slice([0])]) == [[4], [2]]
def compare(array): data = ds.NumpySlicesDataset([array], column_names="x") array = np.array(array) data = data.map(operations=ops.Duplicate(), input_columns=["x"], output_columns=["x", "y"], column_order=["x", "y"]) for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): np.testing.assert_array_equal(array, d["x"]) np.testing.assert_array_equal(array, d["y"])
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64, drop_remainder=True): """Process TNEWS dataset""" ### Loading TNEWS from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(operations=ops.Duplicate(), input_columns=["id"], output_columns=["id", "label_id"], column_order=["id", "label_id", "sentence"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["label_id"]) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label_desc", output_columns="label_id") ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') dataset = dataset.map(operations=tokenizer, input_columns=["sentence"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)), input_columns=["sentence"]) dataset = dataset.map(operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence"]) dataset = dataset.map(operations=lookup, input_columns=["sentence"], output_columns=["text_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["text_ids"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], column_order=["text_ids", "mask_ids", "label_id"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["mask_ids"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"], column_order=["text_ids", "mask_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["segment_ids"]) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset
def test_compose(): ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(operations=ops.Compose(op_list), input_columns=["col"]) res = [] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # test simple compose with only 1 op, this would generate a warning assert test_config([[1, 0], [3, 4]], [ops.Fill(2)]) == [[2, 2], [2, 2]] # test 1 column -> 2columns -> 1 -> 2 -> 1 assert test_config([[1, 0]], [ ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate() ]) == [[1, 0] * 4] # test one python transform followed by a C transform. type after oneHot is float (mixed use-case) assert test_config( [1, 0], [py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)]) == [[[0, 1]], [[1, 0]]] # test exceptions. compose, randomApply randomChoice use the same validator assert "op_list[0] is not a c_transform op" in test_config( [1, 0], [1, ops.TypeCast(mstype.int32)]) # test empty op list assert "op_list can not be empty." in test_config([1, 0], [])
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64): """Process CMNLI dataset""" ### Loading CMNLI from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"], columns_order=["id", "label_id", "sentence1", "sentence2"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0)) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(input_columns="label", output_columns="label_id", operations=label_lookup) ### Processing sentence pairs vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') ### Tokenizing sentences and truncate sequence pair dataset = dataset.map(input_columns=["sentence1"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence2"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence1", "sentence2"], operations=text.TruncateSequencePair(max_seq_len-3)) ### Adding special tokens dataset = dataset.map(input_columns=["sentence1"], operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S'))) dataset = dataset.map(input_columns=["sentence2"], operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S'))) ### Generating segment_ids dataset = dataset.map(input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"], columns_order=["sentence1", "type_sentence1", "sentence2", "label_id"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"], columns_order=["sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["type_sentence1"], operations=[lookup, ops.Fill(0)]) dataset = dataset.map(input_columns=["type_sentence2"], operations=[lookup, ops.Fill(1)]) dataset = dataset.map(input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"], columns_order=["sentence1", "sentence2", "segment_ids", "label_id"], operations=ops.Concatenate()) dataset = dataset.map(input_columns=["segment_ids"], operations=ops.PadEnd([max_seq_len], 0)) ### Generating text_ids dataset = dataset.map(input_columns=["sentence1", "sentence2"], output_columns=["text_ids"], columns_order=["text_ids", "segment_ids", "label_id"], operations=ops.Concatenate()) dataset = dataset.map(input_columns=["text_ids"], operations=lookup) dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0)) ### Generating mask_ids dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32)) dataset = dataset.batch(batch_size) label = [] text_ids = [] mask_ids = [] segment_ids = [] for data in dataset: label.append(data[0]) text_ids.append(data[1]) mask_ids.append(data[2]) segment_ids.append(data[3]) return label, text_ids, mask_ids, segment_ids