def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=[ "input_ids", "input_mask", "segment_ids", "start_positions", "end_positions", "unique_ids", "is_impossible" ], shuffle=do_shuffle) ds = ds.map(operations=type_cast_op, input_columns="start_positions") ds = ds.map(operations=type_cast_op, input_columns="end_positions") else: ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=[ "input_ids", "input_mask", "segment_ids", "unique_ids" ]) ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") ds = ds.repeat(repeat_count) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") else: ds = ds.map(operations=type_cast_op, input_columns="label_ids") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") ds = ds.repeat(repeat_count) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): """create tinybert dataset""" files = os.listdir(data_dir) data_files = [] for file_name in files: if "record" in file_name: data_files.append(os.path.join(data_dir, file_name)) if task == "td": columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] else: columns_list = ["input_ids", "input_mask", "segment_ids"] ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, shard_equal_rows=True) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) if task == "td": ds = ds.map(input_columns="label_ids", operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): """create train dataset""" # apply repeat operations files = os.listdir(data_dir) data_files = [] for file_name in files: if "tf_record" in file_name: data_files.append(os.path.join(data_dir, file_name)) print(data_files) ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, num_shards=device_num, shard_id=rank, shard_equal_rows=True) ori_dataset_size = ds.get_dataset_size() print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations ds = ds.batch(cfg.batch_size, drop_remainder=True) logger.info("data size: {}".format(ds.get_dataset_size())) logger.info("repeat count: {}".format(ds.get_repeat_count())) return ds
def me_de_train_dataset(sink_mode=False): """test me de train dataset""" # apply repeat operations repeat_count = 1 sink_size = -1 batch_size = 16 ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=[ "input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights" ], shuffle=False) type_cast_op = C.TypeCast(mstype.int32) new_repeat_count = repeat_count if sink_mode: sink_size = 100 new_repeat_count = 3 ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) logger.info("data size: {}".format(ds.get_dataset_size())) logger.info("repeat_count: {}".format(ds.get_repeat_count())) return ds, new_repeat_count, sink_size
def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): """create tinybert dataset""" if isinstance(data_dir, list): data_files = data_dir else: data_files = [data_dir] columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] shuffle = (do_shuffle == "true") if data_type == 'mindrecord': ds = de.MindDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, shard_id=rank) else: ds = de.TFRecordDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, shard_id=rank, shard_equal_rows=(device_num == 1)) if device_num == 1 and shuffle is True: ds = ds.shuffle(10000) type_cast_op = C.TypeCast(mstype.int32) slice_op = C.Slice(slice(0, seq_length, 1)) label_type = mstype.int32 if task_type == 'classification' else mstype.float32 ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["segment_ids"]) ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_mask"]) ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_ids"]) ds = ds.map(operations=[C.TypeCast(label_type), slice_op], input_columns=["label_ids"]) # apply batch operations ds = ds.batch(batch_size, drop_remainder=drop_remainder) return ds
def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true", data_sink_steps=1, data_dir=None, schema_dir=None): """create train dataset""" # apply repeat operations repeat_count = epoch_size files = os.listdir(data_dir) data_files = [] for file_name in files: if "tfrecord" in file_name: data_files.append(os.path.join(data_dir, file_name)) ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, columns_list=[ "input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights" ], shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, shard_equal_rows=True) ori_dataset_size = ds.get_dataset_size() print('origin dataset size: ', ori_dataset_size) new_size = ori_dataset_size if enable_data_sink == "true": new_size = data_sink_steps * bert_net_cfg.batch_size ds.set_dataset_size(new_size) new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size()) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) # apply batch operations ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) ds = ds.repeat(max(new_repeat_count, repeat_count)) logger.info("data size: {}".format(ds.get_dataset_size())) logger.info("repeatcount: {}".format(ds.get_repeat_count())) return ds, new_repeat_count
def me_de_train_dataset(): """test me de train dataset""" # apply repeat operations repeat_count = 1 ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], shuffle=False) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) # apply batch operations batch_size = int(os.getenv('BATCH_SIZE', '16')) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(repeat_count) return ds
def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None, data_type=DataType.TFRECORD): """create tinybert dataset""" files = os.listdir(data_dir) data_files = [] for file_name in files: if "record" in file_name and "db" not in file_name: data_files.append(os.path.join(data_dir, file_name)) if task == "td": columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] else: columns_list = ["input_ids", "input_mask", "segment_ids"] shard_equal_rows = True shuffle = (do_shuffle == "true") if device_num == 1: shard_equal_rows = False shuffle = False if data_type == DataType.MINDRECORD: ds = de.MindDataset(data_files, columns_list=columns_list, shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) else: ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, shard_id=rank, shard_equal_rows=shard_equal_rows) if device_num == 1 and shuffle is True: ds = ds.shuffle(10000) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") if task == "td": ds = ds.map(operations=type_cast_op, input_columns="label_ids") # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_train_dataset(batch_size): """create train dataset""" # apply repeat operations repeat_count = bert_train_cfg.epoch_size ds = de.TFRecordDataset([bert_train_cfg.DATA_DIR], bert_train_cfg.SCHEMA_DIR, columns_list=[ "input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights" ]) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(repeat_count) return ds
def create_classification_dataset(indices, batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) # 重建Dataset data = [] for d in ds.create_tuple_iterator(): data.append( [d[0].asnumpy(), d[1].asnumpy(), d[2].asnumpy(), d[3].asnumpy()]) dataset_generator = MyDatasetGenerator(data, indices) ds = GeneratorDataset( dataset_generator, ["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=False) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") else: ds = ds.map(operations=type_cast_op, input_columns="label_ids") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") ds = ds.repeat(repeat_count) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds