def _input(epochs, batch_size, channel, channel_name, hvd=None): if hvd != None: channel_name = '{}_{}'.format(channel_name, hvd.rank() % 4) print("The channel name is ", channel_name) channel_input_dir = args.training_env['channel_input_dirs'][channel_name] print("The corresponding input directory is ", channel_input_dir) mode = args.data_config[channel_name]['TrainingInputMode'] if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') else: filenames = get_filenames(channel_input_dir, hvd) print("The correpsonding filenames are", filenames) dataset = tf.data.TFRecordDataset(filenames) if 'test' in channel_name: dataset = dataset.map(_dataset_parser_with_slide) else: dataset = dataset.repeat(epochs) dataset = dataset.map(_dataset_parser) if 'train' in channel_name: # Ensure that the capacity is sufficiently large to provide good random shuffling. buffer_size = int( NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up (only for train and valid) if 'test' not in channel_name: dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(10) return dataset
def _input(epochs, batch_size, channel, channel_name): filenames = get_filenames(channel_name, channel) # ----- 추가 부분 (PipeModeDataSet) ----- #dataset = tf.data.TFRecordDataset(filenames) dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') dataset = dataset.repeat(epochs) dataset = dataset.prefetch(10) # Parse records. dataset = dataset.map( _dataset_parser, num_parallel_calls=10) # Potentially shuffle records. if channel_name == 'train': # Ensure that the capacity is sufficiently large to provide good random # shuffling. buffer_size = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up. dataset = dataset.batch(batch_size, drop_remainder=True) iterator = dataset.make_one_shot_iterator() image_batch, label_batch = iterator.get_next() return {INPUT_TENSOR_NAME: image_batch}, label_batch
def _input_fn(): def _read_and_decode(record): features = tf.parse_single_example( record, features={ 'image_raw': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64), }) image = tf.decode_raw(features['image_raw'], tf.uint8) image.set_shape([HEIGHT * WIDTH * DEPTH]) image = tf.cast(image, tf.float32) * (1. / 255) label = tf.cast(features['label'], tf.int32) return {INPUT_TENSOR_NAME: image}, label ds = PipeModeDataset(channel, record_format='TFRecord') ds = ds.repeat() ds = ds.prefetch(batch_size) ds = ds.map(_read_and_decode, num_parallel_calls=NUM_PARALLEL_BATCHES) if channel == 'train': ds = ds.shuffle(buffer_size=batch_size) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.make_one_shot_iterator().get_next() return ds
def make_dataset(channel_name): ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir) ds = ds.map(parse, num_parallel_calls=12) ds = ds.repeat(count=2) ds = ds.prefetch(3) ds = ds.batch(10) return ds
def input_fn(params): """The actual input function.""" batch_size = params['batch_size'] # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: channel = 'train' else: channel = 'validation' if pipe_mode: print('***** Using pipe_mode!!!!') ds = PipeModeDataset(channel=channel, record_format='TFRecord') else: ds = tf.data.TFRecordDataset(input_files) if is_training: ds = ds.repeat() ds = ds.shuffle(buffer_size=100) ds = ds.apply( tf.contrib.data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder)) return ds
def _input_fn(channel): """Returns a Dataset for reading from a SageMaker PipeMode channel.""" features = { 'label': tf.FixedLenFeature([], tf.int64), 'feature': tf.FixedLenFeature([480], tf.int64) } def parse(record): parsed = tf.parse_single_example(record, features) data = tf.reshape(parsed['feature'], [160, 3]) data = tf.cast(data, tf.float32) label = tf.cast(parsed['label'], tf.int32) return ({INPUT_TENSOR_NAME: data}, label) ds = PipeModeDataset(channel=channel, record_format='TFRecord') ds = ds.shuffle(SHUFFLE_BUFFER_SIZE) ds = ds.repeat(MAX_EPOCHS) ds = ds.prefetch(PREFETCH_SIZE) ds = ds.map(parse, num_parallel_calls=NUM_PARALLEL_BATCHES) ds = ds.batch(BATCH_SIZE) return ds
def _input_fn(channel): """Returns a Dataset which reads from a SageMaker PipeMode channel.""" features = { 'image_raw': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64), 'height': tf.FixedLenFeature([], tf.int64), 'width': tf.FixedLenFeature([], tf.int64), 'channels': tf.FixedLenFeature([], tf.int64) } def parse(record): parsed = tf.parse_single_example(record, features) image = tf.decode_raw(parsed['image_raw'], tf.uint8) image.set_shape([784]) image = tf.cast(image, tf.float32) * (1. / 255) label = tf.cast(parsed['label'], tf.int32) return ({INPUT_TENSOR_NAME: image}, label) ds = PipeModeDataset(channel=channel, record_format='TFRecord') ds = ds.repeat(MAX_EPOCHS) ds = ds.prefetch(PREFETCH_SIZE) ds = ds.map(parse, num_parallel_calls=NUM_PARALLEL_BATCHES) ds = ds.batch(BATCH_SIZE) return ds
def process_input(epochs, batch_size, channel, channel_name, data_config): mode = data_config[channel_name]['TrainingInputMode'] filenames = _get_filenames(channel_name, channel) # Repeat infinitely. logging.info("Running {} in {} mode".format(channel_name, mode)) if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') else: dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat() dataset = dataset.prefetch(10) # Parse records. dataset = dataset.map(_dataset_parser, num_parallel_calls=10) # Potentially shuffle records. if channel_name == 'train': # Ensure that the capacity is sufficiently large to provide good random # shuffling. buffer_size = int( NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up. dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def _input(epochs, batch_size, channel, channel_name): """Uses the tf.data input pipeline for CIFAR-10 dataset.""" mode = args.data_config[channel_name]['TrainingInputMode'] logging.info("Running {} in {} mode".format(channel_name, mode)) if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') else: filenames = _get_filenames(channel_name, channel) dataset = tf.data.TFRecordDataset(filenames) # Repeat infinitely. dataset = dataset.repeat() dataset = dataset.prefetch(10) # Parse records. dataset = dataset.map(_dataset_parser, num_parallel_calls=10) # Potentially shuffle records. if channel_name == 'train': # Ensure that the capacity is sufficiently large to provide good random shuffling. buffer_size = int( NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up. dataset = dataset.batch(batch_size, drop_remainder=True) iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) image_batch, label_batch = iterator.get_next() return {INPUT_TENSOR_NAME: image_batch}, label_batch
def input_fn(): ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir) ds = ds.map(parse, num_parallel_calls=12) ds = ds.repeat(count=2) ds = ds.prefetch(3) ds = ds.batch(3) it = ds.make_one_shot_iterator() return it.get_next()
def _input(epochs, batch_size, channel, channel_name, hvd=None): # If Horovod, assign channel name using the horovod rank if hvd != None: channel_name = '{}_{}'.format(channel_name, hvd.local_rank()) channel_input_dir = args.training_env['channel_input_dirs'][channel_name] mode = args.data_config[channel_name]['TrainingInputMode'] """Uses the tf.data input pipeline for CIFAR-10 dataset. Args: mode: Standard names for model modes (tf.estimators.ModeKeys). batch_size: The number of samples per batch of input requested. """ if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') #, benchmark=True) else: filenames = get_filenames(channel_input_dir) print(f'DEBUG tfrecords : {filenames}') dataset = tf.data.TFRecordDataset(filenames) if 'train' in channel_name: dataset = dataset.repeat(epochs) else: dataset = dataset.repeat(20) # Parse records. dataset = dataset.map(_dataset_parser, num_parallel_calls=10) # Potentially shuffle records. # if hvd == None and 'train' in channel_name: if 'train' in channel_name: # Ensure that the capacity is sufficiently large to provide good random # shuffling. buffer_size = int( NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up. dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(10) return dataset
def file_based_input_dataset_builder( channel, input_filenames, pipe_mode, is_training, drop_remainder, batch_size, epochs, steps_per_epoch, max_seq_length, ): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: print("***** Using pipe_mode with channel {}".format(channel)) from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel, record_format="TFRecord") else: print("***** Using input_filenames {}".format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) return record dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder, num_parallel_calls=tf.data.experimental.AUTOTUNE, ) ) dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) row_count = 0 print("**************** {} *****************".format(channel)) for row in dataset.as_numpy_iterator(): print(row) if row_count == 5: break row_count = row_count + 1 return dataset
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder, batch_size, epochs, steps_per_epoch, max_seq_length): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: print('***** Using pipe_mode with channel {}'.format(channel)) from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel, record_format='TFRecord') else: print('***** Using input_filenames {}'.format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder, num_parallel_calls=tf.data.experimental.AUTOTUNE)) dataset.cache() if is_training: dataset = dataset.shuffle(seed=42, buffer_size=100, reshuffle_each_iteration=True) return dataset
def _input(epochs, batch_size, channel, channel_name): mode = args.data_config[channel_name]['TrainingInputMode'] """Uses the tf.data input pipeline for our dataset. Args: mode: Standard names for model modes (tf.estimators.ModeKeys). batch_size: The number of samples per batch of input requested. """ logging.info("Running {} in {} mode for {} epochs".format(channel_name, mode, epochs)) filenames = get_filenames(channel_name, channel) # Repeat infinitely. if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') else: dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat(epochs) dataset = dataset.prefetch(batch_size) # Parse records. dataset = dataset.map(_dataset_parser, num_parallel_calls=10) # Shuffle training records. if channel_name == 'train': # Ensure that the capacity is sufficiently large to provide good random # shuffling. buffer_size = args.num_train_samples // args.batch_size dataset = dataset.shuffle(buffer_size=buffer_size) # Batch it up. dataset = dataset.batch(batch_size, drop_remainder=True) if tf.version.VERSION[0] == '2': iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) else: iterator = dataset.make_one_shot_iterator() features_batch, label_batch = iterator.get_next() if tf.version.VERSION[0] == '2': with tf.compat.v1.Session() as sess: logging.info('type of features_batch: {}, type of values: {}'.format(type(features_batch), type(features_batch))) logging.info('label_batch: {}'.format(label_batch)) logging.info('type of label_batch: {}'.format(type(label_batch))) else: with tf.Session() as sess: logging.info('type of features_batch: {}, type of values: {}'.format(type(features_batch), type(features_batch))) logging.info('label_batch: {}'.format(label_batch)) logging.info('type of label_batch: {}'.format(type(label_batch))) return {INPUT_TENSOR_NAME: features_batch}, label_batch
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder, batch_size, epochs, steps_per_epoch, max_seq_length): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: print('***** Using pipe_mode with channel {}'.format(channel)) from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel, record_format='TFRecord') else: print('***** Using input_filenames {}'.format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([], tf.int64), # "is_real_example": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" return tf.io.parse_single_example(record, name_to_features) dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder, num_parallel_calls=tf.data.experimental.AUTOTUNE)) dataset.cache() if is_training: dataset = dataset.shuffle(seed=42, buffer_size=1000, reshuffle_each_iteration=True) return dataset
def _input_fn(): features = { 'data': tf.FixedLenFeature([], tf.string), 'labels': tf.FixedLenFeature([], tf.int64), } def parse(record): return tf.parse_single_example(record, features) ds = PipeModeDataset(config.channel, benchmark=True) if config.epochs > 1: ds = ds.repeat(config.epochs) if config.prefetch_size > 0: ds = ds.prefetch(config.prefetch_size) ds = ds.apply( map_and_batch(parse, batch_size=config.batch_size, num_parallel_batches=config.parallel_transform_calls)) return ds
def _input_fn(channel): """Returns a Dataset for reading from a SageMaker PipeMode channel.""" features = { 'data': tf.FixedLenFeature([], tf.string), 'labels': tf.FixedLenFeature([], tf.int64), } def parse(record): parsed = tf.parse_single_example(record, features) return ({ 'data': tf.decode_raw(parsed['data'], tf.float64) }, parsed['labels']) ds = PipeModeDataset(channel) if EPOCHS > 1: ds = ds.repeat(EPOCHS) ds = ds.prefetch(PREFETCH_SIZE) ds = ds.apply(map_and_batch(parse, batch_size=BATCH_SIZE, num_parallel_batches=NUM_PARALLEL_BATCHES)) return ds
def input_fn(): features = { 'data': tf.FixedLenFeature([], tf.string), 'labels': tf.FixedLenFeature([], tf.int64), } def parse(record): parsed = tf.parse_single_example(record, features) return ({ 'data': tf.decode_raw(parsed['data'], tf.float64) }, parsed['labels']) ds = PipeModeDataset(config.channel) if config.epochs > 1: ds = ds.repeat(config.epochs) if config.prefetch_size > 0: ds = ds.prefetch(config.prefetch_size) ds = ds.map(parse, num_parallel_calls=config.parallel_transform_calls) ds = ds.batch(config.batch_size) return ds
def read_dataset(epochs, batch_size, channel, channel_name): mode = args.data_config[channel_name]["TrainingInputMode"] logging.info("Running {} in {} mode".format(channel_name, mode)) if mode == "Pipe": from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format="TFRecord") else: filenames = [os.path.join(channel, channel_name + ".tfrecords")] dataset = tf.data.TFRecordDataset(filenames) image_feature_description = { "image": tf.io.FixedLenFeature([], tf.string), "label": tf.io.FixedLenFeature([], tf.int64), } def _parse_image_function(example_proto): # Parse the input tf.Example proto using the dictionary above. features = tf.io.parse_single_example(example_proto, image_feature_description) image = tf.io.decode_raw(features["image"], tf.uint8) image.set_shape([3 * 32 * 32]) image = tf.reshape(image, [32, 32, 3]) label = tf.cast(features["label"], tf.int32) label = tf.one_hot(label, 10) return image, label dataset = dataset.map(_parse_image_function, num_parallel_calls=10) dataset = dataset.prefetch(10) dataset = dataset.repeat(epochs) dataset = dataset.shuffle(buffer_size=10 * batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def _input_fn(channel): """Returns a Dataset for reading from a SageMaker PipeMode channel.""" features = { "data": tf.FixedLenFeature([], tf.string), "labels": tf.FixedLenFeature([], tf.int64), } def parse(record): parsed = tf.parse_single_example(record, features) return ({ "data": tf.decode_raw(parsed["data"], tf.float64) }, parsed["labels"]) ds = PipeModeDataset(channel) if EPOCHS > 1: ds = ds.repeat(EPOCHS) ds = ds.prefetch(PREFETCH_SIZE) ds = ds.apply( map_and_batch(parse, batch_size=BATCH_SIZE, num_parallel_batches=NUM_PARALLEL_BATCHES)) return ds
def input_fn(filenames, channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels # Extract lines from input files using the Dataset API, can pass one filename or filename list if FLAGS.pipe_mode == 0: dataset = tf.data.TextLineDataset(filenames).map( decode_libsvm, num_parallel_calls=10).prefetch( 500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) #liangaws:注意如果是单机多GPU或者多CPU,这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。 #dataset = dataset.batch(batch_size) # Batch size to use #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略 dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use """ #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels """ #liangaws:当使用tensorflow的dataset API与distribute strategy联合使用的时候,input_fn需要返回dataset而不是返回特征和label return dataset else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(batch_size * 100) dataset = dataset.apply( map_and_batch(decode_libsvm, batch_size=batch_size, num_parallel_batches=10)) return dataset
'label': tf.FixedLenFeature([], tf.int64), } def parse(record): parsed = tf.parse_single_example(record, features) image = tf.decode_raw(parsed['image'], tf.uint8) image.set_shape([DEPTH * HEIGHT * WIDTH]) image = tf.cast(image, tf.float32)/255.0 label = tf.cast(parsed['label'], tf.int32) return image, label ds = PipeModeDataset(channel='train', record_format='TFRecord') num_epochs = 10 # This yields 40000 (training images)/64 (batch_size) * 10 (epoch) = 6250 batches (steps) # Tensorflow dataset raises tf.errors.OutOfRangeError when all the batches are fed as described in training-loop ds = ds.repeat(num_epochs) ds = ds.prefetch(10) ds = ds.map(parse, num_parallel_calls=10) ds = ds.shuffle(buffer_size = 64) #larger than batch_size ds = ds.batch(batch_size = 64) iterator = ds.make_one_shot_iterator() itr_initializer = iterator.make_initializer(ds) image_batch, label_batch = iterator.get_next() # Set up PyTorch Neural Network and optimizer net = MLP().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case net = torch.nn.parallel.DistributedDataParallel(net) else:
def input_fn(filenames, channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels if FLAGS.pipe_mode == 0: # Extract lines from input files using the Dataset API, can pass one filename or filename list dataset = tf.data.TextLineDataset(filenames).map( decode_libsvm, num_parallel_calls=10).prefetch( 500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) #liangaws:注意如果是单机多GPU或者多CPU,这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。 #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略 dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') #liangaws: 在sagemaker PS训练方式下,每个训练实例只有一个worker,一个ps。所以这里使用host的数量其实等于worker的数量来对训练集shard。不需要对验证集进行shard。 if channel == 'training': number_host = len(FLAGS.hosts) if number_host > 1: index = FLAGS.hosts.index(FLAGS.current_host) print("index is ", index) dataset = dataset.shard(number_host, index) if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(500000) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def input_fn(filenames='', channel='training', batch_size=32, num_epochs=1, perform_shuffle=False): print('Parsing', filenames) def decode_libsvm(line): #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS) #features = dict(zip(CSV_COLUMNS, columns)) #labels = features.pop(LABEL_COLUMN) columns = tf.string_split([line], ' ') labels = tf.string_to_number(columns.values[0], out_type=tf.float32) splits = tf.string_split(columns.values[1:], ':') id_vals = tf.reshape(splits.values, splits.dense_shape) feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size]) #for i in range(splits.dense_shape.eval()[0]): # feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32)) # feat_vals.append(tf.string_to_number(splits.values[2*i+1])) #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels # Extract lines from input files using the Dataset API, can pass one filename or filename list print("pipe mode ", FLAGS.pipe_mode) if FLAGS.pipe_mode == 0: """ dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000) # multi-thread pre-process then prefetch # Randomizes input using a window of 256 elements (read into memory) if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use """ dataset = tf.data.TextLineDataset(filenames) #liangaws: 这里假设Sagemaker用的是S3fullreplicate,也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。 dataset = dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.prefetch( 500000) # multi-thread pre-process then prefetch if perform_shuffle: dataset = dataset.shuffle(buffer_size=256) # epochs from blending together. if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use #return dataset.make_one_shot_iterator() iterator = dataset.make_one_shot_iterator() batch_features, batch_labels = iterator.get_next() #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels return batch_features, batch_labels else: print("-------enter into pipe mode branch!------------") dataset = PipeModeDataset(channel, record_format='TextLine') number_host = len(FLAGS.hosts) #liangaws: horovod + pipe mode下,如果每个训练实例有多个worker,需要每个worker对应一个不同的channel,因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下,才需要对不同训练实例上的同一个channel的数据做shard。 if number_host > 1 and hvd.size() > number_host: #liangaws: 在Sagemaker horovod方式下,发现current-host都是一样的。 #index = FLAGS.hosts.index(FLAGS.current_host) index = hvd.rank() // FLAGS.worker_per_host dataset = dataset.shard(number_host, index) if num_epochs > 1: dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(500000) dataset = dataset.map(decode_libsvm, num_parallel_calls=10) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset