Python PipeModeDataset.shard Examples

Programming Language: Python

Namespace/Package Name: sagemaker_tensorflow

Class/Type: PipeModeDataset

Method/Function: shard

Examples at hotexamples.com: 2

Python PipeModeDataset.shard - 2 examples found. These are the top rated real world Python examples of sagemaker_tensorflow.PipeModeDataset.shard extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

repeat(23)

PipeModeDataset(22)

prefetch(22)

map(20)

batch(18)

make_one_shot_iterator(16)

shuffle(15)

apply(7)

cache(3)

as_numpy_iterator(2)

shard(2)

take(1)

Example #1

Show file

File: DeepFM-dist-ps-for-multiGPU-multiInstance.py Project: whn09/myAWSStudyBlog

def input_fn(filenames,
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    if FLAGS.pipe_mode == 0:
        # Extract lines from input files using the Dataset API, can pass one filename or filename list
        dataset = tf.data.TextLineDataset(filenames).map(
            decode_libsvm, num_parallel_calls=10).prefetch(
                500000)  # multi-thread pre-process then prefetch
        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        #liangaws:注意如果是单机多GPU或者多CPU，这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。
        #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略
        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use
        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels

    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        #liangaws: 在sagemaker PS训练方式下，每个训练实例只有一个worker，一个ps。所以这里使用host的数量其实等于worker的数量来对训练集shard。不需要对验证集进行shard。
        if channel == 'training':
            number_host = len(FLAGS.hosts)
            if number_host > 1:
                index = FLAGS.hosts.index(FLAGS.current_host)
                print("index is ", index)
                dataset = dataset.shard(number_host, index)

        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.prefetch(500000)
        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        return dataset

Example #2

Show file

def input_fn(filenames='',
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    # Extract lines from input files using the Dataset API, can pass one filename or filename list
    print("pipe mode ", FLAGS.pipe_mode)
    if FLAGS.pipe_mode == 0:
        """
        dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000)    # multi-thread pre-process then prefetch
              
        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use
        """

        dataset = tf.data.TextLineDataset(filenames)
        #liangaws: 这里假设Sagemaker用的是S3fullreplicate，也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。
        dataset = dataset.shard(hvd.size(), hvd.rank())

        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.prefetch(
            500000)  # multi-thread pre-process then prefetch
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use

        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels

    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        number_host = len(FLAGS.hosts)
        #liangaws: horovod + pipe mode下，如果每个训练实例有多个worker，需要每个worker对应一个不同的channel，因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下，才需要对不同训练实例上的同一个channel的数据做shard。
        if number_host > 1 and hvd.size() > number_host:
            #liangaws: 在Sagemaker horovod方式下，发现current-host都是一样的。
            #index = FLAGS.hosts.index(FLAGS.current_host)
            index = hvd.rank() // FLAGS.worker_per_host
            dataset = dataset.shard(number_host, index)

        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.prefetch(500000)
        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        return dataset