コード例 #1
0
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode,
                                     is_training, drop_remainder, batch_size,
                                     epochs, steps_per_epoch, max_seq_length):

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.

    if pipe_mode:
        print('***** Using pipe_mode with channel {}'.format(channel))
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel, record_format='TFRecord')
    else:
        print('***** Using input_filenames {}'.format(input_filenames))
        dataset = tf.data.TFRecordDataset(input_filenames)

    dataset = dataset.repeat(epochs * steps_per_epoch * 100)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    name_to_features = {
        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "label_ids": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        record = tf.io.parse_single_example(record, name_to_features)
        # TODO:  wip/bert/bert_attention_head_view/train.py
        # Convert input_ids into input_tokens with DistilBert vocabulary
        #  if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):
        #    hook._write_raw_tensor_simple("input_tokens", input_tokens)
        return record

    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder,
            num_parallel_calls=tf.data.experimental.AUTOTUNE))

    dataset.cache()

    if is_training:
        dataset = dataset.shuffle(seed=42,
                                  buffer_size=100,
                                  reshuffle_each_iteration=True)

    return dataset
コード例 #2
0
def test_invalid_data():
    directory = tempfile.mkdtemp()
    filename = "X_0"
    with open(directory + "/" + filename, 'wb') as f:
        f.write(b"adfsafasfd")
    write_config(directory, 'X')
    dataset = PipeModeDataset("X",
                              pipe_dir=directory,
                              state_dir=directory,
                              config_dir=directory)
    with pytest.raises(tf.errors.InternalError):
        with tf.Session() as sess:
            it = dataset.make_one_shot_iterator()
            next = it.get_next()
            sess.run(next)
コード例 #3
0
def get_dataset(batch_size, channel_name, dataset_bottleneck=False):
    from sagemaker_tensorflow import PipeModeDataset

    dataset = PipeModeDataset(channel_name, record_format="TFRecord").repeat()

    dataset = dataset.map(parse_image_function,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if dataset_bottleneck:
        dataset = dataset.map(data_augmentation,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.batch(batch_size).prefetch(1)

    return dataset
コード例 #4
0
def test_tf_record():
    channel_dir = tempfile.mkdtemp()
    state_dir = tempfile.mkdtemp()
    epochs = 1
    channel_name = 'testchannel'
    create_fifos(epochs, channel_dir, channel_name, input_file='test.tfrecords')
    write_config(channel_dir, 'testchannel')

    ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir,
                         record_format='TFRecord')

    with tf.Session() as sess:
        it = ds.make_one_shot_iterator()
        next = it.get_next()
        for i in range(100):
            assert sess.run(next) == b'hello world'
コード例 #5
0
def _input(epochs, batch_size, channel, channel_name):
    mode = args.data_config[channel_name]['TrainingInputMode']
    """Uses the tf.data input pipeline for our dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """
    logging.info("Running {} in {} mode for {} epochs".format(
        channel_name, mode, epochs))

    filenames = get_filenames(channel_name, channel)

    # Repeat infinitely.
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(batch_size)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)
    ## TF Dataset question: why does _dataset_parser only get called once per channel??

    # Shuffle training records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = args.num_train_samples // args.batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)

    iterator = dataset.make_one_shot_iterator()
    features_batch, label_batch = iterator.get_next()

    with tf.Session() as sess:
        logging.info('type of features_batch: {}, type of values: {}'.format(
            type(features_batch), type(features_batch)))
        logging.info('label_batch: {}'.format(label_batch))
        logging.info('type of label_batch: {}'.format(type(label_batch)))

    return {INPUT_TENSOR_NAME: features_batch}, label_batch
def _input(epochs, batch_size, channel, channel_name, hvd=None):

    # If Horovod, assign channel name using the horovod rank
    if hvd != None:
        channel_name = '{}_{}'.format(channel_name, hvd.local_rank())

    channel_input_dir = args.training_env['channel_input_dirs'][channel_name]

    mode = args.data_config[channel_name]['TrainingInputMode']
    """Uses the tf.data input pipeline for CIFAR-10 dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """

    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')  #, benchmark=True)
    else:
        filenames = get_filenames(channel_input_dir)
        print(f'DEBUG tfrecords : {filenames}')
        dataset = tf.data.TFRecordDataset(filenames)

    if 'train' in channel_name:
        dataset = dataset.repeat(epochs)
    else:
        dataset = dataset.repeat(20)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    # Potentially shuffle records.
    #     if hvd == None and 'train' in channel_name:
    if 'train' in channel_name:
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = int(
            NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(10)

    return dataset
コード例 #7
0
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode,
                                     is_training, drop_remainder, batch_size,
                                     epochs, steps_per_epoch, max_seq_length):

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.

    if pipe_mode:
        print('***** Using pipe_mode with channel {}'.format(channel))
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel, record_format='TFRecord')
    else:
        print('***** Using input_filenames {}'.format(input_filenames))
        dataset = tf.data.TFRecordDataset(input_filenames)

    dataset = dataset.repeat(epochs * steps_per_epoch)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    name_to_features = {
        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "label_ids": tf.io.FixedLenFeature([], tf.int64),
        #      "is_real_example": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        return tf.io.parse_single_example(record, name_to_features)

    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder,
            num_parallel_calls=tf.data.experimental.AUTOTUNE))

    dataset.cache()

    if is_training:
        dataset = dataset.shuffle(seed=42,
                                  buffer_size=1000,
                                  reshuffle_each_iteration=True)

    return dataset
コード例 #8
0
def _input(args, channel_name):
    try:
        mode_channel_name = channel_name + 'ing' if channel_name == 'train' else channel_name
        mode = args.data_config[mode_channel_name]['TrainingInputMode']
    except:
        mode = 'File'
    """Uses the tf.data input pipeline for dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """
    filenames = get_filenames(args, channel_name)
    # Repeat infinitely.
    logging.info("Running {} in {} mode".format(channel_name, mode))
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        dataset = dataset.map(_load_image_train,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
        buffer_size = int(args.train_num_examples * 0.4) + 3 * args.BATCH_SIZE

        dataset = dataset.cache().shuffle(buffer_size=buffer_size).batch(
            args.BATCH_SIZE).repeat()

    elif channel_name == 'test':
        dataset = dataset.map(_load_image_test)

        for image, mask in dataset.take(1):
            sample_image, sample_mask = image, mask

        _img_save('sample_image.jpg', sample_image)
        _img_save('sample_mask.png', sample_mask)

        dataset = dataset.batch(args.BATCH_SIZE)

    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset
def _input_fn():
    features = {
        'data': tf.FixedLenFeature([], tf.string),
        'labels': tf.FixedLenFeature([], tf.int64),
    }

    def parse(record):
        return tf.parse_single_example(record, features)

    ds = PipeModeDataset(config.channel, benchmark=True)
    if config.epochs > 1:
        ds = ds.repeat(config.epochs)
    if config.prefetch_size > 0:
        ds = ds.prefetch(config.prefetch_size)
    ds = ds.apply(
        map_and_batch(parse,
                      batch_size=config.batch_size,
                      num_parallel_batches=config.parallel_transform_calls))
    return ds
コード例 #10
0
def input_fn():
    features = {
        'data': tf.FixedLenFeature([], tf.string),
        'labels': tf.FixedLenFeature([], tf.int64),
    }

    def parse(record):
        parsed = tf.parse_single_example(record, features)
        return ({
            'data': tf.decode_raw(parsed['data'], tf.float64)
        }, parsed['labels'])

    ds = PipeModeDataset(config.channel)

    if config.epochs > 1:
        ds = ds.repeat(config.epochs)
    if config.prefetch_size > 0:
        ds = ds.prefetch(config.prefetch_size)
    ds = ds.map(parse, num_parallel_calls=config.parallel_transform_calls)
    ds = ds.batch(config.batch_size)
    return ds
コード例 #11
0
def read_dataset(epochs, batch_size, channel, channel_name):
    mode = args.data_config[channel_name]["TrainingInputMode"]

    logging.info("Running {} in {} mode".format(channel_name, mode))
    if mode == "Pipe":
        from sagemaker_tensorflow import PipeModeDataset

        dataset = PipeModeDataset(channel=channel_name,
                                  record_format="TFRecord")
    else:
        filenames = [os.path.join(channel, channel_name + ".tfrecords")]
        dataset = tf.data.TFRecordDataset(filenames)

    image_feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }

    def _parse_image_function(example_proto):
        # Parse the input tf.Example proto using the dictionary above.
        features = tf.io.parse_single_example(example_proto,
                                              image_feature_description)
        image = tf.io.decode_raw(features["image"], tf.uint8)
        image.set_shape([3 * 32 * 32])
        image = tf.reshape(image, [32, 32, 3])

        label = tf.cast(features["label"], tf.int32)
        label = tf.one_hot(label, 10)

        return image, label

    dataset = dataset.map(_parse_image_function, num_parallel_calls=10)
    dataset = dataset.prefetch(10)
    dataset = dataset.repeat(epochs)
    dataset = dataset.shuffle(buffer_size=10 * batch_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    return dataset
コード例 #12
0
def _input_fn(channel):
    """Returns a Dataset for reading from a SageMaker PipeMode channel."""
    features = {
        "data": tf.FixedLenFeature([], tf.string),
        "labels": tf.FixedLenFeature([], tf.int64),
    }

    def parse(record):
        parsed = tf.parse_single_example(record, features)
        return ({
            "data": tf.decode_raw(parsed["data"], tf.float64)
        }, parsed["labels"])

    ds = PipeModeDataset(channel)
    if EPOCHS > 1:
        ds = ds.repeat(EPOCHS)
    ds = ds.prefetch(PREFETCH_SIZE)
    ds = ds.apply(
        map_and_batch(parse,
                      batch_size=BATCH_SIZE,
                      num_parallel_batches=NUM_PARALLEL_BATCHES))

    return ds
コード例 #13
0
def _input(epochs, batch_size, channel, channel_name):
    mode = args.data_config[channel_name]['TrainingInputMode']
    """Uses the tf.data input pipeline for CIFAR-10 dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """
    filenames = get_filenames(channel_name, channel)
    # Repeat infinitely.
    logging.info("Running {} in {} mode".format(channel_name, mode))
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(10)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = int(
            NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    iterator = dataset.make_one_shot_iterator()
    image_batch, label_batch = iterator.get_next()

    return {INPUT_TENSOR_NAME: image_batch}, label_batch
コード例 #14
0
def test_multiple_iterators():
    channel, directory = write_to_channel("A", [b"bear"])

    dataset = PipeModeDataset(channel, pipe_dir=directory, state_dir=directory, config_dir=directory)
    with tf.Session() as sess:
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert sess.run(next) == b"bear"
        with pytest.raises(tf.errors.OutOfRangeError):
            sess.run(next)

    with open(os.path.join(directory, channel + "_1"), 'wb') as f:
        write_recordio(f, b"bunny")
        write_recordio(f, b"piano")
        write_recordio(f, b"caterpillar")

    with tf.Session() as sess:
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert b"bunny" == sess.run(next)
        assert b"piano" == sess.run(next)
        assert b"caterpillar" == sess.run(next)
        with pytest.raises(tf.errors.OutOfRangeError):
            sess.run(next)
def input_fn(filenames,
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    # Extract lines from input files using the Dataset API, can pass one filename or filename list
    if FLAGS.pipe_mode == 0:
        dataset = tf.data.TextLineDataset(filenames).map(
            decode_libsvm, num_parallel_calls=10).prefetch(
                500000)  # multi-thread pre-process then prefetch

        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        #liangaws:注意如果是单机多GPU或者多CPU,这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。
        #dataset = dataset.batch(batch_size) # Batch size to use
        #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略
        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use
        """
        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels
        """
        #liangaws:当使用tensorflow的dataset API与distribute strategy联合使用的时候,input_fn需要返回dataset而不是返回特征和label
        return dataset
    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)
        dataset = dataset.prefetch(batch_size * 100)
        dataset = dataset.apply(
            map_and_batch(decode_libsvm,
                          batch_size=batch_size,
                          num_parallel_batches=10))

        return dataset
コード例 #16
0
def input_fn(filenames,
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    if FLAGS.pipe_mode == 0:
        # Extract lines from input files using the Dataset API, can pass one filename or filename list
        dataset = tf.data.TextLineDataset(filenames).map(
            decode_libsvm, num_parallel_calls=10).prefetch(
                500000)  # multi-thread pre-process then prefetch
        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        #liangaws:注意如果是单机多GPU或者多CPU,这里的batch_size应设置为CPU或者GPU数量的倍数来充分利用算力。
        #liangaws:这里使用drop_remainder=True来把不够一个batch size的数据忽略
        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use
        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels

    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        #liangaws: 在sagemaker PS训练方式下,每个训练实例只有一个worker,一个ps。所以这里使用host的数量其实等于worker的数量来对训练集shard。不需要对验证集进行shard。
        if channel == 'training':
            number_host = len(FLAGS.hosts)
            if number_host > 1:
                index = FLAGS.hosts.index(FLAGS.current_host)
                print("index is ", index)
                dataset = dataset.shard(number_host, index)

        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.prefetch(500000)
        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        return dataset
コード例 #17
0
def train(args):
    """Train a model and save inference artifacts to args.model_path
    
    Parameters
    ----------
    args : argparse.Namespace
        See config.py for the parameter definitions
    """
    # Clear TF session and seed random number generators:
    clear_session_and_reseed(args.seed)

    anchors = get_anchors("model_data/yolo_anchors.txt")

    pretrain = None
    if (args.darknet):
        tmpmodel = load_darknet_as_keras(args.darknet + ".cfg",
                                         args.darknet + ".weights")
        tmpmodel.save(args.darknet + ".h5")
        pretrain = args.darknet + ".h5"

    train_model, inference_model = create_model(
        (args.data_shape, args.data_shape),
        anchors,
        args.num_classes,
        freeze_body=2,
        load_pretrained=pretrain,
    )

    ## Keras callbacks:
    checkpoint = (ModelCheckpoint(
        args.checkpoint_dir +
        'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        period=args.checkpoint_interval,
    ) if args.checkpoint_interval else None)
    reduce_lr = ReduceLROnPlateau(monitor="val_loss",
                                  factor=0.1,
                                  patience=3,
                                  verbose=1)
    early_stopping = EarlyStopping(monitor="val_loss",
                                   min_delta=0,
                                   patience=10,
                                   verbose=1)
    pretrain_callbacks = [checkpoint] if checkpoint else []
    train_callbacks = ([checkpoint]
                       if checkpoint else []) + [reduce_lr, early_stopping]

    ## Datasets:
    # For more info on this exact batch size requirement, see:
    # https://github.com/aws/sagemaker-tensorflow-extensions/issues/46
    assert args.num_samples_train % args.batch_size == 0, (
        f"Training sample count {args.num_samples_train} is not a multiple of batch size {args.batch_size}, "
        "which can cause deadlocks with sagemaker_tensorflow.PipeModeDataset. Please preprocess to fix."
    )
    # An Augmented Manifest File channel streams one object per attribute of each JSON line. We assume just 2
    # attributes per object: source-ref (the image) and the label (the annotation) - so this pipeline parses
    # batches of 2 records (image, label) before assembling batch_size batches for final pre-processing.
    ds_train = PipeModeDataset(channel="train") \
        .repeat(args.epochs) \
        .batch(2) \
        .map(data.get_tf_parse_mapper((args.data_shape, args.data_shape), randomize=True)) \
        .batch(args.batch_size, drop_remainder=True) \
        .map(data.get_tf_train_batch_mapper(
            args.batch_size,
            (args.data_shape, args.data_shape),
            anchors, args.num_classes
        ))

    assert args.num_samples_train % args.batch_size == 0, (
        f"Training sample count {args.num_samples_train} is not a multiple of batch size {args.batch_size}, "
        "which can cause deadlocks with sagemaker_tensorflow.PipeModeDataset. Please preprocess to fix."
    )
    ds_val = PipeModeDataset(channel="validation") \
        .repeat(args.epochs) \
        .batch(2) \
        .map(data.get_tf_parse_mapper((args.data_shape, args.data_shape), randomize=False)) \
        .batch(args.batch_size, drop_remainder=True) \
        .map(data.get_tf_train_batch_mapper(
            args.batch_size,
            (args.data_shape, args.data_shape),
            anchors, args.num_classes
        ))

    ## Initial stabilization training
    # (If loading in pretrained parameters, train with frozen layers first to get a stable loss)
    if args.epochs_stabilize:
        logger.info(f"Pre-training for {args.epochs_stabilize} epochs...")
        train_model.compile(
            optimizer=Adam(lr=args.lr_pretrain),
            loss={
                # We calculate loss within the training "model" itself, so Keras loss = y_pred:
                "yolo_loss": lambda y_true, y_pred: y_pred
            })

        train_model.fit(
            ds_train,
            epochs=args.epochs_stabilize,
            initial_epoch=0,
            callbacks=pretrain_callbacks,
            shuffle=False,
            steps_per_epoch=args.num_samples_train // args.batch_size,
            validation_data=ds_val,
            validation_steps=args.num_samples_validation // args.batch_size,
            verbose=2,
        )

    ## Main tuning (unfreezing all layers)
    remaining_epochs = args.epochs - args.epochs_stabilize
    if remaining_epochs > 0:
        logger.info("Unfreezing layers for remaining epochs...")
        for i in range(len(train_model.layers)):
            train_model.layers[i].trainable = True
        # Need to re-compile to apply the change:
        train_model.compile(optimizer=Adam(lr=args.lr),
                            loss={
                                "yolo_loss": lambda y_true, y_pred: y_pred
                            })

        train_model.fit(
            ds_train,
            callbacks=train_callbacks,
            epochs=args.epochs,
            initial_epoch=args.epochs_stabilize,
            shuffle=False,
            steps_per_epoch=args.num_samples_train // args.batch_size,
            validation_data=ds_val,
            validation_steps=args.num_samples_validation // args.batch_size,
            verbose=2,
        )

    ## Save the inference model it TFServing format:
    # (In TFv2, TFServing can open Keras models automatically - but in v1 we need to save as TF model)
    #
    # We can't tf.saved_model.simple_save quite yet, because our TF session is full of training nodes like
    # PipeModeDataset that we don't want to store... So we'll create a temporary Keras .h5 file and recreate
    # the model in an empty session first:
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpfilename = os.path.join(tmpdir, "model.h5")
        tf.keras.models.save_model(
            inference_model,
            tmpfilename,
            overwrite=True,
            include_optimizer=False,
        )
        clear_session_and_reseed(args.seed)
        inference_model = tf.keras.models.load_model(tmpfilename)

    # Now we can save the stripped-down TensorFlow graph ready for TFServing:
    sess = K.get_session()
    tf.saved_model.simple_save(
        sess,
        os.path.join(args.model_path, "model/1"),
        inputs={"inputs": inference_model.input},
        outputs={t.name: t
                 for t in inference_model.outputs},
    )

    # Finally, we need to save our inference container code (custom I/O handlers) too.
    # Really we only need inference.py and whatever files + requirements it depends on, but for simplicity
    # we'll just copy the entire contents of this source_dir into the package:
    copy_tree(
        os.path.dirname(os.path.realpath(__file__)),
        os.path.join(args.model_path, "code"),
    )
コード例 #18
0
    # The return from PipeModeDatase, "ds" in this code, is TensorFlow Dataset
    # https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    features = {
        'image': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64),
    }

    def parse(record):
        parsed = tf.parse_single_example(record, features)
        image = tf.decode_raw(parsed['image'], tf.uint8)
        image.set_shape([DEPTH * HEIGHT * WIDTH])
        image = tf.cast(image,  tf.float32)/255.0
        label = tf.cast(parsed['label'], tf.int32)
        return image, label
    
    ds = PipeModeDataset(channel='train', record_format='TFRecord')
    num_epochs = 10
    # This yields 40000 (training images)/64 (batch_size) * 10 (epoch) = 6250 batches (steps)
    # Tensorflow dataset raises tf.errors.OutOfRangeError when all the batches are fed as described in training-loop
    ds = ds.repeat(num_epochs) 
    ds = ds.prefetch(10)
    ds = ds.map(parse, num_parallel_calls=10)
    ds = ds.shuffle(buffer_size = 64) #larger than batch_size
    ds = ds.batch(batch_size = 64)
 
    iterator = ds.make_one_shot_iterator()
    itr_initializer = iterator.make_initializer(ds)
    image_batch, label_batch = iterator.get_next()
    
    # Set up PyTorch Neural Network and optimizer
    net = MLP().to(device)
コード例 #19
0
ファイル: pipemode.py プロジェクト: johnbensnyder/dockerfiles
import json
import multiprocessing
import os
import tempfile

import tensorflow as tf
from sagemaker_tensorflow import PipeModeDataset

print("Starting estimator script")

ds = PipeModeDataset("elizabeth", benchmark=True)


class BenchmarkConfig(object):
    def __init__(self):
        self.hp = json.load(open('/opt/ml/input/config/hyperparameters.json'))

    @property
    def batch_size(self):
        return int(self.hp.get('batch_size', 5))

    @property
    def prefetch_size(self):
        return int(self.hp.get('prefetch_size', 1000))

    @property
    def channel(self):
        return self.hp.get('channel', 'elizabeth')

    @property
    def dimension(self):
コード例 #20
0
def test_missing_channel():
    channel, directory = write_to_channel("A", [b"bear", b"bunny", b"truck"])
    with tf.Session() as sess:
        with pytest.raises(PipeModeDatasetException):
            PipeModeDataset("Not A Channel", pipe_dir=directory, state_dir=directory, config_dir=directory)
コード例 #21
0
def input_fn(filenames='',
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    # Extract lines from input files using the Dataset API, can pass one filename or filename list
    print("pipe mode ", FLAGS.pipe_mode)
    if FLAGS.pipe_mode == 0:
        """
        dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000)    # multi-thread pre-process then prefetch
              
        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use
        """

        dataset = tf.data.TextLineDataset(filenames)
        #liangaws: 这里假设Sagemaker用的是S3fullreplicate,也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。
        dataset = dataset.shard(hvd.size(), hvd.rank())

        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.prefetch(
            500000)  # multi-thread pre-process then prefetch
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use

        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels

    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        number_host = len(FLAGS.hosts)
        #liangaws: horovod + pipe mode下,如果每个训练实例有多个worker,需要每个worker对应一个不同的channel,因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下,才需要对不同训练实例上的同一个channel的数据做shard。
        if number_host > 1 and hvd.size() > number_host:
            #liangaws: 在Sagemaker horovod方式下,发现current-host都是一样的。
            #index = FLAGS.hosts.index(FLAGS.current_host)
            index = hvd.rank() // FLAGS.worker_per_host
            dataset = dataset.shard(number_host, index)

        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.prefetch(500000)
        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        return dataset
コード例 #22
0
import json
import multiprocessing
import os
import tempfile

import tensorflow as tf
from sagemaker_tensorflow import PipeModeDataset

ds = PipeModeDataset("elizabeth")


class BenchmarkConfig(object):
    def __init__(self):
        self.hp = json.load(open('/opt/ml/input/config/hyperparameters.json'))

    @property
    def batch_size(self):
        return int(self.hp.get('batch_size', 5))

    @property
    def prefetch_size(self):
        return int(self.hp.get('prefetch_size', 1000))

    @property
    def channel(self):
        return self.hp.get('channel', 'elizabeth')

    @property
    def dimension(self):
        return int(self.hp['dimension'])