Example #1
0
def train(create_model, X, y, batch_size, epochs, gpu_count, parameter_server,
          method):
    if gpu_count > 1:
        ps_device = '/gpu:0' if parameter_server == 'gpu' else '/cpu:0'

        with tf.device(ps_device):
            serial_model = create_model()

        if method == 'kuza55':
            from keras_tf_multigpu.kuza55 import make_parallel
            model = make_parallel(serial_model,
                                  gpu_count=gpu_count,
                                  ps_device=ps_device)
        elif method == 'avolkov1':
            from keras_tf_multigpu.avolkov1 import make_parallel, get_available_gpus
            gpus_list = get_available_gpus(gpu_count)
            model = make_parallel(serial_model,
                                  gdev_list=gpus_list,
                                  ps_device=ps_device)
        elif method == 'fchollet':
            # requires Keras (2.0.9?) https://github.com/fchollet/keras/commit/3dd3e8331677e68e7dec6ed4a1cbf16b7ef19f7f
            from keras.utils import multi_gpu_model
            model = multi_gpu_model(serial_model, gpus=gpu_count)
    else:
        model = serial_model = create_model()

    print('Number of parameters:', serial_model.count_params())

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])
    gauge = SamplesPerSec(batch_size)
    model.fit(X, y, batch_size=batch_size, epochs=epochs, callbacks=[gauge])
    gauge.print_results()
Example #2
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    gdev_list = get_available_gpus(mgpu or 1)
    ngpus = len(gdev_list)

    batch_size_1gpu = 32
    batch_size = batch_size_1gpu * ngpus
    num_classes = 1000
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test,
                         y_test) = synthesize_imagenet_dataset(num_classes)
    train_samples = x_train.shape[0]
    test_samples = y_test.shape[0]
    steps_per_epoch = train_samples // batch_size
    print('train_samples:', train_samples)
    print('batch_size:', batch_size)
    print('steps_per_epoch:', steps_per_epoch)
    # validations_steps = test_samples // batch_size
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # if no augmentation can go directly from numpy arrays
        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[x_train, y_train],
        #     # tensors=[x_train, y_train.astype(np.int32)],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     enqueue_many=enqueue_many,
        #     num_threads=8)

        # NOTE: This bakes the whole dataset into the TF graph and for larger
        # datasets it fails on "ValueError: GraphDef cannot be larger than 2GB".
        # TODO: Load the a large dataset via queue from RAM/disk.

        input_images = tf.constant(x_train.reshape(train_samples, -1))
        print('train_samples', train_samples)
        print('input_images', input_images.shape)
        image, label = tf.train.slice_input_producer([input_images, y_train],
                                                     shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])
        print('image', image.shape)

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, y_test], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(distorted_image,
                                                         max_delta=63)
            distorted_image = tf.image.random_contrast(distorted_image,
                                                       lower=0.2,
                                                       upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch([image, label],
                                                      batch_size=batch_size,
                                                      capacity=capacity,
                                                      num_threads=8)

        print('x_train_batch:', x_train_batch.shape)

        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[image, label],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     num_threads=8)

        x_test_batch, y_test_batch = tf.train.batch(
            [test_image, test_label],
            # TODO: shouldn't it be: batch_size=batch_size???
            batch_size=train_samples,
            capacity=capacity,
            num_threads=8,
            name='test_batch',
            shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    print('x_train_input', x_train_input)

    gauge = SamplesPerSec(batch_size)
    callbacks = [gauge]

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    model_init = make_model(x_train_input, num_classes)
    x_train_out = model_init.output
    # model_init.summary()

    lr = 0.0001 * ngpus
    if ngpus > 1:
        model = make_parallel(model_init, gdev_list)
    else:
        # Must re-instantiate model per API below otherwise doesn't work.
        model_init = Model(inputs=[x_train_input], outputs=[x_train_out])
        model = model_init

    opt = RMSprop(lr=lr, decay=1e-6)
    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    print_mgpu_modelsummary(model)  # will print non-mgpu model as well

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True)
        callbacks = [checkpoint]

    # Start the queue runners.
    sess = KB.get_session()
    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])
    tf.train.start_queue_runners(sess=sess)

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.

    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} ms'.format('TRAINING',
                                          int(elapsed_time * 1000)))
    gauge.print_results()

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag:  # empty list
        model.save_weights(checkptfile)

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()

    # Second Session. Demonstrate that the model works
    # test_model = make_model(x_test.shape[1:], num_classes,
    #                         weights_file=weights_file)
    test_model = make_model(x_test.shape[1:], num_classes)
    test_model.load_weights(weights_file)
    test_model.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])

    if data_augmentation:
        x_proccessed = sess.run(x_test_batch)
        y_proccessed = sess.run(y_test_batch)
        loss, acc = test_model.evaluate(x_proccessed, y_proccessed)
    else:
        loss, acc = test_model.evaluate(x_test, y_test)

    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))
def main():
    # user options
    batch_size = 128
    val_in_train = False
    use_model_checkpt = False

    # demo processing
    sess = tf.Session()
    KB.set_session(sess)

    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)
    batch_size = batch_size * ngpus

    data = mnist.load_mnist()
    X_train = data.train.images
    X_test = data.test.images
    train_samples = X_train.shape[0]  # 60000
    test_samples = X_test.shape[0]  # 10000
    height_nrows = 28
    width_ncols = 28
    batch_shape = [batch_size, height_nrows, width_ncols, 1]
    epochs = 5
    steps_per_epoch = train_samples / batch_size
    validations_steps = test_samples / batch_size
    nclasses = 10

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    enqueue_many = True

    x_train_batch, y_train_batch = tf.train.shuffle_batch(
        tensors=[data.train.images, data.train.labels.astype(np.int32)],
        batch_size=batch_size,
        capacity=capacity,
        min_after_dequeue=min_after_dequeue,
        enqueue_many=enqueue_many,
        num_threads=8)

    x_train_batch = tf.cast(x_train_batch, tf.float32)
    x_train_batch = tf.reshape(x_train_batch, shape=batch_shape)

    y_train_batch = tf.cast(y_train_batch, tf.int32)
    y_train_batch = tf.one_hot(y_train_batch, nclasses)

    x_train_input = Input(tensor=x_train_batch)

    x_test_batch, y_test_batch = tf.train.batch(
        tensors=[data.test.images, data.test.labels.astype(np.int32)],
        batch_size=batch_size,
        capacity=capacity,
        enqueue_many=enqueue_many,
        num_threads=8)

    # I like the non-functional definition of model more.
    # model_init = make_model(x_train_input, nclasses)
    # x_train_out = model_init.output
    # train_model = Model(inputs=[x_train_input], outputs=[x_train_out])

    x_train_out = cnn_layers(x_train_input, nclasses)
    train_model = Model(inputs=[x_train_input], outputs=[x_train_out])
    if ngpus > 1:
        train_model = make_parallel(train_model, gdev_list)

    lr = 2e-3 * ngpus
    train_model.compile(optimizer=RMSprop(lr=lr, decay=1e-5),
                        loss='categorical_crossentropy',
                        metrics=['accuracy'],
                        target_tensors=[y_train_batch])

    if ngpus > 1:
        print_mgpu_modelsummary(train_model)
    else:
        train_model.summary()

    # Callbacks
    if use_model_checkpt:
        mon = 'val_acc' if val_in_train else 'acc'
        checkpoint = ModelCheckpoint(
            'saved_wt.h5', monitor=mon, verbose=0,
            save_best_only=True,
            save_weights_only=True)
        checkpoint = [checkpoint]
    else:
        checkpoint = []

    callbacks = checkpoint
    # Training slower with callback. Multigpu slower with callback during
    # training than 1 GPU. Again, mnist is too trivial of a model and dataset
    # to benchmark or stress GPU compute capabilities. I set up this example
    # to illustrate potential for speedup of multigpu case trying to use mnist
    # as a stressor.
    # It's like comparing a 5 ft race between a person and a truck. A truck is
    # obviously faster than a person but in a 5 ft race the person will likely
    # win due to slower startup for the truck.
    # I will re-implement this with Cifar that should be a better benchmark.

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    train_model.fit(
        validation_data=(x_test_batch, y_test_batch) if val_in_train else None,
        validation_steps=validations_steps if val_in_train else None,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} ms'.format('TRAINING',
                                          int(elapsed_time * 1000)))

    if not checkpoint:  # empty list
        train_model.save_weights('./saved_wt.h5')

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()

    # Second Session. Demonstrate that the model works and is independent of
    # the TFRecord pipeline, and to test loading trained model without tensors.
    x_test = np.reshape(data.validation.images,
                        (data.validation.images.shape[0], 28, 28, 1))
    y_test = data.validation.labels
    x_test_inp = KL.Input(shape=(x_test.shape[1:]))
    test_out = cnn_layers(x_test_inp, nclasses)
    test_model = Model(inputs=x_test_inp, outputs=test_out)

    test_model.load_weights('saved_wt.h5')
    test_model.compile(optimizer='rmsprop',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])
    test_model.summary()

    loss, acc = test_model.evaluate(x_test,
                                    to_categorical(y_test),
                                    nclasses)
    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))
model_classifier = Model([img_input, roi_input], classifier)

# this is a model that holds both the RPN and the classifier, used to
# load/save weights for the models
model_all = Model([img_input, roi_input], rpn[:2] + classifier)

try:
    print('loading weights from {}'.format(C.base_net_weights))
    model_rpn.load_weights(C.base_net_weights, by_name=True)
    model_classifier.load_weights(C.base_net_weights, by_name=True)
except:
    print('Could not load pretrained model weights. Weights can be found in the keras application folder \
        https://github.com/fchollet/keras/tree/master/keras/applications')

# ------------------------------------------------------ User multi GPU support
gpus = get_available_gpus()
ngpus = len(gpus)
print_mgpu_modelsummary(model_rpn)
model_rpn = make_parallel(model_rpn, gpus)
print_mgpu_modelsummary(model_rpn)

optimizer = Adam(lr=1e-5)
optimizer_classifier = Adam(lr=1e-5)
model_rpn.compile(optimizer=optimizer, loss=[losses.rpn_loss_cls(
    num_anchors), losses.rpn_loss_regr(num_anchors)])
model_classifier.compile(optimizer=optimizer_classifier, loss=[losses.class_loss_cls, losses.class_loss_regr(
    len(classes_count) - 1)], metrics={'dense_class_{}'.format(len(classes_count)): 'accuracy'})
model_all.compile(optimizer='sgd', loss='mae')

epoch_length = 1000
num_epochs = int(options.num_epochs)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    enqueue = args.enqueue
    usenccl = args.nccl
    syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    # (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    print(x_train.shape, 'train shape')
    # with tf.device('/cpu:0'):
    model_init = make_model(x_train.shape, num_classes,
                            filepath if checkpt_flag else None)

    # model_init = partial(make_model, x_train.shape, num_classes,
    #                      filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks += [checkpoint]

    lr = 0.0001
    if mgpu > 1 or mgpu == -1:
        gpus_list = get_available_gpus(mgpu)
        ngpus = len(gpus_list)
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        batch_size = batch_size * ngpus  #
        lr = lr * ngpus
        # batch_size = 40000  # split over four devices works fine no grad avg
        # batch_size = 25000  # split over four devices works fine w/ grad avg

        # Data-Parallelize the model via function or class.
        model = make_parallel(model_init,
                              gpus_list,
                              usenccl=usenccl,
                              syncopt=syncopt,
                              enqueue=enqueue)
        # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list,
        #                   syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=lr, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list)

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=lr, decay=1e-6)

    gauge = SamplesPerSec(batch_size)
    callbacks += [gauge]

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

        # Fit the model on the batches generated by datagen.flow().
        # mygen = mygenerator(nsamples, batch_size, x_train, y_train)
        # model.fit_generator(mygen,
        #                     steps_per_epoch=steps_per_epoch,
        #                     epochs=epochs,
        #                     validation_data=(x_test, y_test),
        #                     callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    gauge.print_results()