def import_model(path):
    """Import model from given path and assign it to appropriate devices"""
    K.clear_session()
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    config.gpu_options.allow_growth = True
    tfsess = tf.Session(config=config)
    K.set_session(tfsess)
    model = load_model(path, custom_objects={'FreezePadding':FreezePadding,
                                             'FreezePadding_Non_Negative':FreezePadding_Non_Negative})
    if len(get_available_gpus()) > 1:
        model = make_parallel(model)
    return model
Beispiel #2
0
            lr=float(parser.get('Training_Parameters', 'learning_rate')))
    elif parser.get('Training_Parameters', 'optimizer') == "Adam":
        optimizer_used = keras.optimizers.Adam(
            lr=float(parser.get('Training_Parameters', 'learning_rate')))
    else:
        print "Optimizer unchoosen or unknown -> default: Adam"
        optimizer_used = keras.optimizers.Adam(
            lr=float(parser.get('Training_Parameters', 'learning_rate')))

    ngpus = args.__dict__['ngpus']
    #print'Use {} GPUS'.format(ngpus)
    if ngpus > 1:
        if backend == 'tensorflow':
            with tf.device('/cpu:0'):
                model_serial = read_NN_weights(args.__dict__, base_model)
            gdev_list = get_available_gpus()
            print('Using GPUs: {}'.format(gdev_list))
            model = make_parallel(model_serial, gdev_list)
        else:
            raise Exception(
                'Multi GPU can only be used with tensorflow as Backend.')
    else:
        model = read_NN_weights(args.__dict__, base_model)

    # Choosing the Loss function
    loss_func = 'mean_squared_error'
    if parser.has_option('Training_Parameters', 'loss_function'):
        loss_func = parser.get('Training_Parameters', 'loss_function')
        if loss_func == "weighted_categorial_crossentropy":
            weights = parser.get('Training_Parameters', 'weights')
            weights = np.array(weights.split(',')).astype(np.float)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__

    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    # enqueue = args.enqueue
    # usenccl = args.nccl
    # syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # ---------------------------------------------- Distributed setup on SLURM
    scpar = SlurmClusterParser()

    logdevp_flag = True if _DEVPROF or logdevp else False
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(
        log_device_placement=logdevp_flag,  # True,
        allow_soft_placement=True,
        gpu_options=gpu_options)

    cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host,
                                         scpar.hostnames,
                                         scpar.num_parameter_servers,
                                         scpar.my_proc_id)
    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    server = cmgr_facade.get_server(config)
    tfsess = cmgr_facade.get_session(server)
    KB.set_session(tfsess)

    # TODO: Try
    #     sv = tf.train.Supervisor(...)
    #     with sv.managed_session(server.target, config=config) ...
    #     sess = sv.prepare_or_wait_for_session(server.target,
    #                                           config=sess_config)
    #     KB.set_session(tfsess)  # based on this managed session.

    #: :type cluster_spec: tf.train.ClusterSpec
    # cluster_spec = cmgr_facade.get_cluster_spec()
    job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    is_chief = cmgr_facade.is_chief

    if job_type == JobType.ps:
        # JOIN PARAMETER SERVERS
        # server.join()
        cmgr_facade.join(server)

    # Once the server is started everything but the chief worker can join
    # the server and wait to process/service graph computations. Chief pushes
    # the compute graph.
    if not is_chief:
        # JOIN WORKERS (PS also) EXCEPT FOR CHIEF
        cmgr_facade.join(server)

    # sleep(2)  # Have the chief wait just in case. Occasionally get errors.

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    #: :type mywgdev: tf.DeviceSpec
    # mywgdev, wgdev_list = cmgr_facade.get_workers_dev_list(ngpus)
    _, wgdev_list = cmgr_facade.get_workers_dev_list(ngpus)
    nworker_devices_total = len(wgdev_list)
    # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'
    #       .format(cmgr_facade.clusterspec_dict,
    #               [dev.to_string() for dev in wgdev_list]))  # DEBUG

    # ------------------------------------ Data loading and basic preprocessing
    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = None

    print(x_train.shape, 'train shape')

    # --------------------------------------------- Setup model and parallelize
    def _load_fn(unused_op):
        return 1

    cspec = cmgr_facade.get_cluster_spec()
    num_ps = cspec.num_tasks(JobType.ps)
    ps_strategy = \
        tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn)

    ps_device = tf.DeviceSpec(job=JobType.ps,
                              device_type=DevType.cpu,
                              device_index=0).to_string()

    rdsetter = tf.train.replica_device_setter(
        cluster=cspec,
        ps_strategy=ps_strategy,
        ps_device=ps_device,  # '/job:ps/cpu:0'  # seems to work
    )
    with tf.device(rdsetter):
        model_init = make_model(x_train.shape, num_classes,
                                filepath if checkpt_flag else None)

    # model_init = partial(make_model, x_train.shape, num_classes,
    #                      filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks = [checkpoint]

    print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'.format(
        cmgr_facade.clusterspec_dict,
        [dev.to_string() for dev in wgdev_list]))  # DEBUG

    batch_size = batch_size * nworker_devices_total
    # batch_size = 40000  # split over four devices works fine no grad avg
    # batch_size = 25000  # split over four devices works fine w/ grad avg

    # ps_device = rdsetter
    # ps_device = '/job:ps/cpu:0'
    # ps_device = '/cpu:0'
    # ps_device = tf.train.replica_device_setter(
    #     ps_device="/job:ps/cpu:0",
    #     worker_device=mywgdev.to_string(),
    #     cluster=cmgr_facade.get_cluster_spec())

    # TODO: Use replica_device_setter and do not join workers above. Try using
    # managed session.
    # Need to think about this because what I want is to have the workers
    # load the relevant data on the node they are running from instead of
    # loading on chief rank's node and transferring slices over network.
    # Maybe parameter servers can do this via ZeroMQ.
    # Ref: https://gist.github.com/fchollet/2c9b029f505d94e6b8cd7f8a5e244a4e

    # Data-Parallelize the model via function or class.
    model = make_parallel(model_init, wgdev_list, ps_device=ps_device)
    # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list,
    #                   syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
    print_mgpu_modelsummary(model)

    # ------------------------------------------------------------ Run training
    opt = RMSprop(lr=0.0001, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

        # Fit the model on the batches generated by datagen.flow().
        # mygen = mygenerator(nsamples, batch_size, x_train, y_train)
        # model.fit_generator(mygen,
        #                     steps_per_epoch=steps_per_epoch,
        #                     epochs=epochs,
        #                     validation_data=(x_test, y_test),
        #                     callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    # ------------------------------------------------------------- STOP SERVER
    # if not is_chief:
    #     # JOIN WORKERS (PS also) EXCEPT FOR CHIEF
    #     cmgr_facade.join(server)
    cmgr_facade.stop_chief(server)
def model_create(ARGS):
    '''Create and Compile model and assign it to provided devices'''
    def retain(ARGS):
        '''Create the model'''

        #Define the constant for model saving
        reshape_size = ARGS.emb_size + ARGS.numeric_size
        if ARGS.allow_negative:
            embeddings_constraint = FreezePadding()
            beta_activation = 'tanh'
            output_constraint = None
        else:
            embeddings_constraint = FreezePadding_Non_Negative()
            beta_activation = 'sigmoid'
            output_constraint = non_neg()

        #Get available gpus , returns empty list if none
        glist = get_available_gpus()

        def reshape(data):
            '''Reshape the context vectors to 3D vector'''
            return K.reshape(x=data, shape=(K.shape(data)[0], 1, reshape_size))

        #Code Input
        codes = L.Input((None, None), name='codes_input')
        inputs_list = [codes]
        #Calculate embedding for each code and sum them to a visit level
        codes_embs_total = L.Embedding(
            ARGS.num_codes + 1,
            ARGS.emb_size,
            name='embedding',
            embeddings_constraint=embeddings_constraint)(codes)
        codes_embs = L.Lambda(lambda x: K.sum(x, axis=2))(codes_embs_total)
        #Numeric input if needed
        if ARGS.numeric_size:
            numerics = L.Input((None, ARGS.numeric_size), name='numeric_input')
            inputs_list.append(numerics)
            full_embs = L.concatenate([codes_embs, numerics], name='catInp')
        else:
            full_embs = codes_embs

        #Apply dropout on inputs
        full_embs = L.Dropout(ARGS.dropout_input)(full_embs)

        #Time input if needed
        if ARGS.use_time:
            time = L.Input((None, 1), name='time_input')
            inputs_list.append(time)
            time_embs = L.concatenate([full_embs, time], name='catInp2')
        else:
            time_embs = full_embs

        #Setup Layers
        #This implementation uses Bidirectional LSTM instead of reverse order
        #    (see https://github.com/mp2893/retain/issues/3 for more details)

        #If training on GPU and Tensorflow use CuDNNLSTM for much faster training
        if glist:
            alpha = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size,
                                                return_sequences=True),
                                    name='alpha')
            beta = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size,
                                               return_sequences=True),
                                   name='beta')
        else:
            alpha = L.Bidirectional(L.LSTM(ARGS.recurrent_size,
                                           return_sequences=True,
                                           implementation=2),
                                    name='alpha')
            beta = L.Bidirectional(L.LSTM(ARGS.recurrent_size,
                                          return_sequences=True,
                                          implementation=2),
                                   name='beta')

        alpha_dense = L.Dense(1, kernel_regularizer=l2(ARGS.l2))
        beta_dense = L.Dense(ARGS.emb_size + ARGS.numeric_size,
                             activation=beta_activation,
                             kernel_regularizer=l2(ARGS.l2))

        #Compute alpha, visit attention
        alpha_out = alpha(time_embs)
        alpha_out = L.TimeDistributed(alpha_dense,
                                      name='alpha_dense_0')(alpha_out)
        alpha_out = L.Softmax(axis=1)(alpha_out)
        #Compute beta, codes attention
        beta_out = beta(time_embs)
        beta_out = L.TimeDistributed(beta_dense, name='beta_dense_0')(beta_out)
        #Compute context vector based on attentions and embeddings
        c_t = L.Multiply()([alpha_out, beta_out, full_embs])
        c_t = L.Lambda(lambda x: K.sum(x, axis=1))(c_t)
        #Reshape to 3d vector for consistency between Many to Many and Many to One implementations
        contexts = L.Lambda(reshape)(c_t)

        #Make a prediction
        contexts = L.Dropout(ARGS.dropout_context)(contexts)
        output_layer = L.Dense(1,
                               activation='sigmoid',
                               name='dOut',
                               kernel_regularizer=l2(ARGS.l2),
                               kernel_constraint=output_constraint)

        #TimeDistributed is used for consistency
        # between Many to Many and Many to One implementations
        output = L.TimeDistributed(output_layer,
                                   name='time_distributed_out')(contexts)
        #Define the model with appropriate inputs
        model = Model(inputs=inputs_list, outputs=[output])

        return model

    #Set Tensorflow to grow GPU memory consumption instead of grabbing all of it at once
    K.clear_session()
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True
    tfsess = tf.Session(config=config)
    K.set_session(tfsess)
    #If there are multiple GPUs set up a multi-gpu model
    glist = get_available_gpus()
    if len(glist) > 1:
        with tf.device('/cpu:0'):
            model = retain(ARGS)
        model_final = make_parallel(model, glist)
    else:
        model_final = retain(ARGS)

    #Compile the model - adamax has produced best results in our experiments
    model_final.compile(optimizer='adamax',
                        loss='binary_crossentropy',
                        metrics=['accuracy'],
                        sample_weight_mode='temporal')

    return model_final
Beispiel #5
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    print('x_train.shape:', x_train.shape)

    vae_serial, encoder, generator = make_vae_and_codec(
        original_img_size, img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks)  # ,
    # validation_data=(x_test, None))  # Not accurate for mgpu. Use vae_val.

    vae_val = vae_serial
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size:(i + 1) * digit_size,
                   j * digit_size:(j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    enqueue = args.enqueue
    usenccl = args.nccl
    syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    # (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which
    # was fixed in Keras 2.1.1
    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes).squeeze()
    y_test = to_categorical(y_test, num_classes).squeeze()

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    print(x_train.shape, 'train shape')
    # with tf.device('/cpu:0'):
    model_init = make_model(x_train.shape, num_classes,
                            filepath if checkpt_flag else None)

    # model_init = partial(make_model, x_train.shape, num_classes,
    #                      filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks = [checkpoint]

    lr = 0.0001
    if mgpu > 1 or mgpu == -1:
        gpus_list = get_available_gpus(mgpu)
        ngpus = len(gpus_list)
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        batch_size = batch_size * ngpus  #
        lr = lr * ngpus
        # batch_size = 40000  # split over four devices works fine no grad avg
        # batch_size = 25000  # split over four devices works fine w/ grad avg

        # Data-Parallelize the model via function or class.
        model = make_parallel(model_init,
                              gpus_list,
                              usenccl=usenccl,
                              syncopt=syncopt,
                              enqueue=enqueue)
        # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list,
        #                   syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=lr, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list)

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=lr, decay=1e-6)

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

        # Fit the model on the batches generated by datagen.flow().
        # mygen = mygenerator(nsamples, batch_size, x_train, y_train)
        # model.fit_generator(mygen,
        #                     steps_per_epoch=steps_per_epoch,
        #                     epochs=epochs,
        #                     validation_data=(x_test, y_test),
        #                     callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    model_init.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])
    metrics = model_init.evaluate(x=x_test, y=y_test, batch_size=batch_size)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    enqueue = args.enqueue
    usenccl = args.nccl
    syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32  # 64
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = None

    if _DEVPROF or logdevp:
        import tensorflow as tf

        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    print(x_train.shape, 'train shape')
    print(y_train.shape, 'label shape')
    model_init = make_model(x_train.shape, num_classes,
                            filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks = [checkpoint]

    if mgpu > 1 or mgpu == -1:
        gpus_list = get_available_gpus(mgpu)
        ngpus = len(gpus_list)
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        batch_size = batch_size * ngpus  #
        # batch_size = 40000  # split over four devices works fine no grad avg
        # batch_size = 25000  # split over four devices works fine w/ grad avg

        # Data-Parallelize the model via function or class.
        model = make_parallel(model_init,
                              gpus_list,
                              usenccl=usenccl,
                              syncopt=syncopt,
                              enqueue=enqueue,
                              model_class=ModelMGPU_Dflow)
        # model = ModelMGPU_Dflow(
        #     serial_model=model_init, gdev_list=gpus_list,
        #     syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=0.0001, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=0.0001, decay=1e-6, gdev_list=gpus_list)

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=0.0001, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    # prepare dataset
    dataset_train = get_data('train',
                             num_classes,
                             batch_size=batch_size,
                             aug=data_augmentation,
                             epochs=epochs)
    # dataset_test = get_data('test', cifar_classnum)

    if not data_augmentation:
        print('Not using data augmentation.')

    # Plain ol'd fit is faster than dataflow generator below.
    # model.fit(x_train, y_train,
    #           batch_size=batch_size,
    #           epochs=epochs,
    #           validation_data=(x_test, y_test),
    #           shuffle=True,
    #           callbacks=callbacks)

    # Fit the model on the batches generated by datagen.flow().
    # dataset_train.reset_state()
    # mygen = dataset_train.get_data()
    # for dp in mygen:
    #     print('DP SHAPE: {}'.format(dp[0].shape))
    # model.fit_generator(mygen,
    #                     steps_per_epoch=steps_per_epoch,
    #                     epochs=epochs,
    #                     validation_data=(x_test, y_test),
    #                     callbacks=callbacks)

    # Using fit_dataflow method that's mixed into ModelMGPU class.
    model.fit_dataflow(dataset_train,
                       steps_per_epoch=steps_per_epoch,
                       epochs=epochs,
                       validation_data=(x_test, y_test),
                       callbacks=callbacks)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    gdev_list = get_available_gpus(mgpu or 1)
    ngpus = len(gdev_list)

    batch_size_1gpu = 32
    batch_size = batch_size_1gpu * ngpus
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = y_test.shape[0]
    steps_per_epoch = train_samples // batch_size
    # validations_steps = test_samples // batch_size
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which
    # was fixed in Keras 2.1.1
    y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze()
    y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze()

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # if no augmentation can go directly from numpy arrays
        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[x_train, y_train],
        #     # tensors=[x_train, y_train.astype(np.int32)],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     enqueue_many=enqueue_many,
        #     num_threads=8)

        input_images = tf.constant(x_train.reshape(train_samples, -1))
        input_labels = tf.constant(y_train)  # already in proper shape

        image, label = tf.train.slice_input_producer(
            [input_images, input_labels], shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, y_test], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(
                distorted_image, max_delta=63)
            distorted_image = tf.image.random_contrast(
                distorted_image, lower=0.2, upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch(
            [image, label],
            batch_size=batch_size,
            capacity=capacity,
            num_threads=8)

        # https://stackoverflow.com/a/43613376/3457624
        x_test_batch, y_test_batch = tf.train.batch(
            [test_image, test_label],
            batch_size=test_samples,  # if converting to numpy first
            # batch_size=batch_size, # if using tensors
            capacity=capacity,
            # num_threads=8,
            num_threads=1,  # set to 1 to make deterministic
            name='test_batch',
            shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    model_init = make_model(x_train_input, num_classes,
                            filepath if checkpt_flag else None)
    x_train_out = model_init.output
    # model_init.summary()
    model_init = Model(inputs=[x_train_input], outputs=[x_train_out])

    lr = 0.0001 * ngpus
    if ngpus > 1:
        model = make_parallel(model_init, gdev_list)
    else:
        # Must re-instantiate model per API below otherwise doesn't work.
        model = model_init

    opt = RMSprop(lr=lr, decay=1e-6)
    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    print_mgpu_modelsummary(model)  # will print non-mgpu model as well

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1,
                                     save_best_only=True)
        callbacks += [checkpoint]

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    # Start the queue runners.
    sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.
    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag:  # empty list
        model.save_weights(checkptfile)

    KB.clear_session()

    # Second Session. Demonstrate that the model works
    # test_model = make_model(x_test.shape[1:], num_classes,
    #                         weights_file=weights_file)
    test_model = make_model(x_test.shape[1:], num_classes)
    test_model.load_weights(weights_file)
    test_model.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])

    if data_augmentation:
        # Need to run x_test through per_image_standardization otherwise
        # results get messed up.
        x_processed, y_processed = sess.run([x_test_batch, y_test_batch])
        # DEBUGGING
        # xdiff = np.abs(x_test - x_processed)
        # print('MAX XDIFF: {}'.format(np.max(xdiff)))
        # ydiff = np.abs(y_test - y_processed)
        # print('y_test: {}'.format(y_test[0:5, :]))
        # print('y_processed: {}'.format(y_processed[0:5, :]))
        # print('ydiff: {}'.format(ydiff[-10:, :]))
        # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff))))

        loss, acc = test_model.evaluate(x_processed, y_processed)
    else:
        loss, acc = test_model.evaluate(x_test, y_test)

    # # Demonstrate that the model works using TF pipeline directly.
    # # In tf.train.batch for test data change batch_size=batch_size
    # # instead of train_samples. Uncomment below and comment out above.
    # val_samples = x_test.shape[0]
    # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size)))
    # images_val = KL.Input(tensor=x_test_batch)
    # test_model = make_model(images_val, num_classes,
    #                         weights_file)
    # test_model = Model(inputs=[images_val], outputs=[test_model.output])
    # test_model.compile(
    #     loss='categorical_crossentropy',
    #     optimizer=opt,
    #     metrics=['accuracy'],
    #     target_tensors=[y_test_batch])
    # loss, acc = test_model.evaluate(x=None, y=None,
    #                                 steps=steps_per_epoch_val)

    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)
Beispiel #9
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    gdev_list = get_available_gpus(mgpu or 1)
    ngpus = len(gdev_list)

    batch_size_1gpu = 32
    batch_size = batch_size_1gpu * ngpus
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = y_test.shape[0]
    steps_per_epoch = train_samples // batch_size
    # validations_steps = test_samples // batch_size
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which
    # was fixed in Keras 2.1.1
    y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze()
    y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze()

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # if no augmentation can go directly from numpy arrays
        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[x_train, y_train],
        #     # tensors=[x_train, y_train.astype(np.int32)],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     enqueue_many=enqueue_many,
        #     num_threads=8)

        input_images = tf.constant(x_train.reshape(train_samples, -1))
        input_labels = tf.constant(y_train)  # already in proper shape

        image, label = tf.train.slice_input_producer(
            [input_images, input_labels], shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, y_test], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(distorted_image,
                                                         max_delta=63)
            distorted_image = tf.image.random_contrast(distorted_image,
                                                       lower=0.2,
                                                       upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch([image, label],
                                                      batch_size=batch_size,
                                                      capacity=capacity,
                                                      num_threads=8)

        # https://stackoverflow.com/a/43613376/3457624
        x_test_batch, y_test_batch = tf.train.batch(
            [test_image, test_label],
            batch_size=test_samples,  # if converting to numpy first
            # batch_size=batch_size, # if using tensors
            capacity=capacity,
            # num_threads=8,
            num_threads=1,  # set to 1 to make deterministic
            name='test_batch',
            shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    callbacks = []

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    model_init = make_model(x_train_input, num_classes,
                            filepath if checkpt_flag else None)
    x_train_out = model_init.output
    # model_init.summary()
    model_init = Model(inputs=[x_train_input], outputs=[x_train_out])

    lr = 0.0001 * ngpus
    if ngpus > 1:
        model = make_parallel(model_init, gdev_list)
    else:
        # Must re-instantiate model per API below otherwise doesn't work.
        model = model_init

    opt = RMSprop(lr=lr, decay=1e-6)
    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    print_mgpu_modelsummary(model)  # will print non-mgpu model as well

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True)
        callbacks += [checkpoint]

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    # Start the queue runners.
    sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.
    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag:  # empty list
        model.save_weights(checkptfile)

    KB.clear_session()

    # Second Session. Demonstrate that the model works
    # test_model = make_model(x_test.shape[1:], num_classes,
    #                         weights_file=weights_file)
    test_model = make_model(x_test.shape[1:], num_classes)
    test_model.load_weights(weights_file)
    test_model.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])

    if data_augmentation:
        # Need to run x_test through per_image_standardization otherwise
        # results get messed up.
        x_processed, y_processed = sess.run([x_test_batch, y_test_batch])
        # DEBUGGING
        # xdiff = np.abs(x_test - x_processed)
        # print('MAX XDIFF: {}'.format(np.max(xdiff)))
        # ydiff = np.abs(y_test - y_processed)
        # print('y_test: {}'.format(y_test[0:5, :]))
        # print('y_processed: {}'.format(y_processed[0:5, :]))
        # print('ydiff: {}'.format(ydiff[-10:, :]))
        # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff))))

        loss, acc = test_model.evaluate(x_processed, y_processed)
    else:
        loss, acc = test_model.evaluate(x_test, y_test)

    # # Demonstrate that the model works using TF pipeline directly.
    # # In tf.train.batch for test data change batch_size=batch_size
    # # instead of train_samples. Uncomment below and comment out above.
    # val_samples = x_test.shape[0]
    # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size)))
    # images_val = KL.Input(tensor=x_test_batch)
    # test_model = make_model(images_val, num_classes,
    #                         weights_file)
    # test_model = Model(inputs=[images_val], outputs=[test_model.output])
    # test_model.compile(
    #     loss='categorical_crossentropy',
    #     optimizer=opt,
    #     metrics=['accuracy'],
    #     target_tensors=[y_test_batch])
    # loss, acc = test_model.evaluate(x=None, y=None,
    #                                 steps=steps_per_epoch_val)

    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__

    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    # enqueue = args.enqueue
    # usenccl = args.nccl
    # syncopt = args.syncopt
    # print('RDMA: {}'.format(args.rdma))
    # rdma = getattr(args, 'rdma', None)
    rdma = args.rdma
    network = args.network
    # print('NETWORK: {}'.format(network))

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # ---------------------------------------------- Distributed setup on SLURM
    # Specifying network necessary for protocol='grpc+gdr'. GDR doesn't find
    # IB addresses automatically like 'grpc+verbs'.
    # The 'ib.cluster' is specific to NVIDIA psgcluster.
    # network = 'ib.cluster' if rdma == 'gdr' else None
    # network = 'ib.cluster'
    # On fast network even without RDMA speed up significant. RDMA still helps.
    scpar = SlurmClusterParser(network=network)
    cmgr_facade = TFClusterManagerFacade(scpar)

    logdevp_flag = True if _DEVPROF or logdevp else False
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(log_device_placement=logdevp_flag,  # True,
                            allow_soft_placement=True,
                            gpu_options=gpu_options)

    print('\n\tCLUSTER_SPEC_DICT: {}\n'.format(cmgr_facade.clusterspec_dict))

    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    protocol = ProtocolType.get_server_protocol_str(rdma)
    # print('PROTOCOL: {}'.format(protocol))
    server = cmgr_facade.get_server(
        config,
        protocol=protocol)
    tfsess = cmgr_facade.get_session(server)
    KB.set_session(tfsess)

    #: :type cluster_spec: tf.train.ClusterSpec
    # cluster_spec = cmgr_facade.get_cluster_spec()
    job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    is_chief = cmgr_facade.is_chief

    if job_type == JobType.ps:
        # JOIN PARAMETER SERVERS
        # server.join()
        cmgr_facade.join(server)

    # Once the server is started everything but the chief worker can join
    # the server and wait to process/service graph computations. Chief pushes
    # the compute graph. COMPARE TO: cifar10_cnn_distrib_v2_slurm
    if not is_chief:
        # JOIN WORKERS EXCEPT FOR CHIEF
        cmgr_facade.join(server)

    # sleep(2)  # Have the chief wait just in case. Occasionally get errors.

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    # List of all devices. The devices might be associated to the same worker.
    wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus)
    print('\n\tWGDEV_LIST: {}\n'
          .format([dev.to_string() for dev in wgdev_list]))  # DEBUG
    # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8
    # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4
    nworker_devices_total = len(wgdev_list)
    batch_size = batch_size * nworker_devices_total

    psdev_list = cmgr_facade.get_allps_devlist()
    print('\n\tPSDEV_LIST: {}\n'
          .format([dev.to_string() for dev in psdev_list]))  # DEBUG

    # ------------------------------------ Data loading and basic preprocessing
    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # --------------------------------------------- Setup model and parallelize
    def _load_fn(unused_op):
        return 1

    cspec = cmgr_facade.get_cluster_spec()
    num_ps = cmgr_facade.num_ps
    ps_strategy = \
        tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn)

    # ps_device = tf.DeviceSpec(job=JobType.ps, device_type=DevType.cpu,
    #                           device_index=0).to_string()

    rdsetter = tf.train.replica_device_setter(
        cluster=cspec,
        ps_strategy=ps_strategy,
        # ps_device=ps_device,  # '/job:ps/cpu:0'  # seems to work
        # ps_device='/gpu:0'  # for gdr maybe
    )
    with tf.device(rdsetter):
        model_init = make_model(
            x_train.shape, num_classes,
            filepath if checkpt_flag else None
        )

    callbacks = None
    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')
        callbacks = [checkpoint]

    # Data-Parallelize the model via function or class.
    model = make_parallel(model_init, wgdev_list)  # , ps_device='/gpu:0'
    print_mgpu_modelsummary(model)

    # ------------------------------------------------------------ Run training
    lr = 0.0001 * nworker_devices_total
    opt = RMSprop(lr=lr, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train, y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train, y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    # Run Validation
    if is_chief:
        model_init.compile(loss='categorical_crossentropy',
                           optimizer=opt,
                           metrics=['accuracy'])
        metrics = model_init.evaluate(
            x=x_test, y=y_test,
            batch_size=batch_size)
        print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    # ------------------------------------------------------------- STOP SERVER
    cmgr_facade.stop_chief(server)
def main():
    # user options
    batch_size = 128
    val_in_train = False  # not sure how the validation part works during fit.
    use_model_checkpt = False

    # demo processing
    sess = tf.Session()
    KB.set_session(sess)

    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)
    batch_size = batch_size * ngpus

    data = mnist.load_mnist()
    X_train = data.train.images
    # X_test = data.test.images
    train_samples = X_train.shape[0]  # 60000
    # test_samples = X_test.shape[0]  # 10000
    height_nrows = 28
    width_ncols = 28
    batch_shape = [batch_size, height_nrows, width_ncols, 1]
    epochs = 5
    steps_per_epoch = train_samples // batch_size
    # validations_steps = test_samples / batch_size
    nclasses = 10

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    enqueue_many = True

    x_train_batch, y_train_batch = tf.train.shuffle_batch(
        tensors=[data.train.images, data.train.labels.astype(np.int32)],
        batch_size=batch_size,
        capacity=capacity,
        min_after_dequeue=min_after_dequeue,
        enqueue_many=enqueue_many,
        num_threads=8)

    x_train_batch = tf.cast(x_train_batch, tf.float32)
    x_train_batch = tf.reshape(x_train_batch, shape=batch_shape)

    y_train_batch = tf.cast(y_train_batch, tf.int32)
    y_train_batch = tf.one_hot(y_train_batch, nclasses)

    x_train_input = Input(tensor=x_train_batch)

    # x_test_batch, y_test_batch = tf.train.batch(
    #     tensors=[data.test.images, data.test.labels.astype(np.int32)],
    #     batch_size=batch_size,
    #     capacity=capacity,
    #     enqueue_many=enqueue_many,
    #     num_threads=8)

    # I like the non-functional definition of model more.
    # model_init = make_model(x_train_input, nclasses)
    # x_train_out = model_init.output
    # train_model = Model(inputs=[x_train_input], outputs=[x_train_out])

    x_train_out = cnn_layers(x_train_input, nclasses)
    train_model = Model(inputs=[x_train_input], outputs=[x_train_out])
    if ngpus > 1:
        train_model = make_parallel(train_model, gdev_list)

    lr = 2e-3 * ngpus
    train_model.compile(optimizer=RMSprop(lr=lr, decay=1e-5),
                        loss='categorical_crossentropy',
                        metrics=['accuracy'],
                        target_tensors=[y_train_batch])

    if ngpus > 1:
        print_mgpu_modelsummary(train_model)
    else:
        train_model.summary()

    # Callbacks
    if use_model_checkpt:
        mon = 'val_acc' if val_in_train else 'acc'
        checkpoint = ModelCheckpoint(
            'saved_wt.h5', monitor=mon, verbose=0,
            save_best_only=True,
            save_weights_only=True)
        checkpoint = [checkpoint]
    else:
        checkpoint = []

    callbacks = checkpoint
    # Training slower with callback. Multigpu slower with callback during
    # training than 1 GPU. Again, mnist is too trivial of a model and dataset
    # to benchmark or stress GPU compute capabilities. I set up this example
    # to illustrate potential for speedup of multigpu case trying to use mnist
    # as a stressor.
    # It's like comparing a 5 ft race between a person and a truck. A truck is
    # obviously faster than a person but in a 5 ft race the person will likely
    # win due to slower startup for the truck.
    # I will re-implement this with Cifar that should be a better benchmark.

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    train_model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None, # validation data is not used???
        # validations_steps if val_in_train else None,
        # validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    if not checkpoint:  # empty list
        train_model.save_weights('./saved_wt.h5')

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()

    # Second Session. Demonstrate that the model works and is independent of
    # the TFRecord pipeline, and to test loading trained model without tensors.
    x_test = np.reshape(data.validation.images,
                        (data.validation.images.shape[0], 28, 28, 1))
    y_test = data.validation.labels
    x_test_inp = KL.Input(shape=(x_test.shape[1:]))
    test_out = cnn_layers(x_test_inp, nclasses)
    test_model = Model(inputs=x_test_inp, outputs=test_out)

    test_model.load_weights('saved_wt.h5')
    test_model.compile(optimizer='rmsprop',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])
    test_model.summary()

    loss, acc = test_model.evaluate(x_test, to_categorical(y_test))
    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    enqueue = args.enqueue
    usenccl = args.nccl
    syncopt = args.syncopt

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32  # 64
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    callbacks = None

    if _DEVPROF or logdevp:
        import tensorflow as tf

        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    print(x_train.shape, 'train shape')
    print(y_train.shape, 'label shape')
    model_init = make_model(x_train.shape, num_classes,
                            filepath if checkpt_flag else None)

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')
        callbacks = [checkpoint]

    if mgpu > 1 or mgpu == -1:
        gpus_list = get_available_gpus(mgpu)
        ngpus = len(gpus_list)
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        batch_size = batch_size * ngpus  #
        # batch_size = 40000  # split over four devices works fine no grad avg
        # batch_size = 25000  # split over four devices works fine w/ grad avg

        # Data-Parallelize the model via function or class.
        model = make_parallel(model_init, gpus_list, usenccl=usenccl,
                              syncopt=syncopt, enqueue=enqueue,
                              model_class=ModelMGPU_Dflow)
        # model = ModelMGPU_Dflow(
        #     serial_model=model_init, gdev_list=gpus_list,
        #     syncopt=syncopt, usenccl=usenccl, enqueue=enqueue)
        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=0.0001, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=0.0001, decay=1e-6, gdev_list=gpus_list)

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=0.0001, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    # prepare dataset
    dataset_train = get_data('train', num_classes, batch_size=batch_size,
                             aug=data_augmentation, epochs=epochs)
    # dataset_test = get_data('test', cifar_classnum)

    if not data_augmentation:
        print('Not using data augmentation.')

    # Plain ol'd fit is faster than dataflow generator below.
    # model.fit(x_train, y_train,
    #           batch_size=batch_size,
    #           epochs=epochs,
    #           validation_data=(x_test, y_test),
    #           shuffle=True,
    #           callbacks=callbacks)

    # Fit the model on the batches generated by datagen.flow().
    # dataset_train.reset_state()
    # mygen = dataset_train.get_data()
    # for dp in mygen:
    #     print('DP SHAPE: {}'.format(dp[0].shape))
    # model.fit_generator(mygen,
    #                     steps_per_epoch=steps_per_epoch,
    #                     epochs=epochs,
    #                     validation_data=(x_test, y_test),
    #                     callbacks=callbacks)

    # Using fit_dataflow method that's mixed into ModelMGPU class.
    model.fit_dataflow(dataset_train,
                       steps_per_epoch=steps_per_epoch,
                       epochs=epochs,
                       validation_data=(x_test, y_test),
                       callbacks=callbacks)
Beispiel #13
0
def main(argv=None):
    '''Multigpu example using Keras for Cifar10 training.'''
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    # CLI parser
    args = parser_(main.__doc__)

    logdevp = args.logdevp

    gpu_options = tf.GPUOptions(allow_growth=True)
    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True,
            gpu_options=gpu_options)
        # config.gpu_options.allow_growth = True
        KB.set_session(tf.Session(config=config))
    else:
        config = tf.ConfigProto(gpu_options=gpu_options)
        KB.set_session(tf.Session(config=config))

    mgpu = 0 if args.mgpu is None else args.mgpu
    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    syncopt = args.syncopt

    checkpt = args.checkpt
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size * ngpus if ngpus > 1 else args.batch_size
    num_classes = 10
    epochs = args.epochs

    datadir = args.datadir

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    if not args.use_dataset_api:
        traingen = ImageDataGenerator()
        if args.aug:
            print('Using real-time data augmentation.')
            # This will do preprocessing and realtime data augmentation:
            traingen = ImageDataGenerator(
                # set input mean to 0 over the dataset
                featurewise_center=False,
                # set each sample mean to 0
                samplewise_center=False,
                # divide inputs by std of the dataset
                featurewise_std_normalization=False,
                # divide each input by its std
                samplewise_std_normalization=False,
                # apply ZCA whitening
                zca_whitening=False,
                # randomly rotate images in the range (degrees, 0 to 180)
                rotation_range=0,
                # randomly shift images horizontally (fraction of total width)
                width_shift_range=0.1,
                # randomly shift images vertically (fraction of total height)
                height_shift_range=0.1,
                # randomly flip images
                horizontal_flip=True,
                # randomly flip images
                vertical_flip=False)

            # Compute quantities required for feature-wise normalization
            # (std, mean, and principal components if ZCA whitening is applied)
            traingen.fit(x_train)

        # x_train_input = KL.Input(shape=x_train.shape[1:])
        model_init = make_model(
            x_train.shape[1:], num_classes, filepath)
    else:
        print('USING TF DATASET API.')
        dataset = wrap_as_tfdataset(
            x_train, y_train, args.aug, batch_size)
        iterator = dataset.make_one_shot_iterator()

        # Model creation using tensors from the get_next() graph node.
        inputs, targets = iterator.get_next()
        x_train_input = KL.Input(tensor=inputs)

        model_init_ = make_model(x_train_input, num_classes, filepath)
        x_train_out = model_init_.output

        model_init = Model(inputs=[x_train_input], outputs=[x_train_out])

    lr = 0.0001
    if ngpus > 1:
        print('Using GPUs: {}'.format(', '.join(gpus_list)))
        lr = lr * ngpus

        # Data-Parallelize the model via function or class.
        if args.mgpu_type == 'kerasmgpu':
            gpus_list_int = get_available_gpus(
                ngpus, list_type=GPUListType.int_id)
            model = ModelKerasMGPU(model_init, gpus_list_int)
        else:
            model = ModelMGPU(
                serial_model=model_init, gdev_list=gpus_list)

        print_mgpu_modelsummary(model)
        if not syncopt:
            opt = RMSprop(lr=lr, decay=1e-6)
        else:
            opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list)  # @IgnorePep8 pylint: disable=unexpected-keyword-arg

    else:
        model = model_init
        # batch_size = batch_size * 3
        # batch_size = 25000  # exhaust GPU memory. Crashes.
        print(model.summary())

        # initiate RMSprop optimizer
        opt = RMSprop(lr=lr, decay=1e-6)

    model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'],
        target_tensors=None if not args.use_dataset_api else [targets])

    callbacks = []
    if checkpt:
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')
        callbacks = [checkpoint]

    callbacks += [BatchTiming(), SamplesPerSec(batch_size)]

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    if not args.use_dataset_api:
        start_time = time.time()
        # Fit the model on the batches generated by traingen.flow().
        model.fit_generator(
            traingen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test),
            callbacks=callbacks)

    else:
        # augmentation incorporated in the Dataset pipeline
        start_time = time.time()
        # Validation during training can be incorporated via callback:
        # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56
        model.fit(
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks)

    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'
          .format('TRAINING', round(elapsed_time, 3)))

    test_model = model_init
    if args.use_dataset_api:
        # Create a test-model without Dataset pipeline in the model graph.
        test_model = make_model(x_test.shape[1:], num_classes)
        print('SETTING WEIGHTS FOR EVAL WITH DATASET API...')
        test_model.set_weights(model.get_weights())
        print('WEIGHTS SET!!!')

    test_model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'])

    metrics = test_model.evaluate(x_test, y_test)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    KB.clear_session()
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0], ) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0], ) + original_img_size)

    print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size,
                                    filters, num_conv, intermediate_dim,
                                    latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae_serial = make_vae(ldict, x)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks)

    x = Input(shape=original_img_size)
    vae_val = make_vae(ldict, x)
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    x = Input(shape=original_img_size)
    z_mean, _ = get_encoded(ldict, x)
    encoder = Model(x, z_mean)
    # :  :type encoder: Model

    decoder_input = Input(shape=(latent_dim, ))
    x_decoded_mean_squash = get_decoded(ldict, decoder_input)
    generator = Model(decoder_input, x_decoded_mean_squash)
    # :  :type generator: Model

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size:(i + 1) * digit_size,
                   j * digit_size:(j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
Beispiel #15
0
def test(cluster_parser_spec):
    scpar = cluster_parser_spec
    # Setting config on ther server instantiation and then re-using this same
    # config for sesssions is very important. This functionality is wrapped
    # in TFClusterManagerFacade.
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(
        log_device_placement=False,  # True,
        allow_soft_placement=True,
        gpu_options=gpu_options)

    cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host,
                                         scpar.hostnames,
                                         scpar.num_parameter_servers,
                                         scpar.my_proc_id)

    #: :type cluster_spec: tf.train.ClusterSpec
    cluster_spec = cmgr_facade.get_cluster_spec()
    # job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    # cspec_dict = cluster_spec.as_dict()
    # print('CLUSTER_SPEC_DICT: {}\n\tJOB_TYPE: {}\n\tTASK_ID: {}'
    #       '\n\tSERVER TARGET: {}\n\tIS_CHIEF: {}'
    #       .format(  # DEBUG
    #           cspec_dict, job_type, task_id, server.target, is_chief))

    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    server = cmgr_facade.get_server(config)  # , protocol='grpc+verbs')

    # if job_type == JobType.ps:
    #     # JOIN PARAMETER SERVERS
    #     # server.join()
    #     cmgr_facade.join(server)

    # Otherwise assumed worker
    # if job_type == JobType.worker:

    is_chief = cmgr_facade.is_chief

    # Once the server is started everything but the chief worker can join
    # the server and wait to process/service graph computations. Chief in this
    # test function pushes the compute graph.
    if not is_chief:
        # JOIN WORKERS (PS also) EXCEPT FOR CHIEF
        # server.join()
        cmgr_facade.join(server)

    # ps_tasks = cluster_spec.num_tasks(JobType.ps)
    # ps_device = '/job:ps/cpu:0'
    # ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
    # ps_tasks = len(cspec_dict[ps_job_name])
    # print('PS_JOB_NAME: {}\nPS_TASKS: {}'.format(ps_job_name, ps_tasks))

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    #: :type mywgdev: tf.DeviceSpec
    wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus)
    # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'
    #       .format(cmgr_facade.clusterspec_dict,
    #               [dev.to_string() for dev in wgdev_list]))  # DEBUG

    compute_graph = distrib_graph(wgdev_list)
    # config = server.server_def.default_session_config
    # with tf.Session(server.target, config=config) as sess:
    with cmgr_facade.get_session(server) as sess:
        # if not is_chief:
        #     # server.join()
        #     cmgr_facade.join(server, sess)

        sleep(2)  # Have the chief wait just in case. Occasionally get errors.
        # Perhaps implement a READY queue just like DONE queues.

        # ps_device = tf.DeviceSpec(job=JobType.ps,
        #                           device_type=DevType.cpu,
        #                           device_index=0).to_string()
        # ps_device = '/job:ps/cpu:0'
        # print('PS_DEVICE: {}'.format(ps_device))  # DEBUG
        # TO USE REPLICA WITH tf.train.Supervisor DO NOT JOIN WORKERS ABOVE.
        # USING IT BELOW FOR PRINTING "Hello,..." IS NOT NECESSARY.
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
            hello_tf = tf.constant("Hello, distributed TensorFlow!")
            result = sess.run(hello_tf)
            print('RESULT:\n{}\n'.format(result))

        while True:
            try:
                c = calcm()
                result = sess.run(c)
                print('RESULT NOT DISTRIBUTED:\n{}\n'.format(result))

                result = sess.run(compute_graph)
                print('RESULT DISTRIBUTED:\n{}\n'.format(result))
                break
            except Exception as err:
                traceback.print_exc()
                print('INHIBITING ERROR: {}'.format(err), file=sys.stderr)
                continue

        # cmgr_facade.stop_chief(server, sess=sess)  # this works too

    cmgr_facade.stop_chief(server)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(
        img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae_serial = make_vae(ldict, x)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs,
            callbacks=callbacks)

    x = Input(shape=original_img_size)
    vae_val = make_vae(ldict, x)
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    x = Input(shape=original_img_size)
    z_mean, _ = get_encoded(ldict, x)
    encoder = Model(x, z_mean)
    # :  :type encoder: Model

    decoder_input = Input(shape=(latent_dim,))
    x_decoded_mean_squash = get_decoded(ldict, decoder_input)
    generator = Model(decoder_input, x_decoded_mean_squash)
    # :  :type generator: Model

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size: (i + 1) * digit_size,
                   j * digit_size: (j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
Beispiel #17
0
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__

    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    # enqueue = args.enqueue
    # usenccl = args.nccl
    # syncopt = args.syncopt
    rdma = args.rdma
    network = args.network

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # ---------------------------------------------- Distributed setup on SLURM
    scpar = SlurmClusterParser(network=network)
    cmgr_facade = TFClusterManagerFacade(scpar)

    logdevp_flag = True if _DEVPROF or logdevp else False
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(log_device_placement=logdevp_flag,  # True,
                            allow_soft_placement=True,
                            gpu_options=gpu_options)

    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    protocol = ProtocolType.get_server_protocol_str(rdma)
    server = cmgr_facade.get_server(
        config,
        protocol=protocol)
    tfsess = cmgr_facade.get_session(server)
    KB.set_session(tfsess)

    #: :type cluster_spec: tf.train.ClusterSpec
    # cluster_spec = cmgr_facade.get_cluster_spec()
    job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    is_chief = cmgr_facade.is_chief

    if job_type == JobType.ps:
        # JOIN PARAMETER SERVERS
        # server.join()
        cmgr_facade.join(server)

    ps_device = cmgr_facade.get_mypsdevice()
    print('MYPS_DEVICE: {}'.format(ps_device))  # DEBUG

    # sleep(2)  # Have the chief wait just in case. Occasionally get errors.

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    # List of all devices. The devices might be associated to the same worker.
    wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus)
    # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8
    # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4
    # nworker_devices_total = len(wgdev_list)

    # Number of workers, not devices. Each worker can have multiple devices.
    num_workers = cmgr_facade.num_workers

    # List of devices associated with current worker/task.
    mydevlist = cmgr_facade.get_mydevlist(ngpus)
    nmydevs = len(mydevlist)
    batch_size = batch_size * nmydevs

    # ------------------------------------ Data loading and basic preprocessing
    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    nsamples = x_train.shape[0]
    steps_per_epoch = (nsamples // num_workers) // batch_size

    # NOTE: Naive dataset below split. With such a naive approach the random
    #     sampling gets screwed up. The convergence rate is slower as a
    #     result (hence defeats the purpose of scaling since more iterations
    #     are required when using more nodes), and if scaling to very many
    #     nodes might not converge. Instead using a generator that
    #     randomly chooses the samples for "mypart". Maybe implement a
    #     custom ImageDataGenerator for distributed case.
    # split train dataset for myrank
    # mytaskid = mypart = cmgr_facade.mytask_id
    # nn = x_train.shape[0] // num_workers
    # i1 = mypart * nn
    # if mypart == num_workers - 1:
    #     x_train = x_train[i1:, ...]
    #     y_train = y_train[i1:, ...]
    # else:
    #     i2 = (mypart + 1) * nn
    #     x_train = x_train_[i1:i2, ...]
    #     y_train = y_train[i1:i2, ...]
    # print('TASK {}: train samples {}'.format(mytaskid, x_train.shape[0]))
    # print('TASK {}: test samples {}'.format(mytaskid, x_test.shape[0]))
    # nsamples = x_train.shape[0]
    # steps_per_epoch = nsamples // batch_size

    # --------------------------------------------- Setup model and parallelize
    def _load_fn(unused_op):
        return 1

    cspec = cmgr_facade.get_cluster_spec()
    num_ps = cmgr_facade.num_ps
    ps_strategy = \
        tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn)

    rdsetter = tf.train.replica_device_setter(
        cluster=cspec,
        ps_strategy=ps_strategy,
    )
    with tf.device(rdsetter):
        model_init = make_model(
            x_train.shape, num_classes,
            filepath if checkpt_flag else None
        )

    # if using checkpointing callback enable it on chief or use unique
    # filepath for each worker task.
    callbacks = None
    if checkpt_flag and is_chief:
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')
        callbacks = [checkpoint]

    if is_chief:
        print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'
              .format(cmgr_facade.clusterspec_dict,
                      [dev.to_string() for dev in wgdev_list]))  # DEBUG

    print('\n\tMYWGDEV_LIST: {}\n'
          .format([dev.to_string() for dev in mydevlist]))  # DEBUG

    # Data-Parallelize the model via function or class.
    model = make_parallel(model_init, mydevlist, ps_device=ps_device)
    print_mgpu_modelsummary(model)

    # ------------------------------------------------------------ Run training
    lr = 0.0001 * nmydevs
    # lr = 0.0001 * nworker_devices_total
    opt = RMSprop(lr=lr, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    if not data_augmentation:
        print('Not using data augmentation.')
        # model.fit(x_train, y_train,
        #           batch_size=batch_size,
        #           epochs=epochs,
        #           validation_data=(x_test, y_test),
        #           shuffle=True,
        #           callbacks=callbacks)  # verbose=is_chief)

        datagen = ImageDataGenerator()
        datagen.fit(x_train)
        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train, y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train, y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    # ------------------------------------------------------------- STOP SERVER
    if not is_chief:
        # JOIN WORKERS EXCEPT FOR CHIEF
        cmgr_facade.join(server)

    cmgr_facade.stop_chief(server)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    gdev_list = get_available_gpus(mgpu or 1)
    ngpus = len(gdev_list)

    batch_size_1gpu = 32
    batch_size = batch_size_1gpu * ngpus
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = y_test.shape[0]
    steps_per_epoch = train_samples // batch_size
    # validations_steps = test_samples // batch_size
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    y_train = to_categorical(y_train, num_classes).astype(np.float32)
    y_test = to_categorical(y_test, num_classes).astype(np.float32)

    x_train_feed = x_train.reshape(train_samples, -1)
    y_train_feed = y_train.reshape(train_samples, -1)

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    # min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    # enqueue_many = True

    # Force input pipeline to CPU:0 to avoid data operations ending up on GPU
    # and resulting in a slow down for multigpu case due to comm overhead.
    with tf.device('/cpu:0'):
        # ref: https://www.tensorflow.org/api_guides/python/reading_data#Preloaded_data @IgnorePep8
        # Using tf.Variable instead of tf.constant uses less memory, because
        # the constant is stored inline in the graph data structure which may
        # be duplicated a few times. The placeholder/variable either is not
        # duplicated or the duplication will not consume memory since it's a
        # placeholder.
        with tf.name_scope('input'):
            # Input data
            images_initializer = tf.placeholder(dtype=x_train.dtype,
                                                shape=x_train_feed.shape)
            labels_initializer = tf.placeholder(dtype=y_train.dtype,
                                                shape=y_train_feed.shape)
            # Setting trainable=False keeps the variable out of the
            # GraphKeys.TRAINABLE_VARIABLES collection in the graph, so we
            # won't try and update it when training. Setting collections=[]
            # keeps the variable out of the GraphKeys.GLOBAL_VARIABLES
            # collection used for saving and restoring checkpoints
            input_images = tf.Variable(images_initializer,
                                       trainable=False,
                                       collections=[])
            input_labels = tf.Variable(labels_initializer,
                                       trainable=False,
                                       collections=[])

        image, label = tf.train.slice_input_producer(
            [input_images, input_labels], shuffle=True)
        # If using num_epochs=epochs have to:
        #     sess.run(tf.local_variables_initializer())
        #     and maybe also: sess.run(tf.global_variables_initializer())
        image = tf.reshape(image, x_train.shape[1:])

        test_images = tf.constant(x_test.reshape(test_samples, -1))
        test_image, test_label = tf.train.slice_input_producer(
            [test_images, y_test], shuffle=False)
        test_image = tf.reshape(test_image, x_train.shape[1:])

        if data_augmentation:
            print('Using real-time data augmentation.')
            # Randomly flip the image horizontally.
            distorted_image = tf.image.random_flip_left_right(image)

            # Because these operations are not commutative, consider
            # randomizing the order their operation.
            # NOTE: since per_image_standardization zeros the mean and
            # makes the stddev unit, this likely has no effect see
            # tensorflow#1458.
            distorted_image = tf.image.random_brightness(distorted_image,
                                                         max_delta=63)
            distorted_image = tf.image.random_contrast(distorted_image,
                                                       lower=0.2,
                                                       upper=1.8)

            # Subtract off the mean and divide by the variance of the
            # pixels.
            image = tf.image.per_image_standardization(distorted_image)

            # Do this for testing as well if standardizing
            test_image = tf.image.per_image_standardization(test_image)

        # Use tf.train.batch if slice_input_producer shuffle=True,
        # otherwise use tf.train.shuffle_batch. Not sure which way is faster.
        x_train_batch, y_train_batch = tf.train.batch([image, label],
                                                      batch_size=batch_size,
                                                      capacity=capacity,
                                                      num_threads=8)

        # x_train_batch, y_train_batch = tf.train.shuffle_batch(
        #     tensors=[image, label],
        #     batch_size=batch_size,
        #     capacity=capacity,
        #     min_after_dequeue=min_after_dequeue,
        #     num_threads=8)

        x_test_batch, y_test_batch = tf.train.batch([test_image, test_label],
                                                    batch_size=train_samples,
                                                    capacity=capacity,
                                                    num_threads=8,
                                                    name='test_batch',
                                                    shared_name='test_batch')

    x_train_input = KL.Input(tensor=x_train_batch)

    callbacks = None

    if _DEVPROF or logdevp:  # or True:
        # Setup Keras session using Tensorflow
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        # config.gpu_options.allow_growth = True
        tfsess = tf.Session(config=config)
        KB.set_session(tfsess)

    model_init = make_model(x_train_input, num_classes,
                            filepath if checkpt_flag else None)
    x_train_out = model_init.output
    # model_init.summary()

    lr = 0.0001 * ngpus
    if ngpus > 1:
        model = make_parallel(model_init, gdev_list)
    else:
        # Must re-instantiate model per API below otherwise doesn't work.
        model_init = Model(inputs=[x_train_input], outputs=[x_train_out])
        model = model_init

    opt = RMSprop(lr=lr, decay=1e-6)
    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'],
                  target_tensors=[y_train_batch])

    print_mgpu_modelsummary(model)  # will print non-mgpu model as well

    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True)
        callbacks = [checkpoint]

    # Start the queue runners.
    sess = KB.get_session()

    # Create the op for initializing variables.
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    # Run the Op to initialize the variables.
    sess.run(init_op)
    sess.run(input_images.initializer,
             feed_dict={images_initializer: x_train_feed})
    sess.run(input_labels.initializer,
             feed_dict={labels_initializer: y_train_feed})

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    val_in_train = False  # not sure how the validation part works during fit.
    start_time = time.time()
    model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None,  # validation data is not used???
        # validation_steps=validations_steps if val_in_train else None,
        validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} ms'.format('TRAINING',
                                          int(elapsed_time * 1000)))

    weights_file = checkptfile  # './saved_cifar10_wt.h5'
    if not checkpt_flag:  # empty list
        model.save_weights(checkptfile)

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()

    # Second Session. Demonstrate that the model works
    # test_model = make_model(x_test.shape[1:], num_classes,
    #                         weights_file=weights_file)
    test_model = make_model(x_test.shape[1:], num_classes)
    test_model.load_weights(weights_file)
    test_model.compile(loss='categorical_crossentropy',
                       optimizer=opt,
                       metrics=['accuracy'])

    if data_augmentation:
        x_proccessed = sess.run(x_test_batch)
        y_proccessed = sess.run(y_test_batch)
        loss, acc = test_model.evaluate(x_proccessed, y_proccessed)
    else:
        loss, acc = test_model.evaluate(x_test, y_test)

    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))
def main():
    # user options
    batch_size = 128
    val_in_train = False  # not sure how the validation part works during fit.
    use_model_checkpt = False

    # demo processing
    sess = tf.Session()
    KB.set_session(sess)

    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)
    batch_size = batch_size * ngpus

    data = mnist.load_mnist()
    X_train = data.train.images
    # X_test = data.test.images
    train_samples = X_train.shape[0]  # 60000
    # test_samples = X_test.shape[0]  # 10000
    height_nrows = 28
    width_ncols = 28
    batch_shape = [batch_size, height_nrows, width_ncols, 1]
    epochs = 5
    steps_per_epoch = train_samples / batch_size
    # validations_steps = test_samples / batch_size
    nclasses = 10

    # The capacity variable controls the maximum queue size
    # allowed when prefetching data for training.
    capacity = 10000

    # min_after_dequeue is the minimum number elements in the queue
    # after a dequeue, which ensures sufficient mixing of elements.
    min_after_dequeue = 3000

    # If `enqueue_many` is `False`, `tensors` is assumed to represent a
    # single example.  An input tensor with shape `[x, y, z]` will be output
    # as a tensor with shape `[batch_size, x, y, z]`.
    #
    # If `enqueue_many` is `True`, `tensors` is assumed to represent a
    # batch of examples, where the first dimension is indexed by example,
    # and all members of `tensors` should have the same size in the
    # first dimension.  If an input tensor has shape `[*, x, y, z]`, the
    # output will have shape `[batch_size, x, y, z]`.
    enqueue_many = True

    x_train_batch, y_train_batch = tf.train.shuffle_batch(
        tensors=[data.train.images,
                 data.train.labels.astype(np.int32)],
        batch_size=batch_size,
        capacity=capacity,
        min_after_dequeue=min_after_dequeue,
        enqueue_many=enqueue_many,
        num_threads=8)

    x_train_batch = tf.cast(x_train_batch, tf.float32)
    x_train_batch = tf.reshape(x_train_batch, shape=batch_shape)

    y_train_batch = tf.cast(y_train_batch, tf.int32)
    y_train_batch = tf.one_hot(y_train_batch, nclasses)

    x_train_input = Input(tensor=x_train_batch)

    # x_test_batch, y_test_batch = tf.train.batch(
    #     tensors=[data.test.images, data.test.labels.astype(np.int32)],
    #     batch_size=batch_size,
    #     capacity=capacity,
    #     enqueue_many=enqueue_many,
    #     num_threads=8)

    # I like the non-functional definition of model more.
    # model_init = make_model(x_train_input, nclasses)
    # x_train_out = model_init.output
    # train_model = Model(inputs=[x_train_input], outputs=[x_train_out])

    x_train_out = cnn_layers(x_train_input, nclasses)
    train_model = Model(inputs=[x_train_input], outputs=[x_train_out])
    if ngpus > 1:
        train_model = make_parallel(train_model, gdev_list)

    lr = 2e-3 * ngpus
    train_model.compile(optimizer=RMSprop(lr=lr, decay=1e-5),
                        loss='categorical_crossentropy',
                        metrics=['accuracy'],
                        target_tensors=[y_train_batch])

    if ngpus > 1:
        print_mgpu_modelsummary(train_model)
    else:
        train_model.summary()

    # Callbacks
    if use_model_checkpt:
        mon = 'val_acc' if val_in_train else 'acc'
        checkpoint = ModelCheckpoint('saved_wt.h5',
                                     monitor=mon,
                                     verbose=0,
                                     save_best_only=True,
                                     save_weights_only=True)
        checkpoint = [checkpoint]
    else:
        checkpoint = []

    callbacks = checkpoint
    # Training slower with callback. Multigpu slower with callback during
    # training than 1 GPU. Again, mnist is too trivial of a model and dataset
    # to benchmark or stress GPU compute capabilities. I set up this example
    # to illustrate potential for speedup of multigpu case trying to use mnist
    # as a stressor.
    # It's like comparing a 5 ft race between a person and a truck. A truck is
    # obviously faster than a person but in a 5 ft race the person will likely
    # win due to slower startup for the truck.
    # I will re-implement this with Cifar that should be a better benchmark.

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    train_model.fit(
        # validation_data=(x_test_batch, y_test_batch)
        # if val_in_train else None, # validation data is not used???
        # validations_steps if val_in_train else None,
        # validation_steps=val_in_train,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks)
    elapsed_time = time.time() - start_time
    print('[{}] finished in {} ms'.format('TRAINING',
                                          int(elapsed_time * 1000)))

    if not checkpoint:  # empty list
        train_model.save_weights('./saved_wt.h5')

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)

    KB.clear_session()

    # Second Session. Demonstrate that the model works and is independent of
    # the TFRecord pipeline, and to test loading trained model without tensors.
    x_test = np.reshape(data.validation.images,
                        (data.validation.images.shape[0], 28, 28, 1))
    y_test = data.validation.labels
    x_test_inp = KL.Input(shape=(x_test.shape[1:]))
    test_out = cnn_layers(x_test_inp, nclasses)
    test_model = Model(inputs=x_test_inp, outputs=test_out)

    test_model.load_weights('saved_wt.h5')
    test_model.compile(optimizer='rmsprop',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])
    test_model.summary()

    loss, acc = test_model.evaluate(x_test, to_categorical(y_test))
    print('\nTest loss: {0}'.format(loss))
    print('\nTest accuracy: {0}'.format(acc))
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__

    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)
    # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu
    # enqueue = args.enqueue
    # usenccl = args.nccl
    # syncopt = args.syncopt
    rdma = args.rdma

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = 32
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    logdevp = args.logdevp

    # ---------------------------------------------- Distributed setup on SLURM
    scpar = SlurmClusterParser()
    cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host,
                                         scpar.hostnames,
                                         scpar.num_parameter_servers,
                                         scpar.my_proc_id)

    logdevp_flag = True if _DEVPROF or logdevp else False
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(
        log_device_placement=logdevp_flag,  # True,
        allow_soft_placement=True,
        gpu_options=gpu_options)

    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    server = cmgr_facade.get_server(config,
                                    protocol='grpc+verbs' if rdma else None)
    tfsess = cmgr_facade.get_session(server)
    KB.set_session(tfsess)

    #: :type cluster_spec: tf.train.ClusterSpec
    # cluster_spec = cmgr_facade.get_cluster_spec()
    job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    is_chief = cmgr_facade.is_chief

    if job_type == JobType.ps:
        # JOIN PARAMETER SERVERS
        # server.join()
        cmgr_facade.join(server)

    # Once the server is started everything but the chief worker can join
    # the server and wait to process/service graph computations. Chief pushes
    # the compute graph. COMPARE TO: cifar10_cnn_distrib_v2_slurm
    if not is_chief:
        # JOIN WORKERS EXCEPT FOR CHIEF
        cmgr_facade.join(server)

    # sleep(2)  # Have the chief wait just in case. Occasionally get errors.

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    # List of all devices. The devices might be associated to the same worker.
    wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus)
    # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8
    # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4
    nworker_devices_total = len(wgdev_list)
    batch_size = batch_size * nworker_devices_total

    # ------------------------------------ Data loading and basic preprocessing
    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    nsamples = x_train.shape[0]
    steps_per_epoch = nsamples // batch_size

    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # --------------------------------------------- Setup model and parallelize
    def _load_fn(unused_op):
        return 1

    cspec = cmgr_facade.get_cluster_spec()
    num_ps = cmgr_facade.num_ps
    ps_strategy = \
        tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn)

    # ps_device = tf.DeviceSpec(job=JobType.ps, device_type=DevType.cpu,
    #                           device_index=0).to_string()

    rdsetter = tf.train.replica_device_setter(
        cluster=cspec,
        ps_strategy=ps_strategy
        # ps_device=ps_device,  # '/job:ps/cpu:0'  # seems to work
    )
    with tf.device(rdsetter):
        model_init = make_model(x_train.shape, num_classes,
                                filepath if checkpt_flag else None)

    callbacks = None
    if checkpt_flag:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks = [checkpoint]

    print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'.format(
        cmgr_facade.clusterspec_dict,
        [dev.to_string() for dev in wgdev_list]))  # DEBUG

    # Data-Parallelize the model via function or class.
    model = make_parallel(model_init, wgdev_list)
    print_mgpu_modelsummary(model)

    # ------------------------------------------------------------ Run training
    lr = 0.0001 * nworker_devices_total
    opt = RMSprop(lr=lr, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True,
                  callbacks=callbacks)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            callbacks=callbacks)

    # ------------------------------------------------------------- STOP SERVER
    cmgr_facade.stop_chief(server)
model_classifier = Model([img_input, roi_input], classifier)

# this is a model that holds both the RPN and the classifier, used to
# load/save weights for the models
model_all = Model([img_input, roi_input], rpn[:2] + classifier)

try:
    print('loading weights from {}'.format(C.base_net_weights))
    model_rpn.load_weights(C.base_net_weights, by_name=True)
    model_classifier.load_weights(C.base_net_weights, by_name=True)
except:
    print('Could not load pretrained model weights. Weights can be found in the keras application folder \
        https://github.com/fchollet/keras/tree/master/keras/applications')

# ------------------------------------------------------ User multi GPU support
gpus = get_available_gpus()
ngpus = len(gpus)
print_mgpu_modelsummary(model_rpn)
model_rpn = make_parallel(model_rpn, gpus)
print_mgpu_modelsummary(model_rpn)

optimizer = Adam(lr=1e-5)
optimizer_classifier = Adam(lr=1e-5)
model_rpn.compile(optimizer=optimizer, loss=[losses.rpn_loss_cls(
    num_anchors), losses.rpn_loss_regr(num_anchors)])
model_classifier.compile(optimizer=optimizer_classifier, loss=[losses.class_loss_cls, losses.class_loss_regr(
    len(classes_count) - 1)], metrics={'dense_class_{}'.format(len(classes_count)): 'accuracy'})
model_all.compile(optimizer='sgd', loss='mae')

epoch_length = 1000
num_epochs = int(options.num_epochs)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    gpus_list = get_available_gpus(mgpu)
    ngpus = len(gpus_list)

    batch_size = 128 * ngpus
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    print('x_train.shape:', x_train.shape)

    vae_serial, encoder, generator = make_vae_and_codec(
        original_img_size, img_chns, img_rows, img_cols, batch_size,
        filters, num_conv, intermediate_dim, latent_dim, epsilon_std)
    # :  :type vae: Model
    vae = make_parallel(vae_serial, gpus_list)

    lr = 0.001 * ngpus
    opt = RMSprop(lr)  # 'rmsprop'
    # opt = tf.train.RMSPropOptimizer(lr)
    # opt = TFOptimizer(opt)
    vae.compile(optimizer=opt, loss=None)
    # vae.summary()
    print_mgpu_modelsummary(vae)

    callbacks = [BatchTiming(), SamplesPerSec(batch_size)]

    vae.fit(x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks)  # ,
    # validation_data=(x_test, None))  # Not accurate for mgpu. Use vae_val.

    vae_val = vae_serial
    vae_val.compile(optimizer=opt, loss=None)
    loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus)
    print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    # plt.show()
    plt.savefig('vae_scatter.ps')
    plt.close()

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # Linearly spaced coordinates on the unit square were transformed through
    # the inverse CDF (ppf) of the Gaussian
    # To produce values of the latent variables z, since the prior of the
    # latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
            x_decoded = generator.predict(z_sample, batch_size=batch_size)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size: (i + 1) * digit_size,
                   j * digit_size: (j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    # plt.show()
    plt.savefig('vae_digit.ps')
    plt.close()
    def retain(ARGS):
        '''Create the model'''

        #Define the constant for model saving
        reshape_size = ARGS.emb_size + ARGS.numeric_size
        if ARGS.allow_negative:
            embeddings_constraint = FreezePadding()
            beta_activation = 'tanh'
            output_constraint = None
        else:
            embeddings_constraint = FreezePadding_Non_Negative()
            beta_activation = 'sigmoid'
            output_constraint = non_neg()

        #Get available gpus , returns empty list if none
        glist = get_available_gpus()

        def reshape(data):
            '''Reshape the context vectors to 3D vector'''
            return K.reshape(x=data, shape=(K.shape(data)[0], 1, reshape_size))

        #Code Input
        codes = L.Input((None, None), name='codes_input')
        inputs_list = [codes]
        #Calculate embedding for each code and sum them to a visit level
        codes_embs_total = L.Embedding(
            ARGS.num_codes + 1,
            ARGS.emb_size,
            name='embedding',
            embeddings_constraint=embeddings_constraint)(codes)
        codes_embs = L.Lambda(lambda x: K.sum(x, axis=2))(codes_embs_total)
        #Numeric input if needed
        if ARGS.numeric_size:
            numerics = L.Input((None, ARGS.numeric_size), name='numeric_input')
            inputs_list.append(numerics)
            full_embs = L.concatenate([codes_embs, numerics], name='catInp')
        else:
            full_embs = codes_embs

        #Apply dropout on inputs
        full_embs = L.Dropout(ARGS.dropout_input)(full_embs)

        #Time input if needed
        if ARGS.use_time:
            time = L.Input((None, 1), name='time_input')
            inputs_list.append(time)
            time_embs = L.concatenate([full_embs, time], name='catInp2')
        else:
            time_embs = full_embs

        #Setup Layers
        #This implementation uses Bidirectional LSTM instead of reverse order
        #    (see https://github.com/mp2893/retain/issues/3 for more details)

        #If training on GPU and Tensorflow use CuDNNLSTM for much faster training
        if glist:
            alpha = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size,
                                                return_sequences=True),
                                    name='alpha')
            beta = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size,
                                               return_sequences=True),
                                   name='beta')
        else:
            alpha = L.Bidirectional(L.LSTM(ARGS.recurrent_size,
                                           return_sequences=True,
                                           implementation=2),
                                    name='alpha')
            beta = L.Bidirectional(L.LSTM(ARGS.recurrent_size,
                                          return_sequences=True,
                                          implementation=2),
                                   name='beta')

        alpha_dense = L.Dense(1, kernel_regularizer=l2(ARGS.l2))
        beta_dense = L.Dense(ARGS.emb_size + ARGS.numeric_size,
                             activation=beta_activation,
                             kernel_regularizer=l2(ARGS.l2))

        #Compute alpha, visit attention
        alpha_out = alpha(time_embs)
        alpha_out = L.TimeDistributed(alpha_dense,
                                      name='alpha_dense_0')(alpha_out)
        alpha_out = L.Softmax(axis=1)(alpha_out)
        #Compute beta, codes attention
        beta_out = beta(time_embs)
        beta_out = L.TimeDistributed(beta_dense, name='beta_dense_0')(beta_out)
        #Compute context vector based on attentions and embeddings
        c_t = L.Multiply()([alpha_out, beta_out, full_embs])
        c_t = L.Lambda(lambda x: K.sum(x, axis=1))(c_t)
        #Reshape to 3d vector for consistency between Many to Many and Many to One implementations
        contexts = L.Lambda(reshape)(c_t)

        #Make a prediction
        contexts = L.Dropout(ARGS.dropout_context)(contexts)
        output_layer = L.Dense(1,
                               activation='sigmoid',
                               name='dOut',
                               kernel_regularizer=l2(ARGS.l2),
                               kernel_constraint=output_constraint)

        #TimeDistributed is used for consistency
        # between Many to Many and Many to One implementations
        output = L.TimeDistributed(output_layer,
                                   name='time_distributed_out')(contexts)
        #Define the model with appropriate inputs
        model = Model(inputs=inputs_list, outputs=[output])

        return model
Beispiel #24
0
def test(cluster_parser_spec):
    scpar = cluster_parser_spec
    # Setting config on ther server instantiation and then re-using this same
    # config for sesssions is very important. This functionality is wrapped
    # in TFClusterManagerFacade.
    gpu_options = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(log_device_placement=False,  # True,
                            allow_soft_placement=True,
                            gpu_options=gpu_options)

    cmgr_facade = TFClusterManagerFacade(scpar)

    #: :type cluster_spec: tf.train.ClusterSpec
    cluster_spec = cmgr_facade.get_cluster_spec()
    # job_type = cmgr_facade.myjobtype
    # task_id = cmgr_facade.mytask_id

    # cspec_dict = cluster_spec.as_dict()
    # print('CLUSTER_SPEC_DICT: {}\n\tJOB_TYPE: {}\n\tTASK_ID: {}'
    #       '\n\tSERVER TARGET: {}\n\tIS_CHIEF: {}'
    #       .format(  # DEBUG
    #           cspec_dict, job_type, task_id, server.target, is_chief))

    # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below.
    server = cmgr_facade.get_server(config)  # , protocol='grpc+verbs')

    # if job_type == JobType.ps:
    #     # JOIN PARAMETER SERVERS
    #     # server.join()
    #     cmgr_facade.join(server)

    # Otherwise assumed worker
    # if job_type == JobType.worker:

    is_chief = cmgr_facade.is_chief

    # Once the server is started everything but the chief worker can join
    # the server and wait to process/service graph computations. Chief in this
    # test function pushes the compute graph.
    if not is_chief:
        # JOIN WORKERS (PS also) EXCEPT FOR CHIEF
        # server.join()
        cmgr_facade.join(server)

    # ps_tasks = cluster_spec.num_tasks(JobType.ps)
    # ps_device = '/job:ps/cpu:0'
    # ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
    # ps_tasks = len(cspec_dict[ps_job_name])
    # print('PS_JOB_NAME: {}\nPS_TASKS: {}'.format(ps_job_name, ps_tasks))

    # The ngpus per host needs to be done with MPI or somehow sync'd. Currently
    # assuming all hosts have the same number of GPUs.
    gdev_list = get_available_gpus()
    ngpus = len(gdev_list)

    #: :type mywgdev: tf.DeviceSpec
    wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus)
    # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'
    #       .format(cmgr_facade.clusterspec_dict,
    #               [dev.to_string() for dev in wgdev_list]))  # DEBUG

    compute_graph = distrib_graph(wgdev_list)
    # config = server.server_def.default_session_config
    # with tf.Session(server.target, config=config) as sess:
    with cmgr_facade.get_session(server) as sess:
        # if not is_chief:
        #     # server.join()
        #     cmgr_facade.join(server, sess)

        sleep(2)  # Have the chief wait just in case. Occasionally get errors.
        # Perhaps implement a READY queue just like DONE queues.

        # ps_device = tf.DeviceSpec(job=JobType.ps,
        #                           device_type=DevType.cpu,
        #                           device_index=0).to_string()
        # ps_device = '/job:ps/cpu:0'
        # print('PS_DEVICE: {}'.format(ps_device))  # DEBUG
        # TO USE REPLICA WITH tf.train.Supervisor DO NOT JOIN WORKERS ABOVE.
        # USING IT BELOW FOR PRINTING "Hello,..." IS NOT NECESSARY.
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
            hello_tf = tf.constant("Hello, distributed TensorFlow!")
            result = sess.run(hello_tf)
            print('RESULT:\n{}\n'.format(result))

        while True:
            try:
                c = calcm()
                result = sess.run(c)
                print('RESULT NOT DISTRIBUTED:\n{}\n'.format(result))

                result = sess.run(compute_graph)
                print('RESULT DISTRIBUTED:\n{}\n'.format(result))
                break
            except Exception as err:
                traceback.print_exc()
                print('INHIBITING ERROR: {}'.format(err),
                      file=sys.stderr)
                continue

        # cmgr_facade.stop_chief(server, sess=sess)  # this works too

    cmgr_facade.stop_chief(server)