def import_model(path): """Import model from given path and assign it to appropriate devices""" K.clear_session() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) K.set_session(tfsess) model = load_model(path, custom_objects={'FreezePadding':FreezePadding, 'FreezePadding_Non_Negative':FreezePadding_Non_Negative}) if len(get_available_gpus()) > 1: model = make_parallel(model) return model
lr=float(parser.get('Training_Parameters', 'learning_rate'))) elif parser.get('Training_Parameters', 'optimizer') == "Adam": optimizer_used = keras.optimizers.Adam( lr=float(parser.get('Training_Parameters', 'learning_rate'))) else: print "Optimizer unchoosen or unknown -> default: Adam" optimizer_used = keras.optimizers.Adam( lr=float(parser.get('Training_Parameters', 'learning_rate'))) ngpus = args.__dict__['ngpus'] #print'Use {} GPUS'.format(ngpus) if ngpus > 1: if backend == 'tensorflow': with tf.device('/cpu:0'): model_serial = read_NN_weights(args.__dict__, base_model) gdev_list = get_available_gpus() print('Using GPUs: {}'.format(gdev_list)) model = make_parallel(model_serial, gdev_list) else: raise Exception( 'Multi GPU can only be used with tensorflow as Backend.') else: model = read_NN_weights(args.__dict__, base_model) # Choosing the Loss function loss_func = 'mean_squared_error' if parser.has_option('Training_Parameters', 'loss_function'): loss_func = parser.get('Training_Parameters', 'loss_function') if loss_func == "weighted_categorial_crossentropy": weights = parser.get('Training_Parameters', 'weights') weights = np.array(weights.split(',')).astype(np.float)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu # enqueue = args.enqueue # usenccl = args.nccl # syncopt = args.syncopt checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp # ---------------------------------------------- Distributed setup on SLURM scpar = SlurmClusterParser() logdevp_flag = True if _DEVPROF or logdevp else False gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto( log_device_placement=logdevp_flag, # True, allow_soft_placement=True, gpu_options=gpu_options) cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host, scpar.hostnames, scpar.num_parameter_servers, scpar.my_proc_id) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. server = cmgr_facade.get_server(config) tfsess = cmgr_facade.get_session(server) KB.set_session(tfsess) # TODO: Try # sv = tf.train.Supervisor(...) # with sv.managed_session(server.target, config=config) ... # sess = sv.prepare_or_wait_for_session(server.target, # config=sess_config) # KB.set_session(tfsess) # based on this managed session. #: :type cluster_spec: tf.train.ClusterSpec # cluster_spec = cmgr_facade.get_cluster_spec() job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id is_chief = cmgr_facade.is_chief if job_type == JobType.ps: # JOIN PARAMETER SERVERS # server.join() cmgr_facade.join(server) # Once the server is started everything but the chief worker can join # the server and wait to process/service graph computations. Chief pushes # the compute graph. if not is_chief: # JOIN WORKERS (PS also) EXCEPT FOR CHIEF cmgr_facade.join(server) # sleep(2) # Have the chief wait just in case. Occasionally get errors. # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) #: :type mywgdev: tf.DeviceSpec # mywgdev, wgdev_list = cmgr_facade.get_workers_dev_list(ngpus) _, wgdev_list = cmgr_facade.get_workers_dev_list(ngpus) nworker_devices_total = len(wgdev_list) # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n' # .format(cmgr_facade.clusterspec_dict, # [dev.to_string() for dev in wgdev_list])) # DEBUG # ------------------------------------ Data loading and basic preprocessing # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10.load_data() print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 callbacks = None print(x_train.shape, 'train shape') # --------------------------------------------- Setup model and parallelize def _load_fn(unused_op): return 1 cspec = cmgr_facade.get_cluster_spec() num_ps = cspec.num_tasks(JobType.ps) ps_strategy = \ tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn) ps_device = tf.DeviceSpec(job=JobType.ps, device_type=DevType.cpu, device_index=0).to_string() rdsetter = tf.train.replica_device_setter( cluster=cspec, ps_strategy=ps_strategy, ps_device=ps_device, # '/job:ps/cpu:0' # seems to work ) with tf.device(rdsetter): model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) # model_init = partial(make_model, x_train.shape, num_classes, # filepath if checkpt_flag else None) if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'.format( cmgr_facade.clusterspec_dict, [dev.to_string() for dev in wgdev_list])) # DEBUG batch_size = batch_size * nworker_devices_total # batch_size = 40000 # split over four devices works fine no grad avg # batch_size = 25000 # split over four devices works fine w/ grad avg # ps_device = rdsetter # ps_device = '/job:ps/cpu:0' # ps_device = '/cpu:0' # ps_device = tf.train.replica_device_setter( # ps_device="/job:ps/cpu:0", # worker_device=mywgdev.to_string(), # cluster=cmgr_facade.get_cluster_spec()) # TODO: Use replica_device_setter and do not join workers above. Try using # managed session. # Need to think about this because what I want is to have the workers # load the relevant data on the node they are running from instead of # loading on chief rank's node and transferring slices over network. # Maybe parameter servers can do this via ZeroMQ. # Ref: https://gist.github.com/fchollet/2c9b029f505d94e6b8cd7f8a5e244a4e # Data-Parallelize the model via function or class. model = make_parallel(model_init, wgdev_list, ps_device=ps_device) # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list, # syncopt=syncopt, usenccl=usenccl, enqueue=enqueue) print_mgpu_modelsummary(model) # ------------------------------------------------------------ Run training opt = RMSprop(lr=0.0001, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) # Fit the model on the batches generated by datagen.flow(). # mygen = mygenerator(nsamples, batch_size, x_train, y_train) # model.fit_generator(mygen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, # validation_data=(x_test, y_test), # callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) # ------------------------------------------------------------- STOP SERVER # if not is_chief: # # JOIN WORKERS (PS also) EXCEPT FOR CHIEF # cmgr_facade.join(server) cmgr_facade.stop_chief(server)
def model_create(ARGS): '''Create and Compile model and assign it to provided devices''' def retain(ARGS): '''Create the model''' #Define the constant for model saving reshape_size = ARGS.emb_size + ARGS.numeric_size if ARGS.allow_negative: embeddings_constraint = FreezePadding() beta_activation = 'tanh' output_constraint = None else: embeddings_constraint = FreezePadding_Non_Negative() beta_activation = 'sigmoid' output_constraint = non_neg() #Get available gpus , returns empty list if none glist = get_available_gpus() def reshape(data): '''Reshape the context vectors to 3D vector''' return K.reshape(x=data, shape=(K.shape(data)[0], 1, reshape_size)) #Code Input codes = L.Input((None, None), name='codes_input') inputs_list = [codes] #Calculate embedding for each code and sum them to a visit level codes_embs_total = L.Embedding( ARGS.num_codes + 1, ARGS.emb_size, name='embedding', embeddings_constraint=embeddings_constraint)(codes) codes_embs = L.Lambda(lambda x: K.sum(x, axis=2))(codes_embs_total) #Numeric input if needed if ARGS.numeric_size: numerics = L.Input((None, ARGS.numeric_size), name='numeric_input') inputs_list.append(numerics) full_embs = L.concatenate([codes_embs, numerics], name='catInp') else: full_embs = codes_embs #Apply dropout on inputs full_embs = L.Dropout(ARGS.dropout_input)(full_embs) #Time input if needed if ARGS.use_time: time = L.Input((None, 1), name='time_input') inputs_list.append(time) time_embs = L.concatenate([full_embs, time], name='catInp2') else: time_embs = full_embs #Setup Layers #This implementation uses Bidirectional LSTM instead of reverse order # (see https://github.com/mp2893/retain/issues/3 for more details) #If training on GPU and Tensorflow use CuDNNLSTM for much faster training if glist: alpha = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size, return_sequences=True), name='alpha') beta = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size, return_sequences=True), name='beta') else: alpha = L.Bidirectional(L.LSTM(ARGS.recurrent_size, return_sequences=True, implementation=2), name='alpha') beta = L.Bidirectional(L.LSTM(ARGS.recurrent_size, return_sequences=True, implementation=2), name='beta') alpha_dense = L.Dense(1, kernel_regularizer=l2(ARGS.l2)) beta_dense = L.Dense(ARGS.emb_size + ARGS.numeric_size, activation=beta_activation, kernel_regularizer=l2(ARGS.l2)) #Compute alpha, visit attention alpha_out = alpha(time_embs) alpha_out = L.TimeDistributed(alpha_dense, name='alpha_dense_0')(alpha_out) alpha_out = L.Softmax(axis=1)(alpha_out) #Compute beta, codes attention beta_out = beta(time_embs) beta_out = L.TimeDistributed(beta_dense, name='beta_dense_0')(beta_out) #Compute context vector based on attentions and embeddings c_t = L.Multiply()([alpha_out, beta_out, full_embs]) c_t = L.Lambda(lambda x: K.sum(x, axis=1))(c_t) #Reshape to 3d vector for consistency between Many to Many and Many to One implementations contexts = L.Lambda(reshape)(c_t) #Make a prediction contexts = L.Dropout(ARGS.dropout_context)(contexts) output_layer = L.Dense(1, activation='sigmoid', name='dOut', kernel_regularizer=l2(ARGS.l2), kernel_constraint=output_constraint) #TimeDistributed is used for consistency # between Many to Many and Many to One implementations output = L.TimeDistributed(output_layer, name='time_distributed_out')(contexts) #Define the model with appropriate inputs model = Model(inputs=inputs_list, outputs=[output]) return model #Set Tensorflow to grow GPU memory consumption instead of grabbing all of it at once K.clear_session() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) K.set_session(tfsess) #If there are multiple GPUs set up a multi-gpu model glist = get_available_gpus() if len(glist) > 1: with tf.device('/cpu:0'): model = retain(ARGS) model_final = make_parallel(model, glist) else: model_final = retain(ARGS) #Compile the model - adamax has produced best results in our experiments model_final.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal') return model_final
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) print('x_train.shape:', x_train.shape) vae_serial, encoder, generator = make_vae_and_codec( original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, callbacks=callbacks) # , # validation_data=(x_test, None)) # Not accurate for mgpu. Use vae_val. vae_val = vae_serial vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu enqueue = args.enqueue usenccl = args.nccl syncopt = args.syncopt checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: # (x_train, y_train), (x_test, y_test) = cifar10.load_data() (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which # was fixed in Keras 2.1.1 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes).squeeze() y_test = to_categorical(y_test, num_classes).squeeze() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) print(x_train.shape, 'train shape') # with tf.device('/cpu:0'): model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) # model_init = partial(make_model, x_train.shape, num_classes, # filepath if checkpt_flag else None) if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] lr = 0.0001 if mgpu > 1 or mgpu == -1: gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) print('Using GPUs: {}'.format(', '.join(gpus_list))) batch_size = batch_size * ngpus # lr = lr * ngpus # batch_size = 40000 # split over four devices works fine no grad avg # batch_size = 25000 # split over four devices works fine w/ grad avg # Data-Parallelize the model via function or class. model = make_parallel(model_init, gpus_list, usenccl=usenccl, syncopt=syncopt, enqueue=enqueue) # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list, # syncopt=syncopt, usenccl=usenccl, enqueue=enqueue) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=lr, decay=1e-6) else: opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list) else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=lr, decay=1e-6) callbacks += [BatchTiming(), SamplesPerSec(batch_size)] # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) # Fit the model on the batches generated by datagen.flow(). # mygen = mygenerator(nsamples, batch_size, x_train, y_train) # model.fit_generator(mygen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, # validation_data=(x_test, y_test), # callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) model_init.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) metrics = model_init.evaluate(x=x_test, y=y_test, batch_size=batch_size) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu enqueue = args.enqueue usenccl = args.nccl syncopt = args.syncopt checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 # 64 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10.load_data() print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 callbacks = None if _DEVPROF or logdevp: import tensorflow as tf # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) print(x_train.shape, 'train shape') print(y_train.shape, 'label shape') model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] if mgpu > 1 or mgpu == -1: gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) print('Using GPUs: {}'.format(', '.join(gpus_list))) batch_size = batch_size * ngpus # # batch_size = 40000 # split over four devices works fine no grad avg # batch_size = 25000 # split over four devices works fine w/ grad avg # Data-Parallelize the model via function or class. model = make_parallel(model_init, gpus_list, usenccl=usenccl, syncopt=syncopt, enqueue=enqueue, model_class=ModelMGPU_Dflow) # model = ModelMGPU_Dflow( # serial_model=model_init, gdev_list=gpus_list, # syncopt=syncopt, usenccl=usenccl, enqueue=enqueue) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=0.0001, decay=1e-6) else: opt = RMSPropMGPU(lr=0.0001, decay=1e-6, gdev_list=gpus_list) else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=0.0001, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size # prepare dataset dataset_train = get_data('train', num_classes, batch_size=batch_size, aug=data_augmentation, epochs=epochs) # dataset_test = get_data('test', cifar_classnum) if not data_augmentation: print('Not using data augmentation.') # Plain ol'd fit is faster than dataflow generator below. # model.fit(x_train, y_train, # batch_size=batch_size, # epochs=epochs, # validation_data=(x_test, y_test), # shuffle=True, # callbacks=callbacks) # Fit the model on the batches generated by datagen.flow(). # dataset_train.reset_state() # mygen = dataset_train.get_data() # for dp in mygen: # print('DP SHAPE: {}'.format(dp[0].shape)) # model.fit_generator(mygen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, # validation_data=(x_test, y_test), # callbacks=callbacks) # Using fit_dataflow method that's mixed into ModelMGPU class. model.fit_dataflow(dataset_train, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) gdev_list = get_available_gpus(mgpu or 1) ngpus = len(gdev_list) batch_size_1gpu = 32 batch_size = batch_size_1gpu * ngpus num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = y_test.shape[0] steps_per_epoch = train_samples // batch_size # validations_steps = test_samples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which # was fixed in Keras 2.1.1 y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze() y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze() # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # if no augmentation can go directly from numpy arrays # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[x_train, y_train], # # tensors=[x_train, y_train.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # enqueue_many=enqueue_many, # num_threads=8) input_images = tf.constant(x_train.reshape(train_samples, -1)) input_labels = tf.constant(y_train) # already in proper shape image, label = tf.train.slice_input_producer( [input_images, input_labels], shuffle=True) # If using num_epochs=epochs have to: # sess.run(tf.local_variables_initializer()) # and maybe also: sess.run(tf.global_variables_initializer()) image = tf.reshape(image, x_train.shape[1:]) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_image, test_label = tf.train.slice_input_producer( [test_images, y_test], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness( distorted_image, max_delta=63) distorted_image = tf.image.random_contrast( distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch( [image, label], batch_size=batch_size, capacity=capacity, num_threads=8) # https://stackoverflow.com/a/43613376/3457624 x_test_batch, y_test_batch = tf.train.batch( [test_image, test_label], batch_size=test_samples, # if converting to numpy first # batch_size=batch_size, # if using tensors capacity=capacity, # num_threads=8, num_threads=1, # set to 1 to make deterministic name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) model_init = make_model(x_train_input, num_classes, filepath if checkpt_flag else None) x_train_out = model_init.output # model_init.summary() model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 * ngpus if ngpus > 1: model = make_parallel(model_init, gdev_list) else: # Must re-instantiate model per API below otherwise doesn't work. model = model_init opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) print_mgpu_modelsummary(model) # will print non-mgpu model as well if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks += [checkpoint] callbacks += [BatchTiming(), SamplesPerSec(batch_size)] # Start the queue runners. sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag: # empty list model.save_weights(checkptfile) KB.clear_session() # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: # Need to run x_test through per_image_standardization otherwise # results get messed up. x_processed, y_processed = sess.run([x_test_batch, y_test_batch]) # DEBUGGING # xdiff = np.abs(x_test - x_processed) # print('MAX XDIFF: {}'.format(np.max(xdiff))) # ydiff = np.abs(y_test - y_processed) # print('y_test: {}'.format(y_test[0:5, :])) # print('y_processed: {}'.format(y_processed[0:5, :])) # print('ydiff: {}'.format(ydiff[-10:, :])) # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff)))) loss, acc = test_model.evaluate(x_processed, y_processed) else: loss, acc = test_model.evaluate(x_test, y_test) # # Demonstrate that the model works using TF pipeline directly. # # In tf.train.batch for test data change batch_size=batch_size # # instead of train_samples. Uncomment below and comment out above. # val_samples = x_test.shape[0] # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size))) # images_val = KL.Input(tensor=x_test_batch) # test_model = make_model(images_val, num_classes, # weights_file) # test_model = Model(inputs=[images_val], outputs=[test_model.output]) # test_model.compile( # loss='categorical_crossentropy', # optimizer=opt, # metrics=['accuracy'], # target_tensors=[y_test_batch]) # loss, acc = test_model.evaluate(x=None, y=None, # steps=steps_per_epoch_val) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. coord.request_stop() coord.join(threads)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) gdev_list = get_available_gpus(mgpu or 1) ngpus = len(gdev_list) batch_size_1gpu = 32 batch_size = batch_size_1gpu * ngpus num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = y_test.shape[0] steps_per_epoch = train_samples // batch_size # validations_steps = test_samples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which # was fixed in Keras 2.1.1 y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze() y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze() # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # if no augmentation can go directly from numpy arrays # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[x_train, y_train], # # tensors=[x_train, y_train.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # enqueue_many=enqueue_many, # num_threads=8) input_images = tf.constant(x_train.reshape(train_samples, -1)) input_labels = tf.constant(y_train) # already in proper shape image, label = tf.train.slice_input_producer( [input_images, input_labels], shuffle=True) # If using num_epochs=epochs have to: # sess.run(tf.local_variables_initializer()) # and maybe also: sess.run(tf.global_variables_initializer()) image = tf.reshape(image, x_train.shape[1:]) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_image, test_label = tf.train.slice_input_producer( [test_images, y_test], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch([image, label], batch_size=batch_size, capacity=capacity, num_threads=8) # https://stackoverflow.com/a/43613376/3457624 x_test_batch, y_test_batch = tf.train.batch( [test_image, test_label], batch_size=test_samples, # if converting to numpy first # batch_size=batch_size, # if using tensors capacity=capacity, # num_threads=8, num_threads=1, # set to 1 to make deterministic name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) model_init = make_model(x_train_input, num_classes, filepath if checkpt_flag else None) x_train_out = model_init.output # model_init.summary() model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 * ngpus if ngpus > 1: model = make_parallel(model_init, gdev_list) else: # Must re-instantiate model per API below otherwise doesn't work. model = model_init opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) print_mgpu_modelsummary(model) # will print non-mgpu model as well if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks += [checkpoint] callbacks += [BatchTiming(), SamplesPerSec(batch_size)] # Start the queue runners. sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag: # empty list model.save_weights(checkptfile) KB.clear_session() # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: # Need to run x_test through per_image_standardization otherwise # results get messed up. x_processed, y_processed = sess.run([x_test_batch, y_test_batch]) # DEBUGGING # xdiff = np.abs(x_test - x_processed) # print('MAX XDIFF: {}'.format(np.max(xdiff))) # ydiff = np.abs(y_test - y_processed) # print('y_test: {}'.format(y_test[0:5, :])) # print('y_processed: {}'.format(y_processed[0:5, :])) # print('ydiff: {}'.format(ydiff[-10:, :])) # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff)))) loss, acc = test_model.evaluate(x_processed, y_processed) else: loss, acc = test_model.evaluate(x_test, y_test) # # Demonstrate that the model works using TF pipeline directly. # # In tf.train.batch for test data change batch_size=batch_size # # instead of train_samples. Uncomment below and comment out above. # val_samples = x_test.shape[0] # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size))) # images_val = KL.Input(tensor=x_test_batch) # test_model = make_model(images_val, num_classes, # weights_file) # test_model = Model(inputs=[images_val], outputs=[test_model.output]) # test_model.compile( # loss='categorical_crossentropy', # optimizer=opt, # metrics=['accuracy'], # target_tensors=[y_test_batch]) # loss, acc = test_model.evaluate(x=None, y=None, # steps=steps_per_epoch_val) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. coord.request_stop() coord.join(threads)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu # enqueue = args.enqueue # usenccl = args.nccl # syncopt = args.syncopt # print('RDMA: {}'.format(args.rdma)) # rdma = getattr(args, 'rdma', None) rdma = args.rdma network = args.network # print('NETWORK: {}'.format(network)) checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp # ---------------------------------------------- Distributed setup on SLURM # Specifying network necessary for protocol='grpc+gdr'. GDR doesn't find # IB addresses automatically like 'grpc+verbs'. # The 'ib.cluster' is specific to NVIDIA psgcluster. # network = 'ib.cluster' if rdma == 'gdr' else None # network = 'ib.cluster' # On fast network even without RDMA speed up significant. RDMA still helps. scpar = SlurmClusterParser(network=network) cmgr_facade = TFClusterManagerFacade(scpar) logdevp_flag = True if _DEVPROF or logdevp else False gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(log_device_placement=logdevp_flag, # True, allow_soft_placement=True, gpu_options=gpu_options) print('\n\tCLUSTER_SPEC_DICT: {}\n'.format(cmgr_facade.clusterspec_dict)) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. protocol = ProtocolType.get_server_protocol_str(rdma) # print('PROTOCOL: {}'.format(protocol)) server = cmgr_facade.get_server( config, protocol=protocol) tfsess = cmgr_facade.get_session(server) KB.set_session(tfsess) #: :type cluster_spec: tf.train.ClusterSpec # cluster_spec = cmgr_facade.get_cluster_spec() job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id is_chief = cmgr_facade.is_chief if job_type == JobType.ps: # JOIN PARAMETER SERVERS # server.join() cmgr_facade.join(server) # Once the server is started everything but the chief worker can join # the server and wait to process/service graph computations. Chief pushes # the compute graph. COMPARE TO: cifar10_cnn_distrib_v2_slurm if not is_chief: # JOIN WORKERS EXCEPT FOR CHIEF cmgr_facade.join(server) # sleep(2) # Have the chief wait just in case. Occasionally get errors. # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) # List of all devices. The devices might be associated to the same worker. wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus) print('\n\tWGDEV_LIST: {}\n' .format([dev.to_string() for dev in wgdev_list])) # DEBUG # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8 # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4 nworker_devices_total = len(wgdev_list) batch_size = batch_size * nworker_devices_total psdev_list = cmgr_facade.get_allps_devlist() print('\n\tPSDEV_LIST: {}\n' .format([dev.to_string() for dev in psdev_list])) # DEBUG # ------------------------------------ Data loading and basic preprocessing # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10.load_data() # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # --------------------------------------------- Setup model and parallelize def _load_fn(unused_op): return 1 cspec = cmgr_facade.get_cluster_spec() num_ps = cmgr_facade.num_ps ps_strategy = \ tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn) # ps_device = tf.DeviceSpec(job=JobType.ps, device_type=DevType.cpu, # device_index=0).to_string() rdsetter = tf.train.replica_device_setter( cluster=cspec, ps_strategy=ps_strategy, # ps_device=ps_device, # '/job:ps/cpu:0' # seems to work # ps_device='/gpu:0' # for gdr maybe ) with tf.device(rdsetter): model_init = make_model( x_train.shape, num_classes, filepath if checkpt_flag else None ) callbacks = None if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] # Data-Parallelize the model via function or class. model = make_parallel(model_init, wgdev_list) # , ps_device='/gpu:0' print_mgpu_modelsummary(model) # ------------------------------------------------------------ Run training lr = 0.0001 * nworker_devices_total opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) # Run Validation if is_chief: model_init.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) metrics = model_init.evaluate( x=x_test, y=y_test, batch_size=batch_size) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) # ------------------------------------------------------------- STOP SERVER cmgr_facade.stop_chief(server)
def main(): # user options batch_size = 128 val_in_train = False # not sure how the validation part works during fit. use_model_checkpt = False # demo processing sess = tf.Session() KB.set_session(sess) gdev_list = get_available_gpus() ngpus = len(gdev_list) batch_size = batch_size * ngpus data = mnist.load_mnist() X_train = data.train.images # X_test = data.test.images train_samples = X_train.shape[0] # 60000 # test_samples = X_test.shape[0] # 10000 height_nrows = 28 width_ncols = 28 batch_shape = [batch_size, height_nrows, width_ncols, 1] epochs = 5 steps_per_epoch = train_samples // batch_size # validations_steps = test_samples / batch_size nclasses = 10 # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. enqueue_many = True x_train_batch, y_train_batch = tf.train.shuffle_batch( tensors=[data.train.images, data.train.labels.astype(np.int32)], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue, enqueue_many=enqueue_many, num_threads=8) x_train_batch = tf.cast(x_train_batch, tf.float32) x_train_batch = tf.reshape(x_train_batch, shape=batch_shape) y_train_batch = tf.cast(y_train_batch, tf.int32) y_train_batch = tf.one_hot(y_train_batch, nclasses) x_train_input = Input(tensor=x_train_batch) # x_test_batch, y_test_batch = tf.train.batch( # tensors=[data.test.images, data.test.labels.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # enqueue_many=enqueue_many, # num_threads=8) # I like the non-functional definition of model more. # model_init = make_model(x_train_input, nclasses) # x_train_out = model_init.output # train_model = Model(inputs=[x_train_input], outputs=[x_train_out]) x_train_out = cnn_layers(x_train_input, nclasses) train_model = Model(inputs=[x_train_input], outputs=[x_train_out]) if ngpus > 1: train_model = make_parallel(train_model, gdev_list) lr = 2e-3 * ngpus train_model.compile(optimizer=RMSprop(lr=lr, decay=1e-5), loss='categorical_crossentropy', metrics=['accuracy'], target_tensors=[y_train_batch]) if ngpus > 1: print_mgpu_modelsummary(train_model) else: train_model.summary() # Callbacks if use_model_checkpt: mon = 'val_acc' if val_in_train else 'acc' checkpoint = ModelCheckpoint( 'saved_wt.h5', monitor=mon, verbose=0, save_best_only=True, save_weights_only=True) checkpoint = [checkpoint] else: checkpoint = [] callbacks = checkpoint # Training slower with callback. Multigpu slower with callback during # training than 1 GPU. Again, mnist is too trivial of a model and dataset # to benchmark or stress GPU compute capabilities. I set up this example # to illustrate potential for speedup of multigpu case trying to use mnist # as a stressor. # It's like comparing a 5 ft race between a person and a truck. A truck is # obviously faster than a person but in a 5 ft race the person will likely # win due to slower startup for the truck. # I will re-implement this with Cifar that should be a better benchmark. # Start the queue runners. tf.train.start_queue_runners(sess=sess) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) start_time = time.time() train_model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validations_steps if val_in_train else None, # validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) if not checkpoint: # empty list train_model.save_weights('./saved_wt.h5') # Clean up the TF session. coord.request_stop() coord.join(threads) KB.clear_session() # Second Session. Demonstrate that the model works and is independent of # the TFRecord pipeline, and to test loading trained model without tensors. x_test = np.reshape(data.validation.images, (data.validation.images.shape[0], 28, 28, 1)) y_test = data.validation.labels x_test_inp = KL.Input(shape=(x_test.shape[1:])) test_out = cnn_layers(x_test_inp, nclasses) test_model = Model(inputs=x_test_inp, outputs=test_out) test_model.load_weights('saved_wt.h5') test_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) test_model.summary() loss, acc = test_model.evaluate(x_test, to_categorical(y_test)) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc))
def main(argv=None): '''Multigpu example using Keras for Cifar10 training.''' argv = sys.argv if argv is None else sys.argv.extend(argv) # CLI parser args = parser_(main.__doc__) logdevp = args.logdevp gpu_options = tf.GPUOptions(allow_growth=True) if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options) # config.gpu_options.allow_growth = True KB.set_session(tf.Session(config=config)) else: config = tf.ConfigProto(gpu_options=gpu_options) KB.set_session(tf.Session(config=config)) mgpu = 0 if args.mgpu is None else args.mgpu gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) syncopt = args.syncopt checkpt = args.checkpt filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size * ngpus if ngpus > 1 else args.batch_size num_classes = 10 epochs = args.epochs datadir = args.datadir # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) if not args.use_dataset_api: traingen = ImageDataGenerator() if args.aug: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: traingen = ImageDataGenerator( # set input mean to 0 over the dataset featurewise_center=False, # set each sample mean to 0 samplewise_center=False, # divide inputs by std of the dataset featurewise_std_normalization=False, # divide each input by its std samplewise_std_normalization=False, # apply ZCA whitening zca_whitening=False, # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, # randomly flip images horizontal_flip=True, # randomly flip images vertical_flip=False) # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied) traingen.fit(x_train) # x_train_input = KL.Input(shape=x_train.shape[1:]) model_init = make_model( x_train.shape[1:], num_classes, filepath) else: print('USING TF DATASET API.') dataset = wrap_as_tfdataset( x_train, y_train, args.aug, batch_size) iterator = dataset.make_one_shot_iterator() # Model creation using tensors from the get_next() graph node. inputs, targets = iterator.get_next() x_train_input = KL.Input(tensor=inputs) model_init_ = make_model(x_train_input, num_classes, filepath) x_train_out = model_init_.output model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 if ngpus > 1: print('Using GPUs: {}'.format(', '.join(gpus_list))) lr = lr * ngpus # Data-Parallelize the model via function or class. if args.mgpu_type == 'kerasmgpu': gpus_list_int = get_available_gpus( ngpus, list_type=GPUListType.int_id) model = ModelKerasMGPU(model_init, gpus_list_int) else: model = ModelMGPU( serial_model=model_init, gdev_list=gpus_list) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=lr, decay=1e-6) else: opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list) # @IgnorePep8 pylint: disable=unexpected-keyword-arg else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=lr, decay=1e-6) model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'], target_tensors=None if not args.use_dataset_api else [targets]) callbacks = [] if checkpt: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] callbacks += [BatchTiming(), SamplesPerSec(batch_size)] nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not args.use_dataset_api: start_time = time.time() # Fit the model on the batches generated by traingen.flow(). model.fit_generator( traingen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) else: # augmentation incorporated in the Dataset pipeline start_time = time.time() # Validation during training can be incorporated via callback: # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56 model.fit( steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) test_model = model_init if args.use_dataset_api: # Create a test-model without Dataset pipeline in the model graph. test_model = make_model(x_test.shape[1:], num_classes) print('SETTING WEIGHTS FOR EVAL WITH DATASET API...') test_model.set_weights(model.get_weights()) print('WEIGHTS SET!!!') test_model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) KB.clear_session()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae_serial = make_vae(ldict, x) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim, )) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def test(cluster_parser_spec): scpar = cluster_parser_spec # Setting config on ther server instantiation and then re-using this same # config for sesssions is very important. This functionality is wrapped # in TFClusterManagerFacade. gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto( log_device_placement=False, # True, allow_soft_placement=True, gpu_options=gpu_options) cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host, scpar.hostnames, scpar.num_parameter_servers, scpar.my_proc_id) #: :type cluster_spec: tf.train.ClusterSpec cluster_spec = cmgr_facade.get_cluster_spec() # job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id # cspec_dict = cluster_spec.as_dict() # print('CLUSTER_SPEC_DICT: {}\n\tJOB_TYPE: {}\n\tTASK_ID: {}' # '\n\tSERVER TARGET: {}\n\tIS_CHIEF: {}' # .format( # DEBUG # cspec_dict, job_type, task_id, server.target, is_chief)) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. server = cmgr_facade.get_server(config) # , protocol='grpc+verbs') # if job_type == JobType.ps: # # JOIN PARAMETER SERVERS # # server.join() # cmgr_facade.join(server) # Otherwise assumed worker # if job_type == JobType.worker: is_chief = cmgr_facade.is_chief # Once the server is started everything but the chief worker can join # the server and wait to process/service graph computations. Chief in this # test function pushes the compute graph. if not is_chief: # JOIN WORKERS (PS also) EXCEPT FOR CHIEF # server.join() cmgr_facade.join(server) # ps_tasks = cluster_spec.num_tasks(JobType.ps) # ps_device = '/job:ps/cpu:0' # ps_job_name = pydev.DeviceSpec.from_string(ps_device).job # ps_tasks = len(cspec_dict[ps_job_name]) # print('PS_JOB_NAME: {}\nPS_TASKS: {}'.format(ps_job_name, ps_tasks)) # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) #: :type mywgdev: tf.DeviceSpec wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus) # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n' # .format(cmgr_facade.clusterspec_dict, # [dev.to_string() for dev in wgdev_list])) # DEBUG compute_graph = distrib_graph(wgdev_list) # config = server.server_def.default_session_config # with tf.Session(server.target, config=config) as sess: with cmgr_facade.get_session(server) as sess: # if not is_chief: # # server.join() # cmgr_facade.join(server, sess) sleep(2) # Have the chief wait just in case. Occasionally get errors. # Perhaps implement a READY queue just like DONE queues. # ps_device = tf.DeviceSpec(job=JobType.ps, # device_type=DevType.cpu, # device_index=0).to_string() # ps_device = '/job:ps/cpu:0' # print('PS_DEVICE: {}'.format(ps_device)) # DEBUG # TO USE REPLICA WITH tf.train.Supervisor DO NOT JOIN WORKERS ABOVE. # USING IT BELOW FOR PRINTING "Hello,..." IS NOT NECESSARY. with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): hello_tf = tf.constant("Hello, distributed TensorFlow!") result = sess.run(hello_tf) print('RESULT:\n{}\n'.format(result)) while True: try: c = calcm() result = sess.run(c) print('RESULT NOT DISTRIBUTED:\n{}\n'.format(result)) result = sess.run(compute_graph) print('RESULT DISTRIBUTED:\n{}\n'.format(result)) break except Exception as err: traceback.print_exc() print('INHIBITING ERROR: {}'.format(err), file=sys.stderr) continue # cmgr_facade.stop_chief(server, sess=sess) # this works too cmgr_facade.stop_chief(server)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict( img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae_serial = make_vae(ldict, x) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim,)) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu # enqueue = args.enqueue # usenccl = args.nccl # syncopt = args.syncopt rdma = args.rdma network = args.network checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp # ---------------------------------------------- Distributed setup on SLURM scpar = SlurmClusterParser(network=network) cmgr_facade = TFClusterManagerFacade(scpar) logdevp_flag = True if _DEVPROF or logdevp else False gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(log_device_placement=logdevp_flag, # True, allow_soft_placement=True, gpu_options=gpu_options) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. protocol = ProtocolType.get_server_protocol_str(rdma) server = cmgr_facade.get_server( config, protocol=protocol) tfsess = cmgr_facade.get_session(server) KB.set_session(tfsess) #: :type cluster_spec: tf.train.ClusterSpec # cluster_spec = cmgr_facade.get_cluster_spec() job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id is_chief = cmgr_facade.is_chief if job_type == JobType.ps: # JOIN PARAMETER SERVERS # server.join() cmgr_facade.join(server) ps_device = cmgr_facade.get_mypsdevice() print('MYPS_DEVICE: {}'.format(ps_device)) # DEBUG # sleep(2) # Have the chief wait just in case. Occasionally get errors. # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) # List of all devices. The devices might be associated to the same worker. wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus) # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8 # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4 # nworker_devices_total = len(wgdev_list) # Number of workers, not devices. Each worker can have multiple devices. num_workers = cmgr_facade.num_workers # List of devices associated with current worker/task. mydevlist = cmgr_facade.get_mydevlist(ngpus) nmydevs = len(mydevlist) batch_size = batch_size * nmydevs # ------------------------------------ Data loading and basic preprocessing # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10.load_data() # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 nsamples = x_train.shape[0] steps_per_epoch = (nsamples // num_workers) // batch_size # NOTE: Naive dataset below split. With such a naive approach the random # sampling gets screwed up. The convergence rate is slower as a # result (hence defeats the purpose of scaling since more iterations # are required when using more nodes), and if scaling to very many # nodes might not converge. Instead using a generator that # randomly chooses the samples for "mypart". Maybe implement a # custom ImageDataGenerator for distributed case. # split train dataset for myrank # mytaskid = mypart = cmgr_facade.mytask_id # nn = x_train.shape[0] // num_workers # i1 = mypart * nn # if mypart == num_workers - 1: # x_train = x_train[i1:, ...] # y_train = y_train[i1:, ...] # else: # i2 = (mypart + 1) * nn # x_train = x_train_[i1:i2, ...] # y_train = y_train[i1:i2, ...] # print('TASK {}: train samples {}'.format(mytaskid, x_train.shape[0])) # print('TASK {}: test samples {}'.format(mytaskid, x_test.shape[0])) # nsamples = x_train.shape[0] # steps_per_epoch = nsamples // batch_size # --------------------------------------------- Setup model and parallelize def _load_fn(unused_op): return 1 cspec = cmgr_facade.get_cluster_spec() num_ps = cmgr_facade.num_ps ps_strategy = \ tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn) rdsetter = tf.train.replica_device_setter( cluster=cspec, ps_strategy=ps_strategy, ) with tf.device(rdsetter): model_init = make_model( x_train.shape, num_classes, filepath if checkpt_flag else None ) # if using checkpointing callback enable it on chief or use unique # filepath for each worker task. callbacks = None if checkpt_flag and is_chief: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] if is_chief: print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n' .format(cmgr_facade.clusterspec_dict, [dev.to_string() for dev in wgdev_list])) # DEBUG print('\n\tMYWGDEV_LIST: {}\n' .format([dev.to_string() for dev in mydevlist])) # DEBUG # Data-Parallelize the model via function or class. model = make_parallel(model_init, mydevlist, ps_device=ps_device) print_mgpu_modelsummary(model) # ------------------------------------------------------------ Run training lr = 0.0001 * nmydevs # lr = 0.0001 * nworker_devices_total opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if not data_augmentation: print('Not using data augmentation.') # model.fit(x_train, y_train, # batch_size=batch_size, # epochs=epochs, # validation_data=(x_test, y_test), # shuffle=True, # callbacks=callbacks) # verbose=is_chief) datagen = ImageDataGenerator() datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) # ------------------------------------------------------------- STOP SERVER if not is_chief: # JOIN WORKERS EXCEPT FOR CHIEF cmgr_facade.join(server) cmgr_facade.stop_chief(server)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) gdev_list = get_available_gpus(mgpu or 1) ngpus = len(gdev_list) batch_size_1gpu = 32 batch_size = batch_size_1gpu * ngpus num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = y_test.shape[0] steps_per_epoch = train_samples // batch_size # validations_steps = test_samples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 y_train = to_categorical(y_train, num_classes).astype(np.float32) y_test = to_categorical(y_test, num_classes).astype(np.float32) x_train_feed = x_train.reshape(train_samples, -1) y_train_feed = y_train.reshape(train_samples, -1) # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # ref: https://www.tensorflow.org/api_guides/python/reading_data#Preloaded_data @IgnorePep8 # Using tf.Variable instead of tf.constant uses less memory, because # the constant is stored inline in the graph data structure which may # be duplicated a few times. The placeholder/variable either is not # duplicated or the duplication will not consume memory since it's a # placeholder. with tf.name_scope('input'): # Input data images_initializer = tf.placeholder(dtype=x_train.dtype, shape=x_train_feed.shape) labels_initializer = tf.placeholder(dtype=y_train.dtype, shape=y_train_feed.shape) # Setting trainable=False keeps the variable out of the # GraphKeys.TRAINABLE_VARIABLES collection in the graph, so we # won't try and update it when training. Setting collections=[] # keeps the variable out of the GraphKeys.GLOBAL_VARIABLES # collection used for saving and restoring checkpoints input_images = tf.Variable(images_initializer, trainable=False, collections=[]) input_labels = tf.Variable(labels_initializer, trainable=False, collections=[]) image, label = tf.train.slice_input_producer( [input_images, input_labels], shuffle=True) # If using num_epochs=epochs have to: # sess.run(tf.local_variables_initializer()) # and maybe also: sess.run(tf.global_variables_initializer()) image = tf.reshape(image, x_train.shape[1:]) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_image, test_label = tf.train.slice_input_producer( [test_images, y_test], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch([image, label], batch_size=batch_size, capacity=capacity, num_threads=8) # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[image, label], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # num_threads=8) x_test_batch, y_test_batch = tf.train.batch([test_image, test_label], batch_size=train_samples, capacity=capacity, num_threads=8, name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) callbacks = None if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) model_init = make_model(x_train_input, num_classes, filepath if checkpt_flag else None) x_train_out = model_init.output # model_init.summary() lr = 0.0001 * ngpus if ngpus > 1: model = make_parallel(model_init, gdev_list) else: # Must re-instantiate model per API below otherwise doesn't work. model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) model = model_init opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) print_mgpu_modelsummary(model) # will print non-mgpu model as well if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks = [checkpoint] # Start the queue runners. sess = KB.get_session() # Create the op for initializing variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Run the Op to initialize the variables. sess.run(init_op) sess.run(input_images.initializer, feed_dict={images_initializer: x_train_feed}) sess.run(input_labels.initializer, feed_dict={labels_initializer: y_train_feed}) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} ms'.format('TRAINING', int(elapsed_time * 1000))) weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag: # empty list model.save_weights(checkptfile) # Clean up the TF session. coord.request_stop() coord.join(threads) KB.clear_session() # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: x_proccessed = sess.run(x_test_batch) y_proccessed = sess.run(y_test_batch) loss, acc = test_model.evaluate(x_proccessed, y_proccessed) else: loss, acc = test_model.evaluate(x_test, y_test) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc))
def main(): # user options batch_size = 128 val_in_train = False # not sure how the validation part works during fit. use_model_checkpt = False # demo processing sess = tf.Session() KB.set_session(sess) gdev_list = get_available_gpus() ngpus = len(gdev_list) batch_size = batch_size * ngpus data = mnist.load_mnist() X_train = data.train.images # X_test = data.test.images train_samples = X_train.shape[0] # 60000 # test_samples = X_test.shape[0] # 10000 height_nrows = 28 width_ncols = 28 batch_shape = [batch_size, height_nrows, width_ncols, 1] epochs = 5 steps_per_epoch = train_samples / batch_size # validations_steps = test_samples / batch_size nclasses = 10 # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. enqueue_many = True x_train_batch, y_train_batch = tf.train.shuffle_batch( tensors=[data.train.images, data.train.labels.astype(np.int32)], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue, enqueue_many=enqueue_many, num_threads=8) x_train_batch = tf.cast(x_train_batch, tf.float32) x_train_batch = tf.reshape(x_train_batch, shape=batch_shape) y_train_batch = tf.cast(y_train_batch, tf.int32) y_train_batch = tf.one_hot(y_train_batch, nclasses) x_train_input = Input(tensor=x_train_batch) # x_test_batch, y_test_batch = tf.train.batch( # tensors=[data.test.images, data.test.labels.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # enqueue_many=enqueue_many, # num_threads=8) # I like the non-functional definition of model more. # model_init = make_model(x_train_input, nclasses) # x_train_out = model_init.output # train_model = Model(inputs=[x_train_input], outputs=[x_train_out]) x_train_out = cnn_layers(x_train_input, nclasses) train_model = Model(inputs=[x_train_input], outputs=[x_train_out]) if ngpus > 1: train_model = make_parallel(train_model, gdev_list) lr = 2e-3 * ngpus train_model.compile(optimizer=RMSprop(lr=lr, decay=1e-5), loss='categorical_crossentropy', metrics=['accuracy'], target_tensors=[y_train_batch]) if ngpus > 1: print_mgpu_modelsummary(train_model) else: train_model.summary() # Callbacks if use_model_checkpt: mon = 'val_acc' if val_in_train else 'acc' checkpoint = ModelCheckpoint('saved_wt.h5', monitor=mon, verbose=0, save_best_only=True, save_weights_only=True) checkpoint = [checkpoint] else: checkpoint = [] callbacks = checkpoint # Training slower with callback. Multigpu slower with callback during # training than 1 GPU. Again, mnist is too trivial of a model and dataset # to benchmark or stress GPU compute capabilities. I set up this example # to illustrate potential for speedup of multigpu case trying to use mnist # as a stressor. # It's like comparing a 5 ft race between a person and a truck. A truck is # obviously faster than a person but in a 5 ft race the person will likely # win due to slower startup for the truck. # I will re-implement this with Cifar that should be a better benchmark. # Start the queue runners. tf.train.start_queue_runners(sess=sess) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) start_time = time.time() train_model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validations_steps if val_in_train else None, # validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} ms'.format('TRAINING', int(elapsed_time * 1000))) if not checkpoint: # empty list train_model.save_weights('./saved_wt.h5') # Clean up the TF session. coord.request_stop() coord.join(threads) KB.clear_session() # Second Session. Demonstrate that the model works and is independent of # the TFRecord pipeline, and to test loading trained model without tensors. x_test = np.reshape(data.validation.images, (data.validation.images.shape[0], 28, 28, 1)) y_test = data.validation.labels x_test_inp = KL.Input(shape=(x_test.shape[1:])) test_out = cnn_layers(x_test_inp, nclasses) test_model = Model(inputs=x_test_inp, outputs=test_out) test_model.load_weights('saved_wt.h5') test_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) test_model.summary() loss, acc = test_model.evaluate(x_test, to_categorical(y_test)) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc))
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu # enqueue = args.enqueue # usenccl = args.nccl # syncopt = args.syncopt rdma = args.rdma checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp # ---------------------------------------------- Distributed setup on SLURM scpar = SlurmClusterParser() cmgr_facade = TFClusterManagerFacade(scpar.num_tasks_per_host, scpar.hostnames, scpar.num_parameter_servers, scpar.my_proc_id) logdevp_flag = True if _DEVPROF or logdevp else False gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto( log_device_placement=logdevp_flag, # True, allow_soft_placement=True, gpu_options=gpu_options) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. server = cmgr_facade.get_server(config, protocol='grpc+verbs' if rdma else None) tfsess = cmgr_facade.get_session(server) KB.set_session(tfsess) #: :type cluster_spec: tf.train.ClusterSpec # cluster_spec = cmgr_facade.get_cluster_spec() job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id is_chief = cmgr_facade.is_chief if job_type == JobType.ps: # JOIN PARAMETER SERVERS # server.join() cmgr_facade.join(server) # Once the server is started everything but the chief worker can join # the server and wait to process/service graph computations. Chief pushes # the compute graph. COMPARE TO: cifar10_cnn_distrib_v2_slurm if not is_chief: # JOIN WORKERS EXCEPT FOR CHIEF cmgr_facade.join(server) # sleep(2) # Have the chief wait just in case. Occasionally get errors. # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) # List of all devices. The devices might be associated to the same worker. wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus) # If 2 workers ea. w/ 4 devices then nworker_devices_total == 2 * 4 = 8 # If 4 workers ea. w/ 1 devices then nworker_devices_total == 4 * 1 = 4 nworker_devices_total = len(wgdev_list) batch_size = batch_size * nworker_devices_total # ------------------------------------ Data loading and basic preprocessing # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10.load_data() # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # --------------------------------------------- Setup model and parallelize def _load_fn(unused_op): return 1 cspec = cmgr_facade.get_cluster_spec() num_ps = cmgr_facade.num_ps ps_strategy = \ tf.contrib.training.GreedyLoadBalancingStrategy(num_ps, _load_fn) # ps_device = tf.DeviceSpec(job=JobType.ps, device_type=DevType.cpu, # device_index=0).to_string() rdsetter = tf.train.replica_device_setter( cluster=cspec, ps_strategy=ps_strategy # ps_device=ps_device, # '/job:ps/cpu:0' # seems to work ) with tf.device(rdsetter): model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) callbacks = None if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n'.format( cmgr_facade.clusterspec_dict, [dev.to_string() for dev in wgdev_list])) # DEBUG # Data-Parallelize the model via function or class. model = make_parallel(model_init, wgdev_list) print_mgpu_modelsummary(model) # ------------------------------------------------------------ Run training lr = 0.0001 * nworker_devices_total opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) # ------------------------------------------------------------- STOP SERVER cmgr_facade.stop_chief(server)
model_classifier = Model([img_input, roi_input], classifier) # this is a model that holds both the RPN and the classifier, used to # load/save weights for the models model_all = Model([img_input, roi_input], rpn[:2] + classifier) try: print('loading weights from {}'.format(C.base_net_weights)) model_rpn.load_weights(C.base_net_weights, by_name=True) model_classifier.load_weights(C.base_net_weights, by_name=True) except: print('Could not load pretrained model weights. Weights can be found in the keras application folder \ https://github.com/fchollet/keras/tree/master/keras/applications') # ------------------------------------------------------ User multi GPU support gpus = get_available_gpus() ngpus = len(gpus) print_mgpu_modelsummary(model_rpn) model_rpn = make_parallel(model_rpn, gpus) print_mgpu_modelsummary(model_rpn) optimizer = Adam(lr=1e-5) optimizer_classifier = Adam(lr=1e-5) model_rpn.compile(optimizer=optimizer, loss=[losses.rpn_loss_cls( num_anchors), losses.rpn_loss_regr(num_anchors)]) model_classifier.compile(optimizer=optimizer_classifier, loss=[losses.class_loss_cls, losses.class_loss_regr( len(classes_count) - 1)], metrics={'dense_class_{}'.format(len(classes_count)): 'accuracy'}) model_all.compile(optimizer='sgd', loss='mae') epoch_length = 1000 num_epochs = int(options.num_epochs)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) print('x_train.shape:', x_train.shape) vae_serial, encoder, generator = make_vae_and_codec( original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, callbacks=callbacks) # , # validation_data=(x_test, None)) # Not accurate for mgpu. Use vae_val. vae_val = vae_serial vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def retain(ARGS): '''Create the model''' #Define the constant for model saving reshape_size = ARGS.emb_size + ARGS.numeric_size if ARGS.allow_negative: embeddings_constraint = FreezePadding() beta_activation = 'tanh' output_constraint = None else: embeddings_constraint = FreezePadding_Non_Negative() beta_activation = 'sigmoid' output_constraint = non_neg() #Get available gpus , returns empty list if none glist = get_available_gpus() def reshape(data): '''Reshape the context vectors to 3D vector''' return K.reshape(x=data, shape=(K.shape(data)[0], 1, reshape_size)) #Code Input codes = L.Input((None, None), name='codes_input') inputs_list = [codes] #Calculate embedding for each code and sum them to a visit level codes_embs_total = L.Embedding( ARGS.num_codes + 1, ARGS.emb_size, name='embedding', embeddings_constraint=embeddings_constraint)(codes) codes_embs = L.Lambda(lambda x: K.sum(x, axis=2))(codes_embs_total) #Numeric input if needed if ARGS.numeric_size: numerics = L.Input((None, ARGS.numeric_size), name='numeric_input') inputs_list.append(numerics) full_embs = L.concatenate([codes_embs, numerics], name='catInp') else: full_embs = codes_embs #Apply dropout on inputs full_embs = L.Dropout(ARGS.dropout_input)(full_embs) #Time input if needed if ARGS.use_time: time = L.Input((None, 1), name='time_input') inputs_list.append(time) time_embs = L.concatenate([full_embs, time], name='catInp2') else: time_embs = full_embs #Setup Layers #This implementation uses Bidirectional LSTM instead of reverse order # (see https://github.com/mp2893/retain/issues/3 for more details) #If training on GPU and Tensorflow use CuDNNLSTM for much faster training if glist: alpha = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size, return_sequences=True), name='alpha') beta = L.Bidirectional(L.CuDNNLSTM(ARGS.recurrent_size, return_sequences=True), name='beta') else: alpha = L.Bidirectional(L.LSTM(ARGS.recurrent_size, return_sequences=True, implementation=2), name='alpha') beta = L.Bidirectional(L.LSTM(ARGS.recurrent_size, return_sequences=True, implementation=2), name='beta') alpha_dense = L.Dense(1, kernel_regularizer=l2(ARGS.l2)) beta_dense = L.Dense(ARGS.emb_size + ARGS.numeric_size, activation=beta_activation, kernel_regularizer=l2(ARGS.l2)) #Compute alpha, visit attention alpha_out = alpha(time_embs) alpha_out = L.TimeDistributed(alpha_dense, name='alpha_dense_0')(alpha_out) alpha_out = L.Softmax(axis=1)(alpha_out) #Compute beta, codes attention beta_out = beta(time_embs) beta_out = L.TimeDistributed(beta_dense, name='beta_dense_0')(beta_out) #Compute context vector based on attentions and embeddings c_t = L.Multiply()([alpha_out, beta_out, full_embs]) c_t = L.Lambda(lambda x: K.sum(x, axis=1))(c_t) #Reshape to 3d vector for consistency between Many to Many and Many to One implementations contexts = L.Lambda(reshape)(c_t) #Make a prediction contexts = L.Dropout(ARGS.dropout_context)(contexts) output_layer = L.Dense(1, activation='sigmoid', name='dOut', kernel_regularizer=l2(ARGS.l2), kernel_constraint=output_constraint) #TimeDistributed is used for consistency # between Many to Many and Many to One implementations output = L.TimeDistributed(output_layer, name='time_distributed_out')(contexts) #Define the model with appropriate inputs model = Model(inputs=inputs_list, outputs=[output]) return model
def test(cluster_parser_spec): scpar = cluster_parser_spec # Setting config on ther server instantiation and then re-using this same # config for sesssions is very important. This functionality is wrapped # in TFClusterManagerFacade. gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(log_device_placement=False, # True, allow_soft_placement=True, gpu_options=gpu_options) cmgr_facade = TFClusterManagerFacade(scpar) #: :type cluster_spec: tf.train.ClusterSpec cluster_spec = cmgr_facade.get_cluster_spec() # job_type = cmgr_facade.myjobtype # task_id = cmgr_facade.mytask_id # cspec_dict = cluster_spec.as_dict() # print('CLUSTER_SPEC_DICT: {}\n\tJOB_TYPE: {}\n\tTASK_ID: {}' # '\n\tSERVER TARGET: {}\n\tIS_CHIEF: {}' # .format( # DEBUG # cspec_dict, job_type, task_id, server.target, is_chief)) # TF 1.2.x RDMA: specify protocol='grpc+verbs' in server below. server = cmgr_facade.get_server(config) # , protocol='grpc+verbs') # if job_type == JobType.ps: # # JOIN PARAMETER SERVERS # # server.join() # cmgr_facade.join(server) # Otherwise assumed worker # if job_type == JobType.worker: is_chief = cmgr_facade.is_chief # Once the server is started everything but the chief worker can join # the server and wait to process/service graph computations. Chief in this # test function pushes the compute graph. if not is_chief: # JOIN WORKERS (PS also) EXCEPT FOR CHIEF # server.join() cmgr_facade.join(server) # ps_tasks = cluster_spec.num_tasks(JobType.ps) # ps_device = '/job:ps/cpu:0' # ps_job_name = pydev.DeviceSpec.from_string(ps_device).job # ps_tasks = len(cspec_dict[ps_job_name]) # print('PS_JOB_NAME: {}\nPS_TASKS: {}'.format(ps_job_name, ps_tasks)) # The ngpus per host needs to be done with MPI or somehow sync'd. Currently # assuming all hosts have the same number of GPUs. gdev_list = get_available_gpus() ngpus = len(gdev_list) #: :type mywgdev: tf.DeviceSpec wgdev_list = cmgr_facade.get_allworkers_devlist(ngpus) # print('\n\tCLUSTER_SPEC_DICT: {}\n\tWGDEV_LIST: {}\n' # .format(cmgr_facade.clusterspec_dict, # [dev.to_string() for dev in wgdev_list])) # DEBUG compute_graph = distrib_graph(wgdev_list) # config = server.server_def.default_session_config # with tf.Session(server.target, config=config) as sess: with cmgr_facade.get_session(server) as sess: # if not is_chief: # # server.join() # cmgr_facade.join(server, sess) sleep(2) # Have the chief wait just in case. Occasionally get errors. # Perhaps implement a READY queue just like DONE queues. # ps_device = tf.DeviceSpec(job=JobType.ps, # device_type=DevType.cpu, # device_index=0).to_string() # ps_device = '/job:ps/cpu:0' # print('PS_DEVICE: {}'.format(ps_device)) # DEBUG # TO USE REPLICA WITH tf.train.Supervisor DO NOT JOIN WORKERS ABOVE. # USING IT BELOW FOR PRINTING "Hello,..." IS NOT NECESSARY. with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): hello_tf = tf.constant("Hello, distributed TensorFlow!") result = sess.run(hello_tf) print('RESULT:\n{}\n'.format(result)) while True: try: c = calcm() result = sess.run(c) print('RESULT NOT DISTRIBUTED:\n{}\n'.format(result)) result = sess.run(compute_graph) print('RESULT DISTRIBUTED:\n{}\n'.format(result)) break except Exception as err: traceback.print_exc() print('INHIBITING ERROR: {}'.format(err), file=sys.stderr) continue # cmgr_facade.stop_chief(server, sess=sess) # this works too cmgr_facade.stop_chief(server)