def after_run(self, run_context, run_values): if self.step % self.log_freq == 0: current_time = time.time() duration = current_time - self.start_time self.start_time = current_time loss_value = np.asarray([run_values.results[0]], dtype='float32') acc_value = np.asarray([run_values.results[1]], dtype='float32') #lr = np.asarray([run_values.results[2]],dtype='float32') #epoch = np.asarray([run_values.results[3]],dtype='float32') examples_per_sec = self.log_freq * self.batch_size / duration sec_per_batch = float(duration / self.log_freq) mc.average(loss_value) mc.average(acc_value) format_str = ( '%s: step %d, loss = %.3f, acc = %.3f, (%.1f examples/sec; %.3f ' 'sec/batch)') if (mc.get_rank() == 0): print("available values = ", run_values) print(format_str % (datetime.now(), self.step, loss_value, acc_value, examples_per_sec, sec_per_batch)) self.samps = self.samps + examples_per_sec self.perf = self.perf + sec_per_batch self.sums = self.sums + 1
def on_train_begin(self, logs=None): sess = K.get_session() # Split variables based on type -> float32 vs all else test_v = tf.Variable([0], dtype=tf.float32) all_vars = tf.trainable_variables() float_vars = [v for v in all_vars if v.dtype == test_v.dtype] other_vars = [v for v in all_vars if v.dtype != test_v.dtype] # Initialize variables and broadcast from head node sess.run(tf.variables_initializer(all_vars)) new_vars = mc.broadcast(float_vars, 0) bcast = tf.group( *[tf.assign(v, new_vars[k]) for k, v in enumerate(float_vars)]) sess.run(bcast) # Validate Broadcast if self.validate: py_all_vars = [sess.run(v) for v in float_vars] var_types = [ np.array([v]) if type(v) == np.float32 else v for v in py_all_vars ] if mc.get_rank() is 0: if (mc.check_buffers_match(var_types, 1) != 0): tf.logging.error( "Not all processes have the same initial model!") else: tf.logging.info("Initial model is consistent on all ranks")
def end(self, session): self.loss = self.loss / self.step self.acc = self.acc / self.step mc.average(self.loss) mc.average(self.acc) format_str = ( 'EVAL Session ENDED at %s: step %d, loss = %.3f, accuracy = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)') if (mc.get_rank() == 0): print(format_str % (datetime.now(), self.step, self.loss, self.acc, self.samps / self.step, self.perf / self.step))
def end(self, session): lr = session.run(self.lr) #epoch = session.run(self.epoch) format_str = ( 'TRAIN Session ENDED at %s: step %d (%.1f examples/sec; %.3f ' 'sec/batch), learning rate: %.5f') self.epoch_true = tf.train.global_step(session, self.epoch) / (self.step + 1) if (mc.get_rank() == 0): print('Epoch: ', self.epoch_true) print('global_step: %s' % tf.train.global_step(session, self.epoch)) print(format_str % (datetime.now(), self.step, self.samps / self.sums, self.perf / self.sums, lr))
def after_run(self, run_context, run_values): current_time = time.time() duration = current_time - self.start_time self.start_time = current_time loss_value = np.asarray([run_values.results[0]], dtype='float32') acc_value = np.asarray([run_values.results[1]], dtype='float32') examples_per_sec = self.batch_size / duration sec_per_batch = duration self.samps = self.samps + examples_per_sec self.perf = self.perf + sec_per_batch self.loss = self.loss + loss_value self.acc = self.acc + acc_value if (mc.get_rank() == 0): print("Eval step {:9d}".format(self.step))
def learning_rate_fn(global_step): global_step = tf.cast(global_step, tf.float32) #cstep = global_step*num_images/eff_batch_size epoch = (d_steps + w_steps) / (global_step + 1) total_epochs = decay_epochs + warmup_epochs tf.Print(epoch, [epoch], "Epoch: ") #current_step = tf.Print(cstep, [cstep], "Current train steps so far: ") if (mlcomm == 1): current_lr = learning_rate_0 * math.pow( 1.0 - (decay_steps - warmup_steps) / decay_steps, 2) if (mc.get_rank() == 0): print("Using Cray learning_rate_warmup_poly_decay(): ") print(" -> effective batch size: ", eff_batch_size) print(" -> batches per epoch: ", batches_per_epoch) print(" -> initial learning rate: ", learning_rate_0) print(" -> learning rate base: ", learning_rate_base) print(" -> starting global learning rate at first epoch: ", current_lr) print(" -> decay after ", decay_epochs, " epochs") print(" -> decay steps: ", decay_steps) print(" -> warmup with ", warmup_epochs, " epochs") print(" -> warmup steps: ", warmup_steps) print(" -> number workers: ", mc.get_nranks()) #print(" -> Finished Epoch: ", tf.get_session_tensor(epoch), "/", total_epochs) #global_step = tf.cast(global_step, tf.float32) def lr_warmup(): return (lr_0 + global_step * (lr_base - lr_0) / w_steps) def lr_poly(): return (lr_base * math_ops.pow( (1 - (global_step - w_steps) / d_steps), 2)) return tf.cond(tf.less(global_step, warmup_steps), lambda: lr_warmup(), lambda: lr_poly())
summary_op=train_summary)) # Add an op to initialize the variables. init_global_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #saver class: model_saver = tf.train.Saver() print( "Rank", args["task_index"], ": starting training using " + args['optimizer'] + " optimizer") with tf.train.MonitoredTrainingSession( config=sess_config, checkpoint_dir=(args['modelpath'] if +mc.get_rank() == 0 else None), save_checkpoint_secs=300, hooks=hooks) as sess: #initialize variables sess.run([init_global_op, init_local_op]) #do the training loop total_time = time.time() train_loop(sess, bcast_hook, train_step, global_step, optlist, args, trainset, validationset) total_time -= time.time() print("FINISHED Training. Total time %g" % (total_time)) #clean up comm buffers mc.finalize()
def resnet_main(flags, model_function, input_function, num_train_samps, num_eval_samps, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. myrank = 0 numworkers = 1 if (flags.enable_ml_comm == 1): # initialize the Cray PE ML Plugin # config the thread team (correcting the number of epochs for the effectice batch size)) #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()]) totsize = 25551401 #Specific size for resnet50-v2 mc.init(2, 1, totsize, "tensorflow") myrank = mc.get_rank() numworkers = mc.get_nranks() if (myrank == 0): print("ResNet with {:9d} parameters".format(totsize)) max_steps_train = int( math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) / (mc.get_nranks() * flags.batch_size))) #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100) mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100) flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank( ) == 0 else None flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None else: rank_id = myrank session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_steps=500, session_config=session_config) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'train_epochs': flags.train_epochs, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype, 'mlcomm': flags.enable_ml_comm, 'log_freq': flags.global_perf_log_freq, 'weight_decay': flags.weight_decay, 'init_lr': flags.init_lr, 'base_lr': flags.base_lr, 'warmup_epochs': flags.warmup_epochs, 'log_freq': flags.global_perf_log_freq, }) benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) if (myrank == 0): print('Starting a training cycle.') def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) tsteps = math.ceil( float(flags.epochs_between_evals * num_train_samps) / (numworkers * flags.batch_size)) classifier.train(input_fn=input_fn_train, steps=tsteps, max_steps=flags.max_train_steps) if (myrank == 0): print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 3, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. esteps = math.ceil( float(num_eval_samps) / (numworkers * flags.batch_size)) eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=esteps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn) if (flags.enable_ml_comm == 1): mc.finalize()
def rank(): return mc.get_rank()
def build_model(config,args,print_summary=True): # feature_size = (3,3,3) # Pooling = kl.MaxPooling3D image_shape = config['data_handling']['image_shape'] input_image = kl.Input(shape=tuple([1] + image_shape)) logger.debug('input image = %s',input_image) layer_num = 0 outputs = [] for i in range(0,image_shape[0],4): logger.debug('i = %s',i) subimg = kl.Lambda(lambda x: x[:,:,i:i+4,:,:])(input_image) num_filters = 64 x = subimg x = conv_layer(subimg,num_filters=num_filters) # Instantiate the stack of residual units for stack in range(2): for res_block in range(2): strides = (1,1,1) if stack > 0 and res_block == 0: # first layer but not first stack strides = (2,2,2) # downsample y = conv_layer(inputs=x, num_filters=num_filters, strides=strides) y = conv_layer(inputs=y, num_filters=num_filters, activation=None) if stack > 0 and res_block == 0: # first layer but not first stack # linear projection residual shortcut connection to match # changed dims x = conv_layer(x, num_filters=num_filters, kernel_size=1, strides=strides, activation=None, batch_normalization=False) x = kl.add([x, y]) x = kl.Activation('relu')(x) num_filters *= 2 outputs.append(x) num_filters = int(num_filters/2) logger.debug('filters = %s',num_filters) # logger.debug('outputs = %s',outputs) x = kl.Concatenate(axis=2)(outputs) logger.debug('concat: %s',x) # Instantiate the stack of residual units for stack in range(2): logger.debug('stack: %s',stack) for res_block in range(3): logger.debug('res_block: %s',res_block) strides = (1,1,1) if stack > 0 and res_block == 0: # first layer but not first stack strides = (2,2,2) # downsample logger.debug('x: %s',x) y = conv_layer(x, num_filters=num_filters, strides=strides) logger.debug('y: %s',y) y = conv_layer(y, num_filters=num_filters, activation=None) logger.debug('y: %s',y) if stack > 0 and res_block == 0: # first layer but not first stack # linear projection residual shortcut connection to match # changed dims x = conv_layer(x, num_filters=num_filters, kernel_size=1, strides=strides, activation=None, batch_normalization=False) x = kl.add([x, y]) x = kl.Activation('relu')(x) num_filters *= 2 logger.debug('out = %s',x) x = kl.AveragePooling3D(pool_size=(1,1,2))(x) y = kl.Flatten()(x) # x = kl.Dense(2048,activation='relu',kernel_initializer='normal',name='dense_{0}'.format(layer_num))(output) # output = kl.Activation('relu',name='relu_{0}'.format(layer_num))(output) # output = kl.Dropout(0.1,name='dropout_{0}'.format(layer_num))(output) # layer_num += 1 outputs = kl.Dense(len(config['data_handling']['classes']),activation='softmax',kernel_initializer='he_normal')(y) # output = kl.Activation('softmax',name='softmax_{0}'.format(layer_num))(output) model = Model(input_image,outputs) line_length = 150 positions = [.2, .45, .77, 1.] if print_summary: if args.horovod: import horovod.keras as hvd if hvd.rank() == 0: model.summary(line_length=line_length,positions=positions) elif args.ml_comm: import ml_comm as mc if mc.get_rank() == 0: model.summary(line_length=line_length,positions=positions) else: model.summary(line_length=line_length,positions=positions) return model
def train(self): train_step, loss, lossL1Train,train_true,train_predict = self.optimize() lossL1Val,val_true,val_predict = self.validation_loss() lossL1Test,test_true,test_predict = self.test_loss() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 ### taking config from the MKL benchmarks. config.allow_soft_placement = True config.intra_op_parallelism_threads = 1 ## default config.inter_op_parallelism_threads = 2 ## Default #used to save the model saver = tf.train.Saver() global best_validation_accuracy global last_improvement global total_iterations best_validation_accuracy = 1.0 #Best validation accuracy seen so far last_improvement = 0 #Iteration-number for last improvement to validation accuracy. require_improvement = hp.RUNPARAM['require_improvement'] #Stop optimization if no improvement found in this many iterations. total_iterations = 0 #Counter for total number of iterations performed so far. #initialize the CPE ML Plugin with one team (single thread for now) and the model size totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()]) mc.init(1, 1, totsize, "tensorflow") hp.RUNPARAM['batch_per_epoch'] = hp.RUNPARAM['batch_per_epoch'] / mc.get_nranks() hp.RUNPARAM['batch_per_epoch_val'] = hp.RUNPARAM['batch_per_epoch_val'] / mc.get_nranks() totsteps = hp.RUNPARAM['num_epoch'] * hp.RUNPARAM['batch_per_epoch'] mc.config_team(0, 0, totsteps, totsteps, 2, 50) if (mc.get_rank() == 0): print("+------------------------------+") print("| CosmoFlow |") print("| # Ranks = {:5d} |".format(mc.get_nranks())) print("| Global Batch = {:6d} |".format(mc.get_nranks() * hp.Input['BATCH_SIZE'])) print("| # Parameters = {:9d} |".format(totsize)) print("+------------------------------+") #use the CPE ML Plugin to broadcast initial model parameter values new_vars = mc.broadcast(tf.trainable_variables(),0) bcast = tf.group(*[tf.assign(v,new_vars[k]) for k,v in enumerate(tf.trainable_variables())]) if(self.is_train): with tf.Session(config=config) as sess: losses_train = [] losses_val = [] losses = [] val_accuracys = [] data_accuracys = [] #do all parameter initializations sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(bcast) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) elapsed_time = 0. for epoch in range(hp.RUNPARAM['num_epoch']): save_path = os.path.join(hp.Path['Model_path'], 'best_validation') total_iterations += 1 start_time = time.time() loss_per_epoch_val = 0 loss_per_epoch_train = 0 for i in range(hp.RUNPARAM['batch_per_epoch']): step_start_time = time.time() _,lossTrain,lossL1Train_,train_true_,train_predict_ = sess.run([train_step,loss,lossL1Train,train_true,train_predict]) step_finish_time = time.time() elapsed_time += (step_finish_time-step_start_time) samps_per_sec = mc.get_nranks() * (epoch * hp.RUNPARAM['batch_per_epoch'] * hp.Input['BATCH_SIZE'] + (i+1) * hp.Input['BATCH_SIZE']) / elapsed_time if (mc.get_rank() == 0): print("Train Step: " + str(i) + ", Samples/Sec = " + str(samps_per_sec) + ", Loss = " + str(lossTrain)) loss_per_epoch_train +=lossL1Train_ global_loss = np.array([loss_per_epoch_train],dtype=np.float32) mc.average(global_loss) loss_per_epoch_train = global_loss / hp.RUNPARAM['batch_per_epoch'] losses.append(loss_per_epoch_train) losses_train.append(loss_per_epoch_train) for i in range(hp.RUNPARAM['batch_per_epoch_val']): if (mc.get_rank() == 0): print("Val Step = " + str(i)) loss_,val_true_,val_predict_ = sess.run([lossL1Val,val_true,val_predict]) loss_per_epoch_val += loss_ global_loss = np.array([loss_per_epoch_val],dtype=np.float32) mc.average(global_loss) loss_per_epoch_val = global_loss / hp.RUNPARAM['batch_per_epoch_val'] losses_val.append(loss_per_epoch_val) if(loss_per_epoch_val < best_validation_accuracy): best_validation_accuracy = loss_per_epoch_val last_improvement = total_iterations if (mc.get_rank() == 0): saver.save(sess=sess, save_path=save_path) if (mc.get_rank() == 0): print("Epoch {} took {:.3f}s".format(epoch, time.time() - start_time)) print " training loss: %.3f" %(loss_per_epoch_train) print " validation loss: %.3f" %(loss_per_epoch_val) print " best loss: %.3f"%best_validation_accuracy np.savetxt(os.path.join(hp.Path['train_result'],'loss_train.txt'),losses_train) np.savetxt(os.path.join(hp.Path['val_result'],'loss_val.txt'),losses_val) np.savetxt(os.path.join(hp.Path['train_result'],'losses.txt'),losses) #np.savetxt(os.path.join(hp.Path['train_result'],'train_pred'+str(epoch)+'.txt'),np.c_[train_true_,train_predict_]) #np.savetxt(os.path.join(hp.Path['val_result'],'val_pred'+str(epoch)+'.txt'),np.c_[val_true_,val_predict_]) if(total_iterations - last_improvement > require_improvement): if (mc.get_rank() == 0): print ("No improvement found in a while, stopping optimization.") break coord.request_stop(); coord.join(threads); if(self.is_test and mc.get_rank() == 0): save_path = os.path.join(hp.Path['Model_path'], 'best_validation') if self.save_path != None: save_path = self.save_path with tf.Session() as sess: saver.restore(sess=sess,save_path=save_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) loss_test = [] for i in range(0,hp.RUNPARAM['iter_test']): start_time = time.time() lossL1Test_,test_true_,test_predict_ = sess.run([lossL1Test,test_true,test_predict]) loss_test.append(lossL1Test_) print("Box {} took {:.3f}s".format(i, time.time() - start_time)) print " test loss: %.3f"%lossL1Test_ np.savetxt(os.path.join(hp.Path['test_result'],'test_batch_'+str(i)+'.txt'),np.c_[test_true_,test_predict_]) np.savetxt(os.path.join(hp.Path['test_result'],'loss_test.txt'),loss_test) coord.request_stop() coord.join(threads) #cleanup the CPE ML Plugin mc.finalize()