Beispiel #1
0
    def after_run(self, run_context, run_values):
        if self.step % self.log_freq == 0:

            current_time = time.time()
            duration = current_time - self.start_time
            self.start_time = current_time
            loss_value = np.asarray([run_values.results[0]], dtype='float32')
            acc_value = np.asarray([run_values.results[1]], dtype='float32')
            #lr = np.asarray([run_values.results[2]],dtype='float32')
            #epoch = np.asarray([run_values.results[3]],dtype='float32')

            examples_per_sec = self.log_freq * self.batch_size / duration
            sec_per_batch = float(duration / self.log_freq)
            mc.average(loss_value)
            mc.average(acc_value)

            format_str = (
                '%s: step %d, loss = %.3f, acc = %.3f, (%.1f examples/sec; %.3f '
                'sec/batch)')
            if (mc.get_rank() == 0):
                print("available values = ", run_values)
                print(format_str %
                      (datetime.now(), self.step, loss_value, acc_value,
                       examples_per_sec, sec_per_batch))

            self.samps = self.samps + examples_per_sec
            self.perf = self.perf + sec_per_batch
            self.sums = self.sums + 1
Beispiel #2
0
    def on_train_begin(self, logs=None):
        sess = K.get_session()

        # Split variables based on type -> float32 vs all else
        test_v = tf.Variable([0], dtype=tf.float32)
        all_vars = tf.trainable_variables()
        float_vars = [v for v in all_vars if v.dtype == test_v.dtype]
        other_vars = [v for v in all_vars if v.dtype != test_v.dtype]

        # Initialize variables and broadcast from head node
        sess.run(tf.variables_initializer(all_vars))
        new_vars = mc.broadcast(float_vars, 0)
        bcast = tf.group(
            *[tf.assign(v, new_vars[k]) for k, v in enumerate(float_vars)])
        sess.run(bcast)

        # Validate Broadcast
        if self.validate:
            py_all_vars = [sess.run(v) for v in float_vars]
            var_types = [
                np.array([v]) if type(v) == np.float32 else v
                for v in py_all_vars
            ]
            if mc.get_rank() is 0:
                if (mc.check_buffers_match(var_types, 1) != 0):
                    tf.logging.error(
                        "Not all processes have the same initial model!")
                else:
                    tf.logging.info("Initial model is consistent on all ranks")
Beispiel #3
0
    def end(self, session):

        self.loss = self.loss / self.step
        self.acc = self.acc / self.step

        mc.average(self.loss)
        mc.average(self.acc)

        format_str = (
            'EVAL Session ENDED at %s: step %d, loss = %.3f, accuracy = %.3f (%.1f examples/sec; %.3f '
            'sec/batch)')

        if (mc.get_rank() == 0):
            print(format_str % (datetime.now(), self.step, self.loss, self.acc,
                                self.samps / self.step, self.perf / self.step))
Beispiel #4
0
    def end(self, session):

        lr = session.run(self.lr)
        #epoch = session.run(self.epoch)

        format_str = (
            'TRAIN Session ENDED at %s: step %d (%.1f examples/sec; %.3f '
            'sec/batch), learning rate: %.5f')
        self.epoch_true = tf.train.global_step(session,
                                               self.epoch) / (self.step + 1)

        if (mc.get_rank() == 0):
            print('Epoch: ', self.epoch_true)
            print('global_step: %s' %
                  tf.train.global_step(session, self.epoch))
            print(format_str % (datetime.now(), self.step, self.samps /
                                self.sums, self.perf / self.sums, lr))
Beispiel #5
0
    def after_run(self, run_context, run_values):

        current_time = time.time()
        duration = current_time - self.start_time
        self.start_time = current_time

        loss_value = np.asarray([run_values.results[0]], dtype='float32')
        acc_value = np.asarray([run_values.results[1]], dtype='float32')
        examples_per_sec = self.batch_size / duration
        sec_per_batch = duration

        self.samps = self.samps + examples_per_sec
        self.perf = self.perf + sec_per_batch
        self.loss = self.loss + loss_value
        self.acc = self.acc + acc_value

        if (mc.get_rank() == 0):
            print("Eval step {:9d}".format(self.step))
Beispiel #6
0
    def learning_rate_fn(global_step):

        global_step = tf.cast(global_step, tf.float32)
        #cstep = global_step*num_images/eff_batch_size
        epoch = (d_steps + w_steps) / (global_step + 1)

        total_epochs = decay_epochs + warmup_epochs

        tf.Print(epoch, [epoch], "Epoch: ")
        #current_step = tf.Print(cstep, [cstep], "Current train steps so far: ")
        if (mlcomm == 1):
            current_lr = learning_rate_0 * math.pow(
                1.0 - (decay_steps - warmup_steps) / decay_steps, 2)
            if (mc.get_rank() == 0):
                print("Using Cray learning_rate_warmup_poly_decay(): ")
                print(" -> effective batch size: ", eff_batch_size)
                print(" -> batches per epoch: ", batches_per_epoch)

                print(" -> initial learning rate: ", learning_rate_0)
                print(" -> learning rate base: ", learning_rate_base)
                print(" -> starting global learning rate at first epoch: ",
                      current_lr)
                print(" -> decay after ", decay_epochs, " epochs")
                print("     -> decay steps: ", decay_steps)
                print(" -> warmup with ", warmup_epochs, " epochs")
                print("     -> warmup steps: ", warmup_steps)
                print(" -> number workers: ", mc.get_nranks())
                #print(" -> Finished Epoch: ", tf.get_session_tensor(epoch), "/", total_epochs)

        #global_step = tf.cast(global_step, tf.float32)
        def lr_warmup():
            return (lr_0 + global_step * (lr_base - lr_0) / w_steps)

        def lr_poly():
            return (lr_base * math_ops.pow(
                (1 - (global_step - w_steps) / d_steps), 2))

        return tf.cond(tf.less(global_step, warmup_steps), lambda: lr_warmup(),
                       lambda: lr_poly())
Beispiel #7
0
                                              summary_op=train_summary))

            # Add an op to initialize the variables.
            init_global_op = tf.global_variables_initializer()
            init_local_op = tf.local_variables_initializer()

            #saver class:
            model_saver = tf.train.Saver()

            print(
                "Rank", args["task_index"], ": starting training using " +
                args['optimizer'] + " optimizer")
            with tf.train.MonitoredTrainingSession(
                    config=sess_config,
                    checkpoint_dir=(args['modelpath']
                                    if +mc.get_rank() == 0 else None),
                    save_checkpoint_secs=300,
                    hooks=hooks) as sess:

                #initialize variables
                sess.run([init_global_op, init_local_op])

                #do the training loop
                total_time = time.time()
                train_loop(sess, bcast_hook, train_step, global_step, optlist,
                           args, trainset, validationset)
                total_time -= time.time()
                print("FINISHED Training. Total time %g" % (total_time))

                #clean up comm buffers
                mc.finalize()
Beispiel #8
0
def resnet_main(flags,
                model_function,
                input_function,
                num_train_samps,
                num_eval_samps,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.

    myrank = 0
    numworkers = 1
    if (flags.enable_ml_comm == 1):

        # initialize the Cray PE ML Plugin
        # config the thread team (correcting the number of epochs for the effectice batch size))
        #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()])

        totsize = 25551401  #Specific size for resnet50-v2
        mc.init(2, 1, totsize, "tensorflow")
        myrank = mc.get_rank()
        numworkers = mc.get_nranks()
        if (myrank == 0):
            print("ResNet with {:9d} parameters".format(totsize))

        max_steps_train = int(
            math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) /
                      (mc.get_nranks() * flags.batch_size)))
        #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100)
        mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100)

        flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None
        flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank(
        ) == 0 else None
        flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None

    else:
        rank_id = myrank

    session_config = tf.ConfigProto(
        log_device_placement=False,
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_steps=500, session_config=session_config)

    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'train_epochs': flags.train_epochs,
                                            'version': flags.version,
                                            'loss_scale': flags.loss_scale,
                                            'dtype': flags.dtype,
                                            'mlcomm': flags.enable_ml_comm,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                            'weight_decay': flags.weight_decay,
                                            'init_lr': flags.init_lr,
                                            'base_lr': flags.base_lr,
                                            'warmup_epochs':
                                            flags.warmup_epochs,
                                            'log_freq':
                                            flags.global_perf_log_freq,
                                        })

    benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
    benchmark_logger.log_run_info('resnet')

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)
        if (myrank == 0):
            print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        tsteps = math.ceil(
            float(flags.epochs_between_evals * num_train_samps) /
            (numworkers * flags.batch_size))
        classifier.train(input_fn=input_fn_train,
                         steps=tsteps,
                         max_steps=flags.max_train_steps)

        if (myrank == 0):
            print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 3,
                                  flags.num_parallel_calls, flags.multi_gpu,
                                  numworkers, myrank)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        esteps = math.ceil(
            float(num_eval_samps) / (numworkers * flags.batch_size))
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=esteps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)

    if (flags.enable_ml_comm == 1):
        mc.finalize()
Beispiel #9
0
def rank():
    return mc.get_rank()
Beispiel #10
0
def build_model(config,args,print_summary=True):
   # feature_size = (3,3,3)
   # Pooling = kl.MaxPooling3D
   image_shape = config['data_handling']['image_shape']
   input_image = kl.Input(shape=tuple([1] + image_shape))
   logger.debug('input image = %s',input_image)
   layer_num = 0

   outputs = []
   for i in range(0,image_shape[0],4):

      logger.debug('i = %s',i)
      subimg = kl.Lambda(lambda x: x[:,:,i:i+4,:,:])(input_image)
      
      num_filters = 64
      x = subimg
      x = conv_layer(subimg,num_filters=num_filters)
      # Instantiate the stack of residual units
      for stack in range(2):
         for res_block in range(2):
            strides = (1,1,1)
            if stack > 0 and res_block == 0:  # first layer but not first stack
               strides = (2,2,2)  # downsample
            y = conv_layer(inputs=x,
                          num_filters=num_filters,
                          strides=strides)
            y = conv_layer(inputs=y,
                          num_filters=num_filters,
                          activation=None)
            if stack > 0 and res_block == 0:  # first layer but not first stack
               # linear projection residual shortcut connection to match
               # changed dims
               x = conv_layer(x,
                              num_filters=num_filters,
                              kernel_size=1,
                              strides=strides,
                              activation=None,
                              batch_normalization=False)
            x = kl.add([x, y])
            x = kl.Activation('relu')(x)
         num_filters *= 2
      
      outputs.append(x)

   num_filters = int(num_filters/2)
   logger.debug('filters = %s',num_filters)
   # logger.debug('outputs = %s',outputs)
   x = kl.Concatenate(axis=2)(outputs)
   logger.debug('concat: %s',x)

   # Instantiate the stack of residual units
   for stack in range(2):
      logger.debug('stack: %s',stack)
      for res_block in range(3):
         logger.debug('res_block: %s',res_block)
         strides = (1,1,1)
         if stack > 0 and res_block == 0:  # first layer but not first stack
            strides = (2,2,2)  # downsample
         logger.debug('x: %s',x)
         y = conv_layer(x,
                       num_filters=num_filters,
                       strides=strides)
         logger.debug('y: %s',y)
         y = conv_layer(y,
                       num_filters=num_filters,
                       activation=None)
         logger.debug('y: %s',y)
         if stack > 0 and res_block == 0:  # first layer but not first stack
            # linear projection residual shortcut connection to match
            # changed dims
            x = conv_layer(x,
                           num_filters=num_filters,
                           kernel_size=1,
                           strides=strides,
                           activation=None,
                           batch_normalization=False)
         x = kl.add([x, y])
         x = kl.Activation('relu')(x)
      num_filters *= 2

   logger.debug('out = %s',x)

   x = kl.AveragePooling3D(pool_size=(1,1,2))(x)
   
   y = kl.Flatten()(x)

   # x = kl.Dense(2048,activation='relu',kernel_initializer='normal',name='dense_{0}'.format(layer_num))(output)
   # output = kl.Activation('relu',name='relu_{0}'.format(layer_num))(output)
   # output = kl.Dropout(0.1,name='dropout_{0}'.format(layer_num))(output)
   # layer_num += 1

   outputs = kl.Dense(len(config['data_handling']['classes']),activation='softmax',kernel_initializer='he_normal')(y)
   # output = kl.Activation('softmax',name='softmax_{0}'.format(layer_num))(output)

   model = Model(input_image,outputs)

   line_length = 150
   positions = [.2, .45, .77, 1.]
   if print_summary:
      if args.horovod:
         import horovod.keras as hvd
         if hvd.rank() == 0:
            model.summary(line_length=line_length,positions=positions)
      elif args.ml_comm:
         import ml_comm as mc
         if mc.get_rank() == 0:
            model.summary(line_length=line_length,positions=positions)
      else:
         model.summary(line_length=line_length,positions=positions)

   return model
Beispiel #11
0
    def train(self):
        train_step, loss, lossL1Train,train_true,train_predict = self.optimize()
        lossL1Val,val_true,val_predict = self.validation_loss()
        lossL1Test,test_true,test_predict = self.test_loss()
        
	config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.4
 
        ### taking config from the MKL benchmarks. 
        config.allow_soft_placement = True
        config.intra_op_parallelism_threads = 1 ## default
        config.inter_op_parallelism_threads = 2 ## Default

        #used to save the model
	saver = tf.train.Saver()
        global best_validation_accuracy
        global last_improvement
        global total_iterations
	best_validation_accuracy = 1.0         #Best validation accuracy seen so far
	last_improvement = 0                   #Iteration-number for last improvement to validation accuracy.
	require_improvement = hp.RUNPARAM['require_improvement']               #Stop optimization if no improvement found in this many iterations.
        total_iterations = 0                   #Counter for total number of iterations performed so far.        

        #initialize the CPE ML Plugin with one team (single thread for now) and the model size
        totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()])
        mc.init(1, 1, totsize, "tensorflow")
        hp.RUNPARAM['batch_per_epoch'] = hp.RUNPARAM['batch_per_epoch'] / mc.get_nranks()
        hp.RUNPARAM['batch_per_epoch_val'] = hp.RUNPARAM['batch_per_epoch_val'] / mc.get_nranks()
        totsteps = hp.RUNPARAM['num_epoch'] * hp.RUNPARAM['batch_per_epoch']
        mc.config_team(0, 0, totsteps, totsteps, 2, 50)

        if (mc.get_rank() == 0):
            print("+------------------------------+")
            print("| CosmoFlow                    |")
            print("| # Ranks = {:5d}              |".format(mc.get_nranks()))
            print("| Global Batch = {:6d}        |".format(mc.get_nranks() * hp.Input['BATCH_SIZE']))
            print("| # Parameters = {:9d}     |".format(totsize))
            print("+------------------------------+")

        #use the CPE ML Plugin to broadcast initial model parameter values
        new_vars = mc.broadcast(tf.trainable_variables(),0)
        bcast    = tf.group(*[tf.assign(v,new_vars[k]) for k,v in enumerate(tf.trainable_variables())])

	if(self.is_train):
            with tf.Session(config=config) as sess:
        	losses_train = []  
        	losses_val = []
        	losses = []
		val_accuracys = []       
		data_accuracys = []   

                #do all parameter initializations
		sess.run(tf.global_variables_initializer())
		sess.run(tf.local_variables_initializer())
                sess.run(bcast)
		
        	coord = tf.train.Coordinator()
        	threads = tf.train.start_queue_runners(coord=coord)

                elapsed_time = 0.
		for epoch in range(hp.RUNPARAM['num_epoch']):
			save_path = os.path.join(hp.Path['Model_path'], 'best_validation')
			total_iterations += 1
			start_time = time.time()
        	        loss_per_epoch_val = 0
        	        loss_per_epoch_train = 0
        	        for i in range(hp.RUNPARAM['batch_per_epoch']): 
				step_start_time = time.time()
				_,lossTrain,lossL1Train_,train_true_,train_predict_ = sess.run([train_step,loss,lossL1Train,train_true,train_predict])
                                step_finish_time = time.time()
				
                                elapsed_time += (step_finish_time-step_start_time)
                                samps_per_sec = mc.get_nranks() * (epoch * hp.RUNPARAM['batch_per_epoch'] * hp.Input['BATCH_SIZE'] + (i+1) * hp.Input['BATCH_SIZE']) / elapsed_time
                                if (mc.get_rank() == 0):
                                  print("Train Step: " + str(i) + ", Samples/Sec = " + str(samps_per_sec) + ", Loss = " + str(lossTrain))
                        
        	                loss_per_epoch_train +=lossL1Train_

                        global_loss = np.array([loss_per_epoch_train],dtype=np.float32)
                        mc.average(global_loss)
                        loss_per_epoch_train = global_loss / hp.RUNPARAM['batch_per_epoch']
        	        losses.append(loss_per_epoch_train)
			losses_train.append(loss_per_epoch_train)
			
			for i in range(hp.RUNPARAM['batch_per_epoch_val']):
                                if (mc.get_rank() == 0):
                                  print("Val Step = " + str(i))
				loss_,val_true_,val_predict_ = sess.run([lossL1Val,val_true,val_predict])
                                loss_per_epoch_val += loss_

                        global_loss = np.array([loss_per_epoch_val],dtype=np.float32)
                        mc.average(global_loss)
                        loss_per_epoch_val = global_loss / hp.RUNPARAM['batch_per_epoch_val']
			losses_val.append(loss_per_epoch_val)

        	        if(loss_per_epoch_val < best_validation_accuracy):
				best_validation_accuracy  = loss_per_epoch_val
				last_improvement = total_iterations
				if (mc.get_rank() == 0):
					saver.save(sess=sess, save_path=save_path)

			if (mc.get_rank() == 0):
				print("Epoch {} took {:.3f}s".format(epoch, time.time() - start_time))
				print "  training loss: %.3f" %(loss_per_epoch_train)
				print "  validation loss: %.3f" %(loss_per_epoch_val)
				print "  best loss: %.3f"%best_validation_accuracy	
				np.savetxt(os.path.join(hp.Path['train_result'],'loss_train.txt'),losses_train)
				np.savetxt(os.path.join(hp.Path['val_result'],'loss_val.txt'),losses_val)
				np.savetxt(os.path.join(hp.Path['train_result'],'losses.txt'),losses)
		                #np.savetxt(os.path.join(hp.Path['train_result'],'train_pred'+str(epoch)+'.txt'),np.c_[train_true_,train_predict_])
        	                #np.savetxt(os.path.join(hp.Path['val_result'],'val_pred'+str(epoch)+'.txt'),np.c_[val_true_,val_predict_])
			if(total_iterations - last_improvement > require_improvement):
				if (mc.get_rank() == 0):
					print ("No improvement found in a while, stopping optimization.")
				break		                        

		coord.request_stop();
                coord.join(threads);

	if(self.is_test and mc.get_rank() == 0):
               
		save_path = os.path.join(hp.Path['Model_path'], 'best_validation')
		if self.save_path != None:
		    save_path = self.save_path

		with tf.Session() as sess:
	    		saver.restore(sess=sess,save_path=save_path)
			coord = tf.train.Coordinator()
                	threads = tf.train.start_queue_runners(coord=coord)
            		loss_test = []
            		for i in range(0,hp.RUNPARAM['iter_test']):
				start_time = time.time()
		    		lossL1Test_,test_true_,test_predict_ = sess.run([lossL1Test,test_true,test_predict])
		    		loss_test.append(lossL1Test_)	
				print("Box {} took {:.3f}s".format(i, time.time() - start_time))
				print "  test loss: %.3f"%lossL1Test_
	    		        np.savetxt(os.path.join(hp.Path['test_result'],'test_batch_'+str(i)+'.txt'),np.c_[test_true_,test_predict_])
	    		np.savetxt(os.path.join(hp.Path['test_result'],'loss_test.txt'),loss_test)
                	coord.request_stop()
			coord.join(threads)

        #cleanup the CPE ML Plugin
        mc.finalize()