Example #1
0
    def train_step(inputs, labels, first_batch, epoch):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['loc'], losses['landm'], losses['class'] = \
                multi_box_loss(labels, predictions)
            total_loss = tf.add_n([l for l in losses.values()])

        if cfg['distributed']:
            # Horovod: add Horovod Distributed GradientTape.
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if cfg['distributed'] and first_batch and epoch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        return total_loss, losses
Example #2
0
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)
        loss_value = loss(labels, probs)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
    # tf.summary.scalar("loss", loss_value, step=step)
    return loss_value
def train_step(data, model, loss_fn, optimizer, first_batch, compress=True):
    batch, target = data
    with tf.GradientTape() as tape:
        output = model(batch, training=True)
        loss = loss_fn(target, output)

    compression = (hvd.Compression.fp16 if compress else hvd.Compression.none)
    # Horovod: add Horovod Distributed training
    tape = hvd.DistributedGradientTape(tape, compression=compression)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(optimizer.variables(), root_rank=0)

    return loss, output
Example #4
0
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape, tf.GradientTape() as emb_tape:
            logit = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)

        # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape
        tape = hvd.DistributedGradientTape(tape)

        # There is no need to wrap the emb_tape because the communication is done by sok
        # emb_tape = hvd.DistributedGradientTape(emb_tape)

        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)

        # type(emb_tape) here is hvd.DistributedGradientTape
        # type(tape) here is tf.GradientTape
        emb_grads = emb_tape.gradient(replica_loss, emb_variable)
        grads = tape.gradient(replica_loss, other_variable)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))

        # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized.
        # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside
        # sok is model parallel and the communication is down by sok itself.
        if first_batch:
            hvd.broadcast_variables(other_variable, root_rank=0)
            hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0)

        return replica_loss
def train_step(model, opt, loss_func, images, labels, first_batch, fp32=False):
    with tf.GradientTape() as tape:
        probs = model(images, training=True)
        loss_value = loss_func(labels, probs)
        loss_value += tf.add_n(model.losses)
        if not fp32:
            scaled_loss_value = opt.get_scaled_loss(loss_value)

    tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16)
    if not fp32:
        grads = tape.gradient(scaled_loss_value, model.trainable_variables)
        grads = opt.get_unscaled_gradients(grads)
    else:
        grads = tape.gradient(loss_value, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
    top_1_pred = tf.squeeze(tf.math.top_k(probs, k=1)[1])
    sparse_labels = tf.cast(tf.math.argmax(labels, axis=1), tf.int32)
    top_1_accuracy = tf.math.reduce_sum(
        tf.cast(tf.equal(top_1_pred, sparse_labels), tf.int32))
    return loss_value, top_1_accuracy
Example #6
0
    def train_step(self, data):
        """Perform a single training step."""
        x, beta = data
        start = time.time()
        with tf.GradientTape() as tape:
            states, data = self((x, beta), training=True)
            #  states, accept_prob, sumlogdet = self((x, beta), training=True)
            loss = self.calc_losses(states, data.accept_prob)

            if self.aux_weight > 0:
                z = tf.random.normal(x.shape, dtype=x.dtype)
                states_, accept_prob_, _ = self((z, beta), training=True)
                loss_ = self.calc_losses(states_, accept_prob_)
                loss += loss_

        if NUM_RANKS > 1 and HAS_HOROVOD:
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

        metrics = AttrDict({
            'dt': time.time() - start,
            'loss': loss,
            'accept_prob': data.accept_prob,
            #  'eps': self.eps,
            'beta': states.init.beta,
            'sumlogdet': data.sumlogdet,
            #  'sumlogdet': data.sumlogdet.out,
        })

        #  if self.optimizer.iterations == 0 and NUM_RANKS > 1 and HAS_HOROVOD:
        if HAS_HOROVOD and NUM_RANKS > 1 and self.optimizer.iterations == 0:
            hvd.broadcast_variables(self.variables, root_rank=0)
            hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)

        return states.out.x, metrics
Example #7
0
def train_one_step(config,
                   model,
                   optimizer,
                   features,
                   init=False,
                   clip_norm=1.0):
    with tf.GradientTape() as tape:
        total_loss, eval_fn_inputs = model(features, is_training=True)
        unscaled_loss = tf.stop_gradient(total_loss)
        if config.amp:
            total_loss = optimizer.get_scaled_loss(total_loss)

    tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True)

    gradients = tape.gradient(total_loss, model.trainable_variables)
    if config.amp:
        gradients = optimizer.get_unscaled_gradients(gradients)
    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    if init:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(optimizer.variables(), root_rank=0)
    return unscaled_loss, eval_fn_inputs
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape, tf.GradientTape() as emb_tape:
            logit = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = embedding_optimizer.get_scaled_loss(replica_loss)
            else:
                _loss = replica_loss

        tape = hvd.DistributedGradientTape(tape)

        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        emb_grads = emb_tape.gradient(_loss, emb_variable)
        grads = tape.gradient(_loss, other_variable)
        if args.mixed_precision:
            emb_grads = embedding_optimizer.get_unscaled_gradients(emb_grads)
            grads = embedding_optimizer.get_unscaled_gradients(grads)

        if 'plugin' not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))

        # Note: broadcast should be done after the first gradient step to ensure optimizer initialization.
        if first_batch:
            hvd.broadcast_variables(other_variable, root_rank=0)
            hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0)

        return replica_loss
def train_one_step(model, opt, x, y, step, EPOCH):

    with tf.GradientTape() as tape:
        logits = model(x)
        loss = compute_loss(y, logits)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape,
                                       device_sparse='/cpu:0',
                                       device_dense='/cpu:0',
                                       compression=compression)
    grads = tape.gradient(loss, model.trainable_variables)

    opt.apply_gradients(zip(grads, model.trainable_variables))

    if step + EPOCH == 0:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    pred = tf.argmax(logits, axis=-1)
    compute_miou(y, pred)
    compute_accuracy(y, pred)

    return loss
Example #10
0
    def train_step(inputs, first_batch):
        images, labels = inputs

        with tf.GradientTape() as tape:
            predictions = model(images, training=True)
            loss = loss_func(labels, predictions)
            loss += tf.reduce_sum(model.losses)
            loss_copy = loss
            # Scale the losses
            if precision == 'fp16':
                loss = loss * tf.cast(loss_scale, loss.dtype)

        tape = hvd.DistributedGradientTape(tape)

        old_grads = tape.gradient(loss, model.trainable_variables)

        # Unscale the grads
        if precision == 'fp16':
            loss_scale_reciprocal = 1. / loss_scale
            grads = [
                g * tf.cast(loss_scale_reciprocal, g.dtype)
                if g is not None else None for g in old_grads
            ]
        else:
            grads = old_grads

        opt.apply_gradients(zip(grads, model.trainable_variables))

        train_top1.update_state(labels, predictions)
        train_top5.update_state(labels, predictions)

        if hvd.size() > 1 and first_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

        return loss_copy
def benchmark_step(first_batch):
    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: use DistributedGradientTape
    with tf.GradientTape() as tape:
        probs = model(data, training=True)
        loss = tf.losses.categorical_crossentropy(target, probs)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape, compression=compression)

    gradients = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
Example #12
0
def distributed_train_step(self, example: tf.train.Example) -> dict:
    # Unpack data
    image, label = example["image"], example["label"]

    with tf.GradientTape() as tape:
        tape = hvd.DistributedGradientTape(tape)
        # Calculate prediction
        pred = self(image)
        # Calculate loss
        loss = self.loss(label, pred)
        # Compute gradients
    gradients = tape.gradient(loss, self.trainable_variables)
    # Update weights
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    # Update metrics
    self.train_loss_metric(loss)
    self.train_top_1_metric(label, pred)
    self.train_top_5_metric(label, pred)
    return {
        "loss": self.train_loss_metric.result(),
        "accuracy": self.train_top_1_metric.result(),
        "top 5": self.train_top_5_metric.result()
    }
Example #13
0
def training_step(images, labels, first_batch):
    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)

        import sys
        print("labels:", labels, file=sys.stderr)
        print("probs:", probs, file=sys.stderr)
        loss_value = loss(labels, probs)

        if args.use_amp:
            loss_value = opt.get_scaled_loss(loss_value)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape, compression=compression)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)

    if args.use_amp:
        grads = opt.get_unscaled_gradients(grads)

    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    return loss_value
Example #14
0
def main():
   ''' simple starter program for tensorflow models. '''
   parser = argparse.ArgumentParser(description='')
   parser.add_argument('-c','--config',dest='config_filename',help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG,default=DEFAULT_CONFIG)
   parser.add_argument('--interop',type=int,help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP,default=DEFAULT_INTEROP)
   parser.add_argument('--intraop',type=int,help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP,default=DEFAULT_INTRAOP)
   parser.add_argument('-l','--logdir',default=DEFAULT_LOGDIR,help='define location to save log information [default: %s]' % DEFAULT_LOGDIR)

   parser.add_argument('--horovod', default=False, action='store_true', help="Use MPI with horovod")
   parser.add_argument('--profiler',default=False, action='store_true', help='Use TF profiler, needs CUPTI in LD_LIBRARY_PATH for Cuda')
   parser.add_argument('--profrank',default=0,type=int,help='set which rank to profile')

   parser.add_argument('--batch-term',dest='batch_term',type=int,help='if set, terminates training after the specified number of batches',default=0)

   parser.add_argument('--evaluate',help='evaluate a pre-trained model file on the test data set only.')
   parser.add_argument('--train-more',dest='train_more',help='load a pre-trained model file and continue training.')

   parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG")
   parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--logfilename',dest='logfilename',default=None,help='if set, logging information will go to file')
   args = parser.parse_args()
   
   hvd = None
   rank = 0
   nranks = 1
   logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s'
   logging_datefmt = '%Y-%m-%d %H:%M:%S'
   logging_level = logging.INFO
   if args.horovod:
      print('importing horovod')
      sys.stdout.flush()
      sys.stderr.flush()

      import horovod
      import horovod.tensorflow as hvd
      hvd.init()
      logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + (
                 '%05d' % hvd.rank()) + ':%(name)s:%(message)s'
      rank = hvd.rank()
      nranks = hvd.size()
      if rank > 0:
         logging_level = logging.WARNING

   # Setup Logging
   if args.debug and not args.error and not args.warning:
      logging_level = logging.DEBUG
      os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0'
      os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
   elif not args.debug and args.error and not args.warning:
      logging_level = logging.ERROR
   elif not args.debug and not args.error and args.warning:
      logging_level = logging.WARNING

   logging.basicConfig(level=logging_level,
                       format=logging_format,
                       datefmt=logging_datefmt,
                       filename=args.logfilename)
   
   if hvd:
      logging.warning('host: %s rank: %5d   size: %5d  local rank: %5d  local size: %5d',
                      socket.gethostname(),hvd.rank(), hvd.size(),
                      hvd.local_rank(), hvd.local_size())
   
   tf.config.threading.set_inter_op_parallelism_threads(args.interop)
   tf.config.threading.set_intra_op_parallelism_threads(args.intraop)

   # Setup GPUs
   gpus = tf.config.list_physical_devices('GPU')
   logger.info(   'number of gpus:              %s',len(gpus))
   for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
   if hvd and len(gpus) > 0:
      tf.config.set_visible_devices(gpus[hvd.local_rank() % len(gpus)],'GPU')

   logging.info(   'using tensorflow version:   %s (%s)',tf.__version__,tf.__git_version__)
   logging.info(   'using tensorflow from:      %s',tf.__file__)
   if hvd:
      logging.info('using horovod version:      %s',horovod.__version__)
      logging.info('using horovod from:         %s',horovod.__file__)
   logging.info(   'logdir:                     %s',args.logdir)
   logging.info(   'interop:                    %s',args.interop)
   logging.info(   'intraop:                    %s',args.intraop)
   
   # this must be created after the config settings
   gtape = tf.GradientTape()
   if args.horovod:
      gtape = hvd.DistributedGradientTape(gtape)

   config = json.load(open(args.config_filename))
   # config['device'] = device_str
   
   config['profrank'] = args.profrank
   config['profiler'] = args.profiler
   config['logdir'] = args.logdir
   config['rank'] = rank
   config['nranks'] = nranks
   config['evaluate'] = False
   config['batch_term'] = args.batch_term
   if args.batch_term > 0:
      config['training']['epochs'] = 1
      config['training']['status'] = 1 if args.batch_term < config['training']['status'] else config['training']['status']

   if args.evaluate is not None:
      config['evaluate'] = True
      config['model_file'] = args.evaluate
      config['training']['epochs'] = 1
      logger.info('evaluating model file:      %s',args.evaluate)
   elif args.train_more is not None:
      config['train_more'] = True
      config['model_file'] = args.train_more
      logger.info('continuing model file:      %s',args.train_more)


   # using mixed precision?
   if isinstance(config['model']['mixed_precision'],str):
      logger.info('using mixed precsion:       %s',config['model']['mixed_precision'])
      tf.keras.mixed_precision.set_global_policy(config['model']['mixed_precision'])

   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   logger.info('%s = \n %s',args.config_filename,json.dumps(config,indent=4,sort_keys=True))
   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   config['hvd'] = hvd

   sys.stdout.flush()
   sys.stderr.flush()

   trainds,testds = data_handler.get_datasets(config)
   
   logger.info('get model')
   net = model.get_model(config)
   loss_func = losses.get_loss(config)
   opt = get_optimizer(config)
   if isinstance(config['model']['mixed_precision'],str):
      opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)

   # initialize and create the model
   # input_shape = [config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']]
   # output = net(tf.random.uniform(input_shape))

   # load previous model weights
   if args.evaluate:
      net.load_weights(args.evaluate)
   elif args.train_more:
      net.load_weights(args.train_more)

   # # synchronize models across ranks
   # if hvd:
   #    hvd.broadcast_variables(net.variables, root_rank=0)
   #    hvd.broadcast_variables(opt.variables(), root_rank=0)

   train_summary_writer = None
   test_summary_writer = None
   test_jet_writer = None
   test_ele_writer = None
   test_bkg_writer = None
   test_mean_writer = None
   if rank == 0:
      train_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'train')
      test_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'test')
      
      test_jet_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'jet_iou')
      test_ele_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'ele_iou')
      test_bkg_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'bkg_iou')
      test_mean_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'mean_iou')

      #tf.keras.utils.plot_model(net, "network_model.png", show_shapes=True)
      
      #with train_summary_writer.as_default():
        #tf.summary.graph(train_step.get_concrete_function().graph)

   batches_per_epoch = 0
   train_mIoU_sum = 0.
   test_mIoU_sum = 0.
   for epoch_num in range(config['training']['epochs']):
      
      logger.info('begin epoch %s',epoch_num)

      if not config['evaluate']:
         train_output = epoch_loop.one_train_epoch(config,trainds,net,
                                                   loss_func,opt,epoch_num,
                                                   train_summary_writer,
                                                   batches_per_epoch,
                                                   gtape)
         batches_per_epoch = train_output['batches_per_epoch']
         train_mIoU_sum += train_output['mIoU']
         logger.info('train mIoU sum: %10.4f',train_mIoU_sum / (epoch_num + 1))

      test_output = epoch_loop.one_eval_epoch(config,testds,net,
                                              loss_func,opt,epoch_num,
                                              test_summary_writer,
                                              batches_per_epoch,
                                              test_jet_writer,
                                              test_ele_writer,
                                              test_bkg_writer,
                                              test_mean_writer)
      test_mIoU_sum += test_output['mIoU']
      logger.info('test mIoU sum: %10.4f',test_mIoU_sum / (epoch_num + 1))

      if rank == 0:
         with test_summary_writer.as_default():
            step = (epoch_num + 1) * batches_per_epoch
            tf.summary.scalar('metrics/mIoU_AOC', test_mIoU_sum / (epoch_num + 1),step=step)
  def train_step(self,
                 x,
                 y,
                 s=None,
                 y_gt=None,
                 flag=None,
                 x_test=None,
                 y_test=None,
                 flag_test=None,
                 **kwargs):
    """One training step.

    Args:
      x: [B, T, ...], inputs at each timestep.
      y: [B, T], label at each timestep.
      y_gt: [B, T], groundtruth at each timestep, if different from labels.
      x_test: [B, M, ...], inputs of the query set, optional.
      y_test: [B, M], groundtruth of the query set, optional.

    Returns:
      xent: Cross entropy loss.
    """
    # tf.print('y', y[0], summarize=100)
    if self._distributed:
      import horovod.tensorflow as hvd
    if y_gt is None:
      y_gt = y
    with tf.GradientTape() as tape:
      if x_test is not None:
        # Additional query set (optional).
        assert y_test is not None
        logits, logits_test = self.forward(
            x, y, s=s, x_test=x_test, is_training=tf.constant(True))
        logits_all = tf.concat([logits, logits_test], axis=1)  # [B, T+N, Kmax]
        labels_all = tf.concat([y_gt, y_test], axis=1)  # [B, T+N]
      else:
        logits = self.forward(x, y, s=s, is_training=tf.constant(True))
        logits_all = logits
        labels_all = y_gt

      xent = self.compute_loss(logits_all, labels_all)

      # Cross entropy loss.
      if flag is not None:
        if flag_test is not None:
          flag_all = tf.concat([flag, flag_test], axis=1)
        else:
          flag_all = flag
        flag_ = tf.cast(flag_all, self.dtype)
        valid_sum = tf.reduce_sum(flag_)
        delta = tf.cast(tf.equal(valid_sum, 0.0), self.dtype)
        xent = tf.reduce_sum(xent * flag_) / (valid_sum + delta)
      else:
        xent = tf.reduce_mean(xent)

      # Regularizers.
      reg_loss = self._get_regularizer_loss(*self.regularized_weights())
      loss = xent + reg_loss * self.wd

    # Apply gradients.
    if self._distributed:
      tape = hvd.DistributedGradientTape(tape)

    self.apply_gradients(loss, tape)

    return xent
    def train_step(self,
                   x,
                   y,
                   y_gt=None,
                   flag=None,
                   writer=None,
                   first_batch=False,
                   **kwargs):
        """One training step.

    Args:
      x: [B, T, ...], inputs at each timestep.
      y: [B, T], label at each timestep, to be fed as input.
      y_unk: [B, T], binary label indicating unknown, used as groundtruth.
      y_gt: [B, T], groundtruth at each timestep, if different from labels.
      x_test: [B, M, ...], inputs of the query set, optional.
      y_test: [B, M], groundtruth of the query set, optional.

    Returns:
      xent: Cross entropy loss.
    """
        if self._distributed:
            import horovod.tensorflow as hvd
        if y_gt is None:
            y_gt = y
        B = tf.constant(x.shape[0])
        T = tf.constant(x.shape[1])
        with writer.as_default() if writer is not None else dummy_context_mgr(
        ) as gs:
            states = self.memory.get_initial_state(B, 64)
            DT = self.config.optimizer_config.inner_loop_truncate_steps
            # Data parallel training.
            xent_total = 0.0
            xent_unk_total = 0.0
            flag_total = tf.cast(tf.reduce_sum(flag), self.dtype)
            for t_start in range(0, self.config.num_steps, DT):
                t_end = tf.minimum(t_start + DT, T)
                with tf.GradientTape() as tape:
                    loss, metric, states = self.compute_loss(
                        x[:, t_start:t_end], y[:, t_start:t_end],
                        y_gt[:, t_start:t_end], flag[:, t_start:t_end],
                        t_start, DT, *states, **kwargs)

                # Apply gradients.
                if self._distributed:
                    tape = hvd.DistributedGradientTape(tape)
                self.apply_gradients(loss, tape)

                # Sync weights initialization.
                if self._distributed and first_batch and tf.equal(t_start, 0):
                    hvd.broadcast_variables(self.var_to_optimize(),
                                            root_rank=0)
                    hvd.broadcast_variables(self.optimizer.variables(),
                                            root_rank=0)
                    if self.config.set_backbone_lr:
                        hvd.broadcast_variables(self._bb_optimizer.variables(),
                                                root_rank=0)

                flag_total_ = tf.reduce_sum(
                    tf.cast(flag[:, t_start:t_end], self.dtype))
                xent_total += metric['xent'] * flag_total_ / flag_total
                xent_unk_total += metric['xent_unk'] * flag_total_ / flag_total

            write_flag = self._distributed and hvd.rank() == 0
            write_flag = write_flag or (not self._distributed)
            if write_flag and writer is not None:
                if tf.equal(
                        tf.math.floormod(
                            self._step // self._ratio + 1,
                            self.config.train_config.steps_per_log), 0):
                    tf.summary.scalar('xent_unk',
                                      xent_unk_total,
                                      step=self._step + 1)
                    writer.flush()
        return xent_total
Example #17
0
def train(model, train_db, validation_db, epochs, batch_size, learning_rate,
          model_dir):

    # 对dataset进行处理
    # 预处理传入process函数
    # shuffle用于把数据打散,参数越大打的越散
    # 设定每一个batch的大小
    train_db = train_db.map(process).shuffle(
        10000, seed=np.random.randint(999)).batch(batch_size)
    validation_db = validation_db.map(process).shuffle(
        10000, seed=np.random.randint(999)).batch(batch_size)

    # 获得sample数据,查看数据的shape信息,用于下一步定义网络的一些参数
    train_iter = iter(train_db)
    train_sample = next(train_iter)
    print('train dataset x shape {}, train dataset y shape {}'.format(
        train_sample[0].shape, train_sample[1].shape))

    # 第三步,learning rate随着horovod的size的增加,而扩展
    optimazer = optimizers.Adam(lr=learning_rate * hvd.size())

    # 设定epoch的次数
    for epoch in range(epochs):

        for step, (x, y) in enumerate(train_db):
            # 对input进行reshape,对应model build的input_shape
            # x = tf.reshape(x, [-1,28*28]),不需要reshape了,传进来的数据已经是[b, 784]
            # tape包裹前向运算,用于记录varibale,好计算梯度
            with tf.GradientTape() as tape:
                # 直接将x输入model,实际上调用的实例的__call__方法,输出结果为softmax后的预测数据
                softmax = model(x)
                # 对y进行onehot编码,因为y的shape是[b,],而logits的shape是[b, 10],需要将y转换成[b, 10]
                y_hot = tf.one_hot(y, depth=10)
                # 这里设置两个损失函数mse和交叉熵,如果是分类问题,推荐使用交叉熵
                # 注意,如果是使用logits计算交叉熵的化,需要指定参数from_logits=True,这里是softmax所以不需要了
                loss_mse = tf.reduce_mean(
                    tf.losses.mean_squared_error(y_hot, softmax))
                loss_ce = tf.reduce_mean(
                    tf.losses.categorical_crossentropy(y_hot, softmax))
                # loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_hot, logits, from_logits=True))

            # 第四步,用hvd tape包裹之前的tape,这样可以allreduce各个process的梯度然后在同步到各个process,用于更新variables
            tape = hvd.DistributedGradientTape(tape)
            # 计算对于交叉熵的参数的梯度
            grads = tape.gradient(loss_ce, model.trainable_variables)
            # 更新梯度,这里参数的列表直接调用model.trainable_variables
            optimazer.apply_gradients(zip(grads, model.trainable_variables))

            # 第五步,将初始化的varibles广播到所有的process,要确保多有的process在同一个起点开始
            if epoch == 0 and step == 0:
                hvd.broadcast_variables(model.variables, root_rank=0)
                hvd.broadcast_variables(optimazer.variables(), root_rank=0)

            # 每100个step打印一次信息
            if step % 100 == 0 and hvd.rank() == 0:
                print('epoch:{}\t step:{}\t loss_mse:{}\t loss_ce:{}\t'.format(
                    epoch, step, float(loss_mse), float(loss_ce)))

        # 每个epoch计算一次准确率
        total_corrects = 0  # 统计变量
        total_number = 0
        # 针对validation dataset进行测试
        for x, y in validation_db:

            # 得到验证数据的预测结果
            probs = model(x)

            # 取最大值的索引为预测值
            preds = tf.cast(tf.argmax(probs, axis=1), dtype=tf.int32)

            # 累加正确的个数,和总数
            corrects = tf.equal(y, preds)
            corrects = tf.reduce_sum(tf.cast(corrects, dtype=tf.int32))
            total_corrects += corrects
            total_number += x.shape[0]

        # 计算测试数据集的准确率
        acc = total_corrects / total_number

        if hvd.rank() == 0:
            print('accuracy={};'.format(acc))

    # 两种保存model的方法都可以,low level:tf.saved_model.save(model, model_dir+'/'+datetime.now().strftime('%Y%m%d%H%M%S'))
    # 存储model的路径必须是数字类型的字符串
    # 第六步,只有在rank=0时才存储checkpoint或者model
    if hvd.rank() == 0:
        model.save(model_dir + '/' + datetime.now().strftime('%Y%m%d%H%M%S'))
    def train_step(self,
                   x,
                   y,
                   s=None,
                   y_gt=None,
                   flag=None,
                   x_test=None,
                   y_test=None,
                   flag_test=None,
                   writer=None,
                   **kwargs):
        """One training step.

    Args:
      x: [B, T, ...], inputs at each timestep.
      y: [B, T], label at each timestep, to be fed as input.
      y_unk: [B, T], binary label indicating unknown, used as groundtruth.
      y_gt: [B, T], groundtruth at each timestep, if different from labels.
      x_test: [B, M, ...], inputs of the query set, optional.
      y_test: [B, M], groundtruth of the query set, optional.

    Returns:
      xent: Cross entropy loss.
    """
        if self._distributed:
            import horovod.tensorflow as hvd
        if y_gt is None:
            y_gt = y
        with writer.as_default() if writer is not None else dummy_context_mgr(
        ) as gs:
            with tf.GradientTape() as tape:
                loss, metric = self.compute_loss(x,
                                                 y,
                                                 y_gt,
                                                 s=s,
                                                 flag=flag,
                                                 x_test=x_test,
                                                 y_test=y_test,
                                                 flag_test=flag_test,
                                                 **kwargs)

            # Data parallel training.
            if self._distributed:
                xent_sync = tf.reduce_mean(
                    hvd.allgather(tf.zeros([1], dtype=tf.float32) +
                                  metric['xent'],
                                  name='xent'))
                tape = hvd.DistributedGradientTape(tape)
            else:
                xent_sync = metric['xent']

            # Apply gradients.
            # if not tf.math.is_nan(xent_sync):
            self.apply_gradients(loss, tape)

            write_flag = self._distributed and hvd.rank() == 0
            write_flag = write_flag or (not self._distributed)

            if write_flag and writer is not None:
                if tf.equal(
                        tf.math.floormod(
                            self._step + 1,
                            self.config.train_config.steps_per_log), 0):
                    for name, val in metric.items():
                        if name != 'xent':
                            tf.summary.scalar(name, val, step=self._step + 1)

                    if self._ssl_store is not None:
                        tf.summary.scalar('ssl write',
                                          tf.reduce_mean(
                                              tf.cast(self._ssl_store,
                                                      tf.float32)),
                                          step=self._step + 1)
                    writer.flush()
        return xent_sync
Example #19
0
def main(_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if FLAGS.tiny:
        model = YoloV3Tiny(FLAGS.size, training=True)
        anchors = yolo_tiny_anchors
        anchor_masks = yolo_tiny_anchor_masks
    else:
        model = YoloV3(FLAGS.size, training=True)
        anchors = yolo_anchors
        anchor_masks = yolo_anchor_masks

    train_dataset = dataset.load_fake_dataset()
    if FLAGS.dataset:
        train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset,
                                                      FLAGS.classes)
    train_dataset = train_dataset.shuffle(buffer_size=1024)  # TODO: not 1024
    train_dataset = train_dataset.batch(FLAGS.batch_size)
    train_dataset = train_dataset.map(
        lambda x, y: (dataset.transform_images(x, FLAGS.size),
                      dataset.transform_targets(y, anchors, anchor_masks, 80)))
    train_dataset = train_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)

    val_dataset = dataset.load_fake_dataset()
    if FLAGS.val_dataset:
        val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset,
                                                    FLAGS.classes)
    val_dataset = val_dataset.batch(FLAGS.batch_size)
    val_dataset = val_dataset.map(
        lambda x, y: (dataset.transform_images(x, FLAGS.size),
                      dataset.transform_targets(y, anchors, anchor_masks, 80)))

    if FLAGS.transfer != 'none':
        model.load_weights(FLAGS.weights)
        if FLAGS.transfer == 'fine_tune':
            # freeze darknet
            darknet = model.get_layer('yolo_darknet')
            freeze_all(darknet)
        elif FLAGS.mode == 'frozen':
            # freeze everything
            freeze_all(model)
        else:
            # reset top layers
            if FLAGS.tiny:  # get initial weights
                init_model = YoloV3Tiny(FLAGS.size, training=True)
            else:
                init_model = YoloV3(FLAGS.size, training=True)

            if FLAGS.transfer == 'darknet':
                for l in model.layers:
                    if l.name != 'yolo_darknet' and l.name.startswith('yolo_'):
                        l.set_weights(
                            init_model.get_layer(l.name).get_weights())
                    else:
                        freeze_all(l)
            elif FLAGS.transfer == 'no_output':
                for l in model.layers:
                    if l.name.startswith('yolo_output'):
                        l.set_weights(
                            init_model.get_layer(l.name).get_weights())
                    else:
                        freeze_all(l)

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = tf.optimizers.Adam(FLAGS.learning_rate * hvd.size())
    # Horovod: add Horovod DistributedOptimizer.

    ###############################################
    loss = [YoloLoss(anchors[mask]) for mask in anchor_masks]

    if FLAGS.mode == 'eager_tf':
        # Eager mode is great for debugging
        # Non eager graph mode is recommended for real training
        avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
        avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)

        for epoch in range(1, FLAGS.epochs + 1):
            for batch, (images, labels) in enumerate(
                    train_dataset.take(5717 // hvd.size())):
                with tf.GradientTape() as tape:
                    outputs = model(images, training=True)
                    regularization_loss = tf.reduce_sum(model.losses)
                    pred_loss = []
                    for output, label, loss_fn in zip(outputs, labels, loss):
                        pred_loss.append(loss_fn(label, output))
                    total_loss = tf.reduce_sum(pred_loss) + regularization_loss
                # Horovod: add Horovod Distributed GradientTape.
                tape = hvd.DistributedGradientTape(tape)

                grads = tape.gradient(total_loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
                # Horovod: broadcast initial variable states from rank 0 to all other processes.
                # This is necessary to ensure consistent initialization of all workers when
                # training is started with random weights or restored from a checkpoint.
                #
                # Note: broadcast should be done after the first gradient step to ensure optimizer
                # initialization.
                if batch == 0:
                    hvd.broadcast_variables(model.variables, root_rank=0)
                    hvd.broadcast_variables(optimizer.variables(), root_rank=0)

                #############################
                if hvd.rank() == 0:
                    logging.info("{}_train_{}, {}, {}".format(
                        epoch, batch, total_loss.numpy(),
                        list(map(lambda x: np.sum(x.numpy()), pred_loss))))
                ###########################
                avg_loss.update_state(total_loss)

            for batch, (images, labels) in enumerate(val_dataset):
                outputs = model(images)
                regularization_loss = tf.reduce_sum(model.losses)
                pred_loss = []
                for output, label, loss_fn in zip(outputs, labels, loss):
                    pred_loss.append(loss_fn(label, output))
                total_loss = tf.reduce_sum(pred_loss) + regularization_loss
                if hvd.rank() == 0:
                    logging.info("{}_val_{}, {}, {}".format(
                        epoch, batch, total_loss.numpy(),
                        list(map(lambda x: np.sum(x.numpy()), pred_loss))))
                avg_val_loss.update_state(total_loss)
            if hvd.rank() == 0:
                logging.info("{}, train: {}, val: {}".format(
                    epoch,
                    avg_loss.result().numpy(),
                    avg_val_loss.result().numpy()))

            avg_loss.reset_states()
            avg_val_loss.reset_states()
            if hvd.rank() == 0:
                model.save_weights(
                    'checkpoints/horovod_yolov3_train_{}.tf'.format(epoch))
    else:
        model.compile(optimizer=optimizer,
                      loss=loss,
                      run_eagerly=(FLAGS.mode == 'eager_fit'))

        callbacks = [
            ReduceLROnPlateau(verbose=1),
            EarlyStopping(patience=3, verbose=1),
            ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf',
                            verbose=1,
                            save_weights_only=True),
            TensorBoard(log_dir='logs')
        ]

        history = model.fit(train_dataset,
                            epochs=FLAGS.epochs,
                            callbacks=callbacks,
                            validation_data=val_dataset)
def train_step(model,
               inputs,
               loss,
               amp,
               opt,
               init,
               v2=False,
               loss_class=None,
               fp16=False,
               clip_norm=1.0):
    with tf.GradientTape() as tape:
        [
            input_ids, input_mask, segment_ids, start_positions, end_positions,
            cls_index, p_mask, is_impossible
        ] = inputs

        if not v2:
            is_impossible = None

        start_logits, end_logits, cls_logits = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=segment_ids,
            start_positions=start_positions,
            end_positions=end_positions,
            cls_index=cls_index,
            p_mask=p_mask,
            is_impossible=is_impossible,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            training=True,
        )[0:3]

        # If we are on multi-GPU, split add a dimension
        if len(start_positions.shape) > 1:
            start_positions = tf.squeeze(start_positions,
                                         axis=-1,
                                         name="squeeze_start_positions")
        if len(end_positions.shape) > 1:
            end_positions = tf.squeeze(end_positions,
                                       axis=-1,
                                       name="squeeze_end_positions")
        if is_impossible is not None and len(
                is_impossible.shape) > 1 and v2 and cls_logits is not None:
            is_impossible = tf.squeeze(is_impossible,
                                       axis=-1,
                                       name="squeeze_is_impossible")

        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.shape[1]
        start_positions = tf.clip_by_value(start_positions,
                                           0,
                                           ignored_index,
                                           name="clip_start_positions")
        end_positions = tf.clip_by_value(end_positions,
                                         0,
                                         ignored_index,
                                         name="clip_end_positions")

        start_loss = loss(y_true=start_positions,
                          y_pred=tf.cast(start_logits, tf.float32))
        end_loss = loss(y_true=end_positions,
                        y_pred=tf.cast(end_logits, tf.float32))
        loss_value = (start_loss + end_loss) / 2

        if v2:
            cls_loss_value = loss_class(y_true=is_impossible,
                                        y_pred=tf.cast(cls_logits, tf.float32))
            loss_value += cls_loss_value * 0.5

        unscaled_loss = tf.stop_gradient(loss_value)
        if amp:
            loss_value = opt.get_scaled_loss(loss_value)

    tape = hvd.DistributedGradientTape(
        tape,
        sparse_as_dense=True,
        compression=Compression.fp16 if fp16 else Compression.none)
    gradients = tape.gradient(loss_value, model.trainable_variables)
    if amp:
        gradients = opt.get_unscaled_gradients(gradients)
    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
    opt.apply_gradients(zip(gradients,
                            model.trainable_variables))  # , clip_norm=1.0)

    if init:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    return unscaled_loss  # , outputs#, tape.gradient(loss_value, model.trainable_variables)
Example #21
0
def main():
    hvd.init()

    n_epochs = 10
    batch_size = 5
    step = len(im) // batch_size

    params = parse_args(PARSER.parse_args())

    optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate)
    ce_loss = tf.keras.metrics.Mean(name='ce_loss')
    f1_loss = tf.keras.metrics.Mean(name='dice_loss')
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

    pb_i = Progbar(step, stateful_metrics=metrics_names)
    count = 0
    for epoch in range(n_epochs):

        if count >= step:
            count = 0

        features = im[epoch * batch_size:(epoch * batch_size) + batch_size]
        features = np.reshape(features,
                              (len(features), features[0].shape[1],
                               features[0].shape[2], features[0].shape[0]))
        features = features.astype('float32')

        labels = lb[epoch * batch_size:(epoch * batch_size) + batch_size]
        labels = np.reshape(
            labels, (len(labels), labels[0].shape[0], labels[0].shape[1], 1))
        labels = labels.astype('float32')
        print(features.shape, labels.shape)

        print('Epoch {} out of epochs {}'.format(epoch, n_epochs))

        for i, (features_, labels_) in enumerate(zip(features, labels)):

            with tf.GradientTape() as tape:

                output_map = model(features)

                crossentropy_loss, dice_loss = partial_losses(
                    output_map, labels)
                added_losses = tf.add(crossentropy_loss,
                                      dice_loss,
                                      name='total_loss_ref')

                values = [('Xent', crossentropy_loss),
                          ('added_losses', added_losses)]

                pb_i.add(1, values=values)

            # calculate the gradients using our tape and then update the
        # model weights
            tape = hvd.DistributedGradientTape(tape)
            gradients = tape.gradient(added_losses, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))

            # Calculate something wrong here
            # val_total_loss = 0
            # val_total_acc = 0
            # total_val_num = 0
            # for bIdx, (val_X, val_y) in enumerate(val_batch):
            #     if bIdx >= features.shape[0]:
            #         break
            #     y_pred = model(val_X, training=False)

        print('Xen: ', crossentropy_loss, dice_loss, added_losses)
Example #22
0
    def train_step(self, x, y, y_gt=None, flag=None, writer=None, **kwargs):
        """One training step, with truncated backpropagation through time.

    Args:
      x: [B, T, ...], inputs at each timestep.
      y: [B, T], label at each timestep, to be fed as input.
      y_unk: [B, T], binary label indicating unknown, used as groundtruth.
      y_gt: [B, T], groundtruth at each timestep, if different from labels.
      x_test: [B, M, ...], inputs of the query set, optional.
      y_test: [B, M], groundtruth of the query set, optional.

    Returns:
      xent: Cross entropy loss.
    """
        if self._distributed:
            import horovod.tensorflow as hvd
        if y_gt is None:
            y_gt = y
        B = tf.constant(x.shape[0])
        T = tf.constant(x.shape[1])
        DT = self.config.oml_config.inner_loop_truncate_steps
        LOGSTEP = self.config.train_config.steps_per_log
        assert DT > 1
        with writer.as_default() if writer is not None else dummy_context_mgr(
        ) as gs:
            states = self.memory.get_initial_state(B)
            states_shape = [s.shape for s in states]
            xent_total = 0.0
            xent_unk_total = 0.0
            flag_total = tf.cast(tf.reduce_sum(flag), self.dtype)
            for t_start in tf.range(0, T, DT):
                # tf.print('t_start', t_start)
                with tf.GradientTape() as tape:
                    # if tf.equal(t_start, 0):
                    #   states = self.memory.get_initial_state(B)
                    #   [s.set_shape(ss) for s, ss in zip(states, states_shape)]
                    t_end = tf.minimum(t_start + DT, T)
                    # tf.print('start', t_start, 'end', t_end)
                    loss, metric, states = self.compute_loss(
                        x[:, t_start:t_end], y[:, t_start:t_end],
                        t_end - t_start, y_gt[:, t_start:t_end],
                        flag[:, t_start:t_end], *states, **kwargs)

                if self._distributed:
                    tape = hvd.DistributedGradientTape(tape)

                # Apply gradients.
                self.apply_gradients(loss, tape, add_step=tf.equal(t_start, 0))

                flag_total_ = tf.reduce_sum(
                    tf.cast(flag[:, t_start:t_end], self.dtype))
                xent_total += metric['xent'] * flag_total_ / flag_total
                xent_unk_total += metric['xent_unk'] * flag_total_ / flag_total
            # tf.print('xent unk total', xent_unk_total)

            # Log xent unk
            if writer is not None:
                NSTEP = len(tf.range(0, T, DT))
                cond = tf.logical_or(
                    tf.equal(tf.math.floormod(self._step, LOGSTEP), 0),
                    tf.equal(self._step, 1))
                if cond:
                    tf.summary.scalar('xent_unk', xent_unk_total, self._step)
                    writer.flush()
        return xent_total
Example #23
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Make sure the Fetcher worked
    mnist_filename = 'mnist.npz'
    mnist_path = os.path.join(cache_dir, mnist_filename)
    if not os.path.isfile(mnist_path):
        raise FileNotFoundError("Dataset not found. Looked in " + mnist_path)

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path=mnist_filename)

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(32)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(20000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(mnist_model.variables, root_rank=0)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())

        if batch % 50 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))
            emit({"batch": str(batch), "train_loss": "%.6f" % loss_value})
def train():
    # Horovod: initialize Horovod.
    hvd.init()
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.enable_eager_execution(config=config)
    # Horovod: adjust number of steps based on number of GPUs.
    images, images_path = get_celebA(FLAGS.output_size, FLAGS.n_epoch // hvd.size(), FLAGS.batch_size)

    G = get_generator([None, FLAGS.z_dim])
    D = get_discriminator([None, FLAGS.output_size, FLAGS.output_size, FLAGS.c_dim])

    G.train()
    D.train()

    d_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate * hvd.size(), beta1=FLAGS.beta1) # linear scaling rule
    g_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate * hvd.size(), beta1=FLAGS.beta1)

    step_counter = tf.train.get_or_create_global_step()

    n_step_epoch = int(len(images_path) // FLAGS.batch_size)

    for step, batch_images in enumerate(images):
        step_time = time.time()
        with tf.GradientTape(persistent=True) as tape:
            z = tf.contrib.distributions.Normal(0., 1.).sample([FLAGS.batch_size, FLAGS.z_dim]) #tf.placeholder(tf.float32, [None, z_dim], name='z_noise')
            d_logits = D(G(z))
            d2_logits = D(batch_images)
            # discriminator: real images are labelled as 1
            d_loss_real = tl.cost.sigmoid_cross_entropy(d2_logits, tf.ones_like(d2_logits), name='dreal')
            # discriminator: images from generator (fake) are labelled as 0
            d_loss_fake = tl.cost.sigmoid_cross_entropy(d_logits, tf.zeros_like(d_logits), name='dfake')
            # cost for updating discriminator
            d_loss = d_loss_real + d_loss_fake
            # generator: try to make the the fake images look real (1)
            g_loss = tl.cost.sigmoid_cross_entropy(d_logits, tf.ones_like(d_logits), name='gfake')

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if step == 0:
            hvd.broadcast_variables(G.weights, root_rank=0)
            hvd.broadcast_variables(D.weights, root_rank=0)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)
        #
        grad = tape.gradient(d_loss, D.weights)
        d_optimizer.apply_gradients(zip(grad, D.weights), global_step=tf.train.get_or_create_global_step())
        grad = tape.gradient(g_loss, G.weights)
        g_optimizer.apply_gradients(zip(grad, G.weights), global_step=tf.train.get_or_create_global_step())

        # Horovod: print logging only on worker 0
        if hvd.rank() == 0
            print("Epoch: [{}/{}] [{}/{}] took: {:3f}, d_loss: {:5f}, g_loss: {:5f}".format(step//n_step_epoch, FLAGS.n_epoch, step, n_step_epoch, time.time()-step_time, d_loss, g_loss))

        # Horovod: save checkpoints only on worker 0
        if hvd.rank() == 0 and np.mod(step, FLAGS.save_step) == 0:
            G.save_weights('{}/G.npz'.format(FLAGS.checkpoint_dir), format='npz')
            D.save_weights('{}/D.npz'.format(FLAGS.checkpoint_dir), format='npz')
            result = G(z)
            tl.visualize.save_images(result.numpy(), [num_tiles, num_tiles], '{}/train_{:02d}_{:04d}.png'.format(FLAGS.sample_dir, step//n_step_epoch, step))
# rank 0 to the orther ranks
hvd.broadcast_variables([slope, offset], root_rank=0)

print(
    'rank', hvd.rank(),
    'inital slope   = %12.6f\n       initial offset = %12.6f' %
    (slope.numpy(), offset.numpy()))

for xtr, ytr in dataset:
    with tf.GradientTape() as tape:
        yhat = slope * xtr + offset
        loss = tf.losses.mean_squared_error(yhat, ytr)

    # replace tensorflows' GradientTape for Horovod's
    # so that the gradients from all ranks are averaged
    tape = hvd.DistributedGradientTape(tape)

    grads = tape.gradient(loss, [slope, offset])
    opt.apply_gradients(zip(grads, [slope, offset]),
                        global_step=tf.train.get_or_create_global_step())

    history.append([slope.numpy(), offset.numpy(), loss.numpy()])
    # tf.print('loss = %f (rank-%d)' % (loss, hvd.rank()))

# saving arrays for plotting
np.save('slope_hist_%s' % hvd.rank(), np.array(history)[:, 0])
np.save('offset_hist_%s' % hvd.rank(), np.array(history)[:, 1])
if hvd.rank() == 0:
    np.save('x_train', x_train)
    np.save('y_train', y_train)
Example #26
0
    def train_step(self, data):
        """Train step.

    Args:
      data: Tuple of (images, labels). Image tensor with shape [batch_size,
        height, width, 3]. The height and width are fixed and equal.Input labels
        in a dictionary. The labels include class targets and box targets which
        are dense label maps. The labels are generated from get_input_fn
        function in data/dataloader.py.

    Returns:
      A dict record loss info.
    """
        images, labels = data
        with tf.GradientTape() as tape:
            if len(self.config.heads) == 2:
                cls_outputs, box_outputs, seg_outputs = self(images,
                                                             training=True)
            elif 'object_detection' in self.config.heads:
                cls_outputs, box_outputs = self(images, training=True)
            elif 'segmentation' in self.config.heads:
                seg_outputs, = self(images, training=True)
            total_loss = 0
            loss_vals = {}
            if 'object_detection' in self.config.heads:
                det_loss = self._detection_loss(cls_outputs, box_outputs,
                                                labels, loss_vals)
                total_loss += det_loss
            if 'segmentation' in self.config.heads:
                seg_loss_layer = self.loss['seg_loss']
                seg_loss = seg_loss_layer(labels['image_masks'], seg_outputs)
                total_loss += seg_loss
                loss_vals['seg_loss'] = seg_loss

            reg_l2_loss = self._reg_l2_loss(self.config.weight_decay)
            loss_vals['reg_l2_loss'] = reg_l2_loss
            total_loss += reg_l2_loss
            if isinstance(self.optimizer,
                          tf.keras.mixed_precision.LossScaleOptimizer):
                scaled_loss = self.optimizer.get_scaled_loss(total_loss)
                optimizer = self.optimizer._optimizer
            else:
                scaled_loss = total_loss
                optimizer = self.optimizer
        compress = get_mixed_precision_policy().compute_dtype == 'float16'
        tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16 \
            if compress else hvd.Compression.none)
        loss_vals['loss'] = total_loss
        loss_vals['learning_rate'] = optimizer.learning_rate(
            optimizer.iterations)
        trainable_vars = self._freeze_vars()
        scaled_gradients = tape.gradient(scaled_loss, trainable_vars)
        if isinstance(self.optimizer,
                      tf.keras.mixed_precision.LossScaleOptimizer):
            gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
        else:
            gradients = scaled_gradients
        if self.config.clip_gradients_norm > 0:
            clip_norm = abs(self.config.clip_gradients_norm)
            gradients = [
                tf.clip_by_norm(g, clip_norm) if g is not None else None
                for g in gradients
            ]
            gradients, _ = tf.clip_by_global_norm(gradients, clip_norm)
            loss_vals['gradient_norm'] = tf.linalg.global_norm(gradients)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        return loss_vals
Example #27
0
    def train_step(self, data):
        """Perform a single training step."""
        start = time.time()
        with tf.GradientTape() as tape:
            x, beta = data
            tape.watch(x)
            states, data = self((x, beta), training=True)
            accept_prob = data.get('accept_prob', None)
            ploss, qloss = self.calc_losses(states, accept_prob)
            loss = ploss + qloss
            if self.aux_weight > 0:
                z = tf.random.normal(x.shape, dtype=x.dtype)
                states_, data_ = self((z, beta), training=True)
                accept_prob_ = data_.get('accept_prob', None)
                ploss_, qloss_ = self.calc_losses(states_, accept_prob_)
                loss += ploss_ + qloss_

        if HAS_HOROVOD:
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss, self.trainable_variables)

        self.optimizer.apply_gradients(zip(grads, self.trainable_variables), )

        metrics = AttrDict({
            'lr': self._get_lr(),
            'dt': time.time() - start,
            'loss': loss,
        })
        if self.plaq_weight > 0 and self.charge_weight > 0:
            metrics.update({'ploss': ploss, 'qloss': qloss})
        if self.aux_weight > 0:
            metrics.update({'ploss_aux': ploss_, 'qloss_aux': qloss_})

        metrics.update({
            'accept_prob': accept_prob,
            'eps': self.eps,
            'beta': states.init.beta,
        })

        if self._verbose:
            metrics.update({
                'Hf_start':
                data.forward.energies[0],
                'Hf_mid':
                data.forward.energies[self.config.num_steps // 2],
                'Hf_end':
                data.forward.energies[-1],
                'Hb_start':
                data.backward.energies[0],
                'Hb_mid':
                data.backward.energies[self.config.num_steps // 2],
                'Hb_end':
                data.backward.energies[-1],
                #  'ld_f_start': data.forward.logdets[0],
                'ld_f_mid':
                data.forward.logdets[self.config.num_steps // 2],
                'ld_f_end':
                data.forward.logdets[-1],
                #  'ld_b_start': data.backward.logdets[0],
                'ld_b_mid':
                data.backward.logdets[self.config.num_steps // 2],
                'ld_b_end':
                data.backward.logdets[-1],
                #  'sumlogdet': sumlogdet.out,
            })

        observables = self.calc_observables(states)
        metrics.update(**observables)

        metrics.update({
            'lr': self._get_lr(),
        })

        # Horovod:
        #    Broadcast initial variable states from rank 0 to all other
        #    processes. This is necessary to ensure consistent initialization
        #    of all workers when training is started with random weights or
        #    restored from a checkpoint.
        # NOTE:
        #    Broadcast should be done after the first gradient step to ensure
        #    optimizer intialization.
        if self.optimizer.iterations == 0 and HAS_HOROVOD and NUM_WORKERS > 1:
            hvd.broadcast_variables(self.variables, root_rank=0)
            hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)

        return states.out.x, metrics
Example #28
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(32)

    checkpoint_dir = './checkpoints'
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=mnist_model,
                                     optimizer=opt,
                                     step_counter=step_counter)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(20000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(0, mnist_model.variables)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())

        if batch % 10 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting it.
    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)
def get_distributed_tape(tape):
    return hvd.DistributedGradientTape(tape)