def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    output = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=output)

    logits = output

    # Calculate model loss.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params.label_smoothing, params.vocab_size)
    loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)

    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op = get_train_op(loss, params)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Beispiel #2
0
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    output = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=output)

    logits = output

    # Calculate model loss.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params.label_smoothing, params.vocab_size)
    loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)

    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op = get_train_op(loss, params)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Beispiel #3
0
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)

        output = model(inputs, targets)

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.PREDICT,
                                              predictions=output)

        logits = output

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params.label_smoothing, params.vocab_size)
        # Compute the weighted mean of the cross entropy losses
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(loss, "cross_entropy")

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op = get_train_op(loss, params)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)

        logits = model(inputs, targets)

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            if params["use_tpu"]:
                raise NotImplementedError(
                    "Prediction is not yet supported on TPUs.")
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "translate": tf.estimator.export.PredictOutput(logits)
                })

        # Explicitly set the shape of the logits for XLA (TPU). This is needed
        # because the logits are passed back to the host VM CPU for metric
        # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
        # it is known from Transformer that the first two dimensions of logits
        # are the dimensions of targets. Note that the ambiguous shape of logits is
        # not a problem when computing xentropy, because padded_cross_entropy_loss
        # resolves the shape on the TPU.
        logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params["label_smoothing"], params["vocab_size"])
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(loss, "cross_entropy")

        if mode == tf.estimator.ModeKeys.EVAL:
            if params["use_tpu"]:
                # host call functions should only have tensors as arguments.
                # This lambda pre-populates params so that metric_fn is
                # TPUEstimator compliant.
                metric_fn = lambda logits, labels: (metrics.get_eval_metrics(
                    logits, labels, params=params))
                eval_metrics = (metric_fn, [logits, labels])
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metrics=eval_metrics)
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            metric_dict["minibatch_loss"] = loss
            if params["use_tpu"]:
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    host_call=tpu_util.construct_scalar_host_call(
                        metric_dict=metric_dict,
                        model_dir=params["model_dir"],
                        prefix="training/"))
            ####domyoung 2019.10.1####
            #record_scalars(metric_dict)
            for key, value in metric_dict.items():
                tf.summary.scalar(name=key, tensor=value)
                tf.logging.info(key)
            summary_hook = tf.train.SummarySaverHook(
                save_steps=20,
                output_dir=params["model_dir"],
                summary_op=tf.summary.merge_all())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op,
                                              training_hooks=[summary_hook])
Beispiel #5
0
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    logits = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      if params["use_tpu"]:
        raise NotImplementedError("Prediction is not yet supported on TPUs.")
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=logits,
          export_outputs={
              "translate": tf.estimator.export.PredictOutput(logits)
          })

    # Explicitly set the shape of the logits for XLA (TPU). This is needed
    # because the logits are passed back to the host VM CPU for metric
    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
    # it is known from Transformer that the first two dimensions of logits
    # are the dimensions of targets. Note that the ambiguous shape of logits is
    # not a problem when computing xentropy, because padded_cross_entropy_loss
    # resolves the shape on the TPU.
    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

    # Calculate model loss.
    # xentropy contains the cross entropy loss of every nonpadding token in the
    # targets.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params["label_smoothing"], params["vocab_size"])
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

    # Save loss as named tensor that will be logged with the logging hook.
    tf.identity(loss, "cross_entropy")

    if mode == tf.estimator.ModeKeys.EVAL:
      if params["use_tpu"]:
        # host call functions should only have tensors as arguments.
        # This lambda pre-populates params so that metric_fn is
        # TPUEstimator compliant.
        metric_fn = lambda logits, labels: (
            metrics.get_eval_metrics(logits, labels, params=params))
        eval_metrics = (metric_fn, [logits, labels])
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, predictions={"predictions": logits},
            eval_metrics=eval_metrics)
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op, metric_dict = get_train_op_and_metrics(loss, params)

      # Epochs can be quite long. This gives some intermediate information
      # in TensorBoard.
      metric_dict["minibatch_loss"] = loss
      if params["use_tpu"]:
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, train_op=train_op,
            host_call=tpu_util.construct_scalar_host_call(
                metric_dict=metric_dict, model_dir=params["model_dir"],
                prefix="training/")
        )
      record_scalars(metric_dict)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Beispiel #6
0
    def model_fn(features, labels, mode, params):
        """Defines how to train, evaluate and predict from the transformer model."""
        num_devices = flags_core.get_num_gpus(flags_obj)
        consolidation_device = 'gpu:0'
        #    feature_shards, label_shards = replicate_model_fn._split_batch(features, labels, num_devices, device=consolidation_device)

        tower_losses = []
        tower_gradvars = []
        tower_preds = []
        for i in range(num_devices):
            worker_device = '/{}:{}'.format('gpu', i)
            device_setter = local_device_setter(
                ps_device_type='gpu',
                worker_device=worker_device,
                ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                    num_devices, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope('model', reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        # Create model and get output logits.
                        model = transformer.Transformer(
                            params, mode == tf.estimator.ModeKeys.TRAIN)
                        #logits = model(features, labels)
                        loss, gradvars, preds = _tower_fn(model,
                                                          features,
                                                          labels,
                                                          params=params)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)

        # Compute global loss and gradients
        gradvars = []
        with tf.name_scope('gradient_averaging'):
            all_grads = {}
            for grad, var in itertools.chain(*tower_gradvars):
                if grad is not None:
                    all_grads.setdefault(var, []).append(grad)
            for var, grads in six.iteritems(all_grads):
                with tf.device(var.device):
                    if len(grads) == 1:
                        avg_grad = grads[0]
                    else:
                        #            for a in range(len(grads)):
                        #              if len(grads[a]) > 1:
                        #                avg_grad = tf.multiply(tf.add_n(grads[a]), 1. / len(grads[a]))
                        #                gradvars.append((avg_grad, var))
                        avg_grad = tf.multiply(tf.add_n(grads),
                                               1. / len(grads))


#          print("AVG_GRAD: ", avg_grad, "VAR: ", var)
                    gradvars.append((avg_grad, var))

        with tf.device(consolidation_device):
            loss = tf.reduce_mean(tower_losses, name='loss')
            tf.identity(loss, "cross_entropy")
            logits = tf.reduce_mean(tower_preds, axis=0)
            #      logits = tf.concat([l for l in tower_preds], axis=0)
            if mode == tf.estimator.ModeKeys.PREDICT:
                return tf.estimator.EstimatorSpec(
                    tf.estimator.ModeKeys.PREDICT,
                    predictions=logits,
                    export_outputs={
                        "translate": tf.estimator.export.PredictOutput(logits)
                    })

            if mode == tf.estimator.ModeKeys.TRAIN:
                with tf.variable_scope("get_train_op"):
                    print("in get_train_op")
                    learning_rate = get_learning_rate(
                        learning_rate=params["learning_rate"],
                        hidden_size=params["hidden_size"],
                        learning_rate_warmup_steps=params[
                            "learning_rate_warmup_steps"])
                    optimizer = tf.contrib.opt.LazyAdamOptimizer(
                        learning_rate,
                        beta1=params["optimizer_adam_beta1"],
                        beta2=params["optimizer_adam_beta2"],
                        epsilon=params["optimizer_adam_epsilon"])
                    optimizer = tf.train.SyncReplicasOptimizer(
                        optimizer, replicas_to_aggregate=num_devices)
                    sync_hook = optimizer.make_session_run_hook(is_chief)
                    #          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    global_step = tf.train.get_global_step()
                    update_ops = tf.assign(global_step,
                                           global_step + 1,
                                           name='update_global_step')
                    minimize_op = optimizer.apply_gradients(
                        gradvars, global_step=tf.train.get_global_step())
                    train_op = tf.group(minimize_op, update_ops)
                    #train_op = [optimizer.apply_gradients(gradvars, global_step=tf.train.get_global_step())]
                    metric_dict = {"learning_rate": learning_rate}
                    metric_dict["minibatch_loss"] = loss
                    record_scalars(metric_dict)
                    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        loss=loss,
                        training_hooks=[sync_hook],
                        train_op=train_op)
            elif mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metric_ops=metrics.get_eval_metrics(
                        logits, labels, params))
    def model_fn(features, labels, mode, params):
        """Defines how to train, evaluate and predict from the transformer model."""
        num_gpus = flags_core.get_num_gpus(flags_obj)
        print("num_gpus: ", num_gpus)
        #    num_gpus=params["num_gpus"]

        learning_rate = get_learning_rate(
            learning_rate=params["learning_rate"],
            hidden_size=params["hidden_size"],
            learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
        optimizers = [
            tf.contrib.opt.LazyAdamOptimizer(
                learning_rate,
                beta1=params["optimizer_adam_beta1"],
                beta2=params["optimizer_adam_beta2"],
                epsilon=params["optimizer_adam_epsilon"])
            for _ in range(num_gpus)
        ]

        if params["dtype"] == "fp16":
            optimizers = [
                tf.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer) for optimizer in optimizers
            ]

#    feature_shards, label_shards = replicate_model_fn._split_batch(features, labels, num_gpus, device=consolidation_device)
#    feature_shards, label_shards = split_batch(features, labels, num_gpus)

        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)
        grad_list = []
        losses = []
        logits = []
        for gpu_idx in range(num_gpus):
            device_setter = local_device_setter(
                ps_device_type='cpu', worker_device='/gpu:{}'.format(gpu_idx))
            with tf.device(device_setter):
                #      with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx):
                #with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)):
                logit, loss = create_tower_network(model, params, features,
                                                   labels)
                #        feature_shard, label_shard = next(iterator)
                #        logit, loss = create_tower_network(model, params, features, labels)
                logits.append(logit)
                losses.append(loss)
                grad_list.append([
                    x for x in optimizers[gpu_idx].compute_gradients(loss)
                    if x[0] is not None
                ])

#    output_train = tf.concat(logits, axis=0)
        output_train = tf.reduce_mean(logits, axis=0)
        loss_train = tf.reduce_mean(losses, name='loss')

        #    grads = []
        #    all_vars= []
        sparse_grads = []
        sparse_vars = []
        dense_grads = []
        dense_vars = []
        for tower in grad_list:
            sp_grad = []
            sp_var = []
            dn_grad = []
            dn_var = []
            for x in tower:
                if isinstance(x[1], ops.IndexedSlices):
                    sp_grad.append(x[0])
                    sp_var.append(x[1])
                else:
                    dn_grad.append(x[0])
                    dn_var.append(x[1])

            if (len(sp_var) > 0):
                sparse_grads.append(sp_grad)
                sparse_vars.append(sp_var)
            if (len(dn_var) > 0):
                dense_grads.append(dn_grad)
                dense_vars.append(dn_var)

        #SPARSE


#    for var, grad in zip(sparse_vars, sparse_grads):
#      if len(grad) == 1:
#        avg_grad = grad
#      else:
#        avg_grad = tf.multiply(tf.add_n(grad), 1. /len(grad))
#      gradvars.append((avg_grad, var))
        if len(sparse_vars) > 0:
            if num_gpus == 1:
                reduced_grad = sparse_grads
            else:
                new_all_grads = []
                for grad in sparse_grads:
                    new_grads = []
                    for tower_grad in grad:
                        new_grads.append(tower_grad)
                    summed = tf.add_n(new_grads)
                    grads_for_devices = []
                    for g in summed:
                        with tf.device(g.device):
                            g = tf.multiply(g,
                                            1.0 / num_gpus,
                                            name='allreduce_avg')
                        grads_for_devices.append(g)
                    new_all_grads.append(grads_for_devices)
                reduced_grad = list(zip(*new_all_grads))
            gradvars = [
                list(zip(gs, vs)) for gs, vs in zip(reduced_grad, sparse_vars)
            ]

        #DENSE
        reduced_grad = []
        from tensorflow.python.ops import nccl_ops
        if num_gpus == 1:
            reduced_grad = dense_grads
        else:
            new_all_grads = []
            for grad in dense_grads:
                summed = nccl_ops.all_sum(grad)
                grads_for_devices = []
                for g in summed:
                    with tf.device(g.device):
                        g = tf.multiply(g,
                                        1.0 / num_gpus,
                                        name='allreduce_avg')
                    grads_for_devices.append(g)
                new_all_grads.append(grads_for_devices)
            reduced_grad = list(zip(*new_all_grads))

        grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, dense_vars)]

        #apply gradients to each GPU by broadcasting summed gradient
        train_ops = []
        for idx, grad_and_vars in enumerate(grads):
            with tf.name_scope('apply_gradients'), tf.device(
                    tf.DeviceSpec(device_type="GPU", device_index=idx)):
                global_step = tf.train.get_global_step()
                update_ops = tf.assign(global_step,
                                       global_step + 1,
                                       name='update_global_step')
                #update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx)
                #with tf.control_dependencies(update_ops):
                train_ops.append(optimizers[idx].apply_gradients(
                    grad_and_vars, name='apply_grad_{}'.format(idx)))

                #SPARSE
                if device_index == 0 and len(sparse_vars) > 0:
                    learning_rate = get_learning_rate(
                        learning_rate=params["learning_rate"],
                        hidden_size=params["hidden_size"],
                        learning_rate_warmup_steps=params[
                            "learning_rate_warmup_steps"])
                    optimizer = tf.contrib.opt.LazyAdamOptimizer(
                        learning_rate,
                        beta1=params["optimizer_adam_beta1"],
                        beta2=params["optimizer_adam_beta2"],
                        epsilon=params["optimizer_adam_epsilon"])
                    optimizer = tf.train.SyncReplicasOptimizer(
                        optimizer, replicas_to_aggregate=num_devices)
                    sync_hook = optimizer.make_session_run_hook(is_chief)

                    minimize_op = optimizer.apply_gradients(
                        gradvars, global_step=tf.train.get_global_step())
                    train_ops.append(minimize_op)

        optimize_op = tf.group(update_ops, *train_ops, name='train_op')
        train_metrics = {"learning_rate": learning_rate}

        tf.identity(loss_train, "cross_entropy")

        if mode == tf.estimator.ModeKeys.TRAIN:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss_train,
                                              train_op=optimize_op)
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss_train,
                predictions={"predictions": output_train},
                eval_metric_ops=metrics.get_eval_metrics(
                    output_train, labels, params))
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=output_train,
                export_outputs={
                    "translate":
                    tf.estimator.export.PredictOutput(output_train)
                })
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        train = (mode == tf.estimator.ModeKeys.TRAIN)
        #model = transformer.Transformer(params, train)
        #model = transformer2.Transformer(params, train)
        model = transformer3.Transformer(params, train)

        logits, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar = model(
            inputs, targets)
        # debug
        #print('latent_sample.shape', tf.shape(latent_sample))
        #print('latent_sample.shape', latent_sample.shape[-1].value)
        #exit()

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            if params["use_tpu"]:
                raise NotImplementedError(
                    "Prediction is not yet supported on TPUs.")
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "translate": tf.estimator.export.PredictOutput(logits)
                })

        # Explicitly set the shape of the logits for XLA (TPU). This is needed
        # because the logits are passed back to the host VM CPU for metric
        # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
        # it is known from Transformer that the first two dimensions of logits
        # are the dimensions of targets. Note that the ambiguous shape of logits is
        # not a problem when computing xentropy, because padded_cross_entropy_loss
        # resolves the shape on the TPU.
        logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params["label_smoothing"], params["vocab_size"])
        # size:
        #   xentropy: [batch_size, max(length_logits, length_labels)]
        #   weights:  [batch_size, max(length_logits, length_labels)], 0 or 1

        #loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        real_batch_size = tf.to_float(tf.shape(logits)[0])  # get batch_size

        if params["word_avg"]:
            predict_loss_avg_in_sentence = tf.reduce_sum(
                xentropy, axis=1) / tf.reduce_sum(weights, axis=1)
            predict_loss = tf.reduce_sum(
                predict_loss_avg_in_sentence) / real_batch_size
            # 1.first average in sentence by word;
            # 2.then average in batch by sample.
        else:
            predict_loss = tf.reduce_sum(xentropy) / real_batch_size

        if train:  # train mode
            # if use gaussian_kld_v2, the meaning of 'logvar' becomes standard deviation.
            if params["use_std"]:
                kl_loss = gaussian_kld_v2(recog_mu, recog_logvar, prior_mu,
                                          prior_logvar)
            else:
                kl_loss = gaussian_kld(recog_mu, recog_logvar, prior_mu,
                                       prior_logvar)
            kl_loss = tf.reduce_sum(kl_loss) / real_batch_size
            tf.identity(kl_loss, "kl_loss")
            # annealing
            if params["kl_weight"] == 'sigmoid':
                scaled_x = (tf.to_float(tf.train.get_or_create_global_step()) /
                            params["full_kl_steps"] -
                            0.5) * 20.0  # sigmoid weight
                kl_loss_weight = 1.0 / (1 + tf.exp(-scaled_x))
            elif params["kl_weight"] == 'linear':
                kl_loss_weights = tf.minimum(
                    (tf.to_float(tf.train.get_or_create_global_step()) /
                     params["full_kl_steps"]), 1.0)  # linear weight
            else:
                kl_loss_weight = 1.0

            weighted_kl_loss = kl_loss * kl_loss_weight
            tf.identity(weighted_kl_loss, "weighted_kl_loss")
            tf.identity(kl_loss_weight, "kl_loss_weight")
            if params["use_bow"]:
                bow_loss = compute_bow_loss(latent_sample, targets, params,
                                            train)
                loss = predict_loss + weighted_kl_loss + bow_loss
                tf.identity(bow_loss, "bow_loss")  # total loss
                #TENSORS_TO_LOG["bow_loss"] = "model/bow_loss"
            else:
                loss = predict_loss + weighted_kl_loss
        else:  # eval and infer modes
            loss = predict_loss

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(predict_loss, "predict_loss")
        tf.identity(loss, "cross_entropy")  # total loss

        if mode == tf.estimator.ModeKeys.EVAL:
            if params["use_tpu"]:
                # host call functions should only have tensors as arguments.
                # functools.partial() pre-populates params so that metric_fn is
                # TPUEstimator compliant.
                metric_fn = functools.partial(metrics.get_eval_metrics,
                                              params=params)
                eval_metrics = (metric_fn, [logits, labels])
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metrics=eval_metrics)
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            #metric_dict["minibatch_loss"] = loss
            metric_dict["predict_loss"] = predict_loss
            metric_dict["kl_loss"] = kl_loss
            if params["use_bow"]:
                metric_dict["bow_loss"] = bow_loss
            if params["kl_weight"]:
                metric_dict["weighted_kl_loss"] = weighted_kl_loss
                metric_dict["kl_loss_weight"] = kl_loss_weight

            if params["use_tpu"]:
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    host_call=tpu_util.construct_scalar_host_call(
                        metric_dict=metric_dict,
                        model_dir=params["model_dir"],
                        prefix="training/"))
            record_scalars(metric_dict)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
Beispiel #9
0
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  #tf.set_random_seed(1367)
  with tf.variable_scope("model"):
    inputs, targets = features, labels
    concrete_loss = tf.constant(0)
    total_loss = tf.constant(0)
    concrete_reg = tf.constant(0)
    sparsity_rate = tf.constant(0)
    gate_values = tf.constant(0)
    # =================== For concrete gates ==================================
    print("**** concrete heads has this : {} ****".format(params["concrete_heads"]))
    if not params["concrete_coef"] == 0:
        tf.get_default_graph().clear_collection("CONCRETE")
        tf.get_default_graph().clear_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    # =========================================================================

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    logits = model(inputs, targets)
    #print('logits')
    #print(len(logits))

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      if params["use_tpu"]:
        raise NotImplementedError("Prediction is not yet supported on TPUs.")
      print ("Logits", logits)
      #print (logits["attn_weights"], tf.transpose(tf.stack(logits["attn_weights"]).get_shape(), perm=[1,0,2,3,4]))
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions={"outputs": logits["outputs"], "scores": logits["scores"]})
          #export_outputs={
          #    "translate": tf.estimator.export.PredictOutput(logits["outputs"])
          #})

    # Explicitly set the shape of the logits for XLA (TPU). This is needed
    # because the logits are passed back to the host VM CPU for metric
    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
    # it is known from Transformer that the first two dimensions of logits
    # are the dimensions of targets. Note that the ambiguous shape of logits is
    # not a problem when computing xentropy, because padded_cross_entropy_loss
    # resolves the shape on the TPU.
    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

    # Calculate model loss.
    # xentropy contains the cross entropy loss of every nonpadding token in the
    # targets.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params["label_smoothing"], params["vocab_size"])
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

    # Save loss as named tensor that will be logged with the logging hook.
    tf.identity(loss, "cross_entropy")

    # ============ Loss for concrete gates =================
    if not params["concrete_coef"] == 0:
        concrete_coef = params["concrete_coef"]
        sparsity_rate = tf.reduce_mean(tf.get_collection("CONCRETE"))
        concrete_reg = tf.reduce_mean(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        concrete_loss = concrete_coef * tf.reduce_mean(concrete_reg)
        
        total_loss = loss + concrete_loss

        gate_values = tf.get_collection("GATEVALUES")
        
        tf.identity(concrete_loss, "concrete_loss")
        tf.identity(total_loss, "total_loss")
        tf.identity(concrete_reg, "concrete_reg")
        tf.identity(sparsity_rate, "sparsity_rate")
        tf.identity(gate_values, "gate_values")
        loss = total_loss
    else:
        tf.identity(concrete_loss, "concrete_loss")
        tf.identity(total_loss, "total_loss")
        tf.identity(concrete_reg, "concrete_reg")
        tf.identity(sparsity_rate, "sparsity_rate")
        tf.identity(gate_values, "gate_values")
    # =======================================================
    if mode == tf.estimator.ModeKeys.EVAL:
      if params["use_tpu"]:
        # host call functions should only have tensors as arguments.
        # This lambda pre-populates params so that metric_fn is
        # TPUEstimator compliant.
        metric_fn = lambda logits, labels: (
            metrics.get_eval_metrics(logits, labels, params=params))
        eval_metrics = (metric_fn, [logits, labels])
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, predictions={"predictions": logits},
            eval_metrics=eval_metrics)
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op, metric_dict = get_train_op_and_metrics(loss, params)

      # Epochs can be quite long. This gives some intermediate information
      # in TensorBoard.
      metric_dict["minibatch_loss"] = loss
      if params["use_tpu"]:
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, train_op=train_op,
            host_call=tpu_util.construct_scalar_host_call(
                metric_dict=metric_dict, model_dir=params["model_dir"],
                prefix="training/")
        )
      record_scalars(metric_dict)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Beispiel #10
0
def model_fn(features, mode, params):
    # 기존 dict 의 key,value 에 사용자 입력 값을 추가함
    # extend dict values to defaultdict
    _params = Transformer_params.copy()
    for k in params:
        v = params[k]
        _params[k] = v
    params = _params

    if mode == tf.estimator.ModeKeys.PREDICT: features['answer'] = None

    # define transformer
    transformer = Transformer(params, (mode == tf.estimator.ModeKeys.TRAIN))
    logits = transformer(features['question'], features['answer'])

    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:

        # 네트워크 출력 logits 와 실제 answer 간의 loss 를 계산
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, features['answer'], params["label_smoothing"],
            params["vocab_size"])
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        # loss 를 minimize
        learning_rate = get_learning_rate(params['learning_rate'],
                                          params['hidden_size'],
                                          params['learning_rate_warmup_steps'])
        optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate,
            beta1=params['optimizer_adam_beta1'],
            beta2=params['optimizer_adam_beta2'],
            epsilon=params['optimizer_adam_epsilon'])
        train_op = optimizer.minimize(loss,
                                      global_step=tf.train.get_global_step())

        # 매 100번 마다 logitmax 과 answer 값을 보여줌
        logging_hook = tf.train.LoggingTensorHook(
            {
                "logitmax": tf.argmax(logits[0], -1),
                "answer": features['answer'][0]
            },
            every_n_iter=100)

        # 여러가지 metric 을 계산하여 보여줌 (accuracy, BLEU score, ..)
        eval_metric_ops = metrics.get_eval_metrics(logits, features['answer'],
                                                   params)
        tensors_to_log = {}
        for k in eval_metric_ops:
            tensors_to_log[k.split('/')[-1]] = eval_metric_ops[k][1].name
            tf.summary.scalar(k.split('/')[-1], eval_metric_ops[k][1])

        tensors_to_log = {'learning_rate': learning_rate}
        tf.summary.scalar('learning_rate', learning_rate)

        train_hooks = hooks_helper.get_train_hooks(
            ['LoggingTensorHook'],
            model_dir=params['model_dir'],
            tensors_to_log=tensors_to_log,
            batch_size=params['batch_size'],
            use_tpu=params["use_tpu"])
        # train
        if mode == tf.estimator.ModeKeys.TRAIN:
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              train_op=train_op,
                                              predictions=logits,
                                              training_hooks=[logging_hook] +
                                              train_hooks,
                                              eval_metric_ops=eval_metric_ops)
        # evaluate
        elif mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              predictions=logits,
                                              eval_metric_ops=eval_metric_ops)
    # predict
    else:
        # predict 시에도 summary 저장
        summary_hook = tf.train.SummarySaverHook(
            save_secs=1000,
            output_dir='./output/ckpt/pred',
            scaffold=tf.train.Scaffold(summary_op=tf.summary.merge_all()))

        return tf.estimator.EstimatorSpec(mode,
                                          predictions=logits,
                                          prediction_hooks=[summary_hook])
  def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""  
    cluster_spec = cluster.as_dict()
#    num_gpus=len(cluster_spec["worker"])
    num_gpus=2 
    learning_rate = get_learning_rate(learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
    optimizers = [tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) for _ in range(num_gpus)]

    if params["dtype"] == "fp16":
      optimizers = [tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) for optimizer in optimizers]

    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
    grad_list= []
    losses = []
    logits = []
    for gpu_idx in range(num_gpus):
#      device_setter = local_device_setter(cluster, worker_device="/job:worker/task:%d" % gpu_idx)
      device_setter = local_device_setter(cluster, worker_device="gpu:%d" % gpu_idx)
      with tf.device(device_setter):
#      with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % gpu_idx, cluster=cluster)):
#      with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx):
#with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)):
        logit, loss = create_tower_network(model, params, features, labels)
#        feature_shard, label_shard = next(iterator)
#        logit, loss = create_tower_network(model, params, features, labels)
        logits.append(logit)
        losses.append(loss)
        grad_list.append([x for x in optimizers[gpu_idx].compute_gradients(loss) if x[0] is not None])

#    output_train = tf.concat(logits, axis=0)
    output_train = tf.reduce_mean(logits, axis=0)
    loss_train = tf.reduce_mean(losses, name='loss')
   
    '''
    grads = []
    all_vars= []
    for tower in grad_list:
      grads.append([x[0] for x in tower])
      all_vars.append([x[1] for x in tower])

    reduced_grad = []
    if num_gpus==1:
      reduced_grad = grads
    else:
      new_all_grads = []
      for grad in zip(*grads):
        summed = nccl_ops.all_sum(grad)
        grads_for_devices = []
        for g in summed:
          with tf.device(g.device):
            g = tf.multiply(g, 1.0 / num_gpus, name='allreduce_avg')
          grads_for_devices.append(g)
        new_all_grads.append(grads_for_devices)
      reduced_grad = list(zip(*new_all_grads))
    grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, all_vars)]
    '''
    from tensorflow.python.distribute import cross_device_utils
    grads = cross_device_utils.aggregate_gradients_using_nccl(grad_list)
    #apply gradients to each GPU by broadcasting summed gradient
    train_ops = []
    for idx, grad_and_vars in enumerate(grads):
      with tf.name_scope('apply_gradients'), tf.device(tf.DeviceSpec(device_type="GPU", device_index=idx)):
#        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx)
        global_step = tf.train.get_global_step()
        update_ops = tf.assign(global_step, global_step+1, name='update_global_step')
#        with tf.control_dependencies(update_ops):
        train_ops.append(optimizers[idx].apply_gradients(grad_and_vars, name='apply_grad_{}'.format(idx)))
    optimize_op = tf.group(update_ops, *train_ops, name='train_op')
    train_metrics = {"learning_rate": learning_rate}

    tf.identity(loss_train, "cross_entropy")

    if mode == tf.estimator.ModeKeys.TRAIN:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, train_op=optimize_op)
    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, predictions={"predictions": output_train}, eval_metric_ops=metrics.get_eval_metrics(output_train, labels, params))
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(mode=mode, predictions=output_train, export_outputs={"translate": tf.estimator.export.PredictOutput(output_train)})