Exemple #1
0
def create_or_load_hparams(default_hparams, hparams_path):
  """Create hparams or load hparams from output_dir."""
  hparams = utils.maybe_parse_standard_hparams(default_hparams, hparams_path)
  hparams = extend_hparams(hparams)
  # Print HParams
  utils.print_hparams(hparams)
  return hparams
Exemple #2
0
def create_or_load_hparams(out_dir,
                           default_hparams,
                           hparams_path,
                           save_hparams=True):
    """Create hparams or load hparams from out_dir."""
    print('[new hparams]\n')
    hparams = default_hparams
    hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path)
    hparams = extend_hparams(hparams)
    '''
  hparams = utils.load_hparams(out_dir)
  if not hparams:
    print('[new hparams]\n')
    hparams = default_hparams
    hparams = utils.maybe_parse_standard_hparams(
        hparams, hparams_path)
    hparams = extend_hparams(hparams)
  else:
    print('[load hparams]\n')
    hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path)
  '''
    # Save HParams
    if save_hparams:
        utils.save_hparams(out_dir, hparams)
        for metric in hparams.metrics:
            utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"),
                               hparams)

    # Print HParams
    utils.print_hparams(hparams)
    return hparams
def create_or_load_hparams(load_dir, default_hparams, hparams_path,
                           save_hparams):
    """Create hparams or load hparams from out_dir."""
    hparams = utils.load_hparams(load_dir)
    if not hparams:
        hparams = default_hparams
        # Override hparams values with existing standard hparams config
        hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path)
        hparams = process_input_path(hparams)
        hparams = extend_hparams(hparams)
    else:
        hparams = ensure_compatible_hparams(hparams, default_hparams,
                                            hparams_path)
        hparams = process_input_path(hparams)

    # Save HParams
    if save_hparams:
        utils.save_hparams(default_hparams.out_dir, hparams)
        for metric in hparams.metrics:
            utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"),
                               hparams)

    # Print HParams
    utils.print_hparams(hparams)
    return hparams
def create_or_load_hparams(out_dir, default_hparams, flags):
    """Create hparams or load hparams from out_dir."""
    hparams = utils.load_hparams(out_dir, verbose=not flags.chat)
    if not hparams:
        # Parse the ones from the command line
        hparams = default_hparams
        hparams = utils.maybe_parse_standard_hparams(hparams,
                                                     flags.hparams_path,
                                                     verbose=not flags.chat)
        hparams = extend_hparams(hparams)
    else:
        hparams = ensure_compatible_hparams(hparams, default_hparams, flags)

    # Save HParams
    utils.save_hparams(out_dir, hparams, verbose=not flags.chat)

    for metric in hparams.metrics:
        utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"),
                           hparams,
                           verbose=not flags.chat)

    # Print HParams
    if not flags.chat:
        utils.print_hparams(hparams)
    return hparams
Exemple #5
0
def create_or_load_hparams(out_dir, default_hparams, hparams_path):
    """Create hparams or load hparams from out_dir."""
    hparams = utils.load_hparams(out_dir)

    # print(hparams); assert False #debug
    if not hparams:
        hparams = default_hparams
        hparams = utils.maybe_parse_standard_hparams(
            hparams, hparams_path)
        hparams = extend_hparams(hparams)
    else:
        hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path)

    if FLAGS.inference_input_file:
        hparams.src_vocab_file = os.path.join(out_dir, "../data/vocab.cor")
        hparams.tgt_vocab_file = os.path.join(out_dir, "../data/vocab.man")
        hparams.out_dir = out_dir
        hparams.best_bleu_dir = os.path.join(out_dir, "best_bleu")
        hparams.train_prefix = os.path.join(out_dir, "../data/train")
        hparams.dev_prefix = os.path.join(out_dir, "../data/dev_test")
        hparams.vocab_prefix = os.path.join(out_dir, "../data/vocab")
        hparams.rc_vocab_file = os.path.join(out_dir, "../data/vocab.cor")
        hparams.test_prefix = os.path.join(out_dir, "../data/test")

    # Save HParams
    utils.save_hparams(out_dir, hparams)

    for metric in hparams.metrics:
        utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams)

    # Print HParams
    utils.print_hparams(hparams)
    return hparams
Exemple #6
0
def create_or_load_hparams(out_dir, default_hparams, save_hparams=True):
    hparams = utils.load_hparams(out_dir)
    if not hparams:
        hparams = default_hparams
        hparams = extend_hparams(hparams)

    # Save HParams
    if save_hparams:
        utils.save_hparams(out_dir, hparams)

    # Print HParams
    utils.print_hparams(hparams)
    return hparams
Exemple #7
0
def create_or_load_hparams(out_dir, default_hparams, hparams_path):
    """Create hparams or load hparams from out_dir."""
    hparams = utils.load_hparams(out_dir)
    if not hparams:
        hparams = default_hparams
        hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path)
        hparams = extend_hparams(hparams)
    else:
        hparams = ensure_compatible_hparams(hparams, default_hparams,
                                            hparams_path)

    # Save HParams
    utils.save_hparams(out_dir, hparams)

    for metric in hparams.metrics:
        utils.save_hparams(getattr(hparams, "best_bleu_dir"), hparams)

    # Print HParams
    utils.print_hparams(hparams)
    return hparams
Exemple #8
0
def create_or_load_hparams(out_dir,
                           default_hparams,
                           hparams_path,
                           save_hparams=True):
    hparams = utils.load_hparams(out_dir)
    if not hparams:
        hparams = default_hparams
        hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path)
        hparams = extend_hparams(hparams)
    else:
        hparams = ensure_compatible_hparams(hparams, default_hparams,
                                            hparams_path)

    if save_hparams:
        utils.save_hparams(out_dir, hparams)
        for metric in hparams.metrics:
            utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"),
                               hparams)

    utils.print_hparams(hparams)
    return hparams
Exemple #9
0
def main(unused_argv):
  experiment_start = time.time()

  tf.logging.set_verbosity(tf.logging.INFO)

  if FLAGS.use_fp16 and FLAGS.use_dist_strategy:
    raise ValueError("use_fp16 and use_dist_strategy aren't compatible")

  if FLAGS.use_fp16 + FLAGS.use_amp + FLAGS.use_fastmath > 1:
    raise ValueError("Only one of use_fp16, use_amp, use_fastmath can be set")

  if FLAGS.use_amp:
    utils.print_out('Enabling TF-AMP')

    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

  if FLAGS.use_fastmath:
    utils.print_out('Enabling FastMath')

    os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = '1'
    os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = '1'
    os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = '1'

  # Set up hacky envvars.
  # Hack that affects Defun in attention_wrapper.py
  active_xla_option_nums = np.sum([FLAGS.use_xla, FLAGS.use_autojit_xla,
                                   FLAGS.xla_compile])
  if active_xla_option_nums > 1:
    raise ValueError(
        "Only one of use_xla, xla_compile, use_autojit_xla can be set")

  os.environ["use_xla"] = str(FLAGS.use_xla).lower()
  if FLAGS.use_xla:
    os.environ["use_defun"] = str(True).lower()
  else:
    os.environ["use_defun"] = str(FLAGS.use_defun).lower()
  utils.print_out("use_defun is %s for attention" % os.environ["use_defun"])

  # TODO(jamesqin): retire this config after Cuda9.1
  os.environ["use_fp32_batch_matmul"] = ("true" if FLAGS.use_fp32_batch_matmul
                                         else "false")
  os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false"
  os.environ["force_inputs_padding"] = (
      "true" if FLAGS.force_inputs_padding else "false")

  if FLAGS.mode == "train":
    utils.print_out("Running training mode.")
    default_hparams = create_hparams(FLAGS)
    run_main(FLAGS, default_hparams, estimator.train_fn)
  elif FLAGS.mode == "infer" or FLAGS.mode == "translate":
    if FLAGS.mode == "infer":
        utils.print_out("Running inference mode.")
        translate_mode = False
    else:
        utils.print_out("Running translate mode on file {}.".format(FLAGS.translate_file))
        translate_mode = True

    # Random
    random_seed = FLAGS.random_seed
    if random_seed is not None and random_seed > 0:
      utils.print_out("# Set random seed to %d" % random_seed)
      random.seed(random_seed)
      np.random.seed(random_seed)
      tf.set_random_seed(random_seed)

    # Model output directory
    output_dir = FLAGS.output_dir
    if output_dir and not tf.gfile.Exists(output_dir):
      utils.print_out("# Creating output directory %s ..." % output_dir)
      tf.gfile.MakeDirs(output_dir)

    # Load hparams.
    default_hparams = create_hparams(FLAGS)
    default_hparams.num_buckets = 1
    # The estimator model_fn is written in a way allowing train hparams to be
    # passed in infer mode.
    hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
    utils.print_out("infer_hparams:")
    utils.print_hparams(hparams)

    if translate_mode:
      tokenize(hparams, hparams.translate_file, hparams.translate_file + ".tok")

    eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(hparams, train=False)

    # Run evaluation when there's a new checkpoint
    tf.logging.info("Starting to evaluate...")
    eval_start = time.time()
    _, (eval_speed, eval_latencies), eval_output_tokens = estimator.eval_fn(hparams, hparams.ckpt, only_translate=translate_mode)
    eval_end = time.time()
    eval_delta = eval_end - eval_start
    utils.print_out("eval time for ckpt: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
                    (eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr)
    for lat in sorted(eval_latencies):
      utils.print_out("eval latency_%s for ckpt: %.2f ms" % (lat, eval_latencies[lat] * 1000))

    if translate_mode:
      detokenize(hparams, hparams.translate_file + ".trans.tok", hparams.translate_file + ".trans")

  else:
    assert FLAGS.mode == "train_and_eval"
    utils.print_out("Running train and eval mode.")

    # Random
    random_seed = FLAGS.random_seed
    if random_seed is not None and random_seed > 0:
      utils.print_out("# Set random seed to %d" % random_seed)
      random.seed(random_seed)
      np.random.seed(random_seed)
      tf.set_random_seed(random_seed)

    # Model output directory
    output_dir = FLAGS.output_dir
    if output_dir and not tf.gfile.Exists(output_dir):
      utils.print_out("# Creating output directory %s ..." % output_dir)
      tf.gfile.MakeDirs(output_dir)

    # Load hparams.
    default_hparams = create_hparams(FLAGS)

    hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
    utils.print_out("training hparams:")
    utils.print_hparams(hparams)
    with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"), "w") as f:
      f.write(utils.serialize_hparams(hparams) + "\n")

    # The estimator model_fn is written in a way allowing train hparams to be
    # passed in infer mode.
    infer_hparams = tf.contrib.training.HParams(**hparams.values())
    infer_hparams.num_buckets = 1
    utils.print_out("infer_hparams:")
    utils.print_hparams(infer_hparams)
    with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"), "w") as f:
      f.write(utils.serialize_hparams(infer_hparams) + "\n")

    epochs = 0
    should_stop = epochs >= FLAGS.max_train_epochs

    train_sentences, train_src_tokens, train_tgt_tokens = iterator_utils.get_effective_epoch_size(hparams)
    eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(hparams, train=False)

    while not should_stop:
      utils.print_out("Starting epoch %d" % epochs)
      try:
        train_start = time.time()
        train_speed, _ = estimator.train_fn(hparams)
      except tf.errors.OutOfRangeError:
        utils.print_out("training hits OutOfRangeError", f=sys.stderr)

      train_end = time.time()
      train_delta = train_end - train_start
      utils.print_out("training time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
                      (epochs + 1, train_delta / 60., train_speed, train_speed * (train_src_tokens + train_tgt_tokens) / train_sentences), f=sys.stderr)

      # This is probably sub-optimal, doing eval per-epoch
      eval_start = time.time()
      bleu_score, (eval_speed, eval_latencies), eval_output_tokens = estimator.eval_fn(infer_hparams)
      eval_end = time.time()
      eval_delta = eval_end - eval_start
      utils.print_out("eval time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
                      (epochs + 1, eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr)
      for lat in sorted(eval_latencies):
        utils.print_out("eval latency_%s for epoch %d: %.2f ms" % (lat, epochs + 1, eval_latencies[lat] * 1000))


      if FLAGS.debug or (FLAGS.target_bleu is not None and bleu_score > FLAGS.target_bleu):
        should_stop = True
        utils.print_out(
            "Stop job since target bleu is reached at epoch %d ." % epochs,
            f=sys.stderr)

      epochs += 1
      if epochs >= FLAGS.max_train_epochs:
        should_stop = True
        utils.print_out("Stop job since max_train_epochs is reached.",
                        f=sys.stderr)

  experiment_end = time.time()
  utils.print_out('Experiment took {} min'.format((experiment_end - experiment_start) / 60))
def main(unused_argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  if FLAGS.use_fp16 and FLAGS.use_dist_strategy:
    raise ValueError("use_fp16 and use_dist_strategy aren't compatible")

  # Set up hacky envvars.
  # Hack that affects Defun in attention_wrapper.py
  active_xla_option_nums = np.sum([FLAGS.use_xla, FLAGS.use_autojit_xla,
                                   FLAGS.xla_compile])
  if active_xla_option_nums > 1:
    raise ValueError(
        "Only one of use_xla, xla_compile, use_autojit_xla can be set")

  os.environ["use_xla"] = str(FLAGS.use_xla).lower()
  if FLAGS.use_xla:
    os.environ["use_defun"] = str(True).lower()
  else:
    os.environ["use_defun"] = str(FLAGS.use_defun).lower()
  utils.print_out("use_defun is %s for attention" % os.environ["use_defun"])

  # TODO(jamesqin): retire this config after Cuda9.1
  os.environ["use_fp32_batch_matmul"] = ("true" if FLAGS.use_fp32_batch_matmul
                                         else "false")
  os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false"
  os.environ["force_inputs_padding"] = (
      "true" if FLAGS.force_inputs_padding else "false")

  if FLAGS.mode == "train":
    utils.print_out("Running training mode.")
    FLAGS.num_buckets = 5
    default_hparams = create_hparams(FLAGS)
    run_main(FLAGS, default_hparams, estimator.train_fn)
  elif FLAGS.mode == "infer":
    utils.print_out("Running inference mode.")
    # Random
    random_seed = FLAGS.random_seed
    if random_seed is not None and random_seed > 0:
      utils.print_out("# Set random seed to %d" % random_seed)
      random.seed(random_seed)
      np.random.seed(random_seed)
      tf.set_random_seed(random_seed)

    # Model output directory
    output_dir = FLAGS.output_dir
    if output_dir and not tf.gfile.Exists(output_dir):
      utils.print_out("# Creating output directory %s ..." % output_dir)
      tf.gfile.MakeDirs(output_dir)

    # Load hparams.
    default_hparams = create_hparams(FLAGS)
    default_hparams.num_buckets = 1
    # The estimator model_fn is written in a way allowing train hparams to be
    # passed in infer mode.
    hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
    utils.print_out("infer_hparams:")
    utils.print_hparams(hparams)

    # Run evaluation when there's a new checkpoint
    for i, ckpt in enumerate(
        evaluation_utils.get_all_checkpoints(FLAGS.output_dir)):
      tf.logging.info("Starting to evaluate...")
      eval_start = time.time()
      bleu_score = estimator.eval_fn(hparams, ckpt)
      eval_end = time.time()
      utils.print_out("eval time for %d th ckpt: %.2f mins" %
                      (i, (eval_end - eval_start) / 60.), f=sys.stderr)
  else:
    assert FLAGS.mode == "train_and_eval"
    utils.print_out("Running train and eval mode.")

    # Random
    random_seed = FLAGS.random_seed
    if random_seed is not None and random_seed > 0:
      utils.print_out("# Set random seed to %d" % random_seed)
      random.seed(random_seed)
      np.random.seed(random_seed)
      tf.set_random_seed(random_seed)

    # Model output directory
    output_dir = FLAGS.output_dir
    if output_dir and not tf.gfile.Exists(output_dir):
      utils.print_out("# Creating output directory %s ..." % output_dir)
      tf.gfile.MakeDirs(output_dir)

    # Load hparams.
    default_hparams = create_hparams(FLAGS)

    default_hparams.num_buckets = 5
    hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path)
    utils.print_out("training hparams:")
    utils.print_hparams(hparams)
    with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"), "w") as f:
      f.write(utils.serialize_hparams(hparams) + "\n")

    # The estimator model_fn is written in a way allowing train hparams to be
    # passed in infer mode.
    infer_hparams = tf.contrib.training.HParams(**hparams.values())
    infer_hparams.num_buckets = 1
    utils.print_out("infer_hparams:")
    utils.print_hparams(infer_hparams)
    with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"), "w") as f:
      f.write(utils.serialize_hparams(infer_hparams) + "\n")

    epochs = 0
    should_stop = epochs >= FLAGS.max_train_epochs

    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)
    mlperf_log.gnmt_print(key=mlperf_log.EVAL_TARGET, value=hparams.target_bleu)

    while not should_stop:
      utils.print_out("Starting epoch %d" % epochs)
      mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epochs)

      mlperf_log.gnmt_print(
          key=mlperf_log.INPUT_SIZE,
          value=iterator_utils.get_effective_train_epoch_size(hparams))
      mlperf_log.gnmt_print(
          key=mlperf_log.TRAIN_CHECKPOINT,
          value=("Under " + hparams.output_dir))
      try:
        train_start = time.time()
        estimator.train_fn(hparams)
      except tf.errors.OutOfRangeError:
        utils.print_out("training hits OutOfRangeError", f=sys.stderr)

      train_end = time.time()
      utils.print_out("training time for epoch %d: %.2f mins" %
                      (epochs, (train_end - train_start) / 60.), f=sys.stderr)

      # This is probably sub-optimal, doing eval per-epoch
      mlperf_log.gnmt_print(key=mlperf_log.EVAL_START)
      eval_start = time.time()
      bleu_score = estimator.eval_fn(infer_hparams)
      eval_end = time.time()
      utils.print_out("eval time for epoch %d: %.2f mins" %
                      (epochs, (eval_end - eval_start) / 60.), f=sys.stderr)
      mlperf_log.gnmt_print(key=mlperf_log.EVAL_ACCURACY,
                            value={"epoch": epochs, "value": bleu_score})
      mlperf_log.gnmt_print(key=mlperf_log.EVAL_STOP, value=epochs)

      if FLAGS.debug or bleu_score > FLAGS.target_bleu:
        should_stop = True
        utils.print_out(
            "Stop job since target bleu is reached at epoch %d ." % epochs,
            f=sys.stderr)
        mlperf_log.gnmt_print(mlperf_log.RUN_STOP, {"success": True})

      if epochs >= FLAGS.max_train_epochs:
        should_stop = True
        utils.print_out("Stop job since max_train_epochs is reached.",
                        f=sys.stderr)
        mlperf_log.gnmt_print(mlperf_log.RUN_STOP, {"success": False})
      epochs += 1

  mlperf_log.gnmt_print(key=mlperf_log.RUN_FINAL)
Exemple #11
0
def main(_):
    ####################################################################################
    feats = Features()

    # hyper params
    hparam = tf.contrib.training.HParams(
        model=cfg.model,
        norm=True,  # use batch norm
        seed=cfg.seed,
        batch_norm_decay=0.9,
        hidden_size=[1024, 512],
        cross_layer_sizes=[128, 128],
        k=16,  # multi_features embedding dim
        single_k=16,  # single_features embedding dim
        max_length=100,  # hash length
        cross_hash_num=int(5e6),
        single_hash_num=int(5e6),
        multi_hash_num=int(1e6),
        batch_size=1024,
        infer_batch_size=2**14,
        optimizer="adam",
        dropout=0,
        kv_batch_num=20,
        learning_rate=0.00005,
        num_display_steps=100,  # every number of steps to display results
        num_save_steps=1000,  # every number of steps to save model
        num_eval_steps=2000,  # every number of steps to evaluate model
        epoch=10,  # train epoch
        metric='softmax_loss',
        activation=['relu', 'relu', 'relu'],
        init_method='tnormal',
        cross_activation='relu',
        init_value=0.001,
        single_features=None,
        cross_features=None,
        multi_features=feats.multi_features,
        dense_features=feats.dense_features,
        kv_features=None,
        label=feats.label_features,
        label_dim=4,  # output label dim (gender - 1, age - 4, age_all - 10)
        label_name='age',
        model_name=cfg.model,
        checkpoint_dir=os.path.join(cfg.data_path, FLAGS.log_dir))
    utils.print_hparams(hparam)

    ####################################################################################

    if FLAGS.mode == 'train':
        # read data
        train_log = read_all_feature_data(feats, label_name=hparam.label_name)

        # build model
        model = model_utils.build_model(hparam)

        # train model
        model.train(train_log, None)

    ####################################################################################
    elif FLAGS.mode == 'test':
        # read data
        test_log = read_all_feature_data(feats, mode='test')

        # build model
        model = model_utils.build_model(hparam)

        # infer model
        preds = model.infer(test_log)  # shape: [length, 20]

        if hparam.label_name == 'age':
            _ = output_labels_v2(test_log,
                                 preds,
                                 pred_path=os.path.join(
                                     cfg.data_path, FLAGS.log_dir,
                                     'preds.csv'))
        elif hparam.label_name == 'gender':
            _ = output_labels_v3(test_log,
                                 preds,
                                 pred_path=os.path.join(
                                     cfg.data_path, FLAGS.log_dir,
                                     'preds.csv'))

        # K_fold = []
        # for i in range(5):
        #     if i == 4:
        #         tmp = index
        #     else:
        #         tmp = random.sample(index, int(1.0 / 5 * train.shape[0]))
        #     index = index - set(tmp)
        #     print("Number:", len(tmp))
        #     K_fold.append(tmp)
        #
        # train_preds = np.zeros(len(train))
        # test_preds = np.zeros(len(test))
        # scores = []
        # train['gold'] = True
        # for i in range(5):
        #     print("Fold", i)
        #     dev_index = K_fold[i]
        #     train_index = []
        #     for j in range(5):
        #         if j != i:
        #             train_index += K_fold[j]
        #     for k in range(2):
        #         model = model_utils.build_model(hparam)
        #         score = model.train(train.loc[train_index], train.loc[dev_index])
        #         scores.append(score)
        #         train_preds[list(dev_index)] += model.infer(train.loc[list(dev_index)]) / 2
        #         test_preds += model.infer(test) / 10
        #         print(np.mean((np.exp(test_preds * 10 / (i * 2 + k + 1)) - 1)))
        #     try:
        #         del model
        #         gc.collect()
        #     except:
        #         pass
        # train_preds = np.exp(train_preds) - 1
        # test_preds = np.exp(test_preds) - 1

    ####################################################################################
    elif FLAGS.mode == 'val':
        # read data
        train_log, val_log = read_all_feature_data(feats, mode='val')

        # build model
        model = model_utils.build_model(hparam)

        # train model
        model.train(train_log, None, is_val=True)

        # infer model
        preds = model.infer(val_log)  # shape: [length, 20]

        if hparam.label_name == 'age':
            val_log = output_labels_v2(val_log,
                                       preds,
                                       pred_path=os.path.join(
                                           cfg.data_path, FLAGS.log_dir,
                                           'val_preds.csv'),
                                       is_train=True)
        elif hparam.label_name == 'gender':
            val_log = output_labels_v3(val_log,
                                       preds,
                                       pred_path=os.path.join(
                                           cfg.data_path, FLAGS.log_dir,
                                           'val_preds.csv'),
                                       is_train=True)

        # print results
        age_acc = sum((val_log.age == val_log.predicted_age).astype(
            np.int)) / len(val_log)
        gender_acc = sum((val_log.gender == val_log.predicted_gender).astype(
            np.int)) / len(val_log)

        print("Final Age Accuracy: %.4f" % age_acc)
        print("Final Gender Accuracy: %.4f" % gender_acc)
Exemple #12
0
    def build_graph(self, features, labels, mode, params):
        """docstring."""
        del labels, params
        misc_utils.print_out("Running fast mode_fn")

        hparams = self.hparams

        # Create global_step
        tf.train.get_or_create_global_step()

        if mode == tf.contrib.learn.ModeKeys.INFER:
            # Doing inference only on one GPU
            inf_hparams = tf.contrib.training.HParams(**hparams.values())
            inf_hparams.set_hparam("num_gpus", 1)
            # Inference is done in fp32 and in the same way as that of dist_strategy.
            inf_hparams.set_hparam("use_fp16", False)

            misc_utils.print_out("inference hparmas:")
            misc_utils.print_hparams(inf_hparams)

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(inf_hparams)

            with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope(
                    "tower_0"), var_mgr.create_outer_variable_scope(0):
                model = gnmt_model.GNMTModel(inf_hparams,
                                             mode=mode,
                                             features=features)
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if inf_hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            num_towers = hparams.num_gpus
            # Shard inputs
            tower_features = self._shard_inputs(features, num_towers)
            # Create loss scale vars if necessary
            loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars(
            )

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(hparams)

            # Build per-tower fprop and bprop
            devices = var_mgr.get_devices()
            tower_gradvars = []
            tower_scopes = []
            var_scopes = []
            train_losses = []
            learning_rates = []
            batch_sizes = []
            opts = []

            def fprop_and_bprop(tid):
                """docstring."""
                model = gnmt_model.GNMTModel(hparams,
                                             mode=mode,
                                             features=tower_features[tid])
                # sync training.
                assert model.learning_rate is not None
                # The following handles shouldn't be built in when doing manual
                assert model.grad_norm is None
                assert model.update is None
                tower_loss = model.train_loss
                # Only check loss numerics if in fp16
                if hparams.use_fp16 and hparams.check_tower_loss_numerics:
                    tower_loss = tf.check_numerics(
                        tower_loss, "tower_%d has Inf/NaN loss" % tid)
                # Cast to fp32, otherwise would easily overflow.
                tower_loss = tf.to_float(tower_loss)
                var_params, grads, opt = self._compute_tower_grads(
                    tower_loss,
                    var_mgr.trainable_variables_on_device(tid, tid),
                    model.learning_rate,
                    use_fp16=hparams.use_fp16,
                    loss_scale=loss_scale,
                    colocate_gradients_with_ops=hparams.
                    colocate_gradients_with_ops)
                self._print_varinfo(var_params, tid)
                res = [model.train_loss, model.learning_rate, model.batch_size]
                res.extend(grads)
                opts.append(opt)
                return res

            def unpack_fprop_and_bprop_output(output):
                train_loss = output[0]
                learning_rate = output[1]
                batch_size = output[2]
                grads = output[3:]
                return train_loss, learning_rate, batch_size, grads

            with mixed_precision_scope():
                for tid in range(num_towers):
                    with tf.device(devices[tid % len(devices)]), tf.name_scope(
                            "tower_%s" % tid) as scope:
                        tower_scopes.append(scope)
                        with var_mgr.create_outer_variable_scope(
                                tid) as var_scope:
                            var_scopes.append(var_scope)

                            outputs = maybe_xla_compile(
                                hparams, fprop_and_bprop, tid)
                            (train_loss, learning_rate, batch_size,
                             grads) = unpack_fprop_and_bprop_output(outputs)
                            train_losses.append(train_loss)
                            learning_rates.append(learning_rate)
                            batch_sizes.append(batch_size)
                            var_params = var_mgr.trainable_variables_on_device(
                                tid, tid)
                            tower_gradvars.append(list(zip(grads, var_params)))

            # Add summaries
            if hparams.show_metrics:
                tf.summary.scalar("learning_rate", learning_rates[0])
                if loss_scale:
                    tf.summary.scalar("loss_scale", loss_scale)
                    if hparams.enable_auto_loss_scale:
                        tf.summary.scalar("loss_scale_normal_steps",
                                          loss_scale_normal_steps)
            misc_utils.print_out("Finish building fprop and per-tower bprop.")
            # Aggregate gradients
            # The following compute the aggregated grads for each tower, stored in
            # opaque grad_states structure.
            apply_grads_devices, grad_states = var_mgr.preprocess_device_grads(
                tower_gradvars)
            master_grads = None
            master_params = None
            update_ops = []
            for i, device in enumerate(apply_grads_devices):
                with tf.device(device), tf.name_scope(tower_scopes[i]):
                    # Get per-tower grads.
                    with tf.name_scope("get_gradients_to_apply"):
                        avg_gradvars = var_mgr.get_gradients_to_apply(
                            i, grad_states)
                    avg_grads = [gv[0] for gv in avg_gradvars]

                    # gradients post-processing
                    with tf.name_scope("clip_gradients"):
                        if hparams.clip_grads:
                            clipped_grads, grad_norm = model_helper.gradient_clip(
                                avg_grads,
                                max_gradient_norm=hparams.max_gradient_norm)
                            # summary the grad on the 1st tower
                            if i == 0 and hparams.show_metrics:
                                tf.summary.scalar("grad_norm", grad_norm)
                                tf.summary.scalar(
                                    "clipped_grad_norm",
                                    tf.global_norm(clipped_grads))
                        else:
                            clipped_grads = avg_grads
                        if i == 0:
                            master_grads = clipped_grads

                    # Build apply-gradients ops
                    clipped_gradvars = list(
                        zip(clipped_grads, [gv[1] for gv in avg_gradvars]))
                    if i == 0:
                        master_params = [gv[1] for gv in avg_gradvars]
                    with tf.name_scope("append_gradient_ops"):
                        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
                            enable_auto_loss_scale=hparams.
                            enable_auto_loss_scale,
                            loss_scale=loss_scale,
                            loss_scale_normal_steps=loss_scale_normal_steps,
                            inc_loss_scale_every_n=hparams.
                            fp16_inc_loss_scale_every_n,
                            is_chief=True)
                        opt = opts[i]
                        var_mgr.append_apply_gradients_ops(
                            grad_states, opt, clipped_gradvars, update_ops,
                            loss_scale_params)
            misc_utils.print_out("Finish building grad aggregation.")

            assert len(update_ops) == num_towers
            train_op = tf.group(update_ops)
            with tf.control_dependencies([train_op]):
                global_step = tf.train.get_global_step()
                train_op = global_step.assign_add(1)

            # Compute loss on the first gpu
            # TODO(jamesqin): optimize it?
            with tf.device("gpu:0"):
                loss = misc_utils.weighted_avg(train_losses, batch_sizes)

            # Create local init_ops
            # TODO(jamesqin): handle resource variables!
            # At present if not using mirror strategy, not using resource vars.
            local_init_ops = []
            local_init_op = tf.local_variables_initializer()
            with tf.control_dependencies([local_init_op]):
                local_init_ops.append(var_mgr.get_post_init_ops())
            local_init_ops.extend([local_init_op, tf.tables_initializer()])

            saveable_vars = var_mgr.savable_variables()
            # Add saveables for cudnn vars in master tower.
            saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
            saveable_objects = [x for x in saveable_objects if "v0" in x.name]

            misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars))
            for mv in saveable_vars:
                misc_utils.print_out(mv.name)

            misc_utils.print_out("All global trainable vars(%d): " %
                                 len(tf.trainable_variables()))
            for tv in tf.trainable_variables():
                misc_utils.print_out(tv.name)

            misc_utils.print_out("All global vars(%d): " %
                                 len(tf.global_variables()))
            for gv in tf.global_variables():
                misc_utils.print_out(gv.name)

            misc_utils.print_out("master backproped params(%d): " %
                                 len(master_params))
            for mp in master_params:
                misc_utils.print_out(mp.name)

            # Note the cudnn vars are skipped the init check. :(
            scaffold = tf.train.Scaffold(
                ready_op=tf.report_uninitialized_variables(saveable_vars),
                ready_for_local_init_op=tf.report_uninitialized_variables(
                    saveable_vars),
                local_init_op=tf.group(*local_init_ops),
                saver=tf.train.Saver(saveable_vars + saveable_objects,
                                     save_relative_paths=True))

            misc_utils.print_out("Finish building model_fn")
            # return loss, vars, grads, predictions, train_op, scaffold
            return loss, master_params, master_grads, None, train_op, scaffold
def main(_):
    ####################################################################################
    feats = Features()

    # hyper params
    hparam = tf.contrib.training.HParams(
        num_classes=1999,  # number of label
        model=cfg.model,
        norm=True,  # use batch norm
        seed=cfg.seed,
        batch_norm_decay=0.9,
        hidden_size=[1024, 512],
        cross_layer_sizes=[128, 128],
        k=16,  # multi_features embedding dim
        single_k=16,  # single_features embedding dim
        sequence_length=100,  # max sentence length
        embed_size=100,  # embedding size
        cross_hash_num=int(5e6),
        single_hash_num=int(5e6),
        multi_hash_num=int(1e6),
        batch_size=1024,
        infer_batch_size=2**14,
        optimizer="adam",
        dropout=0,
        kv_batch_num=20,
        learning_rate=0.01,
        decay_steps=12000,  # how many steps before decay learning rate
        decay_rate=0.9,  # Rate of decay for learning rate
        num_display_steps=1000,  # every number of steps to display results
        num_save_steps=1000,  # every number of steps to save model
        num_eval_steps=1000,  # every number of steps to evaluate model
        epoch=20,  # train epoch
        metric='softmax_loss',
        activation=['relu', 'relu', 'relu'],
        init_method='tnormal',
        cross_activation='relu',
        init_value=0.001,
        l2_lambda=0.0001,
        single_features=None,
        cross_features=None,
        multi_features=feats.multi_features,
        dense_features=feats.dense_features,
        kv_features=None,
        label=feats.label_features,
        label_dim=1,  # output label dim (gender - 1, age - 4, age_all - 10)
        label_name='gender',
        model_name=cfg.model,
        checkpoint_dir=os.path.join(cfg.data_path, FLAGS.log_dir))
    utils.print_hparams(hparam)

    ####################################################################################
    if FLAGS.mode == 'train':
        # read train data
        train_log = read_all_feature_data(feats, label_name=hparam.label_name)

        # build model
        model = model_utils.build_model(hparam)

        # train model
        model.train(train_log, None)

        # read test data
        test_log = read_all_feature_data(feats,
                                         mode='test',
                                         label_name=hparam.label_name)

        # infer model
        preds = model.infer(test_log)  # shape: [length, 20]

        if hparam.label_name == 'age':
            _ = output_labels_v2(test_log,
                                 preds,
                                 pred_path=os.path.join(
                                     cfg.data_path, FLAGS.log_dir,
                                     'preds.csv'))
        elif hparam.label_name == 'gender':
            _ = output_labels_v3(test_log,
                                 preds,
                                 pred_path=os.path.join(
                                     cfg.data_path, FLAGS.log_dir,
                                     'preds.csv'))

    ####################################################################################
    elif FLAGS.mode == 'val':
        # read data
        train_log, val_log = read_all_feature_data(
            feats, mode='val', label_name=hparam.label_name)

        # build model
        model = model_utils.build_model(hparam)

        # train model
        model.train(train_log, None, is_val=True)

        # infer model
        preds = model.infer(val_log)  # shape: [length, 20]

        if hparam.label_name == 'age':
            val_log = output_labels_v2(val_log,
                                       preds,
                                       pred_path=os.path.join(
                                           cfg.data_path, FLAGS.log_dir,
                                           'val_preds.csv'),
                                       is_train=True)
        elif hparam.label_name == 'gender':
            val_log = output_labels_v3(val_log,
                                       preds,
                                       pred_path=os.path.join(
                                           cfg.data_path, FLAGS.log_dir,
                                           'val_preds.csv'),
                                       is_train=True)

        # print results
        age_acc = sum((val_log.age == val_log.predicted_age).astype(
            np.int)) / len(val_log)
        gender_acc = sum((val_log.gender == val_log.predicted_gender).astype(
            np.int)) / len(val_log)

        print("Final Age Accuracy: %.4f" % age_acc)
        print("Final Gender Accuracy: %.4f" % gender_acc)

    # ####################################################################################
    # 1.load data(X:list of lint,y:int).
    # if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    # else:
    if 1 == 1:
        # 1.  get vocabulary of X and label.
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary(
            simple='simple',
            word2vec_model_path=FLAGS.word2vec_model_path,
            name_scope="biLstmTextRelation")
        vocab_size = len(vocabulary_word2index)
        print("rnn_model.vocab_size:", vocab_size)
        # vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="biLstmTextRelation")
        vocabulary_word2index_label = {'1': 1, '0': 0}
        vocabulary_index2word_label = {0: '0', 1: '1'}
        train, test, _ = load_data(vocabulary_word2index,
                                   vocabulary_word2index_label,
                                   valid_portion=0.005,
                                   training_data_path=FLAGS.traning_data_path)
        # train, test, _ =  load_data_multilabel_new_twoCNN(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=False,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path
        # train, test, _ =  load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=False,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path
        trainX, trainY = train
        testX, testY = test
        # 2.Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                              value=0.)  # padding to max length
        ###############################################################################################
        # with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
        print("trainX[0]:", trainX[0])  # ;print("trainY[0]:", trainY[0])
        # Converting labels to binary vectors
        print("end padding & transform to one hot...")

    # 2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        biLstmTR = BiLstm(FLAGS.num_classes, FLAGS.learning_rate,
                          FLAGS.batch_size, FLAGS.decay_steps,
                          FLAGS.decay_rate, FLAGS.sequence_length, vocab_size,
                          FLAGS.embed_size, FLAGS.is_training)

        # Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint for rnn model.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  # load pre-trained word embedding
                assign_pretrained_word_embedding(
                    sess,
                    vocabulary_index2word,
                    vocab_size,
                    biLstmTR,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(biLstmTR.epoch_step)

        # 3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, hparam.epoch):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end]
                          )  # ;print("trainY[start:end]:",trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [biLstmTR.loss_val, biLstmTR.accuracy, biLstmTR.train_op],
                    feed_dict={
                        biLstmTR.input_x: trainX[start:end],
                        biLstmTR.input_y: trainY[start:end],
                        biLstmTR.dropout_keep_prob: 1.0
                    }
                )  # curr_acc--->TextCNN.accuracy -->,textRNN.dropout_keep_prob:1
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 500 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter))
                    )  # tTrain Accuracy:%.3f---》acc/float(counter)
            # epoch increment
            print("going to increment epoch counter....")
            sess.run(biLstmTR.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, biLstmTR, testX, testY,
                                              batch_size,
                                              vocabulary_index2word_label)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))
                # save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                if not os.path.exists(FLAGS.ckpt_dir):
                    os.mkdir(FLAGS.ckpt_dir)
                saver.save(sess, save_path, global_step=epoch)