def run():
  """Run the model training and return evaluation output."""
  resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
  contrib_distribute.initialize_tpu_system(resolver)
  strategy = contrib_distribute.TPUStrategy(resolver)

  model_cls = MODELS[FLAGS.model]
  if FLAGS.use_synthetic_data:
    data = SyntheticDataset(FLAGS.batch_size)
  else:
    data = Cifar10Dataset(FLAGS.batch_size)

  with strategy.scope():
    model = model_cls(weights=None, input_shape=data.input_shape,
                      classes=data.num_classes)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    history = model.fit(
        data.train_dataset,
        epochs=FLAGS.epochs,
        steps_per_epoch=data.num_train_images // FLAGS.batch_size,
        validation_data=data.test_dataset,
        validation_steps=data.num_test_images // FLAGS.batch_size)

    return history.history
Example #2
0
 def _create_tpu_strategy():
     resolver = cluster_resolver.TPUClusterResolver("")
     tpu_lib.initialize_tpu_system(resolver)
     strategy = tpu_lib.TPUStrategy(resolver,
                                    steps_per_run=steps_per_run,
                                    **kwargs)
     return strategy
def _get_tpu_estimator():
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=None, project=None)
    tpu_grpc_url = tpu_cluster_resolver.get_master()

    run_config = contrib_tpu_python_tpu_tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.work_dir,
        save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop),
        save_summary_steps=FLAGS.summary_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=contrib_tpu_python_tpu_tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=contrib_tpu_python_tpu_tpu_config.
            InputPipelineConfig.PER_HOST_V2))

    return contrib_tpu_python_tpu_tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores,
        eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores,
        params=FLAGS.flag_values_dict())
Example #4
0
def run():
    """Run the model training and return evaluation output."""
    resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    contrib_distribute.initialize_tpu_system(resolver)
    strategy = contrib_distribute.TPUStrategy(resolver, steps_per_run=100)

    if FLAGS.fake_data:
        print("Using fake data")
        x_train = np.random.random((BATCH_SIZE, IMG_ROWS, IMG_COLS))
        y_train = np.zeros([BATCH_SIZE, 1], dtype=np.int32)
        x_test, y_test = x_train, y_train
    else:
        # the data, split between train and test sets
        print("Using real data")
        (x_train, y_train), (x_test,
                             y_test) = tf.keras.datasets.mnist.load_data()

    x_train = x_train.reshape(x_train.shape[0], IMG_ROWS, IMG_COLS, 1)
    x_test = x_test.reshape(x_test.shape[0], IMG_ROWS, IMG_COLS, 1)
    input_shape = (IMG_ROWS, IMG_COLS, 1)

    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print("x_train shape:", x_train.shape)
    print(x_train.shape[0], "train samples")
    print(x_test.shape[0], "test samples")

    # convert class vectors to binary class matrices
    y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES)
    y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES)
    with strategy.scope():
        model = mnist_model(input_shape)
        model.compile(
            loss=tf.keras.losses.categorical_crossentropy,
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05),
            metrics=["accuracy"],
        )

    callbacks = []
    if FLAGS.model_dir:
        callbacks = [tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)]

    model.fit(
        x_train,
        y_train,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        epochs=EPOCHS,
        verbose=1,
        validation_data=(x_test, y_test),
    )
    return model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=1)
Example #5
0
def main(unused_argv):
  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')

  tpu_cluster_resolver = (
      contrib_cluster_resolver.TPUClusterResolver(
          tpu=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
  tpu_grpc_url = tpu_cluster_resolver.get_master()

  config = contrib_tpu.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))

  kwargs = {}
  if FLAGS.train_batch_size:
    kwargs['batch_size'] = FLAGS.train_batch_size

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
      sample_1_of_n_eval_on_train_examples=(
          FLAGS.sample_1_of_n_eval_on_train_examples),
      use_tpu_estimator=True,
      use_tpu=FLAGS.use_tpu,
      num_shards=FLAGS.num_shards,
      save_final_config=FLAGS.mode == 'train',
      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  train_steps = train_and_eval_dict['train_steps']

  if FLAGS.mode == 'train':
    estimator.train(input_fn=train_input_fn, max_steps=train_steps)

  # Continuously evaluating.
  if FLAGS.mode == 'eval':
    if FLAGS.eval_training_data:
      name = 'training_data'
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
      # Currently only a single eval input is allowed.
      input_fn = eval_input_fns[0]
    model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, train_steps,
                              name, FLAGS.max_eval_retries)
Example #6
0
  def _create_tpu_strategy():
    resolver = cluster_resolver.TPUClusterResolver("")
    topology = tpu_lib.initialize_tpu_system(resolver)
    device_assignment = None
    if use_single_core:
      device_assignment = device_assignment_lib.DeviceAssignment(
          topology, core_assignment=device_assignment_lib.
          SINGLE_CORE_ASSIGNMENT)

    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
                                   device_assignment=device_assignment,
                                   **kwargs)
    return strategy
Example #7
0
def create_estimator(model_fn,
                     model_dir,
                     hparams,
                     use_tpu=False,
                     master='',
                     tpu_cluster=None,
                     save_checkpoint_steps=300,
                     save_summary_steps=300,
                     keep_checkpoint_max=None,
                     warm_start_from=None):
    """Creates an estimator."""
    def wrapped_model_fn(features, labels, mode, params, config):
        """Wrap model_fn to restore labels value if present in features."""
        # Workaround for Estimator API that forces 'labels' to be None when in
        # predict mode.
        # https://github.com/tensorflow/tensorflow/issues/17824
        # See also infer_util.labels_to_features_wrapper
        if labels is None and hasattr(features, 'labels'):
            labels = features.labels
        return model_fn(features, labels, mode, params, config)

    if tpu_cluster:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            tpu_cluster)
        master = None
    else:
        tpu_cluster_resolver = None

    config = contrib_tpu.RunConfig(
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=save_checkpoint_steps),
        master=master,
        cluster=tpu_cluster_resolver,
        save_summary_steps=save_summary_steps,
        save_checkpoints_steps=save_checkpoint_steps,
        keep_checkpoint_max=keep_checkpoint_max,
        keep_checkpoint_every_n_hours=1)

    params = copy.deepcopy(hparams)
    params.del_hparam('batch_size')
    return contrib_tpu.TPUEstimator(
        use_tpu=use_tpu,
        model_fn=wrapped_model_fn,
        model_dir=model_dir,
        params=params,
        train_batch_size=hparams.batch_size,
        eval_batch_size=hparams.eval_batch_size,
        predict_batch_size=hparams.predict_batch_size,
        config=config,
        warm_start_from=warm_start_from,
        eval_on_tpu=False)
Example #8
0
def construct_estimator(flags_obj, params, schedule_manager):
    """Construct an estimator from either Estimator or TPUEstimator.

  Args:
    flags_obj: The FLAGS object parsed from command line.
    params: A dict of run specific parameters.
    schedule_manager: A schedule.Manager object containing the run schedule.

  Returns:
    An estimator object to be used for training and eval.
  """
    if not params["use_tpu"]:
        distribution_strategy = distribution_utils.get_distribution_strategy(
            distribution_strategy=flags_obj.distribution_strategy,
            num_gpus=flags_core.get_num_gpus(flags_obj),
            all_reduce_alg=flags_obj.all_reduce_alg)
        return tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=flags_obj.model_dir,
            params=params,
            config=tf.estimator.RunConfig(
                train_distribute=distribution_strategy))

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        tpu=flags_obj.tpu,
        zone=flags_obj.tpu_zone,
        project=flags_obj.tpu_gcp_project)

    tpu_config = contrib_tpu.TPUConfig(
        iterations_per_loop=schedule_manager.single_iteration_train_steps,
        num_shards=flags_obj.num_tpu_shards)

    run_config = contrib_tpu.RunConfig(cluster=tpu_cluster_resolver,
                                       model_dir=flags_obj.model_dir,
                                       session_config=tf.ConfigProto(
                                           allow_soft_placement=True,
                                           log_device_placement=True),
                                       tpu_config=tpu_config)

    return contrib_tpu.TPUEstimator(
        model_fn=model_fn,
        use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
        train_batch_size=schedule_manager.batch_size,
        eval_batch_size=schedule_manager.batch_size,
        params={
            # TPUEstimator needs to populate batch_size itself due to sharding.
            key: value
            for key, value in params.items() if key != "batch_size"
        },
        config=run_config)
def main(argv):
    del argv  # Unused
    if FLAGS.use_tpu:
      assert FLAGS.model_dir.startswith("gs://"), ("'model_dir' should be a "
                                                   "GCS bucket path!")

    # Fetch the data
    (train_x, train_y), (test_x, test_y) = load_data()

    # Feature columns describe how to use the input.
    my_feature_columns = []
    for key in train_x.keys():
      my_feature_columns.append(tf.feature_column.numeric_column(key=key))

    # Resolve TPU cluster and runconfig for this.
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(FLAGS.tpu)

    run_config = contrib_tpu.RunConfig(
        model_dir=FLAGS.model_dir,
        cluster=tpu_cluster_resolver,
        session_config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True),
        tpu_config=contrib_tpu.TPUConfig(FLAGS.iterations),
    )

    # Build 2 hidden layer DNN with 10, 10 units respectively.
    classifier = contrib_tpu.TPUEstimator(
        model_fn=my_model,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size,
        config=run_config,
        params={
            # Name of the feature columns in the input data.
            "feature_columns": my_feature_columns,
            # Two hidden layers of 10 nodes each.
            "hidden_units": [10, 10],
            # The model must choose between 3 classes.
            "n_classes": 3,
            "use_tpu": FLAGS.use_tpu,
        })

    # Train the Model.
    classifier.train(
        input_fn=lambda params: train_input_fn(
            train_x, train_y, params["batch_size"]),
        max_steps=FLAGS.train_steps)
Example #10
0
def freeze_graph_tpu(model_path):
    """Custom freeze_graph implementation for Cloud TPU."""

    assert model_path
    assert FLAGS.tpu_name
    if FLAGS.tpu_name.startswith('grpc://'):
        tpu_grpc_url = FLAGS.tpu_name
    else:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=None, project=None)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
    sess = tf.Session(tpu_grpc_url)

    output_names = []
    with sess.graph.as_default():
        # Replicate the inference function for each TPU core.
        replicated_features = []
        feature_type = tf.bool if FLAGS.bool_features else tf.float32
        for i in range(FLAGS.num_tpu_cores):
            name = 'pos_tensor_%d' % i
            features = tf.placeholder(feature_type, [None], name=name)
            replicated_features.append((features, ))
        outputs = contrib_tpu.replicate(tpu_model_inference_fn,
                                        replicated_features)

        # The replicate op assigns names like output_0_shard_0 to the output
        # names. Give them human readable names.
        for i, (policy_output, value_output, _) in enumerate(outputs):
            policy_name = 'policy_output_%d' % i
            value_name = 'value_output_%d' % i
            output_names.extend([policy_name, value_name])
            tf.identity(policy_output, policy_name)
            tf.identity(value_output, value_name)

        tf.train.Saver().restore(sess, model_path)

    out_graph = tf.graph_util.convert_variables_to_constants(
        sess, sess.graph.as_graph_def(), output_names)

    metadata = make_model_metadata({
        'engine': 'tpu',
        'num_replicas': FLAGS.num_tpu_cores,
    })

    minigo_model.write_graph_def(out_graph, metadata, model_path + '.minigo')
def main(unused_argv):
    assert FLAGS.tpu_name
    if FLAGS.tpu_name.startswith('grpc://'):
        tpu_grpc_url = FLAGS.tpu_name
    else:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=None, project=None)
        tpu_grpc_url = tpu_cluster_resolver.get_master()

    sess = tf.Session(tpu_grpc_url)
    with sess.graph.as_default():
      contrib_tpu.initialize_system()
      contrib_tpu.shutdown_system()

    output_names = ['ConfigureDistributedTPU', 'ShutdownDistributedTPU']
    model_def = tf.graph_util.convert_variables_to_constants(
        sess, sess.graph.as_graph_def(), output_names)
    print(model_def)
Example #12
0
def main(argv):
    del argv  # Unused.
    tf.logging.set_verbosity(tf.logging.INFO)

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=contrib_tpu.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
    )

    estimator = contrib_tpu.TPUEstimator(model_fn=model_fn,
                                         use_tpu=FLAGS.use_tpu,
                                         train_batch_size=FLAGS.batch_size,
                                         eval_batch_size=FLAGS.batch_size,
                                         predict_batch_size=FLAGS.batch_size,
                                         params={"data_dir": FLAGS.data_dir},
                                         config=run_config)
    # TPUEstimator.train *requires* a max_steps argument.
    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)
    # TPUEstimator.evaluate *requires* a steps argument.
    # Note that the number of examples used during evaluation is
    # --eval_steps * --batch_size.
    # So if you change --batch_size then change --eval_steps too.
    if FLAGS.eval_steps:
        estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps)

    # Run prediction on top few samples of test data.
    if FLAGS.enable_predict:
        predictions = estimator.predict(input_fn=predict_input_fn)

        for pred_dict in predictions:
            template = ('Prediction is "{}" ({:.1f}%).')

            class_id = pred_dict['class_ids']
            probability = pred_dict['probabilities'][class_id]

            print(template.format(class_id, 100 * probability))
Example #13
0
def main(argv):
  del argv

  tpu_address = FLAGS.tpu
  if not any(pref in FLAGS.tpu for pref in ['http://', 'grpc://']):
    tpu_address = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu).master()
    tpu_address = '{}:{}'.format(tpu_address[:-len(':1234')],
                                 '8470' if FLAGS.grpc else '8473')
  tpu_address = tpu_address[len('abcd://'):]
  tf.logging.info('ModelServer at: {}'.format(tpu_address))

  if FLAGS.grpc:
    grpc_channel = grpc.insecure_channel(tpu_address)
    stub = prediction_service_pb2_grpc.PredictionServiceStub(grpc_channel)
    run_grpc_load_test(FLAGS.num_requests, FLAGS.qps, generate_grpc_request(),
                       stub)
  else:
    payload = generate_rest_payload()
    run_rest_load_test(FLAGS.num_requests, FLAGS.qps, tpu_address, payload)
Example #14
0
def build_run_config():
    """Return RunConfig for TPU estimator."""
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    iterations_per_loop = (eval_steps if FLAGS.mode == 'eval' else
                           FLAGS.iterations_per_loop)
    save_checkpoints_steps = FLAGS.save_checkpoints_steps or iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=None,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=contrib_tpu.InputPipelineConfig.
            PER_HOST_V2))
    return run_config
Example #15
0
def main(argv):
    del argv  # Unused.

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=3600,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_shards),
    )

    estimator = contrib_tpu.TPUEstimator(model_fn=model_fn,
                                         use_tpu=FLAGS.use_tpu,
                                         config=run_config,
                                         train_batch_size=FLAGS.batch_size)
    estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
Example #16
0
File: model.py Project: wen8411/tpu
def get_estimator(**kwargs):
    """Construct an estimator."""
    cfg = utils.Config(kwargs)

    if cfg.tpu.get('name'):
        tf.logging.info('Using cluster resolver.')
        cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            cfg.tpu.name, zone=cfg.tpu.zone, project=cfg.tpu.gcp_project)
        master = None
    else:
        cluster_resolver = None
        master = cfg.master

    tf.logging.info('Config:\n %s' % cfg)
    if cfg.tpu.enable:
        if not cfg.steps_per_epoch:
            raise ValueError('steps_per_epoch must be nonzero on TPU.')
        exp = contrib_tpu.TPUEstimator(
            model_fn=model_fn,
            config=contrib_tpu.RunConfig(
                cluster=cluster_resolver,
                master=master,
                model_dir=cfg.model_dir,
                tpu_config=contrib_tpu.TPUConfig(
                    iterations_per_loop=cfg.steps_per_epoch)),
            use_tpu=True,
            eval_on_tpu=False,
            # TPU requires these args, but they are ignored inside the input
            # function, which directly get train_batch_size or eval_batch_size.
            train_batch_size=cfg.dataset.train_batch_size,
            eval_batch_size=cfg.dataset.eval_batch_size,
            params=cfg,
        )
    else:
        exp = tf.estimator.Estimator(model_fn=model_fn,
                                     model_dir=cfg.model_dir,
                                     params=cfg)

    return exp
Example #17
0
    def __init__(self, config, tasks):
        self._config = config
        self._tasks = tasks
        self._preprocessor = preprocessing.Preprocessor(config, self._tasks)

        is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)
        run_config = contrib_tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.checkpoints_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            save_checkpoints_secs=None,
            tpu_config=contrib_tpu.TPUConfig(
                iterations_per_loop=config.iterations_per_loop,
                num_shards=config.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        (self._train_input_fn, self.train_steps,
         sizes) = self._preprocessor.prepare_train()
        task_weights = task_weighting.get_task_weights(config, sizes)

        model_fn = model_fn_builder(config=config,
                                    tasks=self._tasks,
                                    task_weights=task_weights,
                                    num_train_steps=self.train_steps)

        self._estimator = contrib_tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.predict_batch_size)
Example #18
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if (not FLAGS.do_train and not FLAGS.do_eval_dev and not FLAGS.do_eval_test
            and not FLAGS.do_predict):
        raise ValueError(
            "At least one of `do_train`, `do_eval_dev` or `do_predict` or "
            "`do_eval_test' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    label_list = ["Yes", "No"]
    if FLAGS.from_three_class_model:
        label_list.append("Neutral")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = get_train()
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    eval_on = []
    if FLAGS.do_eval_dev:
        eval_on.append((get_dev(), "dev"))
    if FLAGS.do_eval_test:
        eval_on.append((get_test(), "test"))

    for eval_examples, name in eval_on:
        eval_file = os.path.join(FLAGS.output_dir, "%s.tf_record" % name)
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running %s *****" % name)
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            # Eval will be slightly WRONG on the TPU because it will truncate
            # the last batch.
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir,
                                        "%s_eval_results.txt" % name)
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** %s eval results *****" % name)
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = get_custom(FLAGS.predict_input_file)
        predict_file = os.path.join(FLAGS.output_dir,
                                    "predict.tf_record.%s" % FLAGS.exp_name)
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction *****")
        tf.logging.info("Num examples = %d", len(predict_examples))

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        output_predictions = []

        counter = 0
        for result in estimator.predict(predict_input_fn,
                                        yield_single_examples=True):

            probs_result = result["probabilities"]
            answer = bool(probs_result[0] >= probs_result[1])
            output_predictions.append(
                json.dumps({
                    "passage":
                    predict_examples[counter].text_a,
                    "question":
                    predict_examples[counter].text_b,
                    "title":
                    "Steal Dataset",
                    "answer":
                    answer,
                    "soft_answer":
                    [float(probs_result[0]),
                     float(probs_result[1])]
                }))
            counter += 1

        tf.logging.info("%d in original file, %d predicted",
                        len(predict_examples), counter)

        with tf.gfile.GFile(FLAGS.predict_output_file, "w") as f:
            f.write("\n".join(output_predictions))
Example #19
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)

  validate_flags_or_throw(albert_config)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tokenizer = fine_tuning_utils.create_vocab(
      vocab_file=FLAGS.vocab_file,
      do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file,
      hub_module=FLAGS.albert_hub_module_handle)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  if FLAGS.do_train:
    iterations_per_loop = int(min(FLAGS.iterations_per_loop,
                                  FLAGS.save_checkpoints_steps))
  else:
    iterations_per_loop = FLAGS.iterations_per_loop
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      keep_checkpoint_max=0,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  train_examples = squad_utils.read_squad_examples(
      input_file=FLAGS.train_file, is_training=True)
  num_train_steps = int(
      len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
  if FLAGS.do_train:
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(12345)
    rng.shuffle(train_examples)

  model_fn = squad_utils.v2_model_fn_builder(
      albert_config=albert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu,
      max_seq_length=FLAGS.max_seq_length,
      start_n_top=FLAGS.start_n_top,
      end_n_top=FLAGS.end_n_top,
      dropout_prob=FLAGS.dropout_prob,
      hub_module=FLAGS.albert_hub_module_handle)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.

    if not tf.gfile.Exists(FLAGS.train_feature_file):
      train_writer = squad_utils.FeatureWriter(
          filename=os.path.join(FLAGS.train_feature_file), is_training=True)
      squad_utils.convert_examples_to_features(
          examples=train_examples,
          tokenizer=tokenizer,
          max_seq_length=FLAGS.max_seq_length,
          doc_stride=FLAGS.doc_stride,
          max_query_length=FLAGS.max_query_length,
          is_training=True,
          output_fn=train_writer.process_feature,
          do_lower_case=FLAGS.do_lower_case)
      train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    # tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = squad_utils.input_fn_builder(
        input_file=FLAGS.train_feature_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True,
        use_tpu=FLAGS.use_tpu,
        bsz=FLAGS.train_batch_size,
        is_v2=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict:
    with tf.gfile.Open(FLAGS.predict_file) as predict_file:
      prediction_json = json.load(predict_file)["data"]
    eval_examples = squad_utils.read_squad_examples(
        input_file=FLAGS.predict_file, is_training=False)

    if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists(
        FLAGS.predict_feature_left_file)):
      tf.logging.info("Loading eval features from {}".format(
          FLAGS.predict_feature_left_file))
      with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin:
        eval_features = pickle.load(fin)
    else:
      eval_writer = squad_utils.FeatureWriter(
          filename=FLAGS.predict_feature_file, is_training=False)
      eval_features = []

      def append_feature(feature):
        eval_features.append(feature)
        eval_writer.process_feature(feature)

      squad_utils.convert_examples_to_features(
          examples=eval_examples,
          tokenizer=tokenizer,
          max_seq_length=FLAGS.max_seq_length,
          doc_stride=FLAGS.doc_stride,
          max_query_length=FLAGS.max_query_length,
          is_training=False,
          output_fn=append_feature,
          do_lower_case=FLAGS.do_lower_case)
      eval_writer.close()

      with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout:
        pickle.dump(eval_features, fout)

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_input_fn = squad_utils.input_fn_builder(
        input_file=FLAGS.predict_feature_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False,
        use_tpu=FLAGS.use_tpu,
        bsz=FLAGS.predict_batch_size,
        is_v2=True)

    def get_result(checkpoint):
      """Evaluate the checkpoint on SQuAD v2.0."""
      # If running eval on the TPU, you will need to specify the number of
      # steps.
      reader = tf.train.NewCheckpointReader(checkpoint)
      global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
      all_results = []
      for result in estimator.predict(
          predict_input_fn, yield_single_examples=True,
          checkpoint_path=checkpoint):
        if len(all_results) % 1000 == 0:
          tf.logging.info("Processing example: %d" % (len(all_results)))
        unique_id = int(result["unique_ids"])
        start_top_log_probs = (
            [float(x) for x in result["start_top_log_probs"].flat])
        start_top_index = [int(x) for x in result["start_top_index"].flat]
        end_top_log_probs = (
            [float(x) for x in result["end_top_log_probs"].flat])
        end_top_index = [int(x) for x in result["end_top_index"].flat]

        cls_logits = float(result["cls_logits"].flat[0])
        all_results.append(
            squad_utils.RawResultV2(
                unique_id=unique_id,
                start_top_log_probs=start_top_log_probs,
                start_top_index=start_top_index,
                end_top_log_probs=end_top_log_probs,
                end_top_index=end_top_index,
                cls_logits=cls_logits))

      output_prediction_file = os.path.join(
          FLAGS.output_dir, "predictions.json")
      output_nbest_file = os.path.join(
          FLAGS.output_dir, "nbest_predictions.json")
      output_null_log_odds_file = os.path.join(
          FLAGS.output_dir, "null_odds.json")

      result_dict = {}
      cls_dict = {}
      squad_utils.accumulate_predictions_v2(
          result_dict, cls_dict, eval_examples, eval_features,
          all_results, FLAGS.n_best_size, FLAGS.max_answer_length,
          FLAGS.start_n_top, FLAGS.end_n_top)

      return squad_utils.evaluate_v2(
          result_dict, cls_dict, prediction_json, eval_examples,
          eval_features, all_results, FLAGS.n_best_size,
          FLAGS.max_answer_length, output_prediction_file, output_nbest_file,
          output_null_log_odds_file), int(global_step)

    def _find_valid_cands(curr_step):
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      candidates = []
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          idx = ckpt_name.split("-")[-1]
          if idx != "best" and int(idx) > curr_step:
            candidates.append(filename)
      return candidates

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    key_name = "f1"
    writer = tf.gfile.GFile(output_eval_file, "w")
    if tf.gfile.Exists(checkpoint_path + ".index"):
      result = get_result(checkpoint_path)
      best_perf = result[0][key_name]
      global_step = result[1]
    else:
      global_step = -1
      best_perf = -1
      checkpoint_path = None
    while global_step < num_train_steps:
      steps_and_files = {}
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
          if cur_filename.split("-")[-1] == "best":
            continue
          gstep = int(cur_filename.split("-")[-1])
          if gstep not in steps_and_files:
            tf.logging.info("Add {} to eval list.".format(cur_filename))
            steps_and_files[gstep] = cur_filename
      tf.logging.info("found {} files.".format(len(steps_and_files)))
      if not steps_and_files:
        tf.logging.info("found 0 file, global step: {}. Sleeping."
                        .format(global_step))
        time.sleep(60)
      else:
        for ele in sorted(steps_and_files.items()):
          step, checkpoint_path = ele
          if global_step >= step:
            if len(_find_valid_cands(step)) > 1:
              for ext in ["meta", "data-00000-of-00001", "index"]:
                src_ckpt = checkpoint_path + ".{}".format(ext)
                tf.logging.info("removing {}".format(src_ckpt))
                tf.gfile.Remove(src_ckpt)
            continue
          result, global_step = get_result(checkpoint_path)
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
          if result[key_name] > best_perf:
            best_perf = result[key_name]
            for ext in ["meta", "data-00000-of-00001", "index"]:
              src_ckpt = checkpoint_path + ".{}".format(ext)
              tgt_ckpt = checkpoint_path.rsplit(
                  "-", 1)[0] + "-best.{}".format(ext)
              tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
              tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
              writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt))
          writer.write("best {} = {}\n".format(key_name, best_perf))
          tf.logging.info("  best {} = {}\n".format(key_name, best_perf))

          if len(_find_valid_cands(global_step)) > 2:
            for ext in ["meta", "data-00000-of-00001", "index"]:
              src_ckpt = checkpoint_path + ".{}".format(ext)
              tf.logging.info("removing {}".format(src_ckpt))
              tf.gfile.Remove(src_ckpt)
          writer.write("=" * 50 + "\n")

    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    result, global_step = get_result(checkpoint_path)
    tf.logging.info("***** Final Eval results *****")
    for key in sorted(result.keys()):
      tf.logging.info("  %s = %s", key, str(result[key]))
      writer.write("%s = %s\n" % (key, str(result[key])))
    writer.write("best perf happened at step: {}".format(global_step))
Example #20
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": classifier_utils.ColaProcessor,
        "mnli": classifier_utils.MnliProcessor,
        "mismnli": classifier_utils.MisMnliProcessor,
        "mrpc": classifier_utils.MrpcProcessor,
        "rte": classifier_utils.RteProcessor,
        "sst-2": classifier_utils.Sst2Processor,
        "sts-b": classifier_utils.StsbProcessor,
        "qqp": classifier_utils.QqpProcessor,
        "qnli": classifier_utils.QnliProcessor,
        "wnli": classifier_utils.WnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle:
        raise ValueError("At least one of `--albert_config_file` and "
                         "`--albert_hub_module_handle` must be set")

    if FLAGS.albert_config_file:
        albert_config = modeling.AlbertConfig.from_json_file(
            FLAGS.albert_config_file)
        if FLAGS.max_seq_length > albert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the ALBERT model "
                "was only trained up to sequence length %d" %
                (FLAGS.max_seq_length, albert_config.max_position_embeddings))
    else:
        albert_config = None  # Get the config from TF-Hub.

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](
        use_spm=True if FLAGS.spm_model_file else False,
        do_lower_case=FLAGS.do_lower_case)

    label_list = processor.get_labels()

    tokenizer = fine_tuning_utils.create_vocab(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file,
        hub_module=FLAGS.albert_hub_module_handle)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
    model_fn = classifier_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.train_step,
        num_warmup_steps=FLAGS.warmup_step,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        task_name=task_name,
        hub_module=FLAGS.albert_hub_module_handle,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        train_file = os.path.join(cached_dir, task_name + "_train.tf_record")
        if not tf.gfile.Exists(train_file):
            classifier_utils.file_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer,
                train_file, task_name)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_step)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size)
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(classifier_utils.PaddingInputExample())

        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record")
        if not tf.gfile.Exists(eval_file):
            classifier_utils.file_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
                eval_file, task_name)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.eval_batch_size)

        best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt")

        def _best_trial_info():
            """Returns information about which checkpoints have been evaled so far."""
            if tf.gfile.Exists(best_trial_info_file):
                with tf.gfile.GFile(best_trial_info_file, "r") as best_info:
                    global_step, best_metric_global_step, metric_value = (
                        best_info.read().split(":"))
                    global_step = int(global_step)
                    best_metric_global_step = int(best_metric_global_step)
                    metric_value = float(metric_value)
            else:
                metric_value = -1
                best_metric_global_step = -1
                global_step = -1
            tf.logging.info(
                "Best trial info: Step: %s, Best Value Step: %s, "
                "Best Value: %s", global_step, best_metric_global_step,
                metric_value)
            return global_step, best_metric_global_step, metric_value

        def _remove_checkpoint(checkpoint_path):
            for ext in ["meta", "data-00000-of-00001", "index"]:
                src_ckpt = checkpoint_path + ".{}".format(ext)
                tf.logging.info("removing {}".format(src_ckpt))
                tf.gfile.Remove(src_ckpt)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")

        if task_name == "sts-b":
            key_name = "pearson"
        elif task_name == "cola":
            key_name = "matthew_corr"
        else:
            key_name = "eval_accuracy"

        global_step, best_perf_global_step, best_perf = _best_trial_info()
        writer = tf.gfile.GFile(output_eval_file, "w")
        while global_step < FLAGS.train_step:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(60)
            else:
                for checkpoint in sorted(steps_and_files.items()):
                    step, checkpoint_path = checkpoint
                    if global_step >= step:
                        if (best_perf_global_step != step
                                and len(_find_valid_cands(step)) > 1):
                            _remove_checkpoint(checkpoint_path)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        best_perf_global_step = global_step
                    elif len(_find_valid_cands(global_step)) > 1:
                        _remove_checkpoint(checkpoint_path)
                    writer.write("=" * 50 + "\n")
                    writer.flush()
                    with tf.gfile.GFile(best_trial_info_file,
                                        "w") as best_info:
                        best_info.write("{}:{}:{}".format(
                            global_step, best_perf_global_step, best_perf))
        writer.close()

        for ext in ["meta", "data-00000-of-00001", "index"]:
            src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext)
            tgt_ckpt = "model.ckpt-best.{}".format(ext)
            tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
            tf.io.gfile.rename(os.path.join(FLAGS.output_dir, src_ckpt),
                               os.path.join(FLAGS.output_dir, tgt_ckpt),
                               overwrite=True)

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(classifier_utils.PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        classifier_utils.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file, task_name)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size)

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.predict(input_fn=predict_input_fn,
                                   checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
            tf.gfile.GFile(output_submit_file, "w") as sub_writer:
            sub_writer.write("index" + "\t" + "prediction\n")
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, (example, prediction)) in\
                enumerate(zip(predict_examples, result)):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                pred_writer.write(output_line)

                if task_name != "sts-b":
                    actual_label = label_list[int(prediction["predictions"])]
                else:
                    actual_label = str(prediction["predictions"])
                sub_writer.write(example.guid + "\t" + actual_label + "\n")
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
Example #21
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                FLAGS.init_checkpoint)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train`, `do_eval` must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    num_train_steps = int(FLAGS.train_data_size / FLAGS.train_batch_size)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu,
      num_choices=FLAGS.num_choices,
      add_masking=FLAGS.include_mlm)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=FLAGS.train_file,
        is_training=True,
        drop_remainder=True,
        add_masking=FLAGS.include_mlm)
    estimator.train(input_fn=train_input_fn, steps=num_train_steps)

  if FLAGS.do_eval:
    # This tells the estimator to run through the entire set.
    if FLAGS.eval_data_size < 0:
      eval_steps = None
    else:
      eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    # Note that we are masking inputs for eval as well as training and this will
    # decrease eval performance
    eval_input_fn = file_based_input_fn_builder(
        input_file=FLAGS.eval_file,
        is_training=False,
        drop_remainder=eval_drop_remainder,
        add_masking=FLAGS.include_mlm)

    # checkpoints_iterator blocks until a new checkpoint appears.
    for ckpt in contrib_training.checkpoints_iterator(estimator.model_dir):
      try:
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        tf.logging.info("********** Eval results:*******\n")
        for key in sorted(result.keys()):
          tf.logging.info("%s = %s" % (key, str(result[key])))
      except tf.errors.NotFoundError:
        tf.logging.error("Checkpoint path '%s' no longer exists.", ckpt)
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  processors = {"sst-2": SST2Processor, "mnli": MNLIProcessor}

  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                FLAGS.init_checkpoint)

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name,))

  processor = processors[task_name]()

  label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    file_based_convert_examples_to_features(train_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True,
        num_labels=len(label_list))
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_eval:
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)
    if FLAGS.use_tpu:
      # TPU requires a fixed batch size for all batches, therefore the number
      # of examples must be a multiple of the batch size, or else examples
      # will get dropped. So we pad with fake examples which are ignored
      # later on. These do NOT count towards the metric (all tf.metrics
      # support a per-instance weight, and these get a weight of 0.0).
      while len(eval_examples) % FLAGS.eval_batch_size != 0:
        eval_examples.append(PaddingInputExample())

    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    file_based_convert_examples_to_features(eval_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            eval_file)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    # This tells the estimator to run through the entire set.
    eval_steps = None
    # However, if running eval on the TPU, you will need to specify the
    # number of steps.
    if FLAGS.use_tpu:
      assert len(eval_examples) % FLAGS.eval_batch_size == 0
      eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder,
        num_labels=len(label_list))

    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
      tf.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

  if FLAGS.do_predict:
    predict_examples = processor.get_test_examples(FLAGS.predict_input_file)
    num_actual_predict_examples = len(predict_examples)
    if FLAGS.use_tpu:
      # TPU requires a fixed batch size for all batches, therefore the number
      # of examples must be a multiple of the batch size, or else examples
      # will get dropped. So we pad with fake examples which are ignored
      # later on.
      while len(predict_examples) % FLAGS.predict_batch_size != 0:
        predict_examples.append(PaddingInputExample())

    predict_file = os.path.join(FLAGS.output_dir,
                                FLAGS.exp_name + ".predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_drop_remainder = True if FLAGS.use_tpu else False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder,
        num_labels=len(label_list))

    result = estimator.predict(input_fn=predict_input_fn)

    if FLAGS.predict_output_file is None:
      predict_output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
    else:
      predict_output_file = FLAGS.predict_output_file

    with tf.gfile.GFile(predict_output_file, "w") as writer:
      num_written_lines = 0
      tf.logging.info("***** Predict results *****")
      for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
          break
        output_line = "\t".join(
            str(class_probability)
            for class_probability in probabilities) + "\n"
        writer.write(output_line)
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples
def main(argv):
    del argv  # Unused.

    if FLAGS.start_profiler_server:
        # Starts profiler. It will perform profiling when receive profiling request.
        profiler.start_profiler_server(FLAGS.profiler_port_number)

    if FLAGS.use_tpu:
        if FLAGS.distribution_strategy is None:
            tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
                FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
            tpu_grpc_url = tpu_cluster_resolver.get_master()
            tf.Session.reset(tpu_grpc_url)
        else:
            raise RuntimeError(
                'Distribution strategy must be None when --use_tpu is True.')
    else:
        tpu_cluster_resolver = None

    if FLAGS.mode not in ['train', 'eval', 'train_and_eval']:
        raise ValueError('Unrecognize --mode: %s' % FLAGS.mode)

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')
    if FLAGS.mode == 'train_and_eval':
        if FLAGS.distribution_strategy is not None:
            raise RuntimeError('You must use --distribution_strategy=None for '
                               'train_and_eval.')

    # Parse hparams
    hparams = retinanet_model.default_hparams()
    config_file = FLAGS.config_file
    hparams.num_epochs = FLAGS.num_epochs
    if config_file and tf.gfile.Exists(config_file):
        # load params from file.
        with tf.gfile.Open(config_file, 'r') as f:
            values_map = json.load(f)
            hparams.override_from_dict(values_map)
    hparams.parse(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in hparams, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = hparams.get('image_size')
        for level in range(hparams.get('min_level'),
                           hparams.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None

        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and not FLAGS.use_tpu:
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)
    if FLAGS.auto_mixed_precision and FLAGS.distribution_strategy:
        config_proto.graph_options.rewrite_options.auto_mixed_precision = (
            rewriter_config_pb2.RewriterConfig.ON)

    if FLAGS.distribution_strategy is None:
        # Uses TPUEstimator.
        params = dict(
            hparams.values(),
            num_shards=num_shards,
            num_examples_per_epoch=FLAGS.num_examples_per_epoch,
            use_tpu=FLAGS.use_tpu,
            resnet_checkpoint=FLAGS.resnet_checkpoint,
            val_json_file=FLAGS.val_json_file,
            mode=FLAGS.mode,
        )
        tpu_config = contrib_tpu.TPUConfig(
            FLAGS.iterations_per_loop,
            num_shards=num_shards,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=contrib_tpu.InputPipelineConfig.
            PER_HOST_V2)

        run_config = contrib_tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            evaluation_master=FLAGS.eval_master,
            model_dir=FLAGS.model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
        )
    else:
        if FLAGS.num_gpus < 0:
            raise ValueError('`num_gpus` cannot be negative.')

        def _per_device_batch_size(batch_size, num_gpus):
            """Calculate per GPU batch for Estimator.

      Args:
        batch_size: Global batch size to be divided among devices.
        num_gpus: How many GPUs are used per worker.
      Returns:
        Batch size per device.
      Raises:
        ValueError: if batch_size is not divisible by number of devices
      """
            if num_gpus <= 1:
                return batch_size

            remainder = batch_size % num_gpus
            if remainder:
                raise ValueError(
                    'Batch size must be a multiple of the number GPUs per worker.'
                )
            return int(batch_size / num_gpus)

        # Uses Estimator.
        params = dict(
            hparams.values(),
            num_examples_per_epoch=FLAGS.num_examples_per_epoch,
            use_tpu=FLAGS.use_tpu,
            resnet_checkpoint=FLAGS.resnet_checkpoint,
            val_json_file=FLAGS.val_json_file,
            mode=FLAGS.mode,
            use_bfloat16=False,
            auto_mixed_precision=FLAGS.auto_mixed_precision,
            dataset_max_intra_op_parallelism=FLAGS.
            dataset_max_intra_op_parallelism,
            dataset_private_threadpool_size=FLAGS.
            dataset_private_threadpool_size,
        )

        if FLAGS.distribution_strategy == 'mirrored':
            params['batch_size'] = _per_device_batch_size(
                FLAGS.train_batch_size, FLAGS.num_gpus)

            if FLAGS.num_gpus == 0:
                devices = ['device:CPU:0']
            else:
                devices = [
                    'device:GPU:{}'.format(i) for i in range(FLAGS.num_gpus)
                ]

            if FLAGS.all_reduce_alg:
                dist_strat = tf.distribute.MirroredStrategy(
                    devices=devices,
                    cross_device_ops=contrib_distribute.
                    AllReduceCrossDeviceOps(FLAGS.all_reduce_alg, num_packs=2))
            else:
                dist_strat = tf.distribute.MirroredStrategy(devices=devices)

            run_config = tf.estimator.RunConfig(session_config=config_proto,
                                                train_distribute=dist_strat,
                                                eval_distribute=dist_strat)

        elif FLAGS.distribution_strategy == 'multi_worker_mirrored':
            local_device_protos = device_lib.list_local_devices()
            params['batch_size'] = _per_device_batch_size(
                FLAGS.train_batch_size,
                sum([1 for d in local_device_protos
                     if d.device_type == 'GPU']))

            if FLAGS.worker_hosts is None:
                tf_config_json = json.loads(os.environ.get('TF_CONFIG', '{}'))
                # Replaces master with chief.
                if tf_config_json:
                    if 'master' in tf_config_json['cluster']:
                        tf_config_json['cluster']['chief'] = tf_config_json[
                            'cluster'].pop('master')
                        if tf_config_json['task']['type'] == 'master':
                            tf_config_json['task']['type'] = 'chief'
                        os.environ['TF_CONFIG'] = json.dumps(tf_config_json)

                tf_config_json = json.loads(os.environ['TF_CONFIG'])
                worker_hosts = tf_config_json['cluster']['worker']
                worker_hosts.extend(tf_config_json['cluster'].get('chief', []))
            else:
                # Set TF_CONFIG environment variable
                worker_hosts = FLAGS.worker_hosts.split(',')
                os.environ['TF_CONFIG'] = json.dumps({
                    'cluster': {
                        'worker': worker_hosts
                    },
                    'task': {
                        'type': 'worker',
                        'index': FLAGS.task_index
                    }
                })

            dist_strat = tf.distribute.experimental.MultiWorkerMirroredStrategy(
                communication=_COLLECTIVE_COMMUNICATION_OPTIONS[
                    FLAGS.all_reduce_alg])
            run_config = tf.estimator.RunConfig(session_config=config_proto,
                                                train_distribute=dist_strat)

        else:
            raise ValueError('Unrecognized distribution strategy.')

    if FLAGS.mode == 'train':
        if FLAGS.model_dir is not None:
            if not tf.gfile.Exists(FLAGS.model_dir):
                tf.gfile.MakeDirs(FLAGS.model_dir)
            with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'hparams.json'),
                               'w') as f:
                json.dump(hparams.values(), f, sort_keys=True, indent=2)
        tf.logging.info(params)
        if FLAGS.distribution_strategy is None:
            total_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            train_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  max_steps=total_steps)

            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )
            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            if FLAGS.eval_after_training:

                if FLAGS.val_json_file is None:
                    raise RuntimeError(
                        'You must specify --val_json_file for evaluation.')

                eval_results = evaluation.evaluate(
                    eval_estimator,
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    num_eval_samples=FLAGS.eval_samples,
                    eval_batch_size=FLAGS.eval_batch_size,
                    validation_json_file=FLAGS.val_json_file)
                tf.logging.info('Eval results: %s' % eval_results)
                output_dir = os.path.join(FLAGS.model_dir, 'train_eval')
                tf.gfile.MakeDirs(output_dir)
                summary_writer = tf.summary.FileWriter(output_dir)

                evaluation.write_summary(eval_results, summary_writer,
                                         total_steps)
        else:
            train_estimator = tf.estimator.Estimator(
                model_fn=retinanet_model.est_retinanet_model_fn,
                model_dir=FLAGS.model_dir,
                config=run_config,
                params=params)
            if FLAGS.distribution_strategy == 'mirrored':
                total_steps = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                tf.logging.info('Starting `MirroredStrategy` training...')
                train_estimator.train(input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern, is_training=True),
                                      max_steps=total_steps)
            elif FLAGS.distribution_strategy == 'multi_worker_mirrored':
                total_steps = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    (len(worker_hosts) * FLAGS.train_batch_size))
                train_spec = tf.estimator.TrainSpec(
                    input_fn=dataloader.InputReader(
                        FLAGS.training_file_pattern, is_training=True),
                    max_steps=total_steps)
                eval_spec = tf.estimator.EvalSpec(input_fn=tf.data.Dataset)
                tf.logging.info(
                    'Starting `MultiWorkerMirroredStrategy` training...')
                tf.estimator.train_and_evaluate(train_estimator, train_spec,
                                                eval_spec)
            else:
                raise ValueError('Unrecognized distribution strategy.')

    elif FLAGS.mode == 'eval':
        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')
        eval_params = dict(
            params,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
        )
        if FLAGS.distribution_strategy is None:
            # Uses TPUEstimator.
            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
        else:
            # Uses Estimator.
            if FLAGS.distribution_strategy == 'multi_worker_mirrored':
                raise ValueError(
                    '--distribution_strategy=multi_worker_mirrored is not supported '
                    'for eval.')
            elif FLAGS.distribution_strategy == 'mirrored':
                eval_estimator = tf.estimator.Estimator(
                    model_fn=retinanet_model.est_retinanet_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=params)
            else:
                raise ValueError('Unrecognized distribution strategy.')

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        # Run evaluation when there's a new checkpoint
        for ckpt in contrib_training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = evaluation.evaluate(
                    eval_estimator,
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    num_eval_samples=FLAGS.eval_samples,
                    eval_batch_size=FLAGS.eval_batch_size,
                    validation_json_file=FLAGS.val_json_file)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                evaluation.write_summary(eval_results, summary_writer,
                                         current_step)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        if FLAGS.distribution_strategy is not None:
            raise ValueError(
                'Distribution strategy is not implemented for --mode=train_and_eval.'
            )
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

        output_dir = os.path.join(FLAGS.model_dir, 'train_and_eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        num_cycles = int(FLAGS.num_epochs * FLAGS.num_examples_per_epoch /
                         FLAGS.num_steps_per_eval)
        for cycle in range(num_cycles):
            tf.logging.info('Starting training cycle, epoch: %d.' % cycle)
            train_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  steps=FLAGS.num_steps_per_eval)

            tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )

            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = evaluation.evaluate(
                eval_estimator,
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                num_eval_samples=FLAGS.eval_samples,
                eval_batch_size=FLAGS.eval_batch_size,
                validation_json_file=FLAGS.val_json_file)
            tf.logging.info('Evaluation results: %s' % eval_results)
            current_step = int(cycle * FLAGS.num_steps_per_eval)
            evaluation.write_summary(eval_results, summary_writer,
                                     current_step)

    else:
        tf.logging.info('Mode not found.')

    if FLAGS.model_dir:
        tf.logging.info('Exporting saved model.')
        eval_params = dict(
            params,
            use_tpu=True,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            use_bfloat16=False,
        )
        eval_estimator = contrib_tpu.TPUEstimator(
            model_fn=retinanet_model.tpu_retinanet_model_fn,
            use_tpu=True,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.inference_batch_size,
            config=run_config,
            params=eval_params)

        export_path = eval_estimator.export_saved_model(
            export_dir_base=FLAGS.model_dir,
            serving_input_receiver_fn=build_serving_input_fn(
                hparams.image_size, FLAGS.inference_batch_size))
        if FLAGS.add_warmup_requests:
            inference_warmup.write_warmup_requests(
                export_path,
                FLAGS.model_name,
                hparams.image_size,
                batch_sizes=[FLAGS.inference_batch_size])
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": classifier_utils.ColaProcessor,
        "mnli": classifier_utils.MnliProcessor,
        "mismnli": classifier_utils.MisMnliProcessor,
        "mrpc": classifier_utils.MrpcProcessor,
        "rte": classifier_utils.RteProcessor,
        "sst-2": classifier_utils.Sst2Processor,
        "sts-b": classifier_utils.StsbProcessor,
        "qqp": classifier_utils.QqpProcessor,
        "qnli": classifier_utils.QnliProcessor,
        "wnli": classifier_utils.WnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle:
        raise ValueError("At least one of `--albert_config_file` and "
                         "`--albert_hub_module_handle` must be set")

    if FLAGS.albert_config_file:
        albert_config = modeling.AlbertConfig.from_json_file(
            FLAGS.albert_config_file)
        if FLAGS.max_seq_length > albert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the ALBERT model "
                "was only trained up to sequence length %d" %
                (FLAGS.max_seq_length, albert_config.max_position_embeddings))
    else:
        albert_config = None  # Get the config from TF-Hub.

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](
        use_spm=True if FLAGS.spm_model_file else False,
        do_lower_case=FLAGS.do_lower_case)

    label_list = processor.get_labels()

    tokenizer = fine_tuning_utils.create_vocab(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file,
        hub_module=FLAGS.albert_hub_module_handle)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = classifier_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.train_step,
        num_warmup_steps=FLAGS.warmup_step,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        task_name=task_name,
        hub_module=FLAGS.albert_hub_module_handle,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    # if FLAGS.do_predict:
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)
    if FLAGS.use_tpu:
        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on.
        while len(eval_examples) % FLAGS.predict_batch_size != 0:
            eval_examples.append(classifier_utils.PaddingInputExample())

    error_analysis_file = os.path.join(FLAGS.output_dir,
                                       "error_analysis.tf_record")
    classifier_utils.file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
        error_analysis_file, task_name)

    tf.logging.info("***** Running error analysis on dev set*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    error_analysis_drop_remainder = True if FLAGS.use_tpu else False
    error_analysis_input_fn = classifier_utils.file_based_input_fn_builder(
        input_file=error_analysis_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=error_analysis_drop_remainder,
        task_name=task_name,
        use_tpu=FLAGS.use_tpu,
        bsz=FLAGS.predict_batch_size)

    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    result = estimator.predict(input_fn=error_analysis_input_fn,
                               checkpoint_path=checkpoint_path)

    output_error_analysis_predict_file = os.path.join(
        FLAGS.output_dir, "error_analysis_test_results.tsv")
    output_error_analysis_submit_file = os.path.join(
        FLAGS.output_dir, "error_analysis_submit_results.tsv")
    with tf.gfile.GFile(output_error_analysis_predict_file, "w") as pred_writer,\
        tf.gfile.GFile(output_error_analysis_submit_file, "w") as sub_writer:
        sub_writer.write("index" + "\t" + "text_a" + "\t" + "text_b" + "\t" +
                         "prediction" + "\t" + "label" + "\n")
        num_written_lines = 0
        tf.logging.info("***** Error analysis results *****")
        for (i, (example, prediction)) in\
            enumerate(zip(eval_examples, result)):
            probabilities = prediction["probabilities"]
            if i >= num_actual_eval_examples:
                break
            output_line = "\t".join(
                str(class_probability)
                for class_probability in probabilities) + "\n"
            pred_writer.write(output_line)

            if task_name != "sts-b":
                actual_label = label_list[int(prediction["predictions"])]
            else:
                actual_label = str(prediction["predictions"])
            sub_writer.write(example.guid + "\t" + str(example.text_a) + "\t" +
                             str(example.text_b) + "\t" + str(actual_label) +
                             "\t" + str(example.label) + "\n")
            num_written_lines += 1
    assert num_written_lines == num_actual_eval_examples
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    label_list = ["Yes", "No"]
    if FLAGS.from_three_class_model:
        label_list.append("Neutral")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    predict_examples = get_custom(FLAGS.predict_input_file)
    predict_file = os.path.join(FLAGS.init_checkpoint2,
                                "predict.tf_record.%s" % FLAGS.exp_name)
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction *****")
    tf.logging.info("Num examples = %d", len(predict_examples))

    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    all_model_ans = []

    for output_dir in [FLAGS.init_checkpoint1, FLAGS.init_checkpoint2]:
        all_answers = []
        tpu_cluster_resolver = None
        if FLAGS.use_tpu and FLAGS.tpu_name:
            tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

        is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
        run_config = contrib_tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=output_dir,
            save_checkpoints_steps=1000,
            tpu_config=contrib_tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        model_fn = model_fn_builder(bert_config=bert_config,
                                    num_labels=len(label_list),
                                    init_checkpoint=FLAGS.init_checkpoint,
                                    learning_rate=None,
                                    num_train_steps=None,
                                    num_warmup_steps=None,
                                    use_tpu=FLAGS.use_tpu,
                                    use_one_hot_embeddings=FLAGS.use_tpu)

        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        estimator = contrib_tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=32,
            eval_batch_size=8,
            predict_batch_size=FLAGS.predict_batch_size)

        for result in estimator.predict(predict_input_fn,
                                        yield_single_examples=True):

            probs_result = result["probabilities"]
            all_answers.append(bool(probs_result[0] >= probs_result[1]))

        all_model_ans.append(all_answers)

    first_ans, second_ans = all_model_ans[0], all_model_ans[1]
    assert len(first_ans) == len(second_ans)

    matching = 0
    counter = 0

    for a1, a2 in zip(first_ans, second_ans):
        if a1 == a2:
            matching += 1
        counter += 1

    tf.logging.info("Agreement = %.4f (%d / %d)", (float(matching) / counter),
                    matching, counter)
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #27
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    validate_flags_or_throw(albert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case,
                                           spm_model_file=FLAGS.spm_model_file)

    # multiple gpus
    NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1
    using_customized_optimizer = None
    if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror":
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
            [str(i) for i in list(range(NUM_GPUS))])
        # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263
        strategy = tf.contrib.distribute.MirroredStrategy(
            num_gpus=NUM_GPUS,
            cross_device_ops=AllReduceCrossDeviceOps('nccl',
                                                     num_packs=NUM_GPUS),
        )
        using_customized_optimizer = True
        tf.logging.info('Use MirroredStrategy with %d devices.',
                        strategy.num_replicas_in_sync)
    else:
        strategy = tf.distribute.OneDeviceStrategy("GPU:0")
        using_customized_optimizer = False
        tf.logging.info('Single device mode.')

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        train_distribute=strategy,
        eval_distribute=strategy,  #get error during evaluation
    )

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    train_examples = squad_utils.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    if FLAGS.do_train:
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

    model_fn = squad_utils.v2_model_fn_builder(
        albert_config=albert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        max_seq_length=FLAGS.max_seq_length,
        start_n_top=FLAGS.start_n_top,
        end_n_top=FLAGS.end_n_top,
        dropout_prob=FLAGS.dropout_prob,
        customized=using_customized_optimizer,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tf.logging.info("Use TPUEstimator")
        estimator = contrib_tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)
    else:
        tf.logging.info("Use normal Estimator")
        estimator = Estimator(
            model_fn=model_fn,
            params={},
            config=run_config,
        )

    if FLAGS.do_train:
        # We write to a temporary file to avoid storing very large constant tensors
        # in memory.

        if not tf.gfile.Exists(FLAGS.train_feature_file):
            train_writer = squad_utils.FeatureWriter(filename=os.path.join(
                FLAGS.train_feature_file),
                                                     is_training=True)
            squad_utils.convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=True,
                output_fn=train_writer.process_feature,
                do_lower_case=FLAGS.do_lower_case)
            train_writer.close()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", len(train_examples))
        # tf.logging.info("  Num split examples = %d", train_writer.num_features)
        tf.logging.info(
            f"  Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}")
        tf.logging.info("  Num steps = %d", num_train_steps)
        del train_examples

        train_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.train_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size,
            is_v2=True)

        time_hist = TimeHistory()
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        total_time = sum(time_hist.times)

    if FLAGS.do_predict:
        with tf.gfile.Open(FLAGS.predict_file) as predict_file:
            prediction_json = json.load(predict_file)["data"]
        eval_examples = squad_utils.read_squad_examples(
            input_file=FLAGS.predict_file, is_training=False)

        if (tf.gfile.Exists(FLAGS.predict_feature_file)
                and tf.gfile.Exists(FLAGS.predict_feature_left_file)):
            tf.logging.info("Loading eval features from {}".format(
                FLAGS.predict_feature_left_file))
            with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin:
                eval_features = pickle.load(fin)
        else:
            eval_writer = squad_utils.FeatureWriter(
                filename=FLAGS.predict_feature_file, is_training=False)
            eval_features = []

            def append_feature(feature):
                eval_features.append(feature)
                eval_writer.process_feature(feature)

            squad_utils.convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=False,
                output_fn=append_feature,
                do_lower_case=FLAGS.do_lower_case)
            eval_writer.close()

            with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout:
                pickle.dump(eval_features, fout)

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.predict_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size,
            is_v2=True)

        def get_result(checkpoint):
            """Evaluate the checkpoint on SQuAD v2.0."""
            # If running eval on the TPU, you will need to specify the number of
            # steps.
            reader = tf.train.NewCheckpointReader(checkpoint)
            global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
            all_results = []
            for result in estimator.predict(predict_input_fn,
                                            yield_single_examples=True,
                                            checkpoint_path=checkpoint):
                if len(all_results) % 1000 == 0:
                    tf.logging.info("Processing example: %d" %
                                    (len(all_results)))
                unique_id = int(result["unique_ids"])
                start_top_log_probs = ([
                    float(x) for x in result["start_top_log_probs"].flat
                ])
                start_top_index = [
                    int(x) for x in result["start_top_index"].flat
                ]
                end_top_log_probs = ([
                    float(x) for x in result["end_top_log_probs"].flat
                ])
                end_top_index = [int(x) for x in result["end_top_index"].flat]

                cls_logits = float(result["cls_logits"].flat[0])
                all_results.append(
                    squad_utils.RawResultV2(
                        unique_id=unique_id,
                        start_top_log_probs=start_top_log_probs,
                        start_top_index=start_top_index,
                        end_top_log_probs=end_top_log_probs,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits))

            output_prediction_file = os.path.join(FLAGS.output_dir,
                                                  "predictions.json")
            output_nbest_file = os.path.join(FLAGS.output_dir,
                                             "nbest_predictions.json")
            output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                     "null_odds.json")

            result_dict = {}
            cls_dict = {}
            squad_utils.accumulate_predictions_v2(
                result_dict, cls_dict, eval_examples, eval_features,
                all_results, FLAGS.n_best_size, FLAGS.max_answer_length,
                FLAGS.start_n_top, FLAGS.end_n_top)

            return squad_utils.evaluate_v2(
                result_dict, cls_dict, prediction_json, eval_examples,
                eval_features, all_results, FLAGS.n_best_size,
                FLAGS.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file), int(global_step)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "f1"
        writer = tf.gfile.GFile(output_eval_file, "w")

        avg_time_per_batch = np.mean(time_hist.times)
        writer.write("===== Hyperparameters =====\n")
        writer.write("Training batch size: {}\n".format(
            FLAGS.train_batch_size))
        writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length))
        writer.write("Learning rate: {}\n".format(FLAGS.learning_rate))
        writer.write("Num of GPU cores: {}\n".format(NUM_GPUS))
        if FLAGS.do_train:
            avg_time_per_batch = np.mean(time_hist.times)
            writer.write("Total time: {}\n".format(total_time))
            writer.write("Speed: {}\n".format(FLAGS.train_batch_size *
                                              NUM_GPUS / avg_time_per_batch))
        if num_train_steps and num_warmup_steps:
            writer.write("Training steps: {}\n".format(num_train_steps))
            writer.write("Warmup steps: {}\n".format(num_warmup_steps))

        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = get_result(checkpoint_path)
            best_perf = result[0][key_name]
            global_step = result[1]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        while global_step < num_train_steps:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(1)
            else:
                for ele in sorted(steps_and_files.items()):
                    step, checkpoint_path = ele
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result, global_step = get_result(checkpoint_path)
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))
                    writer.write("best {} = {}\n".format(key_name, best_perf))
                    tf.logging.info("  best {} = {}\n".format(
                        key_name, best_perf))

                    if len(_find_valid_cands(global_step)) > 2:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result, global_step = get_result(checkpoint_path)
        tf.logging.info("***** Final Eval results *****")
        tf.logging.info(f"num_gpu_cores =  {NUM_GPUS}")
        writer.write("===== Evuations =====\n")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        writer.write("best perf happened at step: {}".format(global_step))
Example #28
0
def main(unused_argv):
    del unused_argv  # Unused

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    params = {
        'input_perm': [0, 1, 2, 3],
        'output_perm': [0, 1, 2, 3],
    }

    batch_axis = 0
    if FLAGS.transpose_enabled:
        params['input_perm'] = [3, 0, 1, 2]
        params['output_perm'] = [1, 2, 3, 0]
        batch_axis = 3

    if FLAGS.eval_total_size > 0:
        eval_size = FLAGS.eval_total_size
    else:
        eval_size = _NUM_EVAL_IMAGES
    eval_steps = eval_size // FLAGS.eval_batch_size

    iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations)

    eval_batch_size = (None
                       if FLAGS.mode == 'train' else FLAGS.eval_batch_size)

    tpu_config = contrib_tpu.TPUConfig(iterations_per_loop=iterations,
                                       num_shards=FLAGS.num_shards)

    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tpu_config)

    inception_classifier = contrib_tpu.TPUEstimator(
        model_fn=inception_model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params=params,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=eval_batch_size,
        batch_axis=(batch_axis, 0))

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir)
    imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir)

    if FLAGS.moving_average:
        eval_hooks = [LoadEMAHook(FLAGS.model_dir)]
    else:
        eval_hooks = []

    if FLAGS.mode == 'eval':
        # Run evaluation when there is a new checkpoint
        for checkpoint in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time()  # Includes compilation time
                eval_results = inception_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(checkpoint).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break
            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    checkpoint)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
            tf.logging.info('Starting training cycle %d.' % cycle)
            inception_classifier.train(input_fn=imagenet_train.input_fn,
                                       steps=FLAGS.train_steps_per_eval)

            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Starting training ...')
        inception_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=FLAGS.train_steps)

    if FLAGS.export_dir is not None:
        tf.logging.info('Starting to export model.')
        inception_classifier.export_saved_model(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=image_serving_input_fn)
Example #29
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        num_train_steps = int(
            FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        if not tf.gfile.Exists(FLAGS.train_file):
            tf.logging.info(
                "DANITER:File doesn't exist, creating tfrecord data")
            examples = model_builder.load_hellaswag(FLAGS.train_raw_data)
            tf.logging.info("DANITER:Read raw data as json")
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.train_file, do_copa=True)
        train_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, steps=num_train_steps)

    if FLAGS.do_eval:
        # This tells the estimator to run through the entire set.
        if FLAGS.eval_data_size < 0:
            eval_steps = None
        else:
            eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        if not tf.gfile.Exists(FLAGS.eval_file):
            examples = model_builder.load_hellaswag(FLAGS.eval_raw_data)
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.eval_file, do_copa=True)
        eval_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        tf.logging.info("Evaling all models in output dir")
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "eval_accuracy"
        tf.logging.info("Checkpoint path " + checkpoint_path)
        if tf.gfile.Exists(checkpoint_path + ".index"):
            tf.logging.info("Found a best model... not good")
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            tf.logging.info("Setting global step to -1")
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        tf.logging.info("Openning writer " + output_eval_file)
        writer = tf.gfile.GFile(output_eval_file, "w")

        steps_and_files = {}
        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
        tf.logging.info("Models found " + "\n".join(filenames))
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                if cur_filename.split("-")[-1] == "best":
                    continue
                gstep = int(cur_filename.split("-")[-1])
                if gstep not in steps_and_files:
                    tf.logging.info(
                        "Add {} to eval list.".format(cur_filename))
                    steps_and_files[gstep] = cur_filename
        tf.logging.info("found {} files.".format(len(steps_and_files)))
        # steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
        if not steps_and_files:
            tf.logging.info(
                "found 0 file, global step: {}. Sleeping.".format(global_step))
        else:
            for ele in sorted(steps_and_files.items()):
                step, checkpoint_path = ele
                if global_step >= step:
                    if len(_find_valid_cands(step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            # Why should we remove checkpoints?
                            # tf.gfile.Remove(src_ckpt)
                    tf.logging.info("Skipping candidate for some reason")
                    continue
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=checkpoint_path)
                global_step = result["global_step"]
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write("best = {}\n".format(best_perf))

                if len(_find_valid_cands(global_step)) > 1:
                    for ext in ["meta", "data-00000-of-00001", "index"]:
                        src_ckpt = checkpoint_path + ".{}".format(ext)
                        tf.logging.info("removing {}".format(src_ckpt))
                        # tf.gfile.Remove(src_ckpt)
                writer.write("=" * 50 + "\n")
        writer.close()
Example #30
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(albert_config=albert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                optimizer=FLAGS.optimizer,
                                poly_power=FLAGS.poly_power,
                                start_warmup_step=FLAGS.start_warmup_step)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        global_step = -1
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        writer = tf.gfile.GFile(output_eval_file, "w")
        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)
        best_perf = 0
        key_name = "masked_lm_accuracy"
        while global_step < FLAGS.num_train_steps:
            if estimator.latest_checkpoint() is None:
                tf.logging.info("No checkpoint found yet. Sleeping.")
                time.sleep(1)
            else:
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=FLAGS.max_eval_steps)
                global_step = result["global_step"]
                tf.logging.info("***** Eval results *****")
                checkpoint_path = estimator.latest_checkpoint()
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))