Example #1
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    # initialize horovod
    hvd.init()
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    config.model_dir = config.model_dir if hvd.rank() == 0 else \
        os.path.join(config.model_dir, str(hvd.rank()))
    config.train_batch_size = config.train_batch_size // hvd.size()
    config.eval_batch_size = config.eval_batch_size // hvd.size()

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=(config.num_tpu_cores
                    if config.do_train else config.num_tpu_cores),
        tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)

    session_config = tf.ConfigProto()
    session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        session_config=session_config,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True, hvd),
                        max_steps=config.num_train_steps,
                        hooks=hooks)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False, hvd),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #2
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max,
            tpu_config=tpu_config)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size)
    else:
        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.Estimator(
            model_fn=tensorflow.contrib.estimator.replicate_model_fn(model_fn),
            config=run_config,
            params={"batch_size": config.train_batch_size})

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #3
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # warm_start_settings = None
    # if config.init_checkpoint:
    #     from tensorflow.python.estimator.estimator import WarmStartSettings
    #     warm_start_settings = WarmStartSettings(ckpt_to_initialize_from=config.init_checkpoint,
    #                                             vars_to_warm_start=['^(?!.*global_step.*)(?!.*adam.*)(?!.*Adam.*).*$'])

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=config.num_tpu_cores,
        # tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #4
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
  """Run pre-training or evaluate the pre-trained model."""
  if config.do_train == config.do_eval:
    raise ValueError("Exactly one of `do_train` or `do_eval` must be True.")
  if config.debug and config.do_train:
    utils.rmkdir(config.model_dir)
  utils.heading("Config:")
  utils.log_config(config)

  is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = None
  if config.use_tpu:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(zone=config.tpu_zone, 
                                                                             project=config.gcp_project)
    print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        
    if tpu_cluster_resolver:
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)

  tpu_config = tf.estimator.tpu.TPUConfig(
      iterations_per_loop=config.iterations_per_loop,
      num_shards=config.num_tpu_cores,
      tpu_job_name=config.tpu_job_name,
      per_host_input_for_training=is_per_host)
  run_config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=config.model_dir,
      save_checkpoints_steps=config.save_checkpoints_steps,
      keep_checkpoint_max=config.keep_checkpoint_max,
      tpu_config=tpu_config)
  model_fn = model_fn_builder(config=config)
  estimator = tf.estimator.tpu.TPUEstimator(
      use_tpu=config.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=config.train_batch_size,
      eval_batch_size=config.eval_batch_size)

  if config.do_train:
    utils.heading("Running training")
    estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                    max_steps=config.num_train_steps)
  if config.do_eval:
    utils.heading("Running evaluation")
    result = estimator.evaluate(
        input_fn=pretrain_data.get_input_fn(config, False),
        steps=config.num_eval_steps)
    for key in sorted(result.keys()):
      utils.log("  {:} = {:}".format(key, str(result[key])))
    return result
Example #5
0
def train_one_step(config: configure_pretraining.PretrainingConfig):
    """Builds an ELECTRA model an trains it for one step; useful for debugging."""
    train_input_fn = pretrain_data.get_input_fn(config, True)
    features = tf.data.make_one_shot_iterator(
        train_input_fn(dict(batch_size=config.train_batch_size))).get_next()
    model = PretrainingModel(config, features, True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        utils.log(sess.run(model.total_loss))
Example #6
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    num_gpus = utils.get_available_gpus()
    utils.log("Found {} gpus".format(len(num_gpus)))

    if num_gpus == 1:
        session_config = tf.ConfigProto(
            log_device_placement=True,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)
    else:
        train_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None,
            cross_device_ops=tensorflow.contrib.distribute.
            AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus)))
        eval_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None)

        session_config = tf.ConfigProto(
            # log_device_placement=True,
            inter_op_parallelism_threads=0,
            intra_op_parallelism_threads=0,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            train_distribute=train_distribution_strategy,
            eval_distribute=eval_distribution_strategy,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)

    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={
                                           'train_batch_size':
                                           config.train_batch_size,
                                           'eval_batch_size':
                                           config.eval_batch_size
                                       })

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #7
0
def train_or_eval(config: PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # session config
    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True
    session_config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # one gpu per process
    # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1  # xla
    # session_config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT  # xla

    # run config
    # согласно примеру: https://gist.github.com/alsrgv/34a32f30292f4e2c1fa29ec0d65dea26
    # model_dir = config.model_dir if hvd.rank() == 0 else None
    # UPD: если model_dir == None, то Estimator по умолчанию сохраняет чекпоинты в /tmp, что сжирает системный диск

    run_config = tf.estimator.RunConfig(
        model_dir=config.model_dir,
        session_config=session_config,
        save_checkpoints_steps=config.save_checkpoints_steps
        if hvd.rank() == 0 else None,
        save_summary_steps=100 if hvd.rank() == 0 else 0,
        keep_checkpoint_max=config.keep_checkpoint_max,
        log_step_count_steps=10000)

    # model_fn
    model_fn = model_fn_builder(config=config)

    # training hooks
    training_hooks = []

    if hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    # estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if config.do_train:
        utils.heading("Running training")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.train_batch_size,
            is_training=True,
            hvd=hvd,
            num_cpu_threads=8)
        estimator.train(input_fn=input_fn,
                        hooks=training_hooks,
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.eval_batch_size,
            is_training=False,
            hvd=hvd,
            num_cpu_threads=8)
        result = estimator.evaluate(input_fn=input_fn,
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #8
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)

        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host,
        )
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            tpu_config=tpu_config,
        )
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
        )
    else:
        config_proto = tf.ConfigProto()
        config_proto.gpu_options.allow_growth = True

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            session_config=config_proto,
        )
        model_fn = model_fn_builder(config=config)

        estimator = None
        if config.saved_model:
            estimator = tf.estimator.Estimator(
                model_fn=model_fn,
                config=run_config,
                warm_start_from=config.saved_model)
        else:
            estimator = tf.estimator.Estimator(model_fn=model_fn,
                                               config=run_config)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(
            input_fn=pretrain_data.get_input_fn(config, True),
            max_steps=config.num_train_steps,
        )
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(
            input_fn=pretrain_data.get_input_fn(config, False),
            steps=config.num_eval_steps,
        )
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result