Python RunConfig Examples, tensorflow.estimator.RunConfig Python Examples

Example #1

0

Show file

File: extract_feature.py Project: Mywayking/albert_demo

    def get_estimator(self):
        from tensorflow.estimator import Estimator
        from tensorflow.estimator import RunConfig
        from tensorflow.estimator import EstimatorSpec

        def model_fn(features, labels, mode, params):
            with tf.gfile.GFile(self.graph_path, 'rb') as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())

            input_names = ['input_ids', 'input_mask', 'segment_ids']

            output = tf.import_graph_def(
                graph_def,
                input_map={k + ':0': features[k]
                           for k in input_names},
                return_elements=['final_encodes:0'])

            return EstimatorSpec(mode=mode, predictions={'encodes': output[0]})

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
        config.log_device_placement = False
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        return Estimator(model_fn=model_fn,
                         config=RunConfig(session_config=config),
                         params={'batch_size': self.batch_size})

Example #2

0

Show file

File: extract_features_gp_nothread.py Project: JerryRoc/bert_utils_gp

    def __init__(self, args):
        from tensorflow.estimator import RunConfig, Estimator
        # load parameters
        self.layer_indexes = args.layer_indexes
        self.ckpt_name = args.ckpt_name
        self.config_name = args.config_name
        self.vocab_file = args.vocab_file
        self.do_lower_case = args.do_lower_case
        self.batch_size = args.batch_size
        self.max_seq_len = args.max_seq_len
        self.gpu_memory_fraction = args.gpu_memory_fraction
        self.xla = args.xla

        # load bert config & construct
        tf.logging.info("load bert config & construct ...")
        self.bert_config = modeling.BertConfig.from_json_file(self.config_name)
        model_fn = model_fn_builder(bert_config=self.bert_config,
                                    init_checkpoint=self.ckpt_name,
                                    layer_indexes=self.layer_indexes)

        # construct estimator
        tf.logging.info("load estimator ...")
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
        config.log_device_placement = False
        if self.xla:
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        self.estimator = Estimator(model_fn=model_fn,
                                   config=RunConfig(session_config=config),
                                   params={'batch_size': self.batch_size})

        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=self.vocab_file, do_lower_case=self.do_lower_case)
        tf.logging.info("initialization done.")

Example #3

0

Show file

File: a.py Project: ArtemSolomatin/severstal

def fit(pattern_path, model_dir):
    watermark_paths = pattern_path
    image_paths = glob('./images/*')
    number_for_train = -100
    train_paths_images = image_paths[:number_for_train]
    test_paths_images = image_paths[number_for_train:]

    train_paths_watermark = watermark_paths[:]
    test_paths_watermark = watermark_paths[:]

    model_params = {"num_blocks": 4,
                    "num_filters": 8,
                    "batch_normalization": True,
                    "training": True}
    
    params = {"model": make_unet,
              "model_params": model_params,
              "IOU_weight": 1,
              "learning_rate": 0.001,
              "lr_decay_steps": 1000,
              "lr_decay_rate": 0.96,
              "create_summary": True}


    strategy = tf.contrib.distribute.OneDeviceStrategy(device='/gpu:0')
    config = RunConfig(save_summary_steps=40,
                       train_distribute=strategy,
                       save_checkpoints_steps = 200,
                       keep_checkpoint_max = 60,
                       eval_distribute=strategy,
                       )
    tf.logging.set_verbosity(tf.logging.INFO)
    segmentation_model = tf.estimator.Estimator(
        model_fn=make_unet_estimator,
        model_dir=model_dir,
        params=params,
        config=config
    )

    train_params = {'image_paths': train_paths_images,
              'watermark_paths': train_paths_watermark,
              'batch_size': 10,
              'num_epochs': 2
              }
    val_params = {'image_paths': test_paths_images,
                    'watermark_paths': test_paths_watermark,
                    'batch_size': 3,
                    'num_epochs': 1
                    }

    train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(**train_params), max_steps = 75000)
    eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(**val_params), throttle_secs=100, steps=200)
    tf.estimator.train_and_evaluate(segmentation_model, train_spec, eval_spec)

    segmentation_model.train(input_fn=lambda: input_fn(**params)),

Example #4

0

Show file

File: BertModel.py Project: kpi6research/Bert-as-a-Library

    def get_config(self, ckpt_output_dir='./output', save_check_steps=1000):

        if not self.config:
            self.config = tf.ConfigProto(device_count={'GPU': 1})
            self.config.gpu_options.allow_growth = True
            self.config.gpu_options.per_process_gpu_memory_fraction = 0.5

        run_config = RunConfig(
            model_dir=ckpt_output_dir,
            session_config=self.config,
            keep_checkpoint_max=self.keep_checkpoint_max,
            save_checkpoints_steps=save_check_steps)

        return run_config

Example #5

0

Show file

File: estimator.py Project: meelement/tacotron-2

def main(args):
    os.makedirs(args.model_dir, exist_ok=True)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=args.model_dir,
        params=hparams,
        config=RunConfig(
            save_summary_steps=args.summary_interval,
            save_checkpoints_steps=args.checkpoint_interval,
            session_config=SESS_CFG,
            # log_step_count_steps=100,
            keep_checkpoint_max=2))
    if args.mode == 'train':
        os.makedirs(args.data_dir, exist_ok=True)
        estimator.train(input_fn=lambda: train_input_fn(args.data_dir))
    elif args.mode == 'predict':
        assert len(args.texts), "No text to predict"
        results = estimator.predict(
            input_fn=lambda: predict_input_fn(args.texts))
        for idx, wav in enumerate(results):
            wav = inv_preemphasis(wav)
            # wav = wav[:find_endpoint(wav)]
            # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False)
            save_wav(wav, 'output_{}.wav'.format(idx))
            # break
    elif args.mode == 'export':
        os.makedirs(args.export_dir, exist_ok=True)
        estimator.export_saved_model(
            args.export_dir,
            tf.estimator.export.build_raw_serving_input_receiver_fn(
                {
                    'inputs':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, None), name='inputs'),
                    'lengths':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, ), name='lengths'),
                },
                default_batch_size=None),
            # assets_extra=None,
            # as_text=False,
            # checkpoint_path=None,
            # experimental_mode=ModeKeys.PREDICT
        )
    else:
        raise KeyError('Unknown Mode <{}>'.format(args.mode))

Example #6

0

Show file

File: tsa_model.py Project: SijanC147/Msc

 def __init__(self, params=None, aux_config=None, run_config=None):
     self._comet_experiment = None
     self._estimator = None
     self.aux_config = aux_config or {}
     self._hooks = (
         []
         if not self.aux_config.get("debug")
         else [tf_debug.LocalCLIDebugHook()]
         if self.aux_config.get("debug") == "cli"
         else [
             tf_debug.TensorBoardDebugHook(
                 "localhost:{}".format(self.aux_config.get("debug"))
             )
         ]
     )
     self.run_config = RunConfig(**(run_config or {}))
     self.params = self.set_params()
     if params:
         self.params.update(params)

Example #7

0

Show file

def train_and_evaluate(board_size, options):

    train_input_fn = make_tfr_input_fn(options['train_data_pattern'],
                                       options['train_batch_size'], board_size,
                                       options)

    eval_input_fn = make_tfr_input_fn(options['eval_data_pattern'],
                                      options['eval_batch_size'], board_size,
                                      options)

    model_fn = make_model_fn(board_size, options)

    serving_input_fn = make_serving_input_fn(board_size)

    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=options['max_train_steps'])

    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      exporters=exporter,
                                      steps=options['eval_steps'],
                                      throttle_secs=options['throttle_secs'],
                                      start_delay_secs=0)

    strategy = MirroredStrategy() if options['distribute'] else None
    config = RunConfig(
        model_dir=options['model_dir'],
        save_summary_steps=options['save_summary_steps'],
        train_distribute=strategy,
        save_checkpoints_steps=options['save_checkpoints_steps'],
        log_step_count_steps=options['log_step_count_steps'])

    estimator = tf.estimator.Estimator(config=config, model_fn=model_fn)

    ##################################################################
    #   Finally, train and evaluate the model
    ##################################################################
    final_eval = tf.estimator.train_and_evaluate(estimator,
                                                 train_spec=train_spec,
                                                 eval_spec=eval_spec)

Example #8

0

Show file

def get_run_config(strategy):
    """
    Get Estimator run config
    Returns:
      Type: RunConfig
  """
    strategy = get_strategy(strategy)

    config = RunConfig(model_dir=None,
                       tf_random_seed=None,
                       save_summary_steps=10,
                       save_checkpoints_steps=20,
                       session_config=SESS_CONFIG,
                       keep_checkpoint_max=5,
                       log_step_count_steps=100,
                       train_distribute=strategy,
                       device_fn=None,
                       protocol=None,
                       eval_distribute=strategy,
                       experimental_distribute=None,
                       experimental_max_worker_delay_secs=None)
    return config

Example #9

0

Show file

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    dist_strategy = tf.contrib.distribute.MirroredStrategy(
        num_gpus=FLAGS.n_gpus,
        cross_device_ops=AllReduceCrossDeviceOps('nccl',
                                                 num_packs=FLAGS.n_gpus),
        # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'),
    )
    log_every_n_steps = 8
    run_config = RunConfig(
        train_distribute=dist_strategy,
        # eval_distribute=dist_strategy,
        log_step_count_steps=log_every_n_steps,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    # run_config = tf.contrib.tpu.RunConfig(
    #     cluster=tpu_cluster_resolver,
    #     master=FLAGS.master,
    #     model_dir=FLAGS.output_dir,
    #     save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    #     tpu_config=tf.contrib.tpu.TPUConfig(
    #         iterations_per_loop=FLAGS.iterations_per_loop,
    #         num_shards=FLAGS.num_tpu_cores,
    #         per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = Estimator(model_fn=model_fn, params={}, config=run_config)
    # estimator = tf.contrib.tpu.TPUEstimator(
    #     use_tpu=FLAGS.use_tpu,
    #     model_fn=model_fn,
    #     config=run_config,
    #     train_batch_size=FLAGS.train_batch_size,
    #     eval_batch_size=FLAGS.eval_batch_size,
    #     predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

Example #10

0

Show file

File: a.py Project: ArtemSolomatin/severstal

                "num_filters": 8,
                "batch_normalization": True,
                "training": True}
params = {"model": make_unet,
          "model_params": model_params,
          "IOU_weight": 1,
          "learning_rate": 0.001,
          "lr_decay_steps": 1000,
          "lr_decay_rate": 0.96,
          "create_summary": True}


strategy = tf.contrib.distribute.OneDeviceStrategy(device='/gpu:0')
config = RunConfig(save_summary_steps=40,
                   train_distribute=strategy,
                   save_checkpoints_steps = 200,
                   keep_checkpoint_max = 60,
                   eval_distribute=strategy,
                   )
tf.logging.set_verbosity(tf.logging.INFO)
segmentation_model = tf.estimator.Estimator(
    model_fn=make_unet_estimator,
    model_dir="/home/dokholyan/Projects/new_experiments/milk_blocks_4_fea_8_IOU_01_norm_Tr/",
    params=params,
    config=config
)

train_params = {'image_paths': train_paths_images,
          'watermark_paths': train_paths_watermark,
          'batch_size': 10,
          'num_epochs': 500
          }

Example #11

0

Show file

File: run_pretraining_multi_gpu.py Project: hzrpku/bert-tf-multi-gpu

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    dist_strategy = tf.contrib.distribute.MirroredStrategy(
        num_gpus=FLAGS.n_gpus,
        cross_device_ops=AllReduceCrossDeviceOps('nccl',
                                                 num_packs=FLAGS.n_gpus),
        # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'),
    )
    log_every_n_steps = 8
    run_config = RunConfig(
        train_distribute=dist_strategy,
        # eval_distribute=dist_strategy,
        log_step_count_steps=log_every_n_steps,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = Estimator(model_fn=model_fn, params={}, config=run_config)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

Example #12

0

Show file

    Steps to run the training job for. If --num-epochs is not specified,
    this must be. Otherwise the training job will run indefinitely.\
    """,
                        type=int,
                        required=True)
    parser.add_argument(
        '--eval-steps',
        help='Number of steps to run evalution for at each checkpoint',
        default=100,
        type=int)
    parser.add_argument('--trainer-type',
                        help='Which trainer to use (spam or component)',
                        choices=['spam', 'component'],
                        required=True)

    args = parser.parse_args()

    logger = logging.getLogger()
    logger.setLevel(getattr(logging, args.verbosity))

    if not args.num_epochs:
        args.num_epochs = args.train_steps

    # Set C++ Graph Execution level verbosity.
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(
        getattr(logging, args.verbosity) / 10)

    # Run the training job.
    train_and_evaluate_model(config=RunConfig(model_dir=args.job_dir),
                             hparams=vars(args))

Example #13

0

Show file

def train_and_evaluate(options):

    import tensorflow as tf
    from tensorflow.estimator import RunConfig
    from tensorflow.contrib.distribute import MirroredStrategy
    import mlflow

    from train.make_model_fn import make_model_fn
    from train.make_tft_serving_input_fn import make_tft_serving_input_fn
    from train.create_feature_columns import create_feature_columns
    from train.make_tfr_input_fn import make_tfr_input_fn
    from train.make_hypotheses import make_hypotheses
    from train.make_input_fns import make_input_fns

    with mlflow.start_run():

        log_params = [
            'base_dir', 'file_format', 'train_batch_size', 'max_train_steps',
            'reader_num_threads', 'parser_num_threads', 'prefetch_buffer_size'
        ]

        for key in log_params:
            mlflow.log_param(key, options[key])

        ##################################################################
        #   Train and Eval Input Functions
        ##################################################################
        make_input_fn = make_input_fns()[options['file_format']]

        train_input_fn = make_input_fn(options['train_data_pattern'],
                                       options['train_batch_size'], options)

        eval_input_fn = make_input_fn(options['eval_data_pattern'],
                                      options['eval_batch_size'], options)

        ##################################################################
        #   Create the hypothesis and the model_fn
        ##################################################################
        hypothesis = make_hypotheses()[options['hypothesis']]
        feature_columns = create_feature_columns()
        model_fn = make_model_fn(feature_columns, options, hypothesis)

        ##################################################################
        #    Train and Eval Spec
        ##################################################################
        serving_input_fn = make_tft_serving_input_fn(options['metadata_dir'])
        exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)

        train_spec = tf.estimator.TrainSpec(
            input_fn=train_input_fn, max_steps=options['max_train_steps'])

        eval_spec = tf.estimator.EvalSpec(
            input_fn=eval_input_fn,
            exporters=exporter,
            steps=options['eval_steps'],
            throttle_secs=options['throttle_secs'],
            start_delay_secs=0)

        ##################################################################
        #   Create and configure the estimator
        ##################################################################
        strategy = MirroredStrategy() if options['distribute'] else None
        config = RunConfig(
            model_dir=options['model_dir'],
            save_summary_steps=options['save_summary_steps'],
            train_distribute=strategy,
            save_checkpoints_steps=options['save_checkpoints_steps'],
            log_step_count_steps=options['log_step_count_steps'])

        estimator = tf.estimator.Estimator(config=config, model_fn=model_fn)

        ##################################################################
        #   Finally, train and evaluate the model
        ##################################################################
        final_eval = tf.estimator.train_and_evaluate(estimator,
                                                     train_spec=train_spec,
                                                     eval_spec=eval_spec)

        if final_eval[0] is not None:
            mlflow.log_metric('loss', final_eval[0]['loss'])
            mlflow.log_metric('mean_error', final_eval[0]['mean_error'])

        return final_eval