def get_estimator(self): from tensorflow.estimator import Estimator from tensorflow.estimator import RunConfig from tensorflow.estimator import EstimatorSpec def model_fn(features, labels, mode, params): with tf.gfile.GFile(self.graph_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) input_names = ['input_ids', 'input_mask', 'segment_ids'] output = tf.import_graph_def( graph_def, input_map={k + ':0': features[k] for k in input_names}, return_elements=['final_encodes:0']) return EstimatorSpec(mode=mode, predictions={'encodes': output[0]}) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction config.log_device_placement = False config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), params={'batch_size': self.batch_size})
def __init__(self, args): from tensorflow.estimator import RunConfig, Estimator # load parameters self.layer_indexes = args.layer_indexes self.ckpt_name = args.ckpt_name self.config_name = args.config_name self.vocab_file = args.vocab_file self.do_lower_case = args.do_lower_case self.batch_size = args.batch_size self.max_seq_len = args.max_seq_len self.gpu_memory_fraction = args.gpu_memory_fraction self.xla = args.xla # load bert config & construct tf.logging.info("load bert config & construct ...") self.bert_config = modeling.BertConfig.from_json_file(self.config_name) model_fn = model_fn_builder(bert_config=self.bert_config, init_checkpoint=self.ckpt_name, layer_indexes=self.layer_indexes) # construct estimator tf.logging.info("load estimator ...") config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction config.log_device_placement = False if self.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 self.estimator = Estimator(model_fn=model_fn, config=RunConfig(session_config=config), params={'batch_size': self.batch_size}) self.tokenizer = tokenization.FullTokenizer( vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) tf.logging.info("initialization done.")
def fit(pattern_path, model_dir): watermark_paths = pattern_path image_paths = glob('./images/*') number_for_train = -100 train_paths_images = image_paths[:number_for_train] test_paths_images = image_paths[number_for_train:] train_paths_watermark = watermark_paths[:] test_paths_watermark = watermark_paths[:] model_params = {"num_blocks": 4, "num_filters": 8, "batch_normalization": True, "training": True} params = {"model": make_unet, "model_params": model_params, "IOU_weight": 1, "learning_rate": 0.001, "lr_decay_steps": 1000, "lr_decay_rate": 0.96, "create_summary": True} strategy = tf.contrib.distribute.OneDeviceStrategy(device='/gpu:0') config = RunConfig(save_summary_steps=40, train_distribute=strategy, save_checkpoints_steps = 200, keep_checkpoint_max = 60, eval_distribute=strategy, ) tf.logging.set_verbosity(tf.logging.INFO) segmentation_model = tf.estimator.Estimator( model_fn=make_unet_estimator, model_dir=model_dir, params=params, config=config ) train_params = {'image_paths': train_paths_images, 'watermark_paths': train_paths_watermark, 'batch_size': 10, 'num_epochs': 2 } val_params = {'image_paths': test_paths_images, 'watermark_paths': test_paths_watermark, 'batch_size': 3, 'num_epochs': 1 } train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(**train_params), max_steps = 75000) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(**val_params), throttle_secs=100, steps=200) tf.estimator.train_and_evaluate(segmentation_model, train_spec, eval_spec) segmentation_model.train(input_fn=lambda: input_fn(**params)),
def get_config(self, ckpt_output_dir='./output', save_check_steps=1000): if not self.config: self.config = tf.ConfigProto(device_count={'GPU': 1}) self.config.gpu_options.allow_growth = True self.config.gpu_options.per_process_gpu_memory_fraction = 0.5 run_config = RunConfig( model_dir=ckpt_output_dir, session_config=self.config, keep_checkpoint_max=self.keep_checkpoint_max, save_checkpoints_steps=save_check_steps) return run_config
def main(args): os.makedirs(args.model_dir, exist_ok=True) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, params=hparams, config=RunConfig( save_summary_steps=args.summary_interval, save_checkpoints_steps=args.checkpoint_interval, session_config=SESS_CFG, # log_step_count_steps=100, keep_checkpoint_max=2)) if args.mode == 'train': os.makedirs(args.data_dir, exist_ok=True) estimator.train(input_fn=lambda: train_input_fn(args.data_dir)) elif args.mode == 'predict': assert len(args.texts), "No text to predict" results = estimator.predict( input_fn=lambda: predict_input_fn(args.texts)) for idx, wav in enumerate(results): wav = inv_preemphasis(wav) # wav = wav[:find_endpoint(wav)] # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False) save_wav(wav, 'output_{}.wav'.format(idx)) # break elif args.mode == 'export': os.makedirs(args.export_dir, exist_ok=True) estimator.export_saved_model( args.export_dir, tf.estimator.export.build_raw_serving_input_receiver_fn( { 'inputs': tf.placeholder( dtype=tf.int32, shape=(None, None), name='inputs'), 'lengths': tf.placeholder( dtype=tf.int32, shape=(None, ), name='lengths'), }, default_batch_size=None), # assets_extra=None, # as_text=False, # checkpoint_path=None, # experimental_mode=ModeKeys.PREDICT ) else: raise KeyError('Unknown Mode <{}>'.format(args.mode))
def __init__(self, params=None, aux_config=None, run_config=None): self._comet_experiment = None self._estimator = None self.aux_config = aux_config or {} self._hooks = ( [] if not self.aux_config.get("debug") else [tf_debug.LocalCLIDebugHook()] if self.aux_config.get("debug") == "cli" else [ tf_debug.TensorBoardDebugHook( "localhost:{}".format(self.aux_config.get("debug")) ) ] ) self.run_config = RunConfig(**(run_config or {})) self.params = self.set_params() if params: self.params.update(params)
def train_and_evaluate(board_size, options): train_input_fn = make_tfr_input_fn(options['train_data_pattern'], options['train_batch_size'], board_size, options) eval_input_fn = make_tfr_input_fn(options['eval_data_pattern'], options['eval_batch_size'], board_size, options) model_fn = make_model_fn(board_size, options) serving_input_fn = make_serving_input_fn(board_size) exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=options['max_train_steps']) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=exporter, steps=options['eval_steps'], throttle_secs=options['throttle_secs'], start_delay_secs=0) strategy = MirroredStrategy() if options['distribute'] else None config = RunConfig( model_dir=options['model_dir'], save_summary_steps=options['save_summary_steps'], train_distribute=strategy, save_checkpoints_steps=options['save_checkpoints_steps'], log_step_count_steps=options['log_step_count_steps']) estimator = tf.estimator.Estimator(config=config, model_fn=model_fn) ################################################################## # Finally, train and evaluate the model ################################################################## final_eval = tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)
def get_run_config(strategy): """ Get Estimator run config Returns: Type: RunConfig """ strategy = get_strategy(strategy) config = RunConfig(model_dir=None, tf_random_seed=None, save_summary_steps=10, save_checkpoints_steps=20, session_config=SESS_CONFIG, keep_checkpoint_max=5, log_step_count_steps=100, train_distribute=strategy, device_fn=None, protocol=None, eval_distribute=strategy, experimental_distribute=None, experimental_max_worker_delay_secs=None) return config
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, # eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) # run_config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # master=FLAGS.master, # model_dir=FLAGS.output_dir, # save_checkpoints_steps=FLAGS.save_checkpoints_steps, # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # num_shards=FLAGS.num_tpu_cores, # per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = Estimator(model_fn=model_fn, params={}, config=run_config) # estimator = tf.contrib.tpu.TPUEstimator( # use_tpu=FLAGS.use_tpu, # model_fn=model_fn, # config=run_config, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
"num_filters": 8, "batch_normalization": True, "training": True} params = {"model": make_unet, "model_params": model_params, "IOU_weight": 1, "learning_rate": 0.001, "lr_decay_steps": 1000, "lr_decay_rate": 0.96, "create_summary": True} strategy = tf.contrib.distribute.OneDeviceStrategy(device='/gpu:0') config = RunConfig(save_summary_steps=40, train_distribute=strategy, save_checkpoints_steps = 200, keep_checkpoint_max = 60, eval_distribute=strategy, ) tf.logging.set_verbosity(tf.logging.INFO) segmentation_model = tf.estimator.Estimator( model_fn=make_unet_estimator, model_dir="/home/dokholyan/Projects/new_experiments/milk_blocks_4_fea_8_IOU_01_norm_Tr/", params=params, config=config ) train_params = {'image_paths': train_paths_images, 'watermark_paths': train_paths_watermark, 'batch_size': 10, 'num_epochs': 500 }
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, # eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = Estimator(model_fn=model_fn, params={}, config=run_config) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
Steps to run the training job for. If --num-epochs is not specified, this must be. Otherwise the training job will run indefinitely.\ """, type=int, required=True) parser.add_argument( '--eval-steps', help='Number of steps to run evalution for at each checkpoint', default=100, type=int) parser.add_argument('--trainer-type', help='Which trainer to use (spam or component)', choices=['spam', 'component'], required=True) args = parser.parse_args() logger = logging.getLogger() logger.setLevel(getattr(logging, args.verbosity)) if not args.num_epochs: args.num_epochs = args.train_steps # Set C++ Graph Execution level verbosity. os.environ['TF_CPP_MIN_LOG_LEVEL'] = str( getattr(logging, args.verbosity) / 10) # Run the training job. train_and_evaluate_model(config=RunConfig(model_dir=args.job_dir), hparams=vars(args))
def train_and_evaluate(options): import tensorflow as tf from tensorflow.estimator import RunConfig from tensorflow.contrib.distribute import MirroredStrategy import mlflow from train.make_model_fn import make_model_fn from train.make_tft_serving_input_fn import make_tft_serving_input_fn from train.create_feature_columns import create_feature_columns from train.make_tfr_input_fn import make_tfr_input_fn from train.make_hypotheses import make_hypotheses from train.make_input_fns import make_input_fns with mlflow.start_run(): log_params = [ 'base_dir', 'file_format', 'train_batch_size', 'max_train_steps', 'reader_num_threads', 'parser_num_threads', 'prefetch_buffer_size' ] for key in log_params: mlflow.log_param(key, options[key]) ################################################################## # Train and Eval Input Functions ################################################################## make_input_fn = make_input_fns()[options['file_format']] train_input_fn = make_input_fn(options['train_data_pattern'], options['train_batch_size'], options) eval_input_fn = make_input_fn(options['eval_data_pattern'], options['eval_batch_size'], options) ################################################################## # Create the hypothesis and the model_fn ################################################################## hypothesis = make_hypotheses()[options['hypothesis']] feature_columns = create_feature_columns() model_fn = make_model_fn(feature_columns, options, hypothesis) ################################################################## # Train and Eval Spec ################################################################## serving_input_fn = make_tft_serving_input_fn(options['metadata_dir']) exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=options['max_train_steps']) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, exporters=exporter, steps=options['eval_steps'], throttle_secs=options['throttle_secs'], start_delay_secs=0) ################################################################## # Create and configure the estimator ################################################################## strategy = MirroredStrategy() if options['distribute'] else None config = RunConfig( model_dir=options['model_dir'], save_summary_steps=options['save_summary_steps'], train_distribute=strategy, save_checkpoints_steps=options['save_checkpoints_steps'], log_step_count_steps=options['log_step_count_steps']) estimator = tf.estimator.Estimator(config=config, model_fn=model_fn) ################################################################## # Finally, train and evaluate the model ################################################################## final_eval = tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec) if final_eval[0] is not None: mlflow.log_metric('loss', final_eval[0]['loss']) mlflow.log_metric('mean_error', final_eval[0]['mean_error']) return final_eval