def get_estimator(): """Create Keras ranking estimator.""" context_feature_columns, example_feature_columns = _get_feature_columns() # To build your own custom ranking network, look at how canned # DNNRankingNetwork is implemented. You can subclass # tfr.keras.network.UnivariateRankingNetwork, or the more generic # tfr.keras.network.RankingNetwork to build your own network. network = tfr.keras.canned.DNNRankingNetwork( context_feature_columns=context_feature_columns, example_feature_columns=example_feature_columns, hidden_layer_dims=[int(d) for d in FLAGS.hidden_layer_dims], activation=tf.nn.relu, dropout=FLAGS.dropout_rate, use_batch_norm=True, batch_norm_moment=0.99, name="dnn_ranking_model") loss = tfr.keras.losses.get( FLAGS.loss, reduction=tf.compat.v2.losses.Reduction.SUM_OVER_BATCH_SIZE) metrics = tfr.keras.metrics.default_keras_metrics() optimizer = tf.keras.optimizers.Adagrad(learning_rate=FLAGS.learning_rate) config = tf_estimator.RunConfig(save_checkpoints_steps=1000) ranker = tfr.keras.model.create_keras_model(network=network, loss=loss, metrics=metrics, optimizer=optimizer, size_feature_name=_SIZE) estimator = tfr.keras.estimator.model_to_estimator( model=ranker, model_dir=FLAGS.model_dir, config=config, weights_feature_name=FLAGS.weights_feature_name) return estimator
def get_keras_estimator(): """Returns a Keras based estimator for GAM ranking model.""" network = tfr.keras.canned.GAMRankingNetwork( context_feature_columns=None, example_feature_columns=example_feature_columns(), example_hidden_layer_dims=FLAGS.hidden_layer_dims, activation=tf.nn.relu, dropout=FLAGS.dropout, use_batch_norm=True, batch_norm_moment=0.999, name="gam_ranking_network") loss = tfr.keras.losses.get( FLAGS.loss, reduction=tf.compat.v2.losses.Reduction.SUM_OVER_BATCH_SIZE) metrics = tfr.keras.metrics.default_keras_metrics() optimizer = tf.keras.optimizers.Adagrad(learning_rate=FLAGS.learning_rate) config = tf_estimator.RunConfig( model_dir=FLAGS.model_dir, keep_checkpoint_max=FLAGS.num_checkpoints, save_checkpoints_secs=FLAGS.checkpoint_secs) ranker = tfr.keras.model.create_keras_model(network=network, loss=loss, metrics=metrics, optimizer=optimizer, size_feature_name=_SIZE) return tfr.keras.estimator.model_to_estimator(model=ranker, model_dir=FLAGS.model_dir, config=config)
def multitask_progress(): import libs.common.data_interface as libdi import libs.model.losses as libms model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_m_dir } session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 config = tfe.RunConfig(model_dir=args.model_m_dir, save_summary_steps=10) # config = config.replace(session_config=session_config) network = tf.estimator.Estimator(model_fn=model_fn.patch_multitask_fn, model_dir=args.model_m_dir, config=config, params=model_params) print(args) p = 0.7 data, label = data_input.MA_segmention_data() # data,label= data_input.MA_segmention_debug_data() rank_list = libdi.rank_list(len(label)) for epoch in range(args.n_epoch): print('num of data:', len(label)) print('batch_size:', args.train_batch_size) print('n_epoch:', args.n_epoch) new_index = rank_list.get_top(p) data.update_index(new_index) label.update_index(new_index) input_fn = tfe.inputs.numpy_input_fn(x={"images": data}, y=label, batch_size=args.train_batch_size, shuffle=False) print('==== train:{}/{}=========='.format(epoch, args.n_epoch)) network.train(input_fn=input_fn, steps=len(label) // args.train_batch_size)
def setUp(self): super(KerasModelToEstimatorTest, self).setUp() (context_feature_columns, example_feature_columns, custom_objects) = _get_feature_columns() self._context_feature_columns = context_feature_columns self._example_feature_columns = example_feature_columns # Remove label feature from example feature column. del self._example_feature_columns[_LABEL_FEATURE] self._custom_objects = custom_objects self._network = _DummyUnivariateRankingNetwork( context_feature_columns=self._context_feature_columns, example_feature_columns=self._example_feature_columns) self._loss = losses.get( losses.RankingLossKey.SOFTMAX_LOSS, reduction=tf.compat.v2.losses.Reduction.SUM_OVER_BATCH_SIZE) self._eval_metrics = metrics.default_keras_metrics() self._optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1) self._config = tf_estimator.RunConfig(keep_checkpoint_max=2, save_checkpoints_secs=2) self._data_file = os.path.join(tf.compat.v1.test.get_temp_dir(), 'test_elwc.tfrecord') serialized_elwc_list = [ _ELWC_PROTO.SerializeToString(), ] * 20 if tf.io.gfile.exists(self._data_file): tf.io.gfile.remove(self._data_file) with tf.io.TFRecordWriter(self._data_file) as writer: for serialized_elwc in serialized_elwc_list: writer.write(serialized_elwc)
def make_estimator(self): """Returns the built `tf.estimator.Estimator` for the TF-Ranking model.""" config = tf_estimator.RunConfig( model_dir=self._hparams.get("model_dir"), keep_checkpoint_max=self._hparams.get("num_checkpoints"), save_checkpoints_secs=self._hparams.get("checkpoint_secs")) return tf_estimator.Estimator(model_fn=self._model_fn(), config=config)
def main(unused_argv): logger = tf.get_logger() logger.set_level(logging.INFO) if FLAGS.batch_size % FLAGS.microbatches != 0: raise ValueError( 'Number of microbatches should divide evenly batch_size') # Load training and test data. train_data, test_data = load_data() # Instantiate the tf.Estimator. conf = tf_estimator.RunConfig(save_summary_steps=1000) lm_classifier = tf_estimator.Estimator(model_fn=rnn_model_fn, model_dir=FLAGS.model_dir, config=conf) # Create tf.Estimator input functions for the training and test data. batch_len = FLAGS.batch_size * SEQ_LEN train_data_end = len(train_data) - len(train_data) % batch_len test_data_end = len(test_data) - len(test_data) % batch_len train_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn( x={'x': train_data[:train_data_end]}, batch_size=batch_len, num_epochs=FLAGS.epochs, shuffle=False) eval_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn( x={'x': test_data[:test_data_end]}, batch_size=batch_len, num_epochs=1, shuffle=False) # Training loop. steps_per_epoch = len(train_data) // batch_len for epoch in range(1, FLAGS.epochs + 1): print('epoch', epoch) # Train the model for one epoch. lm_classifier.train(input_fn=train_input_fn, steps=steps_per_epoch) if epoch % 5 == 0: name_input_fn = [('Train', train_input_fn), ('Eval', eval_input_fn)] for name, input_fn in name_input_fn: # Evaluate the model and print results eval_results = lm_classifier.evaluate(input_fn=input_fn) result_tuple = (epoch, eval_results['accuracy'], eval_results['loss']) print( name, 'accuracy after %d epochs is: %.3f (%.4f)' % result_tuple) # Compute the privacy budget expended so far. if FLAGS.dpsgd: eps = compute_epsilon(epoch * steps_per_epoch) print('For delta=1e-5, the current epsilon is: %.2f' % eps) else: print('Trained with vanilla non-private SGD optimizer')
def activate_progress(): import libs.common.data_interface as libdi import libs.model.losses as libms model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_s_dir } # session_config = tf.ConfigProto(log_device_placement=True) # session_config.gpu_options.per_process_gpu_memory_fraction = 0.9 config = tfe.RunConfig(model_dir=args.model_s_dir, save_summary_steps=10) # config = config.replace(session_config=session_config) network = tf.estimator.Estimator(model_fn=model_fn.patch_segmentation_fn, model_dir=args.model_s_dir, config=config, params=model_params) print(args) p = 1.0 data, label = data_input.MA_segmention_data() # data,label= data_input.MA_segmention_debug_data() rank_list = libdi.rank_list(len(label)) for epoch in range(args.n_epoch): print('num of data:', len(label)) print('batch_size:', args.train_batch_size) print('n_epoch:', args.n_epoch) new_index = rank_list.get_top(p) data.update_index(new_index) label.update_index(new_index) input_fn = tfe.inputs.numpy_input_fn(x={"images": data}, y=label, batch_size=args.train_batch_size, shuffle=False) print('==== train:{}/{}=========='.format(epoch, args.n_epoch)) network.train(input_fn=input_fn, steps=len(label) // args.train_batch_size) print('===== eval:{}/{}=========='.format(epoch, args.n_epoch)) data.update_index() label.update_index() input_fn = tfe.inputs.numpy_input_fn(x={"images": data}, y=label, batch_size=args.eval_batch_size, shuffle=False) r = network.predict(input_fn=input_fn) total_loss = 0 for i, _r in enumerate(r): l_loss = libms.np_iou_loss(_r['predict'][:, :, 0], label[i][0][:, :, 0]) total_loss += l_loss rank_list.update(i, l_loss) rank_list.sort() print('eval_loss:{}'.format(total_loss / len(label))) print('==========={}/{}=end======'.format(epoch, args.n_epoch))
def train_and_eval(): """Train and Evaluate.""" train_input_fn = make_input_fn(FLAGS.train_path, FLAGS.batch_size) eval_input_fn = make_input_fn(FLAGS.eval_path, FLAGS.batch_size, randomize_input=False, num_epochs=1) optimizer = tf.compat.v1.train.AdagradOptimizer( learning_rate=FLAGS.learning_rate) def _train_op_fn(loss): """Defines train op used in ranking head.""" update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) minimize_op = optimizer.minimize( loss=loss, global_step=tf.compat.v1.train.get_global_step()) train_op = tf.group([minimize_op, update_ops]) return train_op ranking_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn( FLAGS.loss, weights_feature_name=FLAGS.weights_feature_name), eval_metric_fns=eval_metric_fns(), train_op_fn=_train_op_fn) estimator = tf_estimator.Estimator( model_fn=tfr.model.make_groupwise_ranking_fn( group_score_fn=make_score_fn(), group_size=FLAGS.group_size, transform_fn=make_transform_fn(), ranking_head=ranking_head), model_dir=FLAGS.model_dir, config=tf_estimator.RunConfig(save_checkpoints_steps=1000)) train_spec = tf_estimator.TrainSpec(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) exporters = tf_estimator.LatestExporter( "saved_model_exporter", serving_input_receiver_fn=make_serving_input_fn()) eval_spec = tf_estimator.EvalSpec(name="eval", input_fn=eval_input_fn, steps=1, exporters=exporters, start_delay_secs=0, throttle_secs=15) # Train and validate. tf_estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_maybe_evaluate(hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = taxi.read_schema(hparams.schema_file) tf_transform_output = tft.TFTransformOutput(hparams.tf_transform_dir) train_input = lambda: model.input_fn( hparams.train_files, tf_transform_output, batch_size=TRAIN_BATCH_SIZE) eval_input = lambda: model.input_fn( hparams.eval_files, tf_transform_output, batch_size=EVAL_BATCH_SIZE) train_spec = tf_estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( tf_transform_output, schema) exporter = tf_estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf_estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf_estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( tf_transform_output, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf_estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def es_gan_progress(): tfgan = tf.contrib.gan g_model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_g_dir } d_model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_d_dir } g_config = tfe.RunConfig(model_dir=args.model_g_dir, save_summary_steps=10) d_config = tfe.RunConfig(model_dir=args.model_d_dir, save_summary_steps=10) gan_model = tfgan.gan_model(generator_fn=model_fn.g_patch, discriminator_fn=model_fn.d_patch, real_data=real_images, generator_inputs=distorted_images) tfgan.train.add_image_comparison_summaries(gan_model, num_comparisons=3, display_diffs=True) tfgan.train.add_gan_model_image_summaries(gan_model, grid_size=3) with tf.name_scope('losses'): gan_loss = tfgan.gan_loss( gan_model, generator_loss_fn=tfgan.losses.least_squares_generator_loss, discriminator_loss_fn=tfgan.losses.least_squares_discriminator_loss ) l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1) / FLAGS.patch_size**2 gan_loss = tfgan.losses.combine_adversarial_loss( gan_loss, gan_model, l1_pixel_loss, weight_factor=FLAGS.weight_factor)
def my_gan_progress(): g_model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_g_dir } d_model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_d_dir } g_config = tfe.RunConfig(model_dir=args.model_g_dir, save_summary_steps=10) d_config = tfe.RunConfig(model_dir=args.model_d_dir, save_summary_steps=10) g_network = tf.estimator.Estimator(model_fn=model_fn.patch_generator_fn, model_dir=args.model_g_dir, config=g_config, params=g_model_params) d_network = tf.estimator.Estimator( model_fn=model_fn.patch_discriminator_fn, model_dir=args.model_d_dir, config=d_config, params=d_model_params) g_input_fn, data, label = data_input.train_input_fn_layer_segmention() print('num of data:', len(label)) print('batch_size:', args.batch_size) print('n_epoch:', args.n_epoch) print(args) for epoch in range(args.n_epoch): train_input_fn, data, label = data_input.train_input_fn_layer_segmention( ) print('=========={}/{}=========='.format(epoch, args.n_epoch)) nn.train(input_fn=train_input_fn, steps=len(label) // args.batch_size)
def train_and_evaluate(working_dir, num_train_instances=common.NUM_TRAIN_INSTANCES, num_test_instances=common.NUM_TEST_INSTANCES): """Train the model on training data and evaluate on test data. Args: working_dir: Directory to read transformed data and metadata from and to write exported model to. num_train_instances: Number of instances in train set num_test_instances: Number of instances in test set Returns: The results from the estimator's 'evaluate' method """ tf_transform_output = tft.TFTransformOutput(working_dir) run_config = tf_estimator.RunConfig() estimator = tf_estimator.LinearClassifier( feature_columns=get_feature_columns(tf_transform_output), config=run_config, loss_reduction=tf.losses.Reduction.SUM) # Fit the model using the default optimizer. train_input_fn = _make_training_input_fn( tf_transform_output, os.path.join(working_dir, common.TRANSFORMED_TRAIN_DATA_FILEBASE + '*'), batch_size=common.TRAIN_BATCH_SIZE) estimator.train(input_fn=train_input_fn, max_steps=common.TRAIN_NUM_EPOCHS * num_train_instances / common.TRAIN_BATCH_SIZE) # Evaluate model on test dataset. eval_input_fn = _make_training_input_fn( tf_transform_output, os.path.join(working_dir, common.TRANSFORMED_TEST_DATA_FILEBASE + '*'), batch_size=1) # Export the model. serving_input_fn = _make_serving_input_fn(tf_transform_output) exported_model_dir = os.path.join(working_dir, common.EXPORTED_MODEL_DIR) estimator.export_saved_model(exported_model_dir, serving_input_fn) return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
def default_params(distribute_strategy, linear_feature_columns, embedding_feature_columns): config = estimator.RunConfig( save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=5, log_step_count_steps=FLAGS.log_steps, save_summary_steps=200, train_distribute=distribute_strategy, eval_distribute=distribute_strategy) model_params = { "linear_feature_columns": linear_feature_columns, "embedding_feature_columns": embedding_feature_columns, "embedding_size": FLAGS.embedding_size, "learning_rate": FLAGS.learning_rate, "dropout": FLAGS.dropout, "deep_layers": FLAGS.deep_layers, "cross_layers": FLAGS.cross_layers } return config, model_params
def __init__(self, model_dir=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_steps=_USE_DEFAULT, save_checkpoints_secs=_USE_DEFAULT, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, train_distribute=None, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None): self.config = es.RunConfig( model_dir, tf_random_seed, save_summary_steps, save_checkpoints_steps, save_checkpoints_secs, session_config, keep_checkpoint_max, keep_checkpoint_every_n_hours, log_step_count_steps, train_distribute, device_fn, protocol, eval_distribute, experimental_distribute)
def train_model(model_fn, train_input_fn, validation_input_fn, params): """Trains a model. Args: model_fn: (fn) A tf.Estimator model_fn. train_input_fn: (fn) A tf.Estimator input_fn for the training data. validation_input_fn: (fn) A tf.Estimator input_fn for the validation data. params: (dict) Model hyperparameters. """ run_config = tf_estimator.RunConfig( model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.train_steps_per_eval, keep_checkpoint_max=None) logging.warn('RUN CONFIG: %r', run_config) model = tf_estimator.Estimator(model_fn=model_fn, params=params, config=run_config) experiment = tf.contrib.learn.Experiment( model, train_input_fn=train_input_fn, eval_input_fn=validation_input_fn, train_steps=FLAGS.max_train_steps, eval_steps=None, eval_delay_secs=FLAGS.eval_throttle_secs, train_steps_per_iteration=FLAGS.train_steps_per_eval) # WARNING: train_steps_per_iteration should be >= train epoch size, because # the train input queue is reset upon each evaluation in the Experiment # implementation currently; i.e., you might only ever train on a subset of the # training data if you configure train_steps_per_iteration < epoch size. # # See https://github.com/tensorflow/tensorflow/issues/11013 precision_early_stopper = train_utils.EarlyStopper( num_evals_to_wait=FLAGS.early_stopper_num_evals_to_wait, metric_key=FLAGS.eval_metric) experiment.continuous_train_and_eval(continuous_eval_predicate_fn=( precision_early_stopper.early_stop_predicate_fn))
def norm_progress(): model_params = { "learning_rate": args.learning_rate, 'model_dir': args.model_s_dir } session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 config = tfe.RunConfig(model_dir=args.model_s_dir, save_summary_steps=10) config = config.replace(session_config=session_config) network = tf.estimator.Estimator(model_fn=model_fn.patch_segmentation_fn, model_dir=args.model_s_dir, config=config, params=model_params) input_fn, data, label = data_input.MA_segmention_input() print('num of data:', len(label)) print('batch_size:', args.train_batch_size) print('n_epoch:', args.n_epoch) print(args) for epoch in range(args.n_epoch): input_fn, data, label = data_input.MA_segmention_input() print('=========={}/{}=========='.format(epoch, args.n_epoch)) network.train(input_fn=input_fn, steps=len(label) // args.train_batch_size)
def main(unused_argv): config = tf_estimator.RunConfig() classifier = tf_estimator.Estimator(get_model_fn(), config=config) def _merge_datasets(test_batch): feature, label = test_batch['image'], test_batch['label'], features = { 'feature': feature, } labels = { 'label': label, } return (features, labels) def get_dataset(dataset_split): """Returns dataset creation function.""" def make_input_dataset(): """Returns input dataset.""" test_data = tfds.load(name=FLAGS.target_dataset, split=dataset_split) test_data = test_data.batch(FLAGS.train_batch_size) dataset = tf.data.Dataset.zip((test_data,)) dataset = dataset.map(_merge_datasets) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset return make_input_dataset num_eval_images = NUM_EVAL_IMAGES[FLAGS.target_dataset] eval_steps = num_eval_images // FLAGS.train_batch_size classifier.evaluate( input_fn=get_dataset('test'), steps=eval_steps, checkpoint_path=FLAGS.ckpt_path, )
def train_and_eval(params, model_fn, input_fn, keep_checkpoint_every_n_hours=0.5, save_checkpoints_secs=100, eval_steps=0, eval_start_delay_secs=10, eval_throttle_secs=100, save_summary_steps=50): """Trains and evaluates our model. Supports local and distributed training. Args: params: ConfigParams class with model training and network parameters. model_fn: A func with prototype model_fn(features, labels, mode, hparams). input_fn: A input function for the tf.estimator.Estimator. keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. save_checkpoints_secs: Save checkpoints every this many seconds. eval_steps: Number of steps to evaluate model; 0 for one epoch. eval_start_delay_secs: Start evaluating after waiting for this many seconds. eval_throttle_secs: Do not re-evaluate unless the last evaluation was started at least this many seconds ago save_summary_steps: Save summaries every this many steps. """ mparams = params.model_params run_config = tf_estimator.RunConfig( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_checkpoints_secs=save_checkpoints_secs, save_summary_steps=save_summary_steps) if run_config.model_dir: params.model_dir = run_config.model_dir print('\nCreating estimator with model dir %s' % params.model_dir) estimator = tf_estimator.Estimator(model_fn=model_fn, model_dir=params.model_dir, config=run_config, params=params) print('\nCreating train_spec') train_spec = tf_estimator.TrainSpec(input_fn=input_fn(params, split='train'), max_steps=params.steps) print('\nCreating eval_spec') def serving_input_receiver_fn(): """Serving input_fn that builds features from placeholders. Returns: A tf.estimator.export.ServingInputReceiver. """ modelx = mparams.modelx modely = mparams.modely offsets = keras.Input(shape=(3, ), name='offsets', dtype='float32') hom = keras.Input(shape=(3, 3), name='hom', dtype='float32') to_world = keras.Input(shape=(4, 4), name='to_world_L', dtype='float32') img_l = keras.Input(shape=(modely, modelx, 3), name='img_L', dtype='float32') img_r = keras.Input(shape=(modely, modelx, 3), name='img_R', dtype='float32') features = { 'img_L': img_l, 'img_R': img_r, 'to_world_L': to_world, 'offsets': offsets, 'hom': hom } return tf_estimator.export.build_raw_serving_input_receiver_fn( features) class SaveModel(tf_estimator.SessionRunHook): """Saves a model in SavedModel format.""" def __init__(self, estimator, output_dir): self.output_dir = output_dir self.estimator = estimator self.save_num = 0 def begin(self): ckpt = self.estimator.latest_checkpoint() print('Latest checkpoint in hook:', ckpt) ckpt_num_str = ckpt.split('.ckpt-')[1] if (int(ckpt_num_str) - self.save_num) > 4000: fname = os.path.join(self.output_dir, 'saved_model-' + ckpt_num_str) print('**** Saving model in train hook: %s' % fname) self.estimator.export_saved_model(fname, serving_input_receiver_fn()) self.save_num = int(ckpt_num_str) saver_hook = SaveModel(estimator, params.model_dir) if eval_steps == 0: eval_steps = None eval_spec = tf_estimator.EvalSpec(input_fn=input_fn(params, split='val'), steps=eval_steps, hooks=[saver_hook], start_delay_secs=eval_start_delay_secs, throttle_secs=eval_throttle_secs) if run_config.is_chief: outdir = params.model_dir if outdir is not None: print('Writing params to %s' % outdir) os.makedirs(outdir, exist_ok=True) params.write_yaml(os.path.join(outdir, 'params.yaml')) print('\nRunning estimator') tf_estimator.train_and_evaluate(estimator, train_spec, eval_spec) print('\nSaving last model') ckpt = estimator.latest_checkpoint() print('Last checkpoint:', ckpt) ckpt_num_str = ckpt.split('.ckpt-')[1] fname = os.path.join(params.model_dir, 'saved_model-' + ckpt_num_str) print('**** Saving last model: %s' % fname) estimator.export_saved_model(fname, serving_input_receiver_fn())
params['nu'] = 0. params['cutoff'] = 0.5 params['add_summary'] = True params['beta_0'] = 1.2 model_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mnist_model') pprint(params) print('placing model artifacts in {}'.format(model_dir)) # define model and data model = MNISTModel() mnist_data = Mnist(params['batch_size']) run_config = estimator.RunConfig( save_checkpoints_steps=params['steps_per_epoch'], save_summary_steps=200, keep_checkpoint_max=10) mnist_estimator = estimator.Estimator(model_dir=model_dir, model_fn=model.model_fn, params=params, config=run_config) # training/evaluation specs for run train_spec = estimator.TrainSpec( input_fn=mnist_data.build_training_data, max_steps=params['total_steps_train'], ) eval_spec = estimator.EvalSpec(input_fn=mnist_data.build_validation_data, steps=None,
def main(argv): del argv # Unused. # Check flag values # if FLAGS.master is None and FLAGS.tpu_name is None: # raise RuntimeError('You must specify either --master or --tpu_name.') # # if FLAGS.master is not None: # if FLAGS.tpu_name is not None: # tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' # '--tpu_name and using --master.') # tpu_grpc_url = FLAGS.master # else: # tpu_cluster_resolver = ( # tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu_name, # zone=FLAGS.tpu_zone, # project=FLAGS.gcp_project)) # tpu_grpc_url = tpu_cluster_resolver.get_master() # tf.Session.reset(tpu_grpc_url) if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode is 'eval': if FLAGS.valid_data_dir is None: raise RuntimeError( 'You must specify --valid_data_dir for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), # num_shards=FLAGS.num_shards, # use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, fine_tune_checkpoint=FLAGS.fine_tune_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) run_config = estimator.RunConfig(model_dir=FLAGS.model_dir, session_config=config_proto) # if FLAGS.use_xla and not FLAGS.use_tpu: # config_proto.graph_options.optimizer_options.global_jit_level = ( # tf.OptimizerOptions.ON_1) # # run_config = tpu_config.RunConfig( # master=tpu_grpc_url, # evaluation_master=FLAGS.eval_master, # model_dir=FLAGS.model_dir, # log_step_count_steps=FLAGS.iterations_per_loop, # session_config=config_proto, # tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, # FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': train_estimator = estimator.Estimator( model_fn=retinanet_model.retinanet_model_fn, config=run_config, params=params) # train_estimator = tpu_estimator.TPUEstimator( # model_fn=retinanet_model.retinanet_model_fn, # use_tpu=FLAGS.use_tpu, # train_batch_size=FLAGS.train_batch_size, # config=run_config, # params=params) train_estimator.train( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, FLAGS.train_batch_size, is_training=True, ), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=1, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # eval_params = dict( # params, # use_tpu=False, # input_rand_hflip=False, # skip_crowd=False, # resnet_checkpoint=None, # is_training_bn=False, # ) # # eval_estimator = tpu_estimator.TPUEstimator( # model_fn=retinanet_model.retinanet_model_fn, # use_tpu=False, # eval_batch_size=1, # train_batch_size=FLAGS.train_batch_size, # config=run_config, # params=eval_params) eval_params = dict(params, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False) eval_estimator = estimator.Estimator( model_fn=retinanet_model.retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, 1, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')
def create_tf_estimator_model(directory, export, training_steps=100, use_v1_estimator=False): CSV_COLUMN_NAMES = [ "SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species" ] train = pd.read_csv( os.path.join(os.path.dirname(__file__), "iris_training.csv"), names=CSV_COLUMN_NAMES, header=0, ) train_y = train.pop("Species") def input_fn(features, labels, training=True, batch_size=256): """An input function for training or evaluating""" # Convert the inputs to a Dataset. dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) # Shuffle and repeat if you are in training mode. if training: dataset = dataset.shuffle(1000).repeat() return dataset.batch(batch_size) my_feature_columns = [] for key in train.keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) feature_spec = {} for feature in CSV_COLUMN_NAMES: feature_spec[feature] = tf.Variable([], dtype=tf.float64, name=feature) receiver_fn = tf_estimator.export.build_raw_serving_input_receiver_fn( feature_spec) run_config = tf_estimator.RunConfig( # Emit loss metrics to TensorBoard every step save_summary_steps=1, ) # If flag set to true, then use the v1 classifier that extends Estimator # If flag set to false, then use the v2 classifier that extends EstimatorV2 if use_v1_estimator: classifier = tf.compat.v1.estimator.DNNClassifier( feature_columns=my_feature_columns, # Two hidden layers of 10 nodes each. hidden_units=[30, 10], # The model must choose between 3 classes. n_classes=3, model_dir=directory, config=run_config, ) else: classifier = tf_estimator.DNNClassifier( feature_columns=my_feature_columns, # Two hidden layers of 10 nodes each. hidden_units=[30, 10], # The model must choose between 3 classes. n_classes=3, model_dir=directory, config=run_config, ) classifier.train(input_fn=lambda: input_fn(train, train_y, training=True), steps=training_steps) if export: classifier.export_saved_model(directory, receiver_fn)
def main(argv): del argv # Unused. params = imagenet_params params['dataset'] = 'imagenet' if FLAGS.use_checkpoint: params['train_steps'] = FLAGS.finetune_steps # tests workflow with limited number of images params['test_small_sample'] = False if FLAGS.test_workflow: params['train_batch_size'] = 2 params['eval_batch_size'] = 2 params['batch_size'] = 2 params['num_train_images'] = 10 params['num_eval_images'] = 10 params['num_val_images'] = 10 params['train_steps'] = 4 params['test_small_sample'] = True # we pass the updated eval and train string to the params dictionary. params['use_tpu'] = FLAGS.use_tpu params['num_cores'] = FLAGS.num_cores params['sloppy_shuffle'] = True params['momentum'] = FLAGS.momentum params['mode'] = FLAGS.mode params['num_train_images'] = params['num_train_images'] if FLAGS.mode == 'eval': params['batch_size'] = params['eval_batch_size'] if FLAGS.mode == 'train': params['batch_size'] = params['train_batch_size'] params['base_learning_rate'] = params['base_learning_rate'] params['num_workers'] = FLAGS.num_workers params['regularizer'] = FLAGS.regularizer params['regularize_gradients'] = FLAGS.regularize_gradients params['reg_scale'] = FLAGS.reg_scale params['use_checkpoint'] = FLAGS.use_checkpoint params['visualize_image'] = False # for the constrained explanations, we will want to use a clean checkpoint # loaded and constrained for a few steps. if FLAGS.use_checkpoint: reg_scale = FLAGS.reg_scale # create a new save pathway to log the hyperparameters used during # regularization stage. if FLAGS.regularize_gradients: params['output_dir'] = os.path.join(FLAGS.output_dir, 'regularizer', FLAGS.regularizer, FLAGS.noise, str(reg_scale), str(FLAGS.multiple_image_std), str(FLAGS.finetune_steps)) else: params['output_dir'] = os.path.join(FLAGS.output_dir, 'regularizer', 'baseline', str(0.0)) if not tf.io.gfile.isdir(params['output_dir']): tf.io.gfile.makedirs(params['output_dir']) warm_start_from = tf.train.latest_checkpoint(FLAGS.ckpt_directory) else: warm_start_from = None params['output_dir'] = FLAGS.output_dir if FLAGS.mode == 'train': params['batch_size'] = params['train_batch_size'] params['data_dir'] = params['train_directory'] else: params['batch_size'] = params['eval_batch_size'] params['data_dir'] = params['eval_directory'] run_config = tf_estimator.RunConfig(save_summary_steps=300, save_checkpoints_steps=1000, log_step_count_steps=100) classifier = tf_estimator.Estimator(model_fn=resnet_model_fn, config=run_config, params=params, warm_start_from=warm_start_from) eval_steps = params['num_eval_images'] // params['eval_batch_size'] if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator(params['output_dir']): tf.logging.info('Starting to evaluate.') try: classifier.evaluate(input_fn=data_helper.input_fn, steps=eval_steps, checkpoint_path=ckpt) current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params['train_steps']: tf.logging.info('Evaluation finished') break except tf.errors.NotFoundError: tf.logging.info( 'Checkpoint was not found, skipping checkpoint.') else: if FLAGS.mode == 'train': print('start training...') classifier.train(input_fn=data_helper.input_fn, max_steps=params['train_steps'])
def main(unused_argv): tf.set_random_seed(FLAGS.random_seed) save_checkpoints_steps = 100 run_config_args = { 'model_dir': FLAGS.model_dir, 'save_checkpoints_steps': save_checkpoints_steps, 'log_step_count_steps': FLAGS.log_step_count_steps, 'keep_checkpoint_max': 200, } config = tf_estimator.RunConfig(**run_config_args) if FLAGS.warm_start_ckpt_path: var_names = [] checkpoint_path = FLAGS.warm_start_ckpt_path reader = tf.train.NewCheckpointReader(checkpoint_path) for key in reader.get_variable_to_shape_map(): keep_str = 'Momentum|global_step|finetune_global_step|Adam|final_dense_dst' if not re.findall('({})'.format(keep_str, ), key): var_names.append(key) tf.logging.info('Warm-starting tensors: %s', sorted(var_names)) vars_to_warm_start = var_names warm_start_settings = tf_estimator.WarmStartSettings( ckpt_to_initialize_from=checkpoint_path, vars_to_warm_start=vars_to_warm_start) else: warm_start_settings = None classifier = tf_estimator.Estimator(get_model_fn(), config=config, warm_start_from=warm_start_settings) def _merge_datasets(train_batch): feature, label = train_batch['image'], train_batch['label'], features = { 'feature': feature, } labels = { 'label': label, } return (features, labels) def get_dataset(dataset_split): """Returns dataset creation function.""" def make_input_dataset(): """Returns input dataset.""" train_data = tfds.load(name=FLAGS.target_dataset, split=dataset_split) train_data = train_data.shuffle(1024).repeat().batch( FLAGS.train_batch_size) dataset = tf.data.Dataset.zip((train_data, )) dataset = dataset.map(_merge_datasets) dataset = dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) return dataset return make_input_dataset # pylint: disable=protected-access current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) train_steps = FLAGS.train_steps while current_step < train_steps: print('Run {}'.format(current_step)) next_checkpoint = current_step + 500 classifier.train(input_fn=get_dataset('train'), max_steps=next_checkpoint) current_step = next_checkpoint
def _config_fn(self): return estimator.RunConfig( model_dir='./ckpt', save_checkpoints_steps=500, keep_checkpoint_max=10, )
def train_and_eval(): """Train and Evaluate.""" features, labels = load_libsvm_data(FLAGS.train_path, FLAGS.list_size) train_input_fn, train_hook = get_train_inputs(features, labels, FLAGS.train_batch_size) features_vali, labels_vali = load_libsvm_data(FLAGS.vali_path, FLAGS.list_size) vali_input_fn, vali_hook = get_eval_inputs(features_vali, labels_vali) features_test, labels_test = load_libsvm_data(FLAGS.test_path, FLAGS.list_size) test_input_fn, test_hook = get_eval_inputs(features_test, labels_test) optimizer = tf.compat.v1.train.AdagradOptimizer( learning_rate=FLAGS.learning_rate) def _train_op_fn(loss): """Defines train op used in ranking head.""" update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) minimize_op = optimizer.minimize( loss=loss, global_step=tf.compat.v1.train.get_global_step()) train_op = tf.group([minimize_op, update_ops]) return train_op if _use_multi_head(): primary_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn(FLAGS.loss), eval_metric_fns=get_eval_metric_fns(), train_op_fn=_train_op_fn, name=_PRIMARY_HEAD) secondary_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn(FLAGS.secondary_loss), eval_metric_fns=get_eval_metric_fns(), train_op_fn=_train_op_fn, name=_SECONDARY_HEAD) ranking_head = tfr.head.create_multi_ranking_head( [primary_head, secondary_head], [1.0, FLAGS.secondary_loss_weight]) else: ranking_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn(FLAGS.loss), eval_metric_fns=get_eval_metric_fns(), train_op_fn=_train_op_fn) estimator = tf_estimator.Estimator( model_fn=tfr.model.make_groupwise_ranking_fn( group_score_fn=make_score_fn(), group_size=FLAGS.group_size, transform_fn=make_transform_fn(), ranking_head=ranking_head), config=tf_estimator.RunConfig(FLAGS.output_dir, save_checkpoints_steps=1000)) train_spec = tf_estimator.TrainSpec(input_fn=train_input_fn, hooks=[train_hook], max_steps=FLAGS.num_train_steps) # Export model to accept tf.Example when group_size = 1. if FLAGS.group_size == 1: vali_spec = tf_estimator.EvalSpec( input_fn=vali_input_fn, hooks=[vali_hook], steps=1, exporters=tf_estimator.LatestExporter( "latest_exporter", serving_input_receiver_fn=make_serving_input_fn()), start_delay_secs=0, throttle_secs=30) else: vali_spec = tf_estimator.EvalSpec(input_fn=vali_input_fn, hooks=[vali_hook], steps=1, start_delay_secs=0, throttle_secs=30) # Train and validate tf_estimator.train_and_evaluate(estimator, train_spec, vali_spec) # Evaluate on the test data. estimator.evaluate(input_fn=test_input_fn, hooks=[test_hook])
def main(_): os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # Load emotion categories with open(FLAGS.emotion_file, "r") as f: all_emotions = f.read().splitlines() if FLAGS.add_neutral: all_emotions = all_emotions + ["neutral"] idx2emotion = {i: e for i, e in enumerate(all_emotions)} num_labels = len(all_emotions) print("%d labels" % num_labels) print("Multilabel: %r" % FLAGS.multilabel) tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_predict: raise ValueError("At least one of `do_train` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = DataProcessor(num_labels, FLAGS.data_dir) # set up preprocessor tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_config = tf_estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_examples("train", FLAGS.train_fname) eval_examples = processor.get_examples("dev", FLAGS.dev_fname) num_eval_examples = len(eval_examples) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) params = { "num_labels": num_labels, "learning_rate": FLAGS.learning_rate, "num_train_epochs": FLAGS.num_train_epochs, "warmup_proportion": FLAGS.warmup_proportion, "batch_size": FLAGS.train_batch_size, "num_train_examples": len(train_examples), "num_eval_examples": num_eval_examples, "data_dir": FLAGS.data_dir, "output_dir": FLAGS.output_dir, "train_fname": FLAGS.train_fname, "dev_fname": FLAGS.dev_fname, "test_fname": FLAGS.test_fname } with open(os.path.join(FLAGS.output_dir, "config.json"), "w") as f: json.dump(params, f) model_fn = model_fn_builder( bert_config=bert_config, num_labels=num_labels, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, multilabel=FLAGS.multilabel, idx2emotion=idx2emotion) estimator = tf_estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": FLAGS.train_batch_size}) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, FLAGS.max_seq_length, tokenizer, train_file) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running training and evaluation *****") tf.logging.info(" Num train examples = %d", len(train_examples)) tf.logging.info(" Num eval examples = %d", num_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num training steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, num_labels=num_labels) train_spec = tf_estimator.TrainSpec( input_fn=train_input_fn, max_steps=num_train_steps) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) eval_spec = tf_estimator.EvalSpec( input_fn=eval_input_fn, steps=FLAGS.eval_steps, start_delay_secs=0, throttle_secs=1000) tf_estimator.train_and_evaluate( estimator, train_spec=train_spec, eval_spec=eval_spec) if FLAGS.calculate_metrics: # Setting the parameter to "dev" ensures that we get labels for the examples eval_examples = processor.get_examples("dev", FLAGS.test_fname) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num eval examples = %d", len(eval_examples)) eval_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length, tokenizer, eval_file) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) result = estimator.evaluate(input_fn=eval_input_fn, steps=None) output_eval_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_examples("test", FLAGS.test_fname) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".tf_record") file_based_convert_examples_to_features(predict_examples, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".predictions.tsv") output_labels = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".label_predictions.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: with tf.gfile.GFile(output_labels, "w") as writer2: writer.write("\t".join(all_emotions) + "\n") writer2.write("\t".join([ "text", "emotion_1", "prob_1", "emotion_2", "prob_2", "emotion_3", "prob_3" ]) + "\n") tf.logging.info("***** Predict results *****") num_written_lines = 0 for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" sorted_idx = np.argsort(-probabilities) top_3_emotion = [idx2emotion[idx] for idx in sorted_idx[:3]] top_3_prob = [probabilities[idx] for idx in sorted_idx[:3]] pred_line = [] for emotion, prob in zip(top_3_emotion, top_3_prob): if prob >= FLAGS.pred_cutoff: pred_line.extend([emotion, "%.4f" % prob]) else: pred_line.extend(["", ""]) writer.write(output_line) writer2.write(predict_examples[i].text + "\t" + "\t".join(pred_line) + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(argv): args = parser.parse_args(argv[1:]) # handling commandline parameters logging.info("Cmdline Input: {}".format(argv)) TRAINING = args.training WITHPLOT = args.plot singleData = args.single FAKE = args.fake numberPrint = args.plotNo hyperParamFile = args.hyperparams saving = args.save loading = args.load augment = args.augment filterBool = args.filter overrideModelPath = args.overrideModel overrideInputPath = args.overrideInput usingCustomEstimator = args.custom displayWeights = args.dispWeights DEBUG = args.debug tensorboardDebugAddress = args.tensorboard_debug_address progressPlot = args.progressPlot maximumLossAnalysis = args.lossAna cancelThreshold = args.target # Commandline parameters sanity checks saveLoc = None if args.save is not None and args.load is not None: raise ValueError( "The --load and --save flags are mutually exclusive.") if args.save is not None and len(args.save) not in (0, 1): parser.error('Either give no values for save, or two, not {}.'.format(len(args.save))) elif args.save is not None: if len(args.save) == 0: # save to default location saveLoc = None elif len(args.save) == 1: # custom save location saveLoc = args.save[0] loadLoc = None if args.load is not None and len(args.load) not in (0, 1): parser.error('Either give no values for load, or one, not {}.'.format(len(args.load))) sys.exit(-1) elif args.load is not None: if len(args.load) == 0: # save to default location loadLoc = None elif len(args.load) == 1: # custom save location loadLoc = args.load[0] if args.separator is not None and FAKE: parser.error('No fake data for separator training (yet)') if args.separator is not None and len(args.separator) not in (0, 2): parser.error('Separator needs 2 Integers representing prediction Close off and separator position: given {}'.format(len(args.separator))) elif args.separator is not None: separator = True if len(args.separator) == 0: separatorPosition = 1550 predictionCutOff = 1300 else: separatorPosition = args.separator[0] predictionCutOff = args.separator[1] else: separator = False if cancelThreshold is not None and not TRAINING: logging.warning("target parameter is not useful when not in training") time_stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H.%M.%S') # load hyperparameters from hyperparameter file try: hyper_params = load_params(hyperParamFile) STEPS_PER_EPOCH = hyper_params.train.steps_per_epoch EPOCHS = hyper_params.train.epochs BATCH_SIZE = hyper_params.train.batch_size FEATURE_SIZE = hyper_params.arch.feature_size ACTIVATION = hyper_params.arch.activation # "leaky_relu", "relu", "linear", TODO: "sigmoid", "tanh" dropout = hyper_params.arch.dropout_rate hidden_layers = hyper_params.arch.hidden_layers regularization = hyper_params.arch.regularization if regularization is None or regularization.lower() == "no": l1regularization = False l2regularization = False elif regularization.lower() == "l1": l1regularization = True l2regularization = False elif regularization.lower() == "l2": l1regularization = False l2regularization = True else: raise AttributeError('invalid string in hyper_params.arch.regularization') if FAKE: FAKE_DATA_AMOUNT = hyper_params.data.numberFakeLines if augment: MIDPOINT = hyper_params.data.augmentMidpoint MIRRORRANGE = hyper_params.data.augmentRange testSize = hyper_params.data.testSize limits = hyper_params.data.limits elementsDirection = hyper_params.data.direction if elementsDirection.lower() == "y": elementsDirectionBool = True elif elementsDirection.lower() == "x": elementsDirectionBool = False unitLocDirection = hyper_params.data.unitLoc unitTimeDirection = hyper_params.data.unitTime units = {'loc': unitLocDirection, 'time':unitTimeDirection} optimizer = hyper_params.train.optimizer # "Adam", "Adagrad" learningRate = hyper_params.train.learning_rate decaySteps = hyper_params.train.decay_steps if overrideInputPath is None: dataFolder = hyper_params.problem.data_path else: dataFolder = overrideInputPath baseModelPath = hyper_params.problem.modelBasePath baseImagePath = hyper_params.problem.imagePath if args.separator is None: if hyper_params.problem.separator == 1: separator = True separatorPosition = hyper_params.problem.separatorPosition predictionCutOff = hyper_params.problem.predictionCutOff thresholdPoint = hyper_params.problem.thresholdPoint else: separator = False except AttributeError as err: logging.error("Error in Parameters. Maybe mistake in hyperparameter file?") logging.error("AttributeError: {0}".format(err)) sys.exit(1) except Exception as e: logging.error("Some kind of error? not sure: {}".format(e)) sys.exit(1) if loading is None: # Generate feature-label-pairs from given csv track files based on given parameters if not FAKE and not separator: (F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasNextStep(dataFolder, FEATURE_SIZE, testSize) elif separator: (F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasSeparation(dataFolder, FEATURE_SIZE, testSize, separatorPosition, predictionCutOff, elementsDirectionBool) if filterBool: F_train = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool) F_test = filterDataForIntersection(F_test, thresholdPoint, elementsDirectionBool) L_train = L_train.loc[F_train.index] L_test = L_test.loc[F_test.index] else: (F_train, L_train), (F_test, L_test) = ld.loadFakeDataPandas(FEATURE_SIZE, FAKE_DATA_AMOUNT, testSize) # TODO: ziemlich unschön - das könnte man noch besser machen if singleData: F_train = pd.concat([F_train, F_test]) F_test = F_train L_train = pd.concat([L_train, L_test]) L_test = L_train # ExTODO: find Augmentation MIDPOINT from data or as argument? - from Argument # Applying augmentation to feature-label-pairs if augment: logging.info("applying augmentation to Training Set...") if separator: F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool) else: F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool) state = random.randint(1, 101) F_train = F_train.sample(frac=1, random_state=state) L_train = L_train.sample(frac=1, random_state=state) logging.info("done!") # Network Design # -------------- my_feature_columns = [] columnNames = ld.genColumnNames(FEATURE_SIZE) for key in columnNames: my_feature_columns.append(tf.feature_column.numeric_column(key=key)) if not overrideModelPath: MODEL_PATH = baseModelPath # genModelPath(hyper_params, FAKE, usingCustomEstimator, separator) else: MODEL_PATH = overrideModelPath logging.info("time: {}".format(time_stamp)) logging.info('Saving to %s' % MODEL_PATH) # Preparing the initialisation of the estimator if optimizer == 'Adagrad': opti = tf.train.AdagradOptimizer elif optimizer == 'Adam': opti = tf.train.AdamOptimizer # elif optimizer == 'GradientDescent': # opti = tf.train.GradientDescentOptimizer else: logging.error("No (or wrong) optimizer given in hyperparameter file") sys.exit(-1) if ACTIVATION == 'relu': acti = tf.nn.relu elif ACTIVATION == 'leaky_relu': acti = tf.nn.leaky_relu elif ACTIVATION == 'linear': acti = None else: logging.error("No (or wrong) activation function given in hyperparameter file") sys.exit(-1) # File System preparation: check if right folders exist and create them if they dont if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH) logging.info("model folder {} does not exist. Creating folder".format(MODEL_PATH)) elif os.path.exists(MODEL_PATH) and not os.path.isdir(MODEL_PATH): logging.error("There is a file in the place where one would like to save their files..") sys.exit(1) if not os.path.exists(baseImagePath): os.makedirs(baseImagePath) logging.info("image folder: {} does not exist. Creating folder".format(MODEL_PATH)) if not os.path.exists(MODEL_PATH + '/' + os.path.basename(hyperParamFile)): shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(MODEL_PATH + hyperParamFile)) # print("new hyperParam File written") else: shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(hyperParamFile)[:-5] + time_stamp + ".json") # print("added another version of hyper param file") # Saving the generated feature-label-pairs for future use if saving is not None: logging.info("storing data in {}".format(saveLoc)) if saveLoc is None: saveLoc = MODEL_PATH + '/data.h5' with pd.HDFStore(saveLoc) as store: store['xtrain'] = F_train store['ytrain'] = L_train store['xtest'] = F_test store['ytest'] = L_test store['labelMeans'] = labelMeans store['labelStds'] = labelStds # loading a set of pregenerated feature-label-pairs for usage if loading is not None: try: if loadLoc is None: loadLoc = MODEL_PATH + '/data.h5' logging.info("loading data from {}.".format(loadLoc)) with pd.HDFStore(loadLoc) as store: F_train = store['xtrain'] L_train = store['ytrain'] F_test = store['xtest'] L_test = store['ytest'] labelMeans = store['labelMeans'] labelStds = store['labelStds'] except Exception as e: logging.error("Error while loading from stored data: {}".format(e)) sys.exit(1) assert not F_train.index.duplicated().any() assert not L_train.index.duplicated().any() assert not F_test.index.duplicated().any() assert not L_test.index.duplicated().any() # Plot progress Vars - more or less deprecated, but could be updated for current state if progressPlot: pos = [int(i * EPOCHS/10) for i in range(1, 10)] debugVisualizerIndex = random.randint(1, F_test.shape[0]) featureVals = F_test.iloc[[debugVisualizerIndex]] labelVals = L_test.iloc[[debugVisualizerIndex]] predictions = [] if not usingCustomEstimator: # Validation and Test Configuration logging.info("using premade Estimator") test_config = estimator.RunConfig(save_checkpoints_steps=50000, save_checkpoints_secs=None, save_summary_steps=100) regressor = estimator.DNNRegressor(feature_columns=my_feature_columns, label_dimension=2, hidden_units=hidden_layers, model_dir=MODEL_PATH, dropout=dropout, activation_fn=acti, config=test_config, optimizer=opti(learning_rate=learningRate) ) else: logging.info("using custom estimator") test_config = estimator.RunConfig(save_checkpoints_steps=100000, save_checkpoints_secs=None, save_summary_steps=500) useRatioScaling = False # Todo: überlegen ob es hierfür noch eine sinnvolle verwendung gibt if separator and useRatioScaling: medianDim1 = L_train.iloc[:,0].median() medianDim2 = L_train.iloc[:,1].median() ratio = medianDim1 / medianDim2 scaleDim1 = 1.0 scaleDim2 = ratio logging.info("scaling loss between different dimensions. ScaleDim2-Ratio: {}".format(ratio)) else: scaleDim1 = 1.0 scaleDim2 = 1.0 regressor = estimator.Estimator( model_fn=cE.myCustomEstimator, config=test_config, model_dir=MODEL_PATH, params={ "feature_columns": my_feature_columns, "learning_rate": learningRate, "optimizer": opti, "hidden_units": hidden_layers, "dropout": dropout, "activation": acti, "decaying_learning_rate": True, "decay_steps": decaySteps, "l1regularization": l1regularization, "l2regularization": l2regularization, "scaleDim1": scaleDim1, "scaleDim2": scaleDim2, "regularizationStrength": 5e-08 }) hooks = None # Debug hooks are handled here if DEBUG and tensorboardDebugAddress: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if DEBUG: hooks = [tf_debug.LocalCLIDebugHook()] # Start tensorboard with debugger port argument: "tensorboard --logdir=./debug2 --debugger_port 6007" elif tensorboardDebugAddress: hooks = [tf_debug.TensorBoardDebugHook(tensorboardDebugAddress)] # hooks = [debug_hook] logging.info("Train: ({}, {})".format(F_train.shape, L_train.shape)) logging.info("Test: ({}, {})".format(F_test.shape, L_test.shape)) logging.info("Means: \n{}".format(labelMeans)) logging.info("Stds: \n{}".format(labelStds)) # Train it if TRAINING: if not os.path.exists(MODEL_PATH + '/meanstd.pkl'): with open(MODEL_PATH + "/meanstd.pkl", 'wb') as f: pickle.dump([labelMeans, labelStds], f) else: with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f: [labelMeansTemp, labelStdsTemp] = pickle.load(f) if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float? logging.warning("CAREFUL: LabelMeans or LabelStds do not match existing values! Training with new values") logging.info('Train the DNN Regressor...\n') # test = tf.train.get_or_create_global_step() # logging.info("test: {}".format(test)) epochInterm = [] startTimeTraining = timer() for epoch in range(EPOCHS): # Fit the DNNRegressor # regressor.train(input_fn=training_input_fn(batch_size=BATCH_SIZE), steps=STEPS_PER_EPOCH) regressor.train(input_fn=lambda: training_input_fn_Slices(F_train, L_train, BATCH_SIZE), steps=STEPS_PER_EPOCH, hooks=hooks) # Start Tensorboard in Terminal: # tensorboard --logdir='./DNNRegressors/' # Now open Browser and visit localhost:6006\ if epoch % 10 == 0: logging.info("Progress: epoch " + str(epoch)) # logging.info("Progress: global step: {}".format(tf.train.get_global_step())) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info("eval: " + str(eval_dict)) avgLoss = eval_dict['average_loss'] epochInterm.append(avgLoss) # optional canceling of training upon hitting a specified loss threshold if cancelThreshold is not None: if avgLoss < cancelThreshold: logging.info("reached cancel Threshold. finishing training") break if progressPlot and epoch in pos: # TODO: adapt or remove because of standardize and normalize debug_pred = regressor.predict(input_fn=lambda: eval_input_fn(featureVals, labels=None, batch_size=BATCH_SIZE)) debug_predicted = [p['predictions'] for p in debug_pred] predictions.append(debug_predicted) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info("Training completed. final average loss: {}, best average loss during training: {}".format( eval_dict['average_loss'], min(epochInterm))) endTimeTraining = timer() timeTotal = endTimeTraining - startTimeTraining hours = timeTotal // 3600 timeTotal %= 3600 minutes = timeTotal // 60 timeTotal %= 60 logging.info("Total Training time: {}h {}min {}s".format(int(hours), int(minutes), int(timeTotal))) if progressPlot: if FAKE: savePath = '/home/hornberger/testFake' else: savePath = '/home/hornberger/testReal' plotTrainDataPandas(featureVals, labelVals, predictions, savePath, units) # Evaluation/Prediction else: logging.info('No training today, just prediction') if not os.path.exists(MODEL_PATH + '/meanstd.pkl'): logging.warning("Careful: No prior LabelMeans or LabelStds found!") else: with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f: [labelMeansTemp, labelStdsTemp] = pickle.load(f) if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float? logging.warning("evaluation on different dataset. replacing current labelMeans and labelStds") L_test = L_test * labelStds + labelMeans labelMeans = labelMeansTemp labelStds = labelStdsTemp logging.info("New labelMeans: \n{}".format(labelMeans)) logging.info("New labelStds: \n{}".format(labelStds)) L_test = (L_test - labelMeans) / labelStds try: # Prediction eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info('Error on whole Test set:\nMSE (tensorflow): {}'.format(eval_dict['average_loss'])) averageLoss = eval_dict['average_loss'] except ValueError as err: # probably failed to load model logging.error("{}".format(err)) sys.exit(1) except Exception as e: logging.error("Unknown Error while trying to evaluate: {}".format(e)) sys.exit(1) assert numberPrint < L_test.shape[0] sampleIndex = random.randint(0, L_test.shape[0] - numberPrint) # x_pred2 = F_test.iloc[[sampleIndex + i for i in range(numberPrint)]] # y_vals2 = L_test.iloc[[sampleIndex + i for i in range(numberPrint)]] x_pred2 = F_test.sample(n=numberPrint, random_state=sampleIndex) y_vals2 = L_test.sample(n=numberPrint, random_state=sampleIndex) y_vals2Denormalized = y_vals2.copy() for k in L_test.columns: y_vals2Denormalized[k] = y_vals2Denormalized[k] * labelStds[k] + labelMeans[k] print(x_pred2) print(y_vals2 * labelStds + labelMeans) startTime = timer() y_predGen = regressor.predict(input_fn=lambda: eval_input_fn(x_pred2, labels=None, batch_size=BATCH_SIZE)) y_predicted = [p['predictions'] for p in y_predGen] endTime = timer() print("predicted: ") y_predictedCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in y_predicted] # Look, ye mighty, and despair! for i in y_predictedCorr: print(i) print("time: {:.2f}s".format((endTime - startTime))) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(x_pred2, y_vals2, batch_size=BATCH_SIZE)) print('MSE (tensorflow): {}'.format(eval_dict['average_loss'])) # Maximum Loss Analysis: display the X worst predictions of the testset if maximumLossAnalysis: if not separator: printDF = prepareMaximumLossAnalysisNextStep(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds) plotDataNextStepPandas(numberPrint, printDF[columnNames], printDF[['LabelX', 'LabelY']], printDF[['PredictionX', 'PredictionY']], baseImagePath, limits, units, os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf') else: printDF = prepareMaximumLossAnalysisSeparator(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds) # printDF['LabelPosBalken'] = printDF['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken'] plotDataSeparatorPandas(numberPrint, printDF[columnNames], printDF[['LabelPosBalken']], separatorPosition, printDF[['PredictionIntersect']], baseImagePath, limits, units, elementsDirectionBool, os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf') # print(printDF) # displaying weights in Net - (a bit redundant after implementation of debugger) if displayWeights: for variable in regressor.get_variable_names(): logging.info("name: \n{}\nvalue: \n{}\n".format(variable, regressor.get_variable_value(variable))) weights = regressor.get_variable_value('dense/kernel') plt.imshow(weights, cmap='coolwarm') plt.show() # # Final Plot if WITHPLOT: L_trainDenormalized = L_train * labelStds + labelMeans L_testDenormalized = L_test * labelStds + labelMeans if not separator: plotDataNextStepPandas(numberPrint, x_pred2, y_vals2Denormalized, y_predictedCorr, baseImagePath, limits, units, os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf') totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE)) totalPredictions = [p['predictions'] for p in totalPredictGen] totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair! evaluateResultNextStep(F_test, L_testDenormalized, totalPredictionsCorr, units, baseImagePath) else: # y_vals2Denormalized = y_vals2['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken'] # y_predictedCorr = list(map(lambda x: [v * labelStds[k] + labelMeans[k] for k,v in enumerate(x)], y_predicted)) plotDataSeparatorPandas(numberPrint, x_pred2, y_vals2Denormalized['LabelPosBalken'], separatorPosition, y_predictedCorr, baseImagePath, limits, units, elementsDirectionBool, os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf') totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE)) totalPredictions = [p['predictions'] for p in totalPredictGen] totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair! filteredFeatures = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool) medianAccel = getMedianAccel(filteredFeatures, separator, elementsDirectionBool) optimalAccel = getOptimalAccel(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool) bias = getCVBias(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool) configDict = {'medAc': medianAccel, 'optAc': optimalAccel, 'cvBias': bias} evaluateResultSeparator(F_test, L_testDenormalized, totalPredictionsCorr, separatorPosition, thresholdPoint, configDict, units, baseImagePath, elementsDirectionBool)