def train(self, train: str, valid: str, feats_only: bool = False) -> None: """ Train a new model with this trainer, or if a model already exists in the save path for this trainer, resume training from a checkpoint. :param train: path to training corpus :param valid: path to validation corpus :param feats_only: only extract features, don't train """ if not self._feature_extractor: self._init_feature_extractor(train_path=train) # read and extract features from training/validation data, serialize to disk self._extract_and_write(train) self._extract_and_write(valid, test=True) if feats_only: return # compute steps per epoch/checkpoint and early stopping steps max_steps, patience, checkpoint_steps, steps_per_epoch = self._compute_steps( train, valid) self._training_config.max_steps = max_steps # update config value for learning rate calculation self._training_config.steps_per_epoch = steps_per_epoch # train and evaluate using Estimator API estimator = self._init_estimator(checkpoint_steps) train_spec = self._train_spec( train, max_steps, self._training_hooks(estimator, patience, checkpoint_steps)) eval_spec = self._eval_spec(valid) train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)
def test_unsupported_task_due_to_not_callable(self): unsupported_task = 'alloc' tf_config = { 'cluster': { run_config_lib.TaskType.CHIEF: ['host0:0'], unsupported_task: ['hos1:1'], }, 'task': { 'type': unsupported_task, 'index': 0 } } mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = _create_run_config_with_cluster_spec(tf_config) mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_instance = self._mock_executor_instance() mock_instance.run_alloc = 123 # not callable mock_executor.return_value = mock_instance with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def train(): model_dir = FLAGS.model_dir os.makedirs(model_dir, exist_ok=True) hparams = get_hparams( model_dir, DEFAULT_HPARAMS, hparams_file=FLAGS.config, hparams_str=FLAGS.hparams, validate=True ) run_config = RunConfig( model_dir=model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, save_summary_steps=FLAGS.save_summary_steps) train_input_fn, eval_input_fn = make_input_fns() # Model estimator = Estimator( model_fn=model_fn, config=run_config, params=hparams) train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=FLAGS.max_steps) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=FLAGS.eval_steps, throttle_secs=0) train_and_evaluate( eval_spec=eval_spec, train_spec=train_spec, estimator=estimator )
def train(): model_dir = tf.flags.FLAGS.model_dir os.makedirs(model_dir, exist_ok=True) print("model_dir={}".format(model_dir)) run_config = RunConfig( model_dir=model_dir, save_checkpoints_steps=tf.flags.FLAGS.save_checkpoints_steps) hparams = get_hparams(model_dir, validate=True) vocabs = read_vocablists(path=tf.flags.FLAGS.data_dir) train_input_fn, eval_input_fn, test_input_fn = make_input_fns( tf.flags.FLAGS.data_dir, batch_size=tf.flags.FLAGS.batch_size) # Model model_fn = make_model_fn(hparams=hparams, run_config=run_config, vocabs=vocabs) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=tf.flags.FLAGS.max_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=tf.flags.FLAGS.eval_steps, throttle_secs=0) estimator = Estimator(model_fn=model_fn, config=run_config, params=hparams) train_and_evaluate(eval_spec=eval_spec, train_spec=train_spec, estimator=estimator)
def test_invalid_estimator(self): invalid_estimator = object() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG): training.train_and_evaluate(invalid_estimator, mock_train_spec, mock_eval_spec)
def test_invalid_estimator(self): invalid_estimator = object() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG): training.train_and_evaluate(invalid_estimator, mock_train_spec, mock_eval_spec)
def test_invalid_task_type(self): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = test.mock.Mock() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) mock_est.config = test.mock.Mock() mock_est.config.cluster_spec = {'1': 'dummy'} mock_est.config.task_type = '' with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def test_invalid_task_type(self): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = test.mock.Mock() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) mock_est.config = test.mock.Mock() mock_est.config.cluster_spec = {'1': 'dummy'} mock_est.config.task_type = '' with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def _test_run_task_in_distributed_training(self, run_config): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = run_config mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_executor_instance = self._mock_executor_instance() mock_executor.return_value = mock_executor_instance training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec) mock_executor.assert_called_with(estimator=mock_est, train_spec=mock_train_spec, eval_spec=mock_eval_spec) return mock_executor_instance
def main(): sequence_schema_path = f'{input_path}/train/sequence_schema' context_schema_path = f'{input_path}/train/context_schema' context_schema, sequence_schema = read_schemata(context_schema_path, sequence_schema_path) tf_ctx_schema, tf_seq_schema = build_schema(context_schema, sequence_schema) train_parts = glob.glob(input_path + '/train' + '/part-*') validation_parts = glob.glob(input_path + '/test' + '/part-*') run_config = RunConfig(log_step_count_steps=10, save_checkpoints_steps=100, save_summary_steps=200, keep_checkpoint_max=32) shared_input_fn = partial(input_fn, params, tf_seq_schema, tf_ctx_schema) train_input_fn = partial(shared_input_fn, train_parts) validation_input_fn = partial(shared_input_fn, validation_parts) train_spec = TrainSpec(train_input_fn, max_steps=1000000) eval_spec = EvalSpec(validation_input_fn, steps=200, name='validation', start_delay_secs=30, throttle_secs=1) estimator = Estimator(model_fn=model.model_fn, model_dir=model_dir, params=params, config=run_config) logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logging.getLogger('tensorflow').propagate = False train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) prediction = list(estimator.predict(input_fn=partial(predict_input_fn, {'epochs': 1, 'batch_size': 10}, grid))) scores = [p.tolist() for p in prediction] pairwise_prob = pairwise_probability(scores) zero = pairwise_prob[0] A_zero = build_diags(zero) print(optimize(A_zero).x)
def test_run_local(self): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = run_config_lib.RunConfig() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_executor_instance = self._mock_executor_instance() mock_executor.return_value = mock_executor_instance training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec) self.assertEqual(1, mock_executor_instance.call_task['local']) mock_executor.assert_called_with(estimator=mock_est, train_spec=mock_train_spec, eval_spec=mock_eval_spec)
def test_invalid_local_task(self): tf_config = { 'cluster': { run_config_lib.TaskType.CHIEF: ['host0:0'], 'local': ['hos1:1'], }, 'task': { 'type': 'local', 'index': 0 } } mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = _create_run_config_with_cluster_spec(tf_config) mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def test_invalid_local_task(self): tf_config = { 'cluster': { run_config_lib.TaskType.CHIEF: ['host0:0'], 'local': ['hos1:1'], }, 'task': { 'type': 'local', 'index': 0 } } mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = _create_run_config_with_cluster_spec(tf_config) mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def _complete_flow(self, train_distribute, eval_distribute, remote_cluster=None): estimator = self._get_estimator(train_distribute, eval_distribute, remote_cluster) input_dimension = LABEL_DIMENSION train_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=BATCH_SIZE // len(train_distribute.worker_devices), shuffle=True) if eval_distribute: eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices) else: eval_batch_size = BATCH_SIZE eval_input_fn = self.dataset_input_fn(x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension, )) ] dnn_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension, )) ] feature_columns = linear_feature_columns + dnn_feature_columns estimator_training.train_and_evaluate( estimator, estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS), estimator_training.EvalSpec(name=EVAL_NAME, input_fn=eval_input_fn, steps=None, exporters=self._get_exporter( EXPORTER_NAME, feature_columns), start_delay_secs=0, throttle_secs=1)) return estimator
def _complete_flow(self, train_distribute, eval_distribute, remote_cluster=None): estimator = self._get_estimator(train_distribute, eval_distribute, remote_cluster) input_dimension = LABEL_DIMENSION train_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=BATCH_SIZE // len(train_distribute.worker_devices), shuffle=True) if eval_distribute: eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices) else: eval_batch_size = BATCH_SIZE eval_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] dnn_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] feature_columns = linear_feature_columns + dnn_feature_columns estimator_training.train_and_evaluate( estimator, estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS), estimator_training.EvalSpec( name=EVAL_NAME, input_fn=eval_input_fn, steps=None, exporters=self._get_exporter(EXPORTER_NAME, feature_columns), start_delay_secs=0, throttle_secs=1)) return estimator
def test_unsupported_task_due_to_not_callable(self): unsupported_task = 'alloc' tf_config = { 'cluster': { run_config_lib.TaskType.CHIEF: ['host0:0'], unsupported_task: ['hos1:1'], }, 'task': { 'type': unsupported_task, 'index': 0 } } mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = _create_run_config_with_cluster_spec(tf_config) mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_instance = self._mock_executor_instance() mock_instance.run_alloc = 123 # not callable mock_executor.return_value = mock_instance with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN): training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
def test_run_local(self): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = run_config_lib.RunConfig() mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_executor.return_value = self._mock_executor_instance() return_value = training.train_and_evaluate( mock_est, mock_train_spec, mock_eval_spec) self.assertEqual('local', return_value) mock_executor.assert_called_with(estimator=mock_est, train_spec=mock_train_spec, eval_spec=mock_eval_spec)
def _test_run_task_in_distributed_training(self, run_config): mock_est = test.mock.Mock(spec=estimator_lib.Estimator) mock_est.config = run_config mock_train_spec = test.mock.Mock(spec=training.TrainSpec) mock_eval_spec = test.mock.Mock(spec=training.EvalSpec) with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor: mock_executor.return_value = self._mock_executor_instance() return_value = training.train_and_evaluate( mock_est, mock_train_spec, mock_eval_spec) self.assertEqual(mock_est.config.task_type, return_value) mock_executor.assert_called_with(estimator=mock_est, train_spec=mock_train_spec, eval_spec=mock_eval_spec)
def _complete_flow(self, train_distribute, eval_distribute, remote_cluster=None, use_train_and_evaluate=True): estimator = self._get_estimator(train_distribute, eval_distribute, remote_cluster) input_dimension = LABEL_DIMENSION train_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync, shuffle=True) if eval_distribute: eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync else: eval_batch_size = BATCH_SIZE eval_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] dnn_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] feature_columns = linear_feature_columns + dnn_feature_columns eval_spec = estimator_training.EvalSpec( name=EVAL_NAME, input_fn=eval_input_fn, steps=None, exporters=self._get_exporter(EXPORTER_NAME, feature_columns), start_delay_secs=0, throttle_secs=1) if use_train_and_evaluate: estimator_training.train_and_evaluate( estimator, estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS), eval_spec) else: estimator.train(train_input_fn, max_steps=MAX_STEPS) latest_ckpt_path = estimator.latest_checkpoint() metrics = estimator.evaluate(eval_input_fn, checkpoint_path=latest_ckpt_path, name=EVAL_NAME) # Export the eval result to files. eval_result = estimator_training._EvalResult( status=estimator_training._EvalStatus.EVALUATED, metrics=metrics, checkpoint_path=latest_ckpt_path) evaluator = estimator_training._TrainingExecutor._Evaluator(estimator, eval_spec, None) evaluator._export_eval_result(eval_result, True) return estimator
def _complete_flow(self, train_distribute, eval_distribute, remote_cluster=None, use_train_and_evaluate=True): estimator = self._get_estimator(train_distribute, eval_distribute, remote_cluster) input_dimension = LABEL_DIMENSION train_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync, shuffle=True) if eval_distribute: eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync else: eval_batch_size = BATCH_SIZE eval_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] dnn_feature_columns = [ feature_column.numeric_column("x", shape=(input_dimension,)) ] feature_columns = linear_feature_columns + dnn_feature_columns eval_spec = estimator_training.EvalSpec( name=EVAL_NAME, input_fn=eval_input_fn, steps=None, exporters=self._get_exporter(EXPORTER_NAME, feature_columns), start_delay_secs=0, throttle_secs=1) if use_train_and_evaluate: estimator_training.train_and_evaluate( estimator, estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS), eval_spec) else: estimator.train(train_input_fn, max_steps=MAX_STEPS) latest_ckpt_path = estimator.latest_checkpoint() metrics = estimator.evaluate(eval_input_fn, checkpoint_path=latest_ckpt_path, name=EVAL_NAME) # Export the eval result to files. eval_result = estimator_training._EvalResult( status=estimator_training._EvalStatus.EVALUATED, metrics=metrics, checkpoint_path=latest_ckpt_path) evaluator = estimator_training._TrainingExecutor._Evaluator(estimator, eval_spec, None) evaluator._export_eval_result(eval_result, True) return estimator
def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): label_dimension = 2 input_dimension = label_dimension batch_size = 10 data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32) data = data.reshape(batch_size, label_dimension) train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=True) eval_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn(x={'x': data}, batch_size=batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension, )) ] dnn_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension, )) ] feature_columns = linear_feature_columns + dnn_feature_columns estimator = dnn_linear_combined.DNNLinearCombinedRegressor( linear_feature_columns=linear_feature_columns, dnn_hidden_units=(2, 2), dnn_feature_columns=dnn_feature_columns, label_dimension=label_dimension, model_dir=self._model_dir, # TODO(isaprykin): Work around the colocate_with error. dnn_optimizer=adagrad.AdagradOptimizer(0.001), linear_optimizer=adagrad.AdagradOptimizer(0.001), config=run_config.RunConfig(train_distribute=distribution, eval_distribute=distribution)) num_steps = 10 if use_train_and_evaluate: scores, _ = training.train_and_evaluate( estimator, training.TrainSpec(train_input_fn, max_steps=num_steps), training.EvalSpec(eval_input_fn)) else: estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', scores) predictions = np.array([ x[prediction_keys.PredictionKeys.PREDICTIONS] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, label_dimension), predictions.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_saved_model(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
def main(mname, model_dir, batch_size, epochs, eval_steps, eps_log_steps): global model_dir_hdfs if model_dir.startswith('hdfs'): model_dir_hdfs = True tf.logging.set_verbosity(tf.logging.DEBUG) # get TF logger log.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # create file handler which logs even debug messages if model_dir_hdfs is False: if os.path.exists(model_dir) is False: os.makedirs(model_dir) log_dir = model_dir else: model_dir = os.path.join( model_dir, "job_cifar10_" + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')) log_dir = '.' # clear old log files with open(log_dir + '/tensorflow.log', 'w'): pass with open(log_dir + '/gpu.csv', 'w'): pass with open(log_dir + '/cpu.csv', 'w'): pass fh = logging.FileHandler(log_dir + '/tensorflow.log') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) log.addHandler(fh) log.info("TF version: %s", tf.__version__) log.info("Model directory: %s", model_dir) log.info("Batch size: %s", batch_size) log.info("Prefetch data all to memory: %s", True) log.info("Train epochs: %s", epochs) config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU config.log_device_placement = True # to log device placement (on which device the operation ran) sess = tf.Session(config=config) ktf.set_session( sess) # set this TensorFlow session as the default session for Keras steps_per_epoch = cifar10_data.train_len() / batch_size log.info("Steps per epoch: %s", steps_per_epoch) if eval_steps is None: eval_steps = steps_per_epoch log.info("Evaluating each %i steps", eval_steps) if mname == "cnn": model = cifar10_model_cnn.cifar_model() else: model = cifar10_model_resnet.cifar_model() global input_name input_name = 'input_1' model.summary() def train_input_fn(): dataset = tf.data.Dataset.from_generator( generator=cifar10_data.generator_train, output_types=(tf.float32, tf.float32), output_shapes=shapes) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=batch_size) # dataset = dataset.repeat(20) iterator = dataset.make_one_shot_iterator() features_tensors, labels = iterator.get_next() features = {input_name: features_tensors} return features, labels def eval_input_fn(): dataset = tf.data.Dataset.from_generator( generator=cifar10_data.generator_test, output_types=(tf.float32, tf.float32), output_shapes=shapes) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=batch_size) iterator = dataset.make_one_shot_iterator() features_tensors, labels = iterator.get_next() features = {input_name: features_tensors} return features, labels my_config = RunConfig( save_checkpoints_steps= eval_steps # Save checkpoints every n steps and run the evaluation. # keep_checkpoint_max = 5 # Retain the n most recent checkpoints (default 5). ) estimator = tf.keras.estimator.model_to_estimator(model, config=my_config, model_dir=model_dir) examples_sec_hook = ExamplesPerSecondHook(batch_size, every_n_steps=eps_log_steps) # stopping_hook = early_stopping.stop_if_higher_hook(estimator, "accuracy", 0.5) train_hooks = [examples_sec_hook] train_spec = TrainSpec(input_fn=train_input_fn, hooks=train_hooks, max_steps=cifar10_data.train_len() / batch_size * epochs) eval_spec = EvalSpec(input_fn=eval_input_fn, steps=cifar10_data.val_len() / batch_size, throttle_secs=5) # default 100 steps global is_training is_training = True threading.Thread(target=lambda: collect_stats(log_dir)).start() start = time.time() train_and_evaluate(estimator, train_spec, eval_spec) elapsed = time.time() - start is_training = False log.info("total time taken (seconds): %s ", elapsed) if model_dir_hdfs: parse_res = parse.urlsplit(model_dir) netloc = parse_res[1] path = parse_res[2] webhdfs_model_dir = 'http://' + netloc + ':50070/webhdfs/v1' + path username = getpass.getuser() component_name = estimator.config.task_type + str( estimator.config.task_id) log.info("Uploading log files for %s as %s to HDFS path: %s", component_name, username, webhdfs_model_dir) logging.shutdown() os.system('curl -L -i -T tensorflow.log "' + webhdfs_model_dir + '/tensorflow-' + component_name + '.log?op=CREATE&overwrite=false&user.name=' + username + '"') os.system('curl -L -i -T cpu.csv "' + webhdfs_model_dir + '/cpu-' + component_name + '.csv?op=CREATE&overwrite=false&user.name=' + username + '"') os.system('curl -L -i -T gpu.csv "' + webhdfs_model_dir + '/gpu-' + component_name + '.csv?op=CREATE&overwrite=false&user.name=' + username + '"') else: log.info("Creating zip archive of job results") logging.shutdown() shutil.make_archive(model_dir, 'zip', model_dir)
def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): label_dimension = 2 input_dimension = label_dimension batch_size = 10 data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32) data = data.reshape(batch_size, label_dimension) train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // len(distribution.worker_devices)) eval_input_fn = self.dataset_input_fn(x={'x': data}, y=data, batch_size=batch_size // len(distribution.worker_devices)) predict_input_fn = numpy_io.numpy_input_fn(x={'x': data}, batch_size=batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension, )) ] dnn_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension, )) ] feature_columns = linear_feature_columns + dnn_feature_columns session_config = config_pb2.ConfigProto(log_device_placement=True, allow_soft_placement=True) estimator = dnn_linear_combined.DNNLinearCombinedRegressor( linear_feature_columns=linear_feature_columns, dnn_hidden_units=(2, 2), dnn_feature_columns=dnn_feature_columns, label_dimension=label_dimension, model_dir=self._model_dir, dnn_optimizer=adam.Adam(0.001), linear_optimizer=adam.Adam(0.001), config=run_config.RunConfig(train_distribute=distribution, eval_distribute=distribution, session_config=session_config)) num_steps = 2 if use_train_and_evaluate: scores, _ = training.train_and_evaluate( estimator, training.TrainSpec(train_input_fn, max_steps=num_steps), training.EvalSpec(eval_input_fn)) else: estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertIn('loss', six.iterkeys(scores)) predictions = np.array([ x[prediction_keys.PredictionKeys.PREDICTIONS] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, label_dimension), predictions.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
session_config=config, save_checkpoints_steps=1000) estimator2= DNNClassifier( hidden_units=[512], feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))], n_classes=len(line_type), config=run_config, label_vocabulary=line_type, dropout=0.1) # Training/Evaluating: tf.logging.set_verbosity(tf.logging.INFO) input_fn2 = lambda fp: (tf.data.TextLineDataset(fp) .apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000)) .batch(batch_size) .map(lambda x: tf.py_func(get_encodes2, [x], [tf.float32, tf.string ], name='bert_client'), num_parallel_calls=num_parallel_calls) .map(lambda x, y : ({'feature': x}, y )) .prefetch(20)) train_spec2 = TrainSpec(input_fn=lambda: input_fn2(train_fp)) eval_spec2 = EvalSpec(input_fn=lambda: input_fn2(eval_fp), throttle_secs=0) train_and_evaluate(estimator2, train_spec2, eval_spec2)
config.gpu_options.allow_growth = True run_config = RunConfig(model_dir='./model_label/', session_config=config, save_checkpoints_steps=1000) estimator1= DNNClassifier( hidden_units=[512], feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))], n_classes=len(line_labels), config=run_config, label_vocabulary=line_labels, dropout=0.1) # Training/Evaluating: tf.logging.set_verbosity(tf.logging.INFO) input_fn1 = lambda fp: (tf.data.TextLineDataset(fp) .apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000)) .batch(batch_size) .map(lambda x: tf.py_func(get_encodes1, [x], [tf.float32, tf.string ], name='bert_client'), num_parallel_calls=num_parallel_calls) .map(lambda x, y : ({'feature': x}, y )) .prefetch(20)) train_spec1 = TrainSpec(input_fn=lambda: input_fn1(train_fp)) eval_spec1 = EvalSpec(input_fn=lambda: input_fn1(eval_fp), throttle_secs=0) train_and_evaluate(estimator1, train_spec1, eval_spec1)
def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): label_dimension = 2 input_dimension = label_dimension batch_size = 10 data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32) data = data.reshape(batch_size, label_dimension) train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // len(distribution.worker_devices)) eval_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // len(distribution.worker_devices)) predict_input_fn = numpy_io.numpy_input_fn( x={'x': data}, batch_size=batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension,)) ] dnn_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension,)) ] feature_columns = linear_feature_columns + dnn_feature_columns session_config = config_pb2.ConfigProto( log_device_placement=True, allow_soft_placement=True) estimator = dnn_linear_combined.DNNLinearCombinedRegressor( linear_feature_columns=linear_feature_columns, dnn_hidden_units=(2, 2), dnn_feature_columns=dnn_feature_columns, label_dimension=label_dimension, model_dir=self._model_dir, dnn_optimizer=adam.Adam(0.001), linear_optimizer=adam.Adam(0.001), config=run_config.RunConfig( train_distribute=distribution, eval_distribute=distribution, session_config=session_config)) num_steps = 2 if use_train_and_evaluate: scores, _ = training.train_and_evaluate( estimator, training.TrainSpec(train_input_fn, max_steps=num_steps), training.EvalSpec(eval_input_fn)) else: estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertIn('loss', six.iterkeys(scores)) predictions = np.array([ x[prediction_keys.PredictionKeys.PREDICTIONS] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, label_dimension), predictions.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
return features, labels config = tf.ConfigProto() config.gpu_options.allow_growth = True run_config = RunConfig(model_dir='/data/cips/save/%s' % MODEL_ID, session_config=config, save_checkpoints_steps=2000) estimator = DNNClassifier(hidden_units=[512], feature_columns=[ tf.feature_column.numeric_column('feature', shape=(768, )) ], n_classes=len(laws), config=run_config, label_vocabulary=laws_str, dropout=0.1) input_fn = lambda fp: (tf.data.TextLineDataset(fp).apply( tf.contrib.data.shuffle_and_repeat(buffer_size=10000)).batch(batch_size). map(lambda x: tf.py_func( get_encodes, [x], [tf.float32, tf.string], name='bert_client')).map(lambda x, y: ({ 'feature': x }, y)).prefetch(20)) train_spec = TrainSpec(input_fn=lambda: input_fn(train_fp)) eval_spec = EvalSpec(input_fn=lambda: input_fn(eval_fp), throttle_secs=0) train_and_evaluate(estimator, train_spec, eval_spec)
def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): label_dimension = 2 input_dimension = label_dimension batch_size = 10 data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32) data = data.reshape(batch_size, label_dimension) train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=True) eval_input_fn = self.dataset_input_fn( x={'x': data}, y=data, batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'x': data}, batch_size=batch_size, shuffle=False) linear_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension,)) ] dnn_feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension,)) ] feature_columns = linear_feature_columns + dnn_feature_columns estimator = dnn_linear_combined.DNNLinearCombinedRegressor( linear_feature_columns=linear_feature_columns, dnn_hidden_units=(2, 2), dnn_feature_columns=dnn_feature_columns, label_dimension=label_dimension, model_dir=self._model_dir, # TODO(isaprykin): Work around the colocate_with error. dnn_optimizer=adagrad.AdagradOptimizer(0.001), linear_optimizer=adagrad.AdagradOptimizer(0.001), config=run_config.RunConfig( train_distribute=distribution, eval_distribute=distribution)) num_steps = 10 if use_train_and_evaluate: scores, _ = training.train_and_evaluate( estimator, training.TrainSpec(train_input_fn, max_steps=num_steps), training.EvalSpec(eval_input_fn)) else: estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', scores) predictions = np.array([ x[prediction_keys.PredictionKeys.PREDICTIONS] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, label_dimension), predictions.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_saved_model(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))