Esempio n. 1
0
    def train(self, train: str, valid: str, feats_only: bool = False) -> None:
        """
        Train a new model with this trainer, or if a model already exists in the save path for this trainer,
        resume training from a checkpoint.
        :param train: path to training corpus
        :param valid: path to validation corpus
        :param feats_only: only extract features, don't train
        """
        if not self._feature_extractor:
            self._init_feature_extractor(train_path=train)

        # read and extract features from training/validation data, serialize to disk
        self._extract_and_write(train)
        self._extract_and_write(valid, test=True)

        if feats_only:
            return

        # compute steps per epoch/checkpoint and early stopping steps
        max_steps, patience, checkpoint_steps, steps_per_epoch = self._compute_steps(
            train, valid)
        self._training_config.max_steps = max_steps  # update config value for learning rate calculation
        self._training_config.steps_per_epoch = steps_per_epoch

        # train and evaluate using Estimator API
        estimator = self._init_estimator(checkpoint_steps)

        train_spec = self._train_spec(
            train, max_steps,
            self._training_hooks(estimator, patience, checkpoint_steps))
        eval_spec = self._eval_spec(valid)

        train_and_evaluate(estimator,
                           train_spec=train_spec,
                           eval_spec=eval_spec)
Esempio n. 2
0
    def test_unsupported_task_due_to_not_callable(self):
        unsupported_task = 'alloc'
        tf_config = {
            'cluster': {
                run_config_lib.TaskType.CHIEF: ['host0:0'],
                unsupported_task: ['hos1:1'],
            },
            'task': {
                'type': unsupported_task,
                'index': 0
            }
        }
        mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
        mock_est.config = _create_run_config_with_cluster_spec(tf_config)
        mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
        mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

        with test.mock.patch.object(training,
                                    '_TrainingExecutor') as mock_executor:
            mock_instance = self._mock_executor_instance()
            mock_instance.run_alloc = 123  # not callable
            mock_executor.return_value = mock_instance
            with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
                training.train_and_evaluate(mock_est, mock_train_spec,
                                            mock_eval_spec)
Esempio n. 3
0
def train():
    model_dir = FLAGS.model_dir
    os.makedirs(model_dir, exist_ok=True)

    hparams = get_hparams(
        model_dir,
        DEFAULT_HPARAMS,
        hparams_file=FLAGS.config,
        hparams_str=FLAGS.hparams,
        validate=True
    )
    run_config = RunConfig(
        model_dir=model_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        save_summary_steps=FLAGS.save_summary_steps)

    train_input_fn, eval_input_fn = make_input_fns()

    # Model
    estimator = Estimator(
        model_fn=model_fn,
        config=run_config,
        params=hparams)
    train_spec = tf.estimator.TrainSpec(
        input_fn=train_input_fn,
        max_steps=FLAGS.max_steps)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_input_fn,
        steps=FLAGS.eval_steps,
        throttle_secs=0)
    train_and_evaluate(
        eval_spec=eval_spec,
        train_spec=train_spec,
        estimator=estimator
    )
Esempio n. 4
0
def train():
    model_dir = tf.flags.FLAGS.model_dir
    os.makedirs(model_dir, exist_ok=True)
    print("model_dir={}".format(model_dir))
    run_config = RunConfig(
        model_dir=model_dir,
        save_checkpoints_steps=tf.flags.FLAGS.save_checkpoints_steps)
    hparams = get_hparams(model_dir, validate=True)
    vocabs = read_vocablists(path=tf.flags.FLAGS.data_dir)
    train_input_fn, eval_input_fn, test_input_fn = make_input_fns(
        tf.flags.FLAGS.data_dir, batch_size=tf.flags.FLAGS.batch_size)

    # Model
    model_fn = make_model_fn(hparams=hparams,
                             run_config=run_config,
                             vocabs=vocabs)
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=tf.flags.FLAGS.max_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      steps=tf.flags.FLAGS.eval_steps,
                                      throttle_secs=0)
    estimator = Estimator(model_fn=model_fn, config=run_config, params=hparams)
    train_and_evaluate(eval_spec=eval_spec,
                       train_spec=train_spec,
                       estimator=estimator)
Esempio n. 5
0
    def test_invalid_estimator(self):
        invalid_estimator = object()
        mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
        mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

        with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG):
            training.train_and_evaluate(invalid_estimator, mock_train_spec,
                                        mock_eval_spec)
Esempio n. 6
0
  def test_invalid_estimator(self):
    invalid_estimator = object()
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG):
      training.train_and_evaluate(invalid_estimator, mock_train_spec,
                                  mock_eval_spec)
Esempio n. 7
0
  def test_invalid_task_type(self):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = test.mock.Mock()
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    mock_est.config = test.mock.Mock()
    mock_est.config.cluster_spec = {'1': 'dummy'}
    mock_est.config.task_type = ''

    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
Esempio n. 8
0
  def test_invalid_task_type(self):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = test.mock.Mock()
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    mock_est.config = test.mock.Mock()
    mock_est.config.cluster_spec = {'1': 'dummy'}
    mock_est.config.task_type = ''

    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
Esempio n. 9
0
  def _test_run_task_in_distributed_training(self, run_config):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = run_config
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
      mock_executor_instance = self._mock_executor_instance()
      mock_executor.return_value = mock_executor_instance
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
      mock_executor.assert_called_with(estimator=mock_est,
                                       train_spec=mock_train_spec,
                                       eval_spec=mock_eval_spec)
      return mock_executor_instance
Esempio n. 10
0
def main():
    sequence_schema_path = f'{input_path}/train/sequence_schema'
    context_schema_path = f'{input_path}/train/context_schema'

    context_schema,  sequence_schema = read_schemata(context_schema_path, sequence_schema_path)

    tf_ctx_schema, tf_seq_schema = build_schema(context_schema, sequence_schema)

    train_parts = glob.glob(input_path + '/train' + '/part-*')
    validation_parts = glob.glob(input_path + '/test' + '/part-*')

    run_config = RunConfig(log_step_count_steps=10,
                           save_checkpoints_steps=100,
                           save_summary_steps=200,
                           keep_checkpoint_max=32)

    shared_input_fn = partial(input_fn, params, tf_seq_schema, tf_ctx_schema)

    train_input_fn = partial(shared_input_fn, train_parts)

    validation_input_fn = partial(shared_input_fn, validation_parts)

    train_spec = TrainSpec(train_input_fn, max_steps=1000000)

    eval_spec = EvalSpec(validation_input_fn, steps=200, name='validation', start_delay_secs=30, throttle_secs=1)

    estimator = Estimator(model_fn=model.model_fn,
                          model_dir=model_dir,
                          params=params,
                          config=run_config)

    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        level=logging.INFO)
    logging.getLogger('tensorflow').propagate = False

    train_and_evaluate(estimator=estimator,
                       train_spec=train_spec,
                       eval_spec=eval_spec)

    prediction = list(estimator.predict(input_fn=partial(predict_input_fn, {'epochs': 1, 'batch_size': 10}, grid)))

    scores = [p.tolist() for p in prediction]

    pairwise_prob = pairwise_probability(scores)

    zero = pairwise_prob[0]

    A_zero = build_diags(zero)

    print(optimize(A_zero).x)
Esempio n. 11
0
  def test_run_local(self):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = run_config_lib.RunConfig()
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
      mock_executor_instance = self._mock_executor_instance()
      mock_executor.return_value = mock_executor_instance
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
      self.assertEqual(1, mock_executor_instance.call_task['local'])

      mock_executor.assert_called_with(estimator=mock_est,
                                       train_spec=mock_train_spec,
                                       eval_spec=mock_eval_spec)
Esempio n. 12
0
  def test_invalid_local_task(self):
    tf_config = {
        'cluster': {
            run_config_lib.TaskType.CHIEF: ['host0:0'],
            'local': ['hos1:1'],
        },
        'task': {
            'type': 'local',
            'index': 0
        }
    }
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER):
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
Esempio n. 13
0
  def test_invalid_local_task(self):
    tf_config = {
        'cluster': {
            run_config_lib.TaskType.CHIEF: ['host0:0'],
            'local': ['hos1:1'],
        },
        'task': {
            'type': 'local',
            'index': 0
        }
    }
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER):
      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
    def _complete_flow(self,
                       train_distribute,
                       eval_distribute,
                       remote_cluster=None):
        estimator = self._get_estimator(train_distribute, eval_distribute,
                                        remote_cluster)

        input_dimension = LABEL_DIMENSION
        train_input_fn = self.dataset_input_fn(
            x={"x": DATA},
            y=DATA,
            batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
            shuffle=True)
        if eval_distribute:
            eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
        else:
            eval_batch_size = BATCH_SIZE
        eval_input_fn = self.dataset_input_fn(x={"x": DATA},
                                              y=DATA,
                                              batch_size=eval_batch_size,
                                              shuffle=False)

        linear_feature_columns = [
            feature_column.numeric_column("x", shape=(input_dimension, ))
        ]
        dnn_feature_columns = [
            feature_column.numeric_column("x", shape=(input_dimension, ))
        ]
        feature_columns = linear_feature_columns + dnn_feature_columns

        estimator_training.train_and_evaluate(
            estimator,
            estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
            estimator_training.EvalSpec(name=EVAL_NAME,
                                        input_fn=eval_input_fn,
                                        steps=None,
                                        exporters=self._get_exporter(
                                            EXPORTER_NAME, feature_columns),
                                        start_delay_secs=0,
                                        throttle_secs=1))
        return estimator
  def _complete_flow(self,
                     train_distribute,
                     eval_distribute,
                     remote_cluster=None):
    estimator = self._get_estimator(train_distribute, eval_distribute,
                                    remote_cluster)

    input_dimension = LABEL_DIMENSION
    train_input_fn = self.dataset_input_fn(
        x={"x": DATA},
        y=DATA,
        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
        shuffle=True)
    if eval_distribute:
      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
    else:
      eval_batch_size = BATCH_SIZE
    eval_input_fn = self.dataset_input_fn(
        x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False)

    linear_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    dnn_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    feature_columns = linear_feature_columns + dnn_feature_columns

    estimator_training.train_and_evaluate(
        estimator,
        estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
        estimator_training.EvalSpec(
            name=EVAL_NAME,
            input_fn=eval_input_fn,
            steps=None,
            exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
            start_delay_secs=0,
            throttle_secs=1))
    return estimator
Esempio n. 16
0
  def test_unsupported_task_due_to_not_callable(self):
    unsupported_task = 'alloc'
    tf_config = {
        'cluster': {
            run_config_lib.TaskType.CHIEF: ['host0:0'],
            unsupported_task: ['hos1:1'],
        },
        'task': {
            'type': unsupported_task,
            'index': 0
        }
    }
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
      mock_instance = self._mock_executor_instance()
      mock_instance.run_alloc = 123  # not callable
      mock_executor.return_value = mock_instance
      with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
        training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
Esempio n. 17
0
  def test_run_local(self):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = run_config_lib.RunConfig()
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
      mock_executor.return_value = self._mock_executor_instance()
      return_value = training.train_and_evaluate(
          mock_est, mock_train_spec, mock_eval_spec)

      self.assertEqual('local', return_value)
      mock_executor.assert_called_with(estimator=mock_est,
                                       train_spec=mock_train_spec,
                                       eval_spec=mock_eval_spec)
Esempio n. 18
0
  def _test_run_task_in_distributed_training(self, run_config):
    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
    mock_est.config = run_config
    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)

    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
      mock_executor.return_value = self._mock_executor_instance()
      return_value = training.train_and_evaluate(
          mock_est, mock_train_spec, mock_eval_spec)

      self.assertEqual(mock_est.config.task_type, return_value)
      mock_executor.assert_called_with(estimator=mock_est,
                                       train_spec=mock_train_spec,
                                       eval_spec=mock_eval_spec)
Esempio n. 19
0
  def _complete_flow(self,
                     train_distribute,
                     eval_distribute,
                     remote_cluster=None,
                     use_train_and_evaluate=True):
    estimator = self._get_estimator(train_distribute, eval_distribute,
                                    remote_cluster)

    input_dimension = LABEL_DIMENSION
    train_input_fn = self.dataset_input_fn(
        x={"x": DATA},
        y=DATA,
        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
        shuffle=True)
    if eval_distribute:
      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
    else:
      eval_batch_size = BATCH_SIZE
    eval_input_fn = self.dataset_input_fn(
        x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False)

    linear_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    dnn_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    feature_columns = linear_feature_columns + dnn_feature_columns

    eval_spec = estimator_training.EvalSpec(
        name=EVAL_NAME,
        input_fn=eval_input_fn,
        steps=None,
        exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
        start_delay_secs=0,
        throttle_secs=1)

    if use_train_and_evaluate:
      estimator_training.train_and_evaluate(
          estimator,
          estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
          eval_spec)
    else:
      estimator.train(train_input_fn, max_steps=MAX_STEPS)

      latest_ckpt_path = estimator.latest_checkpoint()
      metrics = estimator.evaluate(eval_input_fn,
                                   checkpoint_path=latest_ckpt_path,
                                   name=EVAL_NAME)

      # Export the eval result to files.
      eval_result = estimator_training._EvalResult(
          status=estimator_training._EvalStatus.EVALUATED,
          metrics=metrics,
          checkpoint_path=latest_ckpt_path)
      evaluator = estimator_training._TrainingExecutor._Evaluator(estimator,
                                                                  eval_spec,
                                                                  None)
      evaluator._export_eval_result(eval_result, True)

    return estimator
  def _complete_flow(self,
                     train_distribute,
                     eval_distribute,
                     remote_cluster=None,
                     use_train_and_evaluate=True):
    estimator = self._get_estimator(train_distribute, eval_distribute,
                                    remote_cluster)

    input_dimension = LABEL_DIMENSION
    train_input_fn = self.dataset_input_fn(
        x={"x": DATA},
        y=DATA,
        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
        shuffle=True)
    if eval_distribute:
      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
    else:
      eval_batch_size = BATCH_SIZE
    eval_input_fn = self.dataset_input_fn(
        x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False)

    linear_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    dnn_feature_columns = [
        feature_column.numeric_column("x", shape=(input_dimension,))
    ]
    feature_columns = linear_feature_columns + dnn_feature_columns

    eval_spec = estimator_training.EvalSpec(
        name=EVAL_NAME,
        input_fn=eval_input_fn,
        steps=None,
        exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
        start_delay_secs=0,
        throttle_secs=1)

    if use_train_and_evaluate:
      estimator_training.train_and_evaluate(
          estimator,
          estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
          eval_spec)
    else:
      estimator.train(train_input_fn, max_steps=MAX_STEPS)

      latest_ckpt_path = estimator.latest_checkpoint()
      metrics = estimator.evaluate(eval_input_fn,
                                   checkpoint_path=latest_ckpt_path,
                                   name=EVAL_NAME)

      # Export the eval result to files.
      eval_result = estimator_training._EvalResult(
          status=estimator_training._EvalStatus.EVALUATED,
          metrics=metrics,
          checkpoint_path=latest_ckpt_path)
      evaluator = estimator_training._TrainingExecutor._Evaluator(estimator,
                                                                  eval_spec,
                                                                  None)
      evaluator._export_eval_result(eval_result, True)

    return estimator
Esempio n. 21
0
    def test_complete_flow_with_mode(self, distribution,
                                     use_train_and_evaluate):
        label_dimension = 2
        input_dimension = label_dimension
        batch_size = 10
        data = np.linspace(0.,
                           2.,
                           batch_size * label_dimension,
                           dtype=np.float32)
        data = data.reshape(batch_size, label_dimension)
        train_input_fn = self.dataset_input_fn(
            x={'x': data},
            y=data,
            batch_size=batch_size // distribution.num_replicas_in_sync,
            shuffle=True)
        eval_input_fn = self.dataset_input_fn(
            x={'x': data},
            y=data,
            batch_size=batch_size // distribution.num_replicas_in_sync,
            shuffle=False)
        predict_input_fn = numpy_io.numpy_input_fn(x={'x': data},
                                                   batch_size=batch_size,
                                                   shuffle=False)

        linear_feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, ))
        ]
        dnn_feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, ))
        ]
        feature_columns = linear_feature_columns + dnn_feature_columns
        estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
            linear_feature_columns=linear_feature_columns,
            dnn_hidden_units=(2, 2),
            dnn_feature_columns=dnn_feature_columns,
            label_dimension=label_dimension,
            model_dir=self._model_dir,
            # TODO(isaprykin): Work around the colocate_with error.
            dnn_optimizer=adagrad.AdagradOptimizer(0.001),
            linear_optimizer=adagrad.AdagradOptimizer(0.001),
            config=run_config.RunConfig(train_distribute=distribution,
                                        eval_distribute=distribution))

        num_steps = 10
        if use_train_and_evaluate:
            scores, _ = training.train_and_evaluate(
                estimator,
                training.TrainSpec(train_input_fn, max_steps=num_steps),
                training.EvalSpec(eval_input_fn))
        else:
            estimator.train(train_input_fn, steps=num_steps)
            scores = estimator.evaluate(eval_input_fn)

        self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
        self.assertIn('loss', scores)

        predictions = np.array([
            x[prediction_keys.PredictionKeys.PREDICTIONS]
            for x in estimator.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, label_dimension), predictions.shape)

        feature_spec = feature_column.make_parse_example_spec(feature_columns)
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = estimator.export_saved_model(tempfile.mkdtemp(),
                                                  serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
Esempio n. 22
0
def main(mname, model_dir, batch_size, epochs, eval_steps, eps_log_steps):
    global model_dir_hdfs
    if model_dir.startswith('hdfs'):
        model_dir_hdfs = True

    tf.logging.set_verbosity(tf.logging.DEBUG)
    # get TF logger
    log.setLevel(logging.DEBUG)
    # create formatter and add it to the handlers
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # create file handler which logs even debug messages
    if model_dir_hdfs is False:
        if os.path.exists(model_dir) is False:
            os.makedirs(model_dir)
        log_dir = model_dir
    else:
        model_dir = os.path.join(
            model_dir, "job_cifar10_" +
            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))
        log_dir = '.'

    # clear old log files
    with open(log_dir + '/tensorflow.log', 'w'):
        pass
    with open(log_dir + '/gpu.csv', 'w'):
        pass
    with open(log_dir + '/cpu.csv', 'w'):
        pass

    fh = logging.FileHandler(log_dir + '/tensorflow.log')

    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    log.addHandler(fh)

    log.info("TF version: %s", tf.__version__)
    log.info("Model directory: %s", model_dir)
    log.info("Batch size: %s", batch_size)
    log.info("Prefetch data all to memory: %s", True)
    log.info("Train epochs: %s", epochs)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
    config.log_device_placement = True  # to log device placement (on which device the operation ran)
    sess = tf.Session(config=config)
    ktf.set_session(
        sess)  # set this TensorFlow session as the default session for Keras

    steps_per_epoch = cifar10_data.train_len() / batch_size
    log.info("Steps per epoch: %s", steps_per_epoch)
    if eval_steps is None:
        eval_steps = steps_per_epoch
    log.info("Evaluating each %i steps", eval_steps)

    if mname == "cnn":
        model = cifar10_model_cnn.cifar_model()
    else:
        model = cifar10_model_resnet.cifar_model()
        global input_name
        input_name = 'input_1'

    model.summary()

    def train_input_fn():
        dataset = tf.data.Dataset.from_generator(
            generator=cifar10_data.generator_train,
            output_types=(tf.float32, tf.float32),
            output_shapes=shapes)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=batch_size)
        # dataset = dataset.repeat(20)
        iterator = dataset.make_one_shot_iterator()
        features_tensors, labels = iterator.get_next()
        features = {input_name: features_tensors}
        return features, labels

    def eval_input_fn():
        dataset = tf.data.Dataset.from_generator(
            generator=cifar10_data.generator_test,
            output_types=(tf.float32, tf.float32),
            output_shapes=shapes)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=batch_size)
        iterator = dataset.make_one_shot_iterator()
        features_tensors, labels = iterator.get_next()
        features = {input_name: features_tensors}
        return features, labels

    my_config = RunConfig(
        save_checkpoints_steps=
        eval_steps  # Save checkpoints every n steps and run the evaluation.
        # keep_checkpoint_max = 5    # Retain the n most recent checkpoints (default 5).
    )
    estimator = tf.keras.estimator.model_to_estimator(model,
                                                      config=my_config,
                                                      model_dir=model_dir)

    examples_sec_hook = ExamplesPerSecondHook(batch_size,
                                              every_n_steps=eps_log_steps)
    # stopping_hook = early_stopping.stop_if_higher_hook(estimator, "accuracy", 0.5)

    train_hooks = [examples_sec_hook]

    train_spec = TrainSpec(input_fn=train_input_fn,
                           hooks=train_hooks,
                           max_steps=cifar10_data.train_len() / batch_size *
                           epochs)
    eval_spec = EvalSpec(input_fn=eval_input_fn,
                         steps=cifar10_data.val_len() / batch_size,
                         throttle_secs=5)  # default 100 steps

    global is_training
    is_training = True
    threading.Thread(target=lambda: collect_stats(log_dir)).start()
    start = time.time()

    train_and_evaluate(estimator, train_spec, eval_spec)

    elapsed = time.time() - start
    is_training = False
    log.info("total time taken (seconds): %s ", elapsed)
    if model_dir_hdfs:
        parse_res = parse.urlsplit(model_dir)
        netloc = parse_res[1]
        path = parse_res[2]
        webhdfs_model_dir = 'http://' + netloc + ':50070/webhdfs/v1' + path
        username = getpass.getuser()
        component_name = estimator.config.task_type + str(
            estimator.config.task_id)
        log.info("Uploading log files for %s as %s to HDFS path: %s",
                 component_name, username, webhdfs_model_dir)
        logging.shutdown()
        os.system('curl -L -i -T tensorflow.log "' + webhdfs_model_dir +
                  '/tensorflow-' + component_name +
                  '.log?op=CREATE&overwrite=false&user.name=' + username + '"')
        os.system('curl -L -i -T cpu.csv "' + webhdfs_model_dir + '/cpu-' +
                  component_name +
                  '.csv?op=CREATE&overwrite=false&user.name=' + username + '"')
        os.system('curl -L -i -T gpu.csv "' + webhdfs_model_dir + '/gpu-' +
                  component_name +
                  '.csv?op=CREATE&overwrite=false&user.name=' + username + '"')
    else:
        log.info("Creating zip archive of job results")
        logging.shutdown()
        shutil.make_archive(model_dir, 'zip', model_dir)
    def test_complete_flow_with_mode(self, distribution,
                                     use_train_and_evaluate):
        label_dimension = 2
        input_dimension = label_dimension
        batch_size = 10
        data = np.linspace(0.,
                           2.,
                           batch_size * label_dimension,
                           dtype=np.float32)
        data = data.reshape(batch_size, label_dimension)
        train_input_fn = self.dataset_input_fn(
            x={'x': data},
            y=data,
            batch_size=batch_size // len(distribution.worker_devices))
        eval_input_fn = self.dataset_input_fn(x={'x': data},
                                              y=data,
                                              batch_size=batch_size //
                                              len(distribution.worker_devices))
        predict_input_fn = numpy_io.numpy_input_fn(x={'x': data},
                                                   batch_size=batch_size,
                                                   shuffle=False)

        linear_feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, ))
        ]
        dnn_feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, ))
        ]
        feature_columns = linear_feature_columns + dnn_feature_columns
        session_config = config_pb2.ConfigProto(log_device_placement=True,
                                                allow_soft_placement=True)
        estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
            linear_feature_columns=linear_feature_columns,
            dnn_hidden_units=(2, 2),
            dnn_feature_columns=dnn_feature_columns,
            label_dimension=label_dimension,
            model_dir=self._model_dir,
            dnn_optimizer=adam.Adam(0.001),
            linear_optimizer=adam.Adam(0.001),
            config=run_config.RunConfig(train_distribute=distribution,
                                        eval_distribute=distribution,
                                        session_config=session_config))

        num_steps = 2
        if use_train_and_evaluate:
            scores, _ = training.train_and_evaluate(
                estimator,
                training.TrainSpec(train_input_fn, max_steps=num_steps),
                training.EvalSpec(eval_input_fn))
        else:
            estimator.train(train_input_fn, steps=num_steps)
            scores = estimator.evaluate(eval_input_fn)

        self.assertIn('loss', six.iterkeys(scores))

        predictions = np.array([
            x[prediction_keys.PredictionKeys.PREDICTIONS]
            for x in estimator.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, label_dimension), predictions.shape)

        feature_spec = feature_column.make_parse_example_spec(feature_columns)
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                                 serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
Esempio n. 24
0
                       session_config=config,
                       save_checkpoints_steps=1000)


estimator2= DNNClassifier(
                          hidden_units=[512],
                          feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
                          n_classes=len(line_type),
                          config=run_config,
                          label_vocabulary=line_type,
                          dropout=0.1)

# Training/Evaluating:
tf.logging.set_verbosity(tf.logging.INFO)


input_fn2 = lambda fp: (tf.data.TextLineDataset(fp)
                        .apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000))
                        .batch(batch_size)
                        .map(lambda x: tf.py_func(get_encodes2, [x], [tf.float32, tf.string ], name='bert_client'),
                             num_parallel_calls=num_parallel_calls)
                        .map(lambda x, y  : ({'feature': x}, y ))
                        .prefetch(20))


train_spec2 = TrainSpec(input_fn=lambda: input_fn2(train_fp))
eval_spec2 = EvalSpec(input_fn=lambda: input_fn2(eval_fp), throttle_secs=0)


train_and_evaluate(estimator2, train_spec2, eval_spec2)
Esempio n. 25
0
config.gpu_options.allow_growth = True
run_config = RunConfig(model_dir='./model_label/',
                       session_config=config,
                       save_checkpoints_steps=1000)

estimator1= DNNClassifier(
                          hidden_units=[512],
                          feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
                          n_classes=len(line_labels),
                          config=run_config,
                          label_vocabulary=line_labels,
                          dropout=0.1)


# Training/Evaluating:
tf.logging.set_verbosity(tf.logging.INFO)

input_fn1 = lambda fp: (tf.data.TextLineDataset(fp)
                       .apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000))
                       .batch(batch_size)
                       .map(lambda x: tf.py_func(get_encodes1, [x], [tf.float32, tf.string ], name='bert_client'),
                            num_parallel_calls=num_parallel_calls)
                       .map(lambda x, y  : ({'feature': x}, y ))
                       .prefetch(20))

train_spec1 = TrainSpec(input_fn=lambda: input_fn1(train_fp))
eval_spec1 = EvalSpec(input_fn=lambda: input_fn1(eval_fp), throttle_secs=0)


train_and_evaluate(estimator1, train_spec1, eval_spec1)
  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
    label_dimension = 2
    input_dimension = label_dimension
    batch_size = 10
    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
    data = data.reshape(batch_size, label_dimension)
    train_input_fn = self.dataset_input_fn(
        x={'x': data},
        y=data,
        batch_size=batch_size // len(distribution.worker_devices))
    eval_input_fn = self.dataset_input_fn(
        x={'x': data},
        y=data,
        batch_size=batch_size // len(distribution.worker_devices))
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'x': data}, batch_size=batch_size, shuffle=False)

    linear_feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,))
    ]
    dnn_feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,))
    ]
    feature_columns = linear_feature_columns + dnn_feature_columns
    session_config = config_pb2.ConfigProto(
        log_device_placement=True, allow_soft_placement=True)
    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
        linear_feature_columns=linear_feature_columns,
        dnn_hidden_units=(2, 2),
        dnn_feature_columns=dnn_feature_columns,
        label_dimension=label_dimension,
        model_dir=self._model_dir,
        dnn_optimizer=adam.Adam(0.001),
        linear_optimizer=adam.Adam(0.001),
        config=run_config.RunConfig(
            train_distribute=distribution,
            eval_distribute=distribution,
            session_config=session_config))

    num_steps = 2
    if use_train_and_evaluate:
      scores, _ = training.train_and_evaluate(
          estimator, training.TrainSpec(train_input_fn, max_steps=num_steps),
          training.EvalSpec(eval_input_fn))
    else:
      estimator.train(train_input_fn, steps=num_steps)
      scores = estimator.evaluate(eval_input_fn)

    self.assertIn('loss', six.iterkeys(scores))

    predictions = np.array([
        x[prediction_keys.PredictionKeys.PREDICTIONS]
        for x in estimator.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, label_dimension), predictions.shape)

    feature_spec = feature_column.make_parse_example_spec(feature_columns)
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                             serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))
Esempio n. 27
0
    return features, labels


config = tf.ConfigProto()
config.gpu_options.allow_growth = True
run_config = RunConfig(model_dir='/data/cips/save/%s' % MODEL_ID,
                       session_config=config,
                       save_checkpoints_steps=2000)

estimator = DNNClassifier(hidden_units=[512],
                          feature_columns=[
                              tf.feature_column.numeric_column('feature',
                                                               shape=(768, ))
                          ],
                          n_classes=len(laws),
                          config=run_config,
                          label_vocabulary=laws_str,
                          dropout=0.1)

input_fn = lambda fp: (tf.data.TextLineDataset(fp).apply(
    tf.contrib.data.shuffle_and_repeat(buffer_size=10000)).batch(batch_size).
                       map(lambda x: tf.py_func(
                           get_encodes, [x], [tf.float32, tf.string],
                           name='bert_client')).map(lambda x, y: ({
                               'feature': x
                           }, y)).prefetch(20))

train_spec = TrainSpec(input_fn=lambda: input_fn(train_fp))
eval_spec = EvalSpec(input_fn=lambda: input_fn(eval_fp), throttle_secs=0)
train_and_evaluate(estimator, train_spec, eval_spec)
  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
    label_dimension = 2
    input_dimension = label_dimension
    batch_size = 10
    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
    data = data.reshape(batch_size, label_dimension)
    train_input_fn = self.dataset_input_fn(
        x={'x': data},
        y=data,
        batch_size=batch_size // distribution.num_replicas_in_sync,
        shuffle=True)
    eval_input_fn = self.dataset_input_fn(
        x={'x': data},
        y=data,
        batch_size=batch_size // distribution.num_replicas_in_sync,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'x': data}, batch_size=batch_size, shuffle=False)

    linear_feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,))
    ]
    dnn_feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,))
    ]
    feature_columns = linear_feature_columns + dnn_feature_columns
    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
        linear_feature_columns=linear_feature_columns,
        dnn_hidden_units=(2, 2),
        dnn_feature_columns=dnn_feature_columns,
        label_dimension=label_dimension,
        model_dir=self._model_dir,
        # TODO(isaprykin): Work around the colocate_with error.
        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
        linear_optimizer=adagrad.AdagradOptimizer(0.001),
        config=run_config.RunConfig(
            train_distribute=distribution, eval_distribute=distribution))

    num_steps = 10
    if use_train_and_evaluate:
      scores, _ = training.train_and_evaluate(
          estimator,
          training.TrainSpec(train_input_fn, max_steps=num_steps),
          training.EvalSpec(eval_input_fn))
    else:
      estimator.train(train_input_fn, steps=num_steps)
      scores = estimator.evaluate(eval_input_fn)

    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
    self.assertIn('loss', scores)

    predictions = np.array([
        x[prediction_keys.PredictionKeys.PREDICTIONS]
        for x in estimator.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, label_dimension), predictions.shape)

    feature_spec = feature_column.make_parse_example_spec(feature_columns)
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = estimator.export_saved_model(tempfile.mkdtemp(),
                                              serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))