Ejemplo n.º 1
0
    def train_and_evaluate(output_dir):
        review_column = feature_column.sparse_column_with_integerized_feature(
            const.REVIEW_COLUMN, bucket_size=vocab_size + 1, combiner='sum')
        weighted_reviews = feature_column.weighted_sparse_column(
            review_column, const.REVIEW_WEIGHT)

        estimator = learn.LinearClassifier(
            feature_columns=[weighted_reviews],
            n_classes=2,
            model_dir=output_dir,
            config=tf.contrib.learn.RunConfig(save_checkpoints_secs=30))

        transformed_metadata = metadata_io.read_metadata(
            transformed_metadata_dir)
        raw_metadata = metadata_io.read_metadata(raw_metadata_dir)

        train_input_fn = input_fn_maker.build_training_input_fn(
            transformed_metadata,
            transformed_train_file_pattern,
            training_batch_size=train_batch_size,
            label_keys=[const.LABEL_COLUMN])

        eval_input_fn = input_fn_maker.build_training_input_fn(
            transformed_metadata,
            transformed_test_file_pattern,
            training_batch_size=1,
            label_keys=[const.LABEL_COLUMN])

        serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn(
            raw_metadata=raw_metadata,
            transform_savedmodel_dir=output_dir + '/transform_fn',
            raw_label_keys=[],
            raw_feature_keys=[const.REVIEW_COLUMN])

        export_strategy = saved_model_export_utils.make_export_strategy(
            serving_input_fn,
            exports_to_keep=5,
            default_output_alternative_key=None)

        return tf.contrib.learn.Experiment(estimator=estimator,
                                           train_steps=train_num_epochs *
                                           num_train_instances /
                                           train_batch_size,
                                           eval_steps=num_test_instances,
                                           train_input_fn=train_input_fn,
                                           eval_input_fn=eval_input_fn,
                                           export_strategies=export_strategy,
                                           min_eval_frequency=500)
Ejemplo n.º 2
0
 def transformed_metadata(self):
   """A DatasetMetadata."""
   if self._transformed_metadata is None:
     self._transformed_metadata = metadata_io.read_metadata(
         os.path.join(self._transform_output_dir,
                      self.TRANSFORMED_METADATA_DIR))
   return self._transformed_metadata
Ejemplo n.º 3
0
def make_input_function(working_dir,
                        filebase,
                        num_epochs=None,
                        shuffle=True,
                        batch_size=200):
    transformed_metadata = metadata_io.read_metadata(
        os.path.join(working_dir, transform_fn_io.TRANSFORMED_METADATA_DIR))
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

    def parse_tf_record(example_proto):
        parsed_features = tf.parse_single_example(example_proto,
                                                  transformed_feature_spec)
        return parsed_features

    def input_func():
        file_pattern = os.path.join(working_dir, filebase + '-*')
        file_names = tf.data.TFRecordDataset.list_files(file_pattern)
        dataset = file_names.flat_map(
            lambda x: tf.data.TFRecordDataset(x)).map(parse_tf_record)

        if shuffle:
            dataset = dataset.shuffle(buffer_size=batch_size * 10)

        dataset = dataset.repeat(num_epochs).batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()
        return features, features.pop(LABEL_KEY)

    return input_func
Ejemplo n.º 4
0
def is_classification(transformed_data_dir, target):
    """Whether the scenario is classification (vs regression).

  Returns:
    The number of classes if the target represents a classification
    problem, or None if it does not.
  """
    transformed_metadata = metadata_io.read_metadata(
        os.path.join(transformed_data_dir,
                     transform_fn_io.TRANSFORMED_METADATA_DIR))
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
    if target not in transformed_feature_spec:
        raise ValueError('Cannot find target "%s" in transformed data.' %
                         target)

    feature = transformed_feature_spec[target]
    if (not isinstance(feature, tf.FixedLenFeature) or feature.shape != []
            or feature.dtype not in TARGET_TYPES):
        raise ValueError('target "%s" is of invalid type.' % target)

    if feature.dtype in CLASSIFICATION_TARGET_TYPES:
        if feature.dtype == tf.bool:
            return 2
        return get_vocab_size(transformed_data_dir, target)

    return None
Ejemplo n.º 5
0
    def testWriteTransformFn(self):
        path = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                })

            _ = ((saved_model_dir_pcoll, metadata)
                 | transform_fn_io.WriteTransformFn(path))

        transformed_metadata_dir = os.path.join(
            path, transform_fn_io.TRANSFORMED_METADATA_DIR)
        metadata = metadata_io.read_metadata(transformed_metadata_dir)
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR)
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
Ejemplo n.º 6
0
def input_fn(filenames, tf_transform_dir, batch_size=200):
    """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_dir: directory in which the tf-transform model was written
      during the preprocessing step.
    batch_size: int First dimension size of the Tensors returned by input_fn
  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
    metadata_dir = os.path.join(tf_transform_dir,
                                transform_fn_io.TRANSFORMED_METADATA_DIR)
    transformed_metadata = metadata_io.read_metadata(metadata_dir)
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

    transformed_features = tf.contrib.learn.io.read_batch_features(
        filenames,
        batch_size,
        transformed_feature_spec,
        reader=_gzip_reader_fn)

    # We pop the label because we do not want to use it as a feature while we're
    # training.
    return transformed_features, transformed_features.pop(taxi.LABEL_KEY)
Ejemplo n.º 7
0
def _input_fn(filenames, transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    transform_output: directory in which the tf-transform model was written
      during the preprocessing step.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  metadata_dir = os.path.join(transform_output,
                              transform_fn_io.TRANSFORMED_METADATA_DIR)
  transformed_metadata = metadata_io.read_metadata(metadata_dir)
  transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

  transformed_features = tf.contrib.learn.io.read_batch_features(
      filenames,
      batch_size,
      transformed_feature_spec,
      reader=_gzip_reader_fn)

  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))
Ejemplo n.º 8
0
def _make_training_input_fn(working_dir, filebase, batch_size):
    """Creates an input function reading from transformed data.

  Args:
    working_dir: Directory to read transformed data and metadata from and to
        write exported model to.
    filebase: Base filename (relative to `working_dir`) of examples.
    batch_size: Batch size.

  Returns:
    The input function for training or eval.
  """
    transformed_metadata = metadata_io.read_metadata(
        os.path.join(working_dir, tft.TRANSFORMED_METADATA_DIR))
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

    def input_fn():
        """Input function for training and eval."""
        transformed_features = tf.contrib.learn.io.read_batch_features(
            os.path.join(working_dir, filebase + '*'), batch_size,
            transformed_feature_spec, tf.TFRecordReader)

        # Extract features and label from the transformed tensors.
        transformed_labels = transformed_features.pop(LABEL_KEY)

        return transformed_features, transformed_labels

    return input_fn
Ejemplo n.º 9
0
def make_serving_input_fn_for_base64_json(args):
    raw_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'rawdata_metadata'))
    transform_savedmodel_dir = (os.path.join(args['metadata_path'],
                                             'transform_fn'))
    return input_fn_maker.build_parsing_transforming_serving_input_receiver_fn(
        raw_metadata, transform_savedmodel_dir, exclude_raw_keys=[LABEL_COL])
Ejemplo n.º 10
0
def make_training_input_fn(transformed_data_dir,
                           mode,
                           batch_size,
                           target_name,
                           num_epochs=None):
    """Creates an input function reading from transformed data.
  Args:
    transformed_data_dir: Directory to read transformed data and metadata from.
    mode: 'train' or 'eval'.
    batch_size: Batch size.
    target_name: name of the target column.
    num_epochs: number of training data epochs.
  Returns:
    The input function for training or eval.
  """
    transformed_metadata = metadata_io.read_metadata(
        os.path.join(transformed_data_dir,
                     transform_fn_io.TRANSFORMED_METADATA_DIR))
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

    def _input_fn():
        """Input function for training and eval."""
        epochs = 1 if mode == 'eval' else num_epochs
        transformed_features = tf.contrib.learn.io.read_batch_features(
            os.path.join(transformed_data_dir, mode + '-*'),
            batch_size,
            transformed_feature_spec,
            tf.TFRecordReader,
            num_epochs=epochs)

        # Extract features and label from the transformed tensors.
        transformed_labels = transformed_features.pop(target_name)
        return transformed_features, transformed_labels

    return _input_fn
Ejemplo n.º 11
0
 def test_read_features(self):
     # TODO(b/123241798): use TEST_TMPDIR
     basedir = tempfile.mkdtemp()
     schema_no_sparse_features = """
 {
   "feature": [{
     "name": "my_key",
     "fixedShape": {
       "axis": [{
         "size": 2
       }]
     },
     "type": "INT",
     "domain": {
       "ints": {}
     },
     "parsingOptions": {
       "tfOptions": {
         "fixedLenFeature": {}
       }
     }
   }]
 }
 """
     self._write_schema_to_disk(basedir, schema_no_sparse_features)
     _ = metadata_io.read_metadata(basedir)
Ejemplo n.º 12
0
def run(args):
    #config = tf.estimator.RunConfig(save_checkpoints_steps=10)
    feature_spec = metadata_io.read_metadata(
        posixpath.join(
            args.input_dir,
            constants.TRANSFORMED_METADATA_DIR)).schema.as_feature_spec()
    train_input_fn = get_input_fn("{}*".format(
        posixpath.join(args.input_dir,
                       constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)),
                                  feature_spec,
                                  num_epochs=args.num_epochs,
                                  batch_size=args.batch_size)
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=args.train_steps)
    eval_input_fn = get_input_fn(posixpath.join(
        args.input_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                                 feature_spec,
                                 num_epochs=1,
                                 batch_size=args.batch_size)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_input_fn
        #exporters=tf.estimator.FinalExporter(
        #  name='export',
        #  serving_input_receiver_fn=get_serving_input_fn(args.input_dir)
        #)
    )
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=_get_feature_columns(), model_dir=args.model_dir)
    tf.estimator.train_and_evaluate(linear_regressor, train_spec, eval_spec)
Ejemplo n.º 13
0
    def testWriteTransformFn(self):
        path = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = _TEST_METADATA
            deferred_metadata = (
                pipeline
                | 'CreateEmptyProperties' >> beam.Create([_FUTURES_DICT]))

            _ = ((saved_model_dir_pcoll, (metadata, deferred_metadata))
                 | transform_fn_io.WriteTransformFn(path))

        transformed_metadata_dir = os.path.join(path, 'transformed_metadata')
        metadata = metadata_io.read_metadata(transformed_metadata_dir)
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = os.path.join(path, 'transform_fn')
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
Ejemplo n.º 14
0
    def testWriteMetadataIsRetryable(self):
        tft_test_case.skip_if_external_environment(
            'Retries are currently not available on this environment.')
        original_write_metadata = beam_metadata_io.metadata_io.write_metadata
        write_metadata_called_list = []

        def mock_write_metadata(metadata, path):
            """Mocks metadata_io.write_metadata to fail the first time it is called by this test, thus forcing a retry which should succeed."""
            if not write_metadata_called_list:
                write_metadata_called_list.append(True)
                original_write_metadata(metadata, path)
                raise ArithmeticError('Some error')
            return original_write_metadata(metadata, path)

        # Write metadata to disk using WriteMetadata PTransform.
        with mock.patch(
                'tensorflow_transform.tf_metadata.metadata_io.write_metadata',
                mock_write_metadata):
            with self._makeTestPipeline() as pipeline:
                path = self.get_temp_dir()
                _ = (test_metadata.COMPLETE_METADATA
                     | beam_metadata_io.WriteMetadata(path, pipeline))

            # Load from disk and check that it is as expected.
            metadata = metadata_io.read_metadata(path)
            self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Ejemplo n.º 15
0
  def make_experiment(output_dir):
    """Function that creates an experiment http://goo.gl/HcKHlT.

    Args:
      output_dir: The directory where the training output should be written.
    Returns:
      A `tf.contrib.learn.Experiment`.
    """

    estimator = tf.contrib.learn.Estimator(
        model_fn=model_builder(hparams=args),
        model_dir=output_dir)

    train_input_fn = make_input_fn(
        mode=tf.contrib.learn.ModeKeys.TRAIN,
        eval_type=args.eval_type,
        data_file_pattern=args.train_data_paths,
        randomize_input=args.randomize_input,
        batch_size=args.batch_size,
        queue_capacity=4 * args.batch_size)

    eval_input_fn = make_input_fn(
        mode=tf.contrib.learn.ModeKeys.EVAL,
        eval_type=args.eval_type,
        data_file_pattern=args.eval_data_paths,
        batch_size=args.eval_batch_size,
        queue_capacity=4 * args.eval_batch_size)

    raw_metadata = metadata_io.read_metadata(args.raw_metadata_path)
    # Both ratings and candidate features are not needed for serving.
    raw_label_keys = [LABEL_RATING_SCORE]
    # For serving, we only need query features.
    raw_feature_keys = [QUERY_RATED_MOVIE_IDS,
                        QUERY_RATED_MOVIE_SCORES,
                        QUERY_RATED_GENRE_IDS,
                        QUERY_RATED_GENRE_FREQS,
                        QUERY_RATED_GENRE_AVG_SCORES]
    serving_input_fn = (
        input_fn_maker.build_parsing_transforming_serving_input_fn(
            raw_metadata,
            args.transform_savedmodel,
            raw_label_keys=raw_label_keys,
            raw_feature_keys=raw_feature_keys))

    export_strategy = tf.contrib.learn.utils.make_export_strategy(
        serving_input_fn,
        default_output_alternative_key=DEFAULT_OUTPUT_ALTERNATIVE)

    return tf.contrib.learn.Experiment(
        estimator=estimator,
        train_steps=(args.train_steps or
                     args.num_epochs * args.train_set_size // args.batch_size),
        eval_steps=args.eval_steps,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        eval_metrics=create_evaluation_metrics(args.eval_type),
        export_strategies=[export_strategy],
        # Do not remove this is needed until b/36498507 is fixed.
        min_eval_frequency=1000)
def make_serving_input_fn_for_base64_json(args):
    raw_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'rawdata_metadata'))
    transform_savedmodel_dir = (
        os.path.join(args['metadata_path'], 'transform_fn'))
    return input_fn_maker.build_parsing_transforming_serving_input_receiver_fn(
      raw_metadata,
      transform_savedmodel_dir,
      exclude_raw_keys = [LABEL_COLUMN])
Ejemplo n.º 17
0
 def testWriteMetadataNonDeferred(self):
     # Write properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         _ = (_TEST_METADATA_COMPLETE
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
Ejemplo n.º 18
0
def parse_tf_example(tf_example):

    transformed_metadata = metadata_io.read_metadata(os.path.join(params.Params.TRANSFORM_ARTIFACTS_DIR,"transformed_metadata"))
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

    parsed_features = tf.parse_example(serialized=tf_example, features=transformed_feature_spec)
    target = parsed_features.pop(metadata.TARGET_FEATURE_NAME)

    return parsed_features, target
Ejemplo n.º 19
0
def _build_estimator(transform_output,
                     config,
                     hidden_units=None,
                     warm_start_from=None):
  """Build an estimator for predicting the tipping behavior of taxi riders.

  Args:
    transform_output: directory in which the tf-transform model was written
      during the preprocessing step.
    config: tf.contrib.learn.RunConfig defining the runtime environment for the
      estimator (including model_dir).
    hidden_units: [int], the layer sizes of the DNN (input layer first)
    warm_start_from: Optional directory to warm start from.

  Returns:
    A dict of the following:
      - estimator: The estimator that will be used for training and eval.
      - train_spec: Spec for training.
      - eval_spec: Spec for eval.
      - eval_input_receiver_fn: Input function for eval.
  """
  metadata_dir = os.path.join(transform_output,
                              transform_fn_io.TRANSFORMED_METADATA_DIR)
  transformed_metadata = metadata_io.read_metadata(metadata_dir)
  transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

  transformed_feature_spec.pop(_transformed_name(_LABEL_KEY))

  real_valued_columns = [
      tf.feature_column.numeric_column(key, shape=())
      for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
  ]
  categorical_columns = [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0)
      for key in _transformed_names(_VOCAB_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0)
      for key in _transformed_names(_BUCKET_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
          key,
          num_buckets=num_buckets,
          default_value=0) for key, num_buckets in zip(
              _transformed_names(_CATEGORICAL_FEATURE_KEYS),
              _MAX_CATEGORICAL_FEATURE_VALUES)
  ]
  return tf.estimator.DNNLinearCombinedClassifier(
      config=config,
      linear_feature_columns=categorical_columns,
      dnn_feature_columns=real_valued_columns,
      dnn_hidden_units=hidden_units or [100, 70, 50, 25],
      warm_start_from=warm_start_from)
Ejemplo n.º 20
0
    def make_experiment(output_dir):
        """Function that creates an experiment http://goo.gl/HcKHlT.

    Args:
      output_dir: The directory where the training output should be written.
    Returns:
      A `tf.contrib.learn.Experiment`.
    """

        estimator = tf.contrib.learn.Estimator(
            model_fn=model_builder(hparams=args), model_dir=output_dir)

        train_input_fn = make_input_fn(mode=tf.contrib.learn.ModeKeys.TRAIN,
                                       eval_type=args.eval_type,
                                       data_file_pattern=args.train_data_paths,
                                       randomize_input=args.randomize_input,
                                       batch_size=args.batch_size,
                                       queue_capacity=4 * args.batch_size)

        eval_input_fn = make_input_fn(mode=tf.contrib.learn.ModeKeys.EVAL,
                                      eval_type=args.eval_type,
                                      data_file_pattern=args.eval_data_paths,
                                      batch_size=args.eval_batch_size,
                                      queue_capacity=4 * args.eval_batch_size)

        raw_metadata = metadata_io.read_metadata(args.raw_metadata_path)
        # Both ratings and candidate features are not needed for serving.
        raw_label_keys = [LABEL_RATING_SCORE]
        # For serving, we only need query features.
        raw_feature_keys = [
            QUERY_RATED_MOVIE_IDS, QUERY_RATED_MOVIE_SCORES,
            QUERY_RATED_GENRE_IDS, QUERY_RATED_GENRE_FREQS,
            QUERY_RATED_GENRE_AVG_SCORES
        ]
        serving_input_fn = (
            input_fn_maker.build_parsing_transforming_serving_input_fn(
                raw_metadata,
                args.transform_savedmodel,
                raw_label_keys=raw_label_keys,
                raw_feature_keys=raw_feature_keys))

        export_strategy = tf.contrib.learn.utils.make_export_strategy(
            serving_input_fn,
            default_output_alternative_key=DEFAULT_OUTPUT_ALTERNATIVE)

        return tf.contrib.learn.Experiment(
            estimator=estimator,
            train_steps=(args.train_steps or args.num_epochs *
                         args.train_set_size // args.batch_size),
            eval_steps=args.eval_steps,
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            eval_metrics=create_evaluation_metrics(args.eval_type),
            export_strategies=[export_strategy],
            # Do not remove this is needed until b/36498507 is fixed.
            min_eval_frequency=1000)
def _build_estimator(transform_output,
                     config,
                     hidden_units=None,
                     warm_start_from=None):
  """Build an estimator for predicting the tipping behavior of taxi riders.

  Args:
    transform_output: directory in which the tf-transform model was written
      during the preprocessing step.
    config: tf.contrib.learn.RunConfig defining the runtime environment for the
      estimator (including model_dir).
    hidden_units: [int], the layer sizes of the DNN (input layer first)
    warm_start_from: Optional directory to warm start from.

  Returns:
    A dict of the following:
      - estimator: The estimator that will be used for training and eval.
      - train_spec: Spec for training.
      - eval_spec: Spec for eval.
      - eval_input_receiver_fn: Input function for eval.
  """
  metadata_dir = os.path.join(transform_output,
                              transform_fn_io.TRANSFORMED_METADATA_DIR)
  transformed_metadata = metadata_io.read_metadata(metadata_dir)
  transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

  transformed_feature_spec.pop(_transformed_name(_LABEL_KEY))

  real_valued_columns = [
      tf.feature_column.numeric_column(key, shape=())
      for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
  ]
  categorical_columns = [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0)
      for key in _transformed_names(_VOCAB_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0)
      for key in _transformed_names(_BUCKET_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(  # pylint: disable=g-complex-comprehension
          key,
          num_buckets=num_buckets,
          default_value=0) for key, num_buckets in zip(
              _transformed_names(_CATEGORICAL_FEATURE_KEYS),
              _MAX_CATEGORICAL_FEATURE_VALUES)
  ]
  return tf.estimator.DNNLinearCombinedClassifier(
      config=config,
      linear_feature_columns=categorical_columns,
      dnn_feature_columns=real_valued_columns,
      dnn_hidden_units=hidden_units or [100, 70, 50, 25],
      warm_start_from=warm_start_from)
Ejemplo n.º 22
0
    def testWriteMetadataNonDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            _ = (test_metadata.COMPLETE_METADATA
                 | beam_metadata_io.WriteMetadata(path, pipeline))

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Ejemplo n.º 23
0
    def test_write_and_read(self):
        # TODO(b/123241798): use TEST_TMPDIR
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir)
        reloaded = metadata_io.read_metadata(basedir)

        self.assertEqual(original, reloaded)
Ejemplo n.º 24
0
 def testWriteMetadataDeferredProperties(self):
     # Write deferred properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         deferred_metadata = pipeline | beam.Create([_FUTURES_DICT])
         _ = ((_TEST_METADATA_WITH_FUTURES, deferred_metadata)
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA)
Ejemplo n.º 25
0
 def testWriteMetadataNonDeferredEmptyDict(self):
     # Write properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         property_pcoll = pipeline | beam.Create([{}])
         _ = ((_TEST_METADATA, property_pcoll)
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA)
Ejemplo n.º 26
0
 def expand(self, pvalue):
     # Read metadata in non-deferred manner.  Note that since this reads the
     # whole metadata in a non-deferred manner, typically the roundtrip
     #
     # done = metadata | WriteMetadata(path)
     # metadata = p | ReadMetadata(path).must_follow(done)
     #
     # will fail as the metadata on disk will not be complete when the read is
     # done.
     return metadata_io.read_metadata(self._path)
    def test_write_and_read(self):
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir, versions=_test_versions)
        reloaded = metadata_io.read_metadata(basedir, versions=_test_versions)

        generated_feature_spec = reloaded.schema.as_feature_spec()
        self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
Ejemplo n.º 28
0
    def expand(self, pvalue):
        transform_fn_path = os.path.join(self._path, TRANSFORM_FN_DIR)
        saved_model_dir_pcoll = (
            pvalue.pipeline
            | 'CreateTransformFnPath' >> beam.Create([transform_fn_path]))

        metadata = metadata_io.read_metadata(
            os.path.join(self._path, TRANSFORMED_METADATA_DIR))

        return saved_model_dir_pcoll, metadata
Ejemplo n.º 29
0
    def test_write_and_read(self):
        # TODO(b/123241798): use TEST_TMPDIR
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir)
        reloaded = metadata_io.read_metadata(basedir)

        generated_feature_spec = reloaded.schema.as_feature_spec()
        self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
Ejemplo n.º 30
0
  def test_write_and_read(self):
    basedir = tempfile.mkdtemp()
    original_schema = schema_io_vtest.TestSchema(
        {'test_feature_1': 'bogus 1', 'test_feature_2': 'bogus 2'})
    original = dataset_metadata.DatasetMetadata(schema=original_schema)

    metadata_io.write_metadata(original, basedir, versions=_test_versions)
    reloaded = metadata_io.read_metadata(basedir, versions=_test_versions)

    self.assertTrue('test_feature_1' in reloaded.schema.column_schemas)
    self.assertTrue('test_feature_2' in reloaded.schema.column_schemas)
    self.assertEqual(2, len(reloaded.schema.column_schemas))
    def test_read_with_invalid_keys(self):
        basedir = tempfile.mkdtemp()
        version_basedir = os.path.join(basedir, 'v1-json')

        # Write a proto by hand to disk
        file_io.recursive_create_dir(version_basedir)
        file_io.write_string_to_file(
            os.path.join(version_basedir, 'schema.json'),
            _SCHEMA_WITH_INVALID_KEYS)

        with self.assertRaisesRegexp(
                ValueError, 'Keys of dense and sparse features overlapped.*'):
            _ = metadata_io.read_metadata(basedir, versions=_test_versions)
Ejemplo n.º 32
0
    def expand(self, pvalue):
        transform_fn_path = os.path.join(self._path, 'transform_fn')
        saved_model_dir_pcoll = (
            pvalue.pipeline
            | 'CreateTransformFnPath' >> beam.Create([transform_fn_path]))

        metadata = metadata_io.read_metadata(
            os.path.join(self._path, 'transformed_metadata'))
        deferred_metadata = (
            pvalue.pipeline
            | 'CreateEmptyDeferredMetadata' >> beam.Create([{}]))

        return saved_model_dir_pcoll, (metadata, deferred_metadata)
Ejemplo n.º 33
0
def train_and_evaluate(transformed_train_filepattern,
                       transformed_test_filepattern, transformed_metadata_dir,
                       num_train_instances=NUM_TRAIN_INSTANCES,
                       num_test_instances=NUM_TEST_INSTANCES):
  """Train the model on training data and evaluate on test data.

  Args:
    transformed_train_filepattern: File pattern for transformed training data
        shards
    transformed_test_filepattern: File pattern for transformed test data shards
    transformed_metadata_dir: Directory containing transformed data metadata
    num_train_instances: Number of instances in train set
    num_test_instances: Number of instances in test set

  Returns:
    The results from the estimator's 'evaluate' method
  """

  # Wrap scalars as real valued columns.
  real_valued_columns = [feature_column.real_valued_column(key)
                         for key in NUMERIC_COLUMNS]

  # Wrap categorical columns.  Note the combiner is irrelevant since the input
  # only has one value set per feature per instance.
  one_hot_columns = [
      feature_column.sparse_column_with_integerized_feature(
          key, bucket_size=bucket_size, combiner='sum')
      for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)]

  estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns)

  transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
  train_input_fn = input_fn_maker.build_training_input_fn(
      transformed_metadata,
      transformed_train_filepattern,
      training_batch_size=TRAIN_BATCH_SIZE,
      label_keys=[LABEL_COLUMN])

  # Estimate the model using the default optimizer.
  estimator.fit(
      input_fn=train_input_fn,
      max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)

  # Evaluate model on test dataset.
  eval_input_fn = input_fn_maker.build_training_input_fn(
      transformed_metadata,
      transformed_test_filepattern,
      training_batch_size=1,
      label_keys=[LABEL_COLUMN])

  return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
Ejemplo n.º 34
0
  def raw_metadata(self):
    """A DatasetMetadata.

    Note: raw_metadata is not guaranteed to exist in the output of tf.transform
    and hence using this could fail, if raw_metadata is not present in
    TFTransformOutput.

    Returns:
      A DatasetMetadata
    """
    if self._raw_metadata is None:
      self._raw_metadata = metadata_io.read_metadata(
          os.path.join(self._transform_output_dir, self.RAW_METADATA_DIR))
    return self._raw_metadata
Ejemplo n.º 35
0
def build_estimator(tf_transform_dir, config, hidden_units=None):
  """Build an estimator for predicting the tipping behavior of taxi riders.

  Args:
    tf_transform_dir: directory in which the tf-transform model was written
      during the preprocessing step.
    config: tf.contrib.learn.RunConfig defining the runtime environment for the
      estimator (including model_dir).
    hidden_units: [int], the layer sizes of the DNN (input layer first)

  Returns:
    Resulting DNNLinearCombinedClassifier.
  """
  metadata_dir = os.path.join(tf_transform_dir,
                              transform_fn_io.TRANSFORMED_METADATA_DIR)
  transformed_metadata = metadata_io.read_metadata(metadata_dir)
  transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

  transformed_feature_spec.pop(taxi.transformed_name(taxi.LABEL_KEY))

  real_valued_columns = [
      tf.feature_column.numeric_column(key, shape=())
      for key in taxi.transformed_names(taxi.DENSE_FLOAT_FEATURE_KEYS)
  ]
  categorical_columns = [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=taxi.VOCAB_SIZE + taxi.OOV_SIZE, default_value=0)
      for key in taxi.transformed_names(taxi.VOCAB_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=taxi.FEATURE_BUCKET_COUNT, default_value=0)
      for key in taxi.transformed_names(taxi.BUCKET_FEATURE_KEYS)
  ]
  categorical_columns += [
      tf.feature_column.categorical_column_with_identity(
          key, num_buckets=num_buckets, default_value=0)
      for key, num_buckets in zip(
          taxi.transformed_names(taxi.CATEGORICAL_FEATURE_KEYS),  #
          taxi.MAX_CATEGORICAL_FEATURE_VALUES)
  ]
  return tf.estimator.DNNLinearCombinedClassifier(
      config=config,
      linear_feature_columns=categorical_columns,
      dnn_feature_columns=real_valued_columns,
      dnn_hidden_units=hidden_units or [100, 70, 50, 25])
def read_dataset(args, mode):
    batch_size = args['train_batch_size']
    if mode == tf.estimator.ModeKeys.TRAIN:
        input_paths = args['train_data_paths']
    else:
        input_paths = args['eval_data_paths']

    transformed_metadata = metadata_io.read_metadata(
              os.path.join(args['metadata_path'], 'transformed_metadata'))

    return input_fn_maker.build_training_input_fn(
          metadata = transformed_metadata,
          file_pattern = (
              input_paths[0] if len(input_paths) == 1 else input_paths),
          training_batch_size = batch_size,
          label_keys = [LABEL_COLUMN],
          reader = gzip_reader_fn,
          key_feature_name = KEY_FEATURE_COLUMN,
          randomize_input = (mode != tf.estimator.ModeKeys.EVAL),
          num_epochs = (1 if mode == tf.estimator.ModeKeys.EVAL else None)) 
Ejemplo n.º 37
0
  def get_experiment(output_dir):
    """Function that creates an experiment http://goo.gl/HcKHlT.

    Args:
      output_dir: The directory where the training output should be written.
    Returns:
      A `tf.contrib.learn.Experiment`.
    """

    columns = feature_columns(args.model_type, vocab_sizes, use_crosses)

    runconfig = tf.contrib.learn.RunConfig()
    cluster = runconfig.cluster_spec
    num_table_shards = max(1, runconfig.num_ps_replicas * 3)
    num_partitions = max(1, 1 + cluster.num_tasks('worker') if cluster and
                         'worker' in cluster.jobs else 0)

    model_dir = os.path.join(output_dir, MODEL_DIR)
    if args.model_type == LINEAR:
      estimator = tf.contrib.learn.LinearRegressor(
          model_dir=model_dir,
          feature_columns=columns,
          optimizer=tf.contrib.linear_optimizer.SDCAOptimizer(
              example_id_column=KEY_FEATURE_COLUMN,
              symmetric_l2_regularization=args.l2_regularization,
              num_loss_partitions=num_partitions,  # workers
              num_table_shards=num_table_shards))  # ps
    elif args.model_type == DEEP:
      estimator = tf.contrib.learn.DNNRegressor(
          hidden_units=args.hidden_units,
          feature_columns=columns,
          model_dir=model_dir)

    transformed_metadata = metadata_io.read_metadata(
        args.transformed_metadata_path)
    raw_metadata = metadata_io.read_metadata(args.raw_metadata_path)
    serving_input_fn = (
        input_fn_maker.build_parsing_transforming_serving_input_fn(
            raw_metadata,
            args.transform_savedmodel,
            raw_label_keys=[TARGET_FEATURE_COLUMN]))
    export_strategy = tf.contrib.learn.utils.make_export_strategy(
        serving_input_fn, exports_to_keep=5,
        default_output_alternative_key=None)

    train_input_fn = get_transformed_reader_input_fn(
        transformed_metadata, args.train_data_paths, args.batch_size,
        tf.contrib.learn.ModeKeys.TRAIN)

    eval_input_fn = get_transformed_reader_input_fn(
        transformed_metadata, args.eval_data_paths, args.batch_size,
        tf.contrib.learn.ModeKeys.EVAL)

    return tf.contrib.learn.Experiment(
        estimator=estimator,
        train_steps=(args.train_steps or
                     args.num_epochs * args.train_set_size // args.batch_size),
        eval_steps=args.eval_steps,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        export_strategies=export_strategy,
        min_eval_frequency=500)