Beispiel #1
0
def train_and_evaluate(transformed_train_filepattern,
                       transformed_test_filepattern, transformed_metadata_dir,
                       num_train_instances=NUM_TRAIN_INSTANCES,
                       num_test_instances=NUM_TEST_INSTANCES):
  """Train the model on training data and evaluate on test data.

  Args:
    transformed_train_filepattern: File pattern for transformed training data
        shards
    transformed_test_filepattern: File pattern for transformed test data shards
    transformed_metadata_dir: Directory containing transformed data metadata
    num_train_instances: Number of instances in train set
    num_test_instances: Number of instances in test set

  Returns:
    The results from the estimator's 'evaluate' method
  """

  # Wrap scalars as real valued columns.
  real_valued_columns = [feature_column.real_valued_column(key)
                         for key in NUMERIC_COLUMNS]

  # Wrap categorical columns.  Note the combiner is irrelevant since the input
  # only has one value set per feature per instance.
  one_hot_columns = [
      feature_column.sparse_column_with_integerized_feature(
          key, bucket_size=bucket_size, combiner='sum')
      for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)]

  estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns)

  transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
  train_input_fn = input_fn_maker.build_training_input_fn(
      transformed_metadata,
      transformed_train_filepattern,
      training_batch_size=TRAIN_BATCH_SIZE,
      label_keys=[LABEL_COLUMN])

  # Estimate the model using the default optimizer.
  estimator.fit(
      input_fn=train_input_fn,
      max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)

  # Evaluate model on test dataset.
  eval_input_fn = input_fn_maker.build_training_input_fn(
      transformed_metadata,
      transformed_test_filepattern,
      training_batch_size=1,
      label_keys=[LABEL_COLUMN])

  return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
Beispiel #2
0
def train_and_evaluate(transformed_train_filepattern,
                       transformed_test_filepattern,
                       transformed_metadata_dir,
                       num_train_instances=NUM_TRAIN_INSTANCES,
                       num_test_instances=NUM_TEST_INSTANCES):
    """Train the model on training data and evaluate on evaluation data.

  Args:
    transformed_train_filepattern: Base filename for transformed training data
        shards
    transformed_test_filepattern: Base filename for transformed evaluation data
        shards
    transformed_metadata_dir: Directory containing transformed data metadata
    num_train_instances: Number of instances in train set
    num_test_instances: Number of instances in test set

  Returns:
    The results from the estimator's 'evaluate' method
  """
    # Unrecognized tokens are represented by -1, but
    # sparse_column_with_integerized_feature uses the mod operator to map integers
    # to the range [0, bucket_size).  By choosing bucket_size=VOCAB_SIZE + 1, we
    # represent unrecognized tokens as VOCAB_SIZE.
    review_column = feature_column.sparse_column_with_integerized_feature(
        REVIEW_COLUMN, bucket_size=VOCAB_SIZE + 1, combiner='sum')
    weighted_reviews = feature_column.weighted_sparse_column(
        review_column, REVIEW_WEIGHT)

    estimator = learn.LinearClassifier([weighted_reviews])

    transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
    train_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_train_filepattern,
        training_batch_size=TRAIN_BATCH_SIZE,
        label_keys=[LABEL_COLUMN])

    # Estimate the model using the default optimizer.
    estimator.fit(input_fn=train_input_fn,
                  max_steps=TRAIN_NUM_EPOCHS * num_train_instances /
                  TRAIN_BATCH_SIZE)

    # Evaluate model on eval dataset.
    eval_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_test_filepattern,
        training_batch_size=1,
        label_keys=[LABEL_COLUMN])

    return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
Beispiel #3
0
def train_and_evaluate(transformed_train_data_base, transformed_eval_data_base,
                       transformed_metadata_dir):
    """Train the model on training data and evaluate on evaluation data.

  Args:
    transformed_train_data_base: Base filename for transformed training data
        shards
    transformed_eval_data_base: Base filename for cleaned evaluation data
        shards
    transformed_metadata_dir: Directory containing transformed data metadata.

  Returns:
    The results from the estimator's 'evaluate' method.
  """

    # Wrap scalars as real valued columns.
    real_valued_columns = [
        feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS
    ]

    # Wrap categorical columns.
    one_hot_columns = [
        feature_column.sparse_column_with_integerized_feature(
            key, bucket_size=bucket_size)
        for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)
    ]

    estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns)

    transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
    train_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_train_data_base + '*',
        training_batch_size=TRAIN_BATCH_SIZE,
        label_keys=['label'])

    # Estimate the model using the default optimizer.
    estimator.fit(input_fn=train_input_fn,
                  max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES /
                  TRAIN_BATCH_SIZE)

    # Evaluate model on eval dataset.
    eval_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_eval_data_base + '*',
        training_batch_size=1,
        label_keys=['label'])

    return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_EVAL_INSTANCES)
Beispiel #4
0
    def train_and_evaluate(output_dir):
        review_column = feature_column.sparse_column_with_integerized_feature(
            const.REVIEW_COLUMN, bucket_size=vocab_size + 1, combiner='sum')
        weighted_reviews = feature_column.weighted_sparse_column(
            review_column, const.REVIEW_WEIGHT)

        estimator = learn.LinearClassifier(
            feature_columns=[weighted_reviews],
            n_classes=2,
            model_dir=output_dir,
            config=tf.contrib.learn.RunConfig(save_checkpoints_secs=30))

        transformed_metadata = metadata_io.read_metadata(
            transformed_metadata_dir)
        raw_metadata = metadata_io.read_metadata(raw_metadata_dir)

        train_input_fn = input_fn_maker.build_training_input_fn(
            transformed_metadata,
            transformed_train_file_pattern,
            training_batch_size=train_batch_size,
            label_keys=[const.LABEL_COLUMN])

        eval_input_fn = input_fn_maker.build_training_input_fn(
            transformed_metadata,
            transformed_test_file_pattern,
            training_batch_size=1,
            label_keys=[const.LABEL_COLUMN])

        serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn(
            raw_metadata=raw_metadata,
            transform_savedmodel_dir=output_dir + '/transform_fn',
            raw_label_keys=[],
            raw_feature_keys=[const.REVIEW_COLUMN])

        export_strategy = saved_model_export_utils.make_export_strategy(
            serving_input_fn,
            exports_to_keep=5,
            default_output_alternative_key=None)

        return tf.contrib.learn.Experiment(estimator=estimator,
                                           train_steps=train_num_epochs *
                                           num_train_instances /
                                           train_batch_size,
                                           eval_steps=num_test_instances,
                                           train_input_fn=train_input_fn,
                                           eval_input_fn=eval_input_fn,
                                           export_strategies=export_strategy,
                                           min_eval_frequency=500)
Beispiel #5
0
    def test_build_training_input_fn(self):
        # TODO(b/123241798): use TEST_TMPDIR
        basedir = tempfile.mkdtemp()

        # the transformed schema should be vectorized already.
        metadata = dataset_metadata.DatasetMetadata(
            schema=_make_transformed_schema([1]))
        data_file = os.path.join(basedir, 'data')
        examples = [
            _create_serialized_example(d) for d in [{
                'transformed_a': 15,
                'transformed_b': 6,
                'transformed_label': 77
            }, {
                'transformed_a': 12,
                'transformed_b': 17,
                'transformed_label': 44
            }]
        ]
        _write_tfrecord(data_file, examples)

        training_input_fn = (input_fn_maker.build_training_input_fn(
            metadata=metadata,
            file_pattern=[data_file],
            training_batch_size=128,
            label_keys=['transformed_label'],
            randomize_input=False))

        with tf.Graph().as_default():
            features, labels = training_input_fn()

            with tf.Session().as_default() as session:
                session.run(tf.initialize_all_variables())
                tf.train.start_queue_runners()
                transformed_a, transformed_b, transformed_label = session.run([
                    features['transformed_a'], features['transformed_b'],
                    labels
                ])

        self.assertEqual((128, 1), tuple(transformed_a.shape))
        self.assertEqual((128, 1), tuple(transformed_b.dense_shape))
        self.assertEqual((128, 1), tuple(transformed_label.shape))
        transformed_b_dict = dict(
            zip([tuple(x) for x in transformed_b.indices.tolist()],
                transformed_b.values.tolist()))

        self.assertEqual(15, transformed_a[0][0])
        self.assertEqual(6, transformed_b_dict[(0, 0)])
        self.assertEqual(77, transformed_label[0][0])
        self.assertEqual(12, transformed_a[1][0])
        self.assertEqual(17, transformed_b_dict[(1, 0)])
        self.assertEqual(44, transformed_label[1][0])
Beispiel #6
0
def get_transformed_reader_input_fn(transformed_metadata,
                                    transformed_data_paths, batch_size, mode):
    """Wrap the get input features function to provide the runtime arguments."""
    return input_fn_maker.build_training_input_fn(
        metadata=transformed_metadata,
        file_pattern=(transformed_data_paths[0] if len(transformed_data_paths)
                      == 1 else transformed_data_paths),
        training_batch_size=batch_size,
        label_keys=[TARGET_FEATURE_COLUMN],
        reader=gzip_reader_fn,
        key_feature_name=KEY_FEATURE_COLUMN,
        reader_num_threads=4,
        queue_capacity=batch_size * 2,
        randomize_input=(mode != tf.contrib.learn.ModeKeys.EVAL),
        num_epochs=(1 if mode == tf.contrib.learn.ModeKeys.EVAL else None))
Beispiel #7
0
def get_transformed_reader_input_fn(transformed_metadata,
                                    transformed_data_paths,
                                    batch_size,
                                    mode):
  """Wrap the get input features function to provide the runtime arguments."""
  return input_fn_maker.build_training_input_fn(
      metadata=transformed_metadata,
      file_pattern=(
          transformed_data_paths[0] if len(transformed_data_paths) == 1
          else transformed_data_paths),
      training_batch_size=batch_size,
      label_keys=[TARGET_FEATURE_COLUMN],
      reader=gzip_reader_fn,
      key_feature_name=KEY_FEATURE_COLUMN,
      reader_num_threads=4,
      queue_capacity=batch_size * 2,
      randomize_input=(mode != tf.contrib.learn.ModeKeys.EVAL),
      num_epochs=(1 if mode == tf.contrib.learn.ModeKeys.EVAL else None))
    def test_build_training_input_fn(self):
        basedir = tempfile.mkdtemp()

        metadata = dataset_metadata.DatasetMetadata(
            schema=_make_transformed_schema())
        data_file = os.path.join(basedir, 'data')
        examples = [
            _create_serialized_example(d) for d in [{
                'transformed_a': 15,
                'transformed_b': 5,
                'transformed_label': 77
            }, {
                'transformed_a': 12,
                'transformed_b': 17,
                'transformed_label': 44
            }]
        ]
        _write_tfrecord(data_file, examples)

        training_input_fn = (input_fn_maker.build_training_input_fn(
            metadata=metadata,
            file_pattern=[data_file],
            training_batch_size=128,
            label_keys=['transformed_label'],
            randomize_input=False))

        with tf.Graph().as_default():
            features, labels = training_input_fn()

            with tf.Session().as_default() as session:
                session.run(tf.initialize_all_variables())
                tf.train.start_queue_runners()
                transformed_a, transformed_b, transformed_label = session.run([
                    features['transformed_a'], features['transformed_b'],
                    labels
                ])

        self.assertEqual(15, transformed_a[0][0])
        self.assertEqual(5, transformed_b[0][0])
        self.assertEqual(77, transformed_label[0][0])
        self.assertEqual(12, transformed_a[1][0])
        self.assertEqual(17, transformed_b[1][0])
        self.assertEqual(44, transformed_label[1][0])
def read_dataset(args, mode):
    batch_size = args['train_batch_size']
    if mode == tf.estimator.ModeKeys.TRAIN:
        input_paths = args['train_data_paths']
    else:
        input_paths = args['eval_data_paths']

    transformed_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'transformed_metadata'))

    return input_fn_maker.build_training_input_fn(
        metadata=transformed_metadata,
        file_pattern=(input_paths[0]
                      if len(input_paths) == 1 else input_paths),
        training_batch_size=batch_size,
        label_keys=[LABEL_COLUMN],
        reader=gzip_reader_fn,
        key_feature_name=KEY_FEATURE_COLUMN,
        randomize_input=(mode != tf.estimator.ModeKeys.EVAL),
        num_epochs=(1 if mode == tf.estimator.ModeKeys.EVAL else None))
def read_dataset(args, mode):
    batch_size = args['train_batch_size']
    if mode == tf.estimator.ModeKeys.TRAIN:
        input_paths = args['train_data_paths']
    else:
        input_paths = args['eval_data_paths']

    transformed_metadata = metadata_io.read_metadata(
              os.path.join(args['metadata_path'], 'transformed_metadata'))

    return input_fn_maker.build_training_input_fn(
          metadata = transformed_metadata,
          file_pattern = (
              input_paths[0] if len(input_paths) == 1 else input_paths),
          training_batch_size = batch_size,
          label_keys = [LABEL_COLUMN],
          reader = gzip_reader_fn,
          key_feature_name = KEY_FEATURE_COLUMN,
          randomize_input = (mode != tf.estimator.ModeKeys.EVAL),
          num_epochs = (1 if mode == tf.estimator.ModeKeys.EVAL else None)) 
Beispiel #11
0
def read_dataset(args, mode):
    tfrecord_options = tf.python_io.TFRecordOptions(
        compression_type=tf.python_io.TFRecordCompressionType.GZIP)
    batch_size = args['train_batch_size']
    if mode == tf.estimator.ModeKeys.TRAIN:
        input_paths = args['train_data_paths']
    elif mode == tf.estimator.ModeKeys.EVAL:
        input_paths = args['eval_data_paths']
    else:
        input_paths = args['test_data_paths']

    transformed_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'transformed_metadata'))

    return input_fn_maker.build_training_input_fn(
        metadata=transformed_metadata,
        file_pattern=(input_paths[0]
                      if len(input_paths) == 1 else input_paths),
        training_batch_size=batch_size,
        label_keys=[LABEL_COL],
        reader=gzip_reader_fn,
        randomize_input=(mode == tf.estimator.ModeKeys.TRAIN),
        num_epochs=(None if mode == tf.estimator.ModeKeys.TRAIN else 1))
Beispiel #12
0
def train_and_evaluate(transformed_train_filepattern,
                       transformed_test_filepattern, transformed_metadata_dir,
                       serving_graph_dir):
    """Train the model on training data and evaluate on test data.

  Args:
    transformed_train_filepattern: File pattern for transformed training data
        shards
    transformed_test_filepattern: File pattern for transformed test data shards
    transformed_metadata_dir: Directory containing transformed data metadata
    serving_graph_dir: Directory to save the serving graph

  Returns:
    The results from the estimator's 'evaluate' method
  """

    # Wrap scalars as real valued columns.
    real_valued_columns = [
        feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS
    ]

    # Wrap categorical columns.  Note the combiner is irrelevant since the input
    # only has one value set per feature per instance.
    one_hot_columns = [
        feature_column.sparse_column_with_integerized_feature(
            key, bucket_size=bucket_size, combiner='sum')
        for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)
    ]

    estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns)

    transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
    train_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_train_filepattern,
        training_batch_size=TRAIN_BATCH_SIZE,
        label_keys=[LABEL_COLUMN])

    # Estimate the model using the default optimizer.
    estimator.fit(input_fn=train_input_fn,
                  max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES /
                  TRAIN_BATCH_SIZE)

    # Write the serving graph to disk for use in tf.serving
    in_columns = [
        'age', 'workclass', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain',
        'capital-loss', 'hours-per-week', 'native-country'
    ]

    if not serving_graph_dir is None:
        serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn(
            raw_metadata=raw_data_metadata,
            transform_savedmodel_dir=serving_graph_dir + '/transform_fn',
            raw_label_keys=[],
            raw_feature_keys=in_columns)
        estimator.export_savedmodel(serving_graph_dir, serving_input_fn)

    # Evaluate model on test dataset.
    eval_input_fn = input_fn_maker.build_training_input_fn(
        transformed_metadata,
        transformed_test_filepattern,
        training_batch_size=1,
        label_keys=[LABEL_COLUMN])

    return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
Beispiel #13
0
    def get_experiment(output_dir):
        # Read schema, input features, and transforms.
        schema = read_json_file(
            os.path.join(args.analysis_output_dir, SCHEMA_FILE))
        features = read_json_file(
            os.path.join(args.analysis_output_dir, FEATURES_FILE))
        stats = read_json_file(
            os.path.join(args.analysis_output_dir, STATS_FILE))

        target_column_name = get_target_name(features)
        header_names = [col['name'] for col in schema]
        if not target_column_name:
            raise ValueError('target missing from features file.')

        # Get the model to train.
        target_vocab = read_vocab(args, target_column_name)
        estimator = get_estimator(args, output_dir, features, stats,
                                  len(target_vocab))

        # Make list of files to save with the trained model.
        additional_assets = {
            FEATURES_FILE: os.path.join(args.analysis_output_dir,
                                        FEATURES_FILE),
            SCHEMA_FILE: os.path.join(args.analysis_output_dir, SCHEMA_FILE)
        }

        export_strategy_csv_notarget = make_export_strategy(
            args=args,
            keep_target=False,
            assets_extra=additional_assets,
            features=features,
            schema=schema)
        export_strategy_csv_target = make_export_strategy(
            args=args,
            keep_target=True,
            assets_extra=additional_assets,
            features=features,
            schema=schema)

        # Build readers for training.
        if args.run_transforms:
            raw_metadata = metadata_io.read_metadata(
                os.path.join(args.analysis_output_dir, RAW_METADATA_DIR))

            input_reader_for_train = build_csv_transforming_training_input_fn(
                raw_metadata=raw_metadata,
                transform_savedmodel_dir=os.path.join(args.analysis_output_dir,
                                                      TRANSFORM_FN_DIR),
                raw_data_file_pattern=args.train_data_paths,
                training_batch_size=args.train_batch_size,
                raw_keys=header_names,
                transformed_label_keys=[target_column_name],
                convert_scalars_to_vectors=True,
                num_epochs=args.num_epochs,
                randomize_input=True,
                min_after_dequeue=10,
                reader_num_threads=multiprocessing.cpu_count())
            input_reader_for_eval = build_csv_transforming_training_input_fn(
                raw_metadata=raw_metadata,
                transform_savedmodel_dir=os.path.join(args.analysis_output_dir,
                                                      TRANSFORM_FN_DIR),
                raw_data_file_pattern=args.eval_data_paths,
                training_batch_size=args.eval_batch_size,
                raw_keys=header_names,
                transformed_label_keys=[target_column_name],
                convert_scalars_to_vectors=True,
                num_epochs=1,
                randomize_input=False,
                reader_num_threads=multiprocessing.cpu_count())
        else:
            transformed_metadata = metadata_io.read_metadata(
                os.path.join(args.analysis_output_dir,
                             TRANSFORMED_METADATA_DIR))

            input_reader_for_train = input_fn_maker.build_training_input_fn(
                metadata=transformed_metadata,
                file_pattern=args.train_data_paths,
                training_batch_size=args.train_batch_size,
                reader=gzip_reader_fn,
                label_keys=[target_column_name],
                feature_keys=None,  # extract all features
                key_feature_name=None,  # None as we take care of the key column.
                reader_num_threads=multiprocessing.cpu_count(),
                queue_capacity=args.train_batch_size *
                multiprocessing.cpu_count() + 10,
                randomize_input=True,
                num_epochs=args.num_epochs,
            )
            input_reader_for_eval = input_fn_maker.build_training_input_fn(
                metadata=transformed_metadata,
                file_pattern=args.eval_data_paths,
                training_batch_size=args.eval_batch_size,
                reader=gzip_reader_fn,
                label_keys=[target_column_name],
                feature_keys=None,  # extract all features
                key_feature_name=None,  # None as we take care of the key column.
                reader_num_threads=multiprocessing.cpu_count(),
                queue_capacity=args.train_batch_size *
                multiprocessing.cpu_count() + 10,
                randomize_input=False,
                num_epochs=1,
            )

        return tf.contrib.learn.Experiment(
            estimator=estimator,
            train_input_fn=input_reader_for_train,
            eval_input_fn=input_reader_for_eval,
            train_steps=args.max_steps,
            export_strategies=[
                export_strategy_csv_notarget, export_strategy_csv_target
            ],
            min_eval_frequency=args.min_eval_frequency,
            eval_steps=None,
        )