def train_and_evaluate(transformed_train_filepattern, transformed_test_filepattern, transformed_metadata_dir, num_train_instances=NUM_TRAIN_INSTANCES, num_test_instances=NUM_TEST_INSTANCES): """Train the model on training data and evaluate on test data. Args: transformed_train_filepattern: File pattern for transformed training data shards transformed_test_filepattern: File pattern for transformed test data shards transformed_metadata_dir: Directory containing transformed data metadata num_train_instances: Number of instances in train set num_test_instances: Number of instances in test set Returns: The results from the estimator's 'evaluate' method """ # Wrap scalars as real valued columns. real_valued_columns = [feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS] # Wrap categorical columns. Note the combiner is irrelevant since the input # only has one value set per feature per instance. one_hot_columns = [ feature_column.sparse_column_with_integerized_feature( key, bucket_size=bucket_size, combiner='sum') for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)] estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns) transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_filepattern, training_batch_size=TRAIN_BATCH_SIZE, label_keys=[LABEL_COLUMN]) # Estimate the model using the default optimizer. estimator.fit( input_fn=train_input_fn, max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE) # Evaluate model on test dataset. eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_filepattern, training_batch_size=1, label_keys=[LABEL_COLUMN]) return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
def train_and_evaluate(transformed_train_filepattern, transformed_test_filepattern, transformed_metadata_dir, num_train_instances=NUM_TRAIN_INSTANCES, num_test_instances=NUM_TEST_INSTANCES): """Train the model on training data and evaluate on evaluation data. Args: transformed_train_filepattern: Base filename for transformed training data shards transformed_test_filepattern: Base filename for transformed evaluation data shards transformed_metadata_dir: Directory containing transformed data metadata num_train_instances: Number of instances in train set num_test_instances: Number of instances in test set Returns: The results from the estimator's 'evaluate' method """ # Unrecognized tokens are represented by -1, but # sparse_column_with_integerized_feature uses the mod operator to map integers # to the range [0, bucket_size). By choosing bucket_size=VOCAB_SIZE + 1, we # represent unrecognized tokens as VOCAB_SIZE. review_column = feature_column.sparse_column_with_integerized_feature( REVIEW_COLUMN, bucket_size=VOCAB_SIZE + 1, combiner='sum') weighted_reviews = feature_column.weighted_sparse_column( review_column, REVIEW_WEIGHT) estimator = learn.LinearClassifier([weighted_reviews]) transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_filepattern, training_batch_size=TRAIN_BATCH_SIZE, label_keys=[LABEL_COLUMN]) # Estimate the model using the default optimizer. estimator.fit(input_fn=train_input_fn, max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE) # Evaluate model on eval dataset. eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_filepattern, training_batch_size=1, label_keys=[LABEL_COLUMN]) return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
def train_and_evaluate(transformed_train_data_base, transformed_eval_data_base, transformed_metadata_dir): """Train the model on training data and evaluate on evaluation data. Args: transformed_train_data_base: Base filename for transformed training data shards transformed_eval_data_base: Base filename for cleaned evaluation data shards transformed_metadata_dir: Directory containing transformed data metadata. Returns: The results from the estimator's 'evaluate' method. """ # Wrap scalars as real valued columns. real_valued_columns = [ feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS ] # Wrap categorical columns. one_hot_columns = [ feature_column.sparse_column_with_integerized_feature( key, bucket_size=bucket_size) for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES) ] estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns) transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_data_base + '*', training_batch_size=TRAIN_BATCH_SIZE, label_keys=['label']) # Estimate the model using the default optimizer. estimator.fit(input_fn=train_input_fn, max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE) # Evaluate model on eval dataset. eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_eval_data_base + '*', training_batch_size=1, label_keys=['label']) return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_EVAL_INSTANCES)
def train_and_evaluate(output_dir): review_column = feature_column.sparse_column_with_integerized_feature( const.REVIEW_COLUMN, bucket_size=vocab_size + 1, combiner='sum') weighted_reviews = feature_column.weighted_sparse_column( review_column, const.REVIEW_WEIGHT) estimator = learn.LinearClassifier( feature_columns=[weighted_reviews], n_classes=2, model_dir=output_dir, config=tf.contrib.learn.RunConfig(save_checkpoints_secs=30)) transformed_metadata = metadata_io.read_metadata( transformed_metadata_dir) raw_metadata = metadata_io.read_metadata(raw_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_file_pattern, training_batch_size=train_batch_size, label_keys=[const.LABEL_COLUMN]) eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_file_pattern, training_batch_size=1, label_keys=[const.LABEL_COLUMN]) serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn( raw_metadata=raw_metadata, transform_savedmodel_dir=output_dir + '/transform_fn', raw_label_keys=[], raw_feature_keys=[const.REVIEW_COLUMN]) export_strategy = saved_model_export_utils.make_export_strategy( serving_input_fn, exports_to_keep=5, default_output_alternative_key=None) return tf.contrib.learn.Experiment(estimator=estimator, train_steps=train_num_epochs * num_train_instances / train_batch_size, eval_steps=num_test_instances, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, export_strategies=export_strategy, min_eval_frequency=500)
def test_build_training_input_fn(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() # the transformed schema should be vectorized already. metadata = dataset_metadata.DatasetMetadata( schema=_make_transformed_schema([1])) data_file = os.path.join(basedir, 'data') examples = [ _create_serialized_example(d) for d in [{ 'transformed_a': 15, 'transformed_b': 6, 'transformed_label': 77 }, { 'transformed_a': 12, 'transformed_b': 17, 'transformed_label': 44 }] ] _write_tfrecord(data_file, examples) training_input_fn = (input_fn_maker.build_training_input_fn( metadata=metadata, file_pattern=[data_file], training_batch_size=128, label_keys=['transformed_label'], randomize_input=False)) with tf.Graph().as_default(): features, labels = training_input_fn() with tf.Session().as_default() as session: session.run(tf.initialize_all_variables()) tf.train.start_queue_runners() transformed_a, transformed_b, transformed_label = session.run([ features['transformed_a'], features['transformed_b'], labels ]) self.assertEqual((128, 1), tuple(transformed_a.shape)) self.assertEqual((128, 1), tuple(transformed_b.dense_shape)) self.assertEqual((128, 1), tuple(transformed_label.shape)) transformed_b_dict = dict( zip([tuple(x) for x in transformed_b.indices.tolist()], transformed_b.values.tolist())) self.assertEqual(15, transformed_a[0][0]) self.assertEqual(6, transformed_b_dict[(0, 0)]) self.assertEqual(77, transformed_label[0][0]) self.assertEqual(12, transformed_a[1][0]) self.assertEqual(17, transformed_b_dict[(1, 0)]) self.assertEqual(44, transformed_label[1][0])
def get_transformed_reader_input_fn(transformed_metadata, transformed_data_paths, batch_size, mode): """Wrap the get input features function to provide the runtime arguments.""" return input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=(transformed_data_paths[0] if len(transformed_data_paths) == 1 else transformed_data_paths), training_batch_size=batch_size, label_keys=[TARGET_FEATURE_COLUMN], reader=gzip_reader_fn, key_feature_name=KEY_FEATURE_COLUMN, reader_num_threads=4, queue_capacity=batch_size * 2, randomize_input=(mode != tf.contrib.learn.ModeKeys.EVAL), num_epochs=(1 if mode == tf.contrib.learn.ModeKeys.EVAL else None))
def get_transformed_reader_input_fn(transformed_metadata, transformed_data_paths, batch_size, mode): """Wrap the get input features function to provide the runtime arguments.""" return input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=( transformed_data_paths[0] if len(transformed_data_paths) == 1 else transformed_data_paths), training_batch_size=batch_size, label_keys=[TARGET_FEATURE_COLUMN], reader=gzip_reader_fn, key_feature_name=KEY_FEATURE_COLUMN, reader_num_threads=4, queue_capacity=batch_size * 2, randomize_input=(mode != tf.contrib.learn.ModeKeys.EVAL), num_epochs=(1 if mode == tf.contrib.learn.ModeKeys.EVAL else None))
def test_build_training_input_fn(self): basedir = tempfile.mkdtemp() metadata = dataset_metadata.DatasetMetadata( schema=_make_transformed_schema()) data_file = os.path.join(basedir, 'data') examples = [ _create_serialized_example(d) for d in [{ 'transformed_a': 15, 'transformed_b': 5, 'transformed_label': 77 }, { 'transformed_a': 12, 'transformed_b': 17, 'transformed_label': 44 }] ] _write_tfrecord(data_file, examples) training_input_fn = (input_fn_maker.build_training_input_fn( metadata=metadata, file_pattern=[data_file], training_batch_size=128, label_keys=['transformed_label'], randomize_input=False)) with tf.Graph().as_default(): features, labels = training_input_fn() with tf.Session().as_default() as session: session.run(tf.initialize_all_variables()) tf.train.start_queue_runners() transformed_a, transformed_b, transformed_label = session.run([ features['transformed_a'], features['transformed_b'], labels ]) self.assertEqual(15, transformed_a[0][0]) self.assertEqual(5, transformed_b[0][0]) self.assertEqual(77, transformed_label[0][0]) self.assertEqual(12, transformed_a[1][0]) self.assertEqual(17, transformed_b[1][0]) self.assertEqual(44, transformed_label[1][0])
def read_dataset(args, mode): batch_size = args['train_batch_size'] if mode == tf.estimator.ModeKeys.TRAIN: input_paths = args['train_data_paths'] else: input_paths = args['eval_data_paths'] transformed_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'transformed_metadata')) return input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=(input_paths[0] if len(input_paths) == 1 else input_paths), training_batch_size=batch_size, label_keys=[LABEL_COLUMN], reader=gzip_reader_fn, key_feature_name=KEY_FEATURE_COLUMN, randomize_input=(mode != tf.estimator.ModeKeys.EVAL), num_epochs=(1 if mode == tf.estimator.ModeKeys.EVAL else None))
def read_dataset(args, mode): batch_size = args['train_batch_size'] if mode == tf.estimator.ModeKeys.TRAIN: input_paths = args['train_data_paths'] else: input_paths = args['eval_data_paths'] transformed_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'transformed_metadata')) return input_fn_maker.build_training_input_fn( metadata = transformed_metadata, file_pattern = ( input_paths[0] if len(input_paths) == 1 else input_paths), training_batch_size = batch_size, label_keys = [LABEL_COLUMN], reader = gzip_reader_fn, key_feature_name = KEY_FEATURE_COLUMN, randomize_input = (mode != tf.estimator.ModeKeys.EVAL), num_epochs = (1 if mode == tf.estimator.ModeKeys.EVAL else None))
def read_dataset(args, mode): tfrecord_options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) batch_size = args['train_batch_size'] if mode == tf.estimator.ModeKeys.TRAIN: input_paths = args['train_data_paths'] elif mode == tf.estimator.ModeKeys.EVAL: input_paths = args['eval_data_paths'] else: input_paths = args['test_data_paths'] transformed_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'transformed_metadata')) return input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=(input_paths[0] if len(input_paths) == 1 else input_paths), training_batch_size=batch_size, label_keys=[LABEL_COL], reader=gzip_reader_fn, randomize_input=(mode == tf.estimator.ModeKeys.TRAIN), num_epochs=(None if mode == tf.estimator.ModeKeys.TRAIN else 1))
def train_and_evaluate(transformed_train_filepattern, transformed_test_filepattern, transformed_metadata_dir, serving_graph_dir): """Train the model on training data and evaluate on test data. Args: transformed_train_filepattern: File pattern for transformed training data shards transformed_test_filepattern: File pattern for transformed test data shards transformed_metadata_dir: Directory containing transformed data metadata serving_graph_dir: Directory to save the serving graph Returns: The results from the estimator's 'evaluate' method """ # Wrap scalars as real valued columns. real_valued_columns = [ feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS ] # Wrap categorical columns. Note the combiner is irrelevant since the input # only has one value set per feature per instance. one_hot_columns = [ feature_column.sparse_column_with_integerized_feature( key, bucket_size=bucket_size, combiner='sum') for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES) ] estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns) transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_filepattern, training_batch_size=TRAIN_BATCH_SIZE, label_keys=[LABEL_COLUMN]) # Estimate the model using the default optimizer. estimator.fit(input_fn=train_input_fn, max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE) # Write the serving graph to disk for use in tf.serving in_columns = [ 'age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ] if not serving_graph_dir is None: serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn( raw_metadata=raw_data_metadata, transform_savedmodel_dir=serving_graph_dir + '/transform_fn', raw_label_keys=[], raw_feature_keys=in_columns) estimator.export_savedmodel(serving_graph_dir, serving_input_fn) # Evaluate model on test dataset. eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_filepattern, training_batch_size=1, label_keys=[LABEL_COLUMN]) return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
def get_experiment(output_dir): # Read schema, input features, and transforms. schema = read_json_file( os.path.join(args.analysis_output_dir, SCHEMA_FILE)) features = read_json_file( os.path.join(args.analysis_output_dir, FEATURES_FILE)) stats = read_json_file( os.path.join(args.analysis_output_dir, STATS_FILE)) target_column_name = get_target_name(features) header_names = [col['name'] for col in schema] if not target_column_name: raise ValueError('target missing from features file.') # Get the model to train. target_vocab = read_vocab(args, target_column_name) estimator = get_estimator(args, output_dir, features, stats, len(target_vocab)) # Make list of files to save with the trained model. additional_assets = { FEATURES_FILE: os.path.join(args.analysis_output_dir, FEATURES_FILE), SCHEMA_FILE: os.path.join(args.analysis_output_dir, SCHEMA_FILE) } export_strategy_csv_notarget = make_export_strategy( args=args, keep_target=False, assets_extra=additional_assets, features=features, schema=schema) export_strategy_csv_target = make_export_strategy( args=args, keep_target=True, assets_extra=additional_assets, features=features, schema=schema) # Build readers for training. if args.run_transforms: raw_metadata = metadata_io.read_metadata( os.path.join(args.analysis_output_dir, RAW_METADATA_DIR)) input_reader_for_train = build_csv_transforming_training_input_fn( raw_metadata=raw_metadata, transform_savedmodel_dir=os.path.join(args.analysis_output_dir, TRANSFORM_FN_DIR), raw_data_file_pattern=args.train_data_paths, training_batch_size=args.train_batch_size, raw_keys=header_names, transformed_label_keys=[target_column_name], convert_scalars_to_vectors=True, num_epochs=args.num_epochs, randomize_input=True, min_after_dequeue=10, reader_num_threads=multiprocessing.cpu_count()) input_reader_for_eval = build_csv_transforming_training_input_fn( raw_metadata=raw_metadata, transform_savedmodel_dir=os.path.join(args.analysis_output_dir, TRANSFORM_FN_DIR), raw_data_file_pattern=args.eval_data_paths, training_batch_size=args.eval_batch_size, raw_keys=header_names, transformed_label_keys=[target_column_name], convert_scalars_to_vectors=True, num_epochs=1, randomize_input=False, reader_num_threads=multiprocessing.cpu_count()) else: transformed_metadata = metadata_io.read_metadata( os.path.join(args.analysis_output_dir, TRANSFORMED_METADATA_DIR)) input_reader_for_train = input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=args.train_data_paths, training_batch_size=args.train_batch_size, reader=gzip_reader_fn, label_keys=[target_column_name], feature_keys=None, # extract all features key_feature_name=None, # None as we take care of the key column. reader_num_threads=multiprocessing.cpu_count(), queue_capacity=args.train_batch_size * multiprocessing.cpu_count() + 10, randomize_input=True, num_epochs=args.num_epochs, ) input_reader_for_eval = input_fn_maker.build_training_input_fn( metadata=transformed_metadata, file_pattern=args.eval_data_paths, training_batch_size=args.eval_batch_size, reader=gzip_reader_fn, label_keys=[target_column_name], feature_keys=None, # extract all features key_feature_name=None, # None as we take care of the key column. reader_num_threads=multiprocessing.cpu_count(), queue_capacity=args.train_batch_size * multiprocessing.cpu_count() + 10, randomize_input=False, num_epochs=1, ) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=input_reader_for_train, eval_input_fn=input_reader_for_eval, train_steps=args.max_steps, export_strategies=[ export_strategy_csv_notarget, export_strategy_csv_target ], min_eval_frequency=args.min_eval_frequency, eval_steps=None, )