def train(_): training_dir = pjoin(FLAGS.training_set, FLAGS.train_subdir) feature_context = Datasets.get_context(training_dir) (feature_names, label_names) = feature_context.multispec_feature_groups training_dataset = Datasets.dict.read_dataset(training_dir) (feature_train_data, labels_train_data) = transform_dataset(feature_context, training_dataset) params = { 'objective': 'multi:softprob', 'verbose': False, 'num_class': len(label_names), 'max_depth': 6, 'nthread': 4, 'silent': 1 } xg_train = xgb.DMatrix(feature_train_data, label=labels_train_data) xg_model = xgb.train(params, xg_train, FLAGS.rounds) model_path = pjoin(FLAGS.local_dir, "iterator.model") xg_model.save_model(model_path) output_path = pjoin(FLAGS.training_set, "xgboost/iterator.model") file_io.copy(model_path, output_path, overwrite=True)
def test_trainer_shouldnt_crash(self): label_key = "label" feature_spec, _ = Datasets.parse_schema(self.schema_path) all_features = { name: tf.feature_column.numeric_column(name, default_value=.0) for name in feature_spec.keys() } feature_columns = all_features.copy() feature_columns.pop(label_key) config = tf.estimator.RunConfig(tempfile.mkdtemp()) estimator = tf.estimator.LinearClassifier( feature_columns=feature_columns.values(), config=config) def split_features_label_fn(parsed_features): label = parsed_features.pop(label_key) return parsed_features, label def get_in_fn(data): raw_feature_spec = tf.feature_column.make_parse_example_spec( all_features.values()) def in_fn(): dataset = Datasets.examples_via_feature_spec( data, raw_feature_spec) return dataset.map(split_features_label_fn) return in_fn estimator.train(get_in_fn(self.train_data)).evaluate( get_in_fn(self.eval_data))
def get_input(args): with tf.name_scope("input"): dataset, c = Datasets.get_featran_example_dataset(args.input, gen_spec=["label"]) iterator = dataset.make_initializable_iterator() (label,), features = iterator.get_next() label = tf.reshape(label, [-1, 1]) features = tf.reshape(features, [-1, c.num_features]) return iterator, label, features
def test_get_example_dataset(self): dataset = Datasets.examples_via_schema(self.train_data, self.schema_path, batch_size=16) batch_it = dataset.make_one_shot_iterator().get_next() with tf.Session() as sess: batch = sess.run(batch_it) self.assertEqual(len(batch), self.N_FEATURES) self.assertEqual(len(batch["f1"]), 16)
def test_mk_iter(self): it, context = Datasets.mk_iter(self.test_resources_dir) batch_it = it.get_next() with tf.Session() as sess: batch = sess.run(batch_it) self.assertEqual(len(batch), self.N_FEATURES) first_feature = list(context.features.keys())[0] self.assertEqual(len(batch[first_feature]), tf.flags.FLAGS["batch-size"].value)
def test_simple_get_example_dataset(self): data, schema_path = SquareTest._write_test_data() with self.test_session() as sess: dataset = Datasets.examples_via_schema(data, schema_path) # noqa: E501 iterator = dataset.make_one_shot_iterator() r = iterator.get_next() f1, f2 = r["f1"], r["f2"] self.assertAllEqual([[1], [2]], sess.run([f1, f2])) with self.assertRaises(tf.errors.OutOfRangeError): f1.eval()
def test_get_featran_example_dataset(self): d, _, _ = DataUtil.write_featran_test_data() with self.test_session() as sess: dataset, c = Datasets._get_featran_example_dataset(d) self.assertEquals(len(c.features), 2) iterator = dataset.make_one_shot_iterator() r = iterator.get_next() f1, f2 = r["f1"], r["f2"] self.assertAllEqual([1, 2], sess.run([f1, f2])) with self.assertRaises(tf.errors.OutOfRangeError): f1.eval()
def train(_): import tempfile config = tf.estimator.RunConfig(tempfile.mkdtemp()) train_data_dir = get_data_dir("train") schema_path = os.path.join(train_data_dir, "_inferred_schema.pb") feature_spec, _ = Datasets.parse_schema(schema_path) # we use OrderedDict and sorted keys for features for determinism all_features = OrderedDict([ (name, tf.feature_column.numeric_column(name, default_value=.0)) for name in sorted(feature_spec.keys()) ]) feature_columns = all_features.copy() label_keys = sorted( [l for l in set(feature_columns.keys()) if l.startswith("class_name")]) for l in label_keys: feature_columns.pop(l) def split_features_label_fn(spec): # Canned TF's LinearClassifier requires label to be a single integer, Featran gives us # one hot encoding for class, thus we need to convert one hot encoding to single integer labels = tf.concat([[spec.pop(l)] for l in label_keys], axis=0) label = tf.argmax(labels, axis=0) # Get the rest of the features out of the spec return spec, label def get_in_fn(data): raw_feature_spec = tf.feature_column.make_parse_example_spec( all_features.values()) def in_fn(): dataset = Datasets.examples_via_feature_spec( data, raw_feature_spec) return dataset.map(split_features_label_fn) return in_fn classifier = tf.estimator.LinearClassifier( feature_columns=feature_columns.values(), n_classes=3, config=config) train_data = os.path.join(train_data_dir, "part-*") eval_data = os.path.join(get_data_dir("eval"), "part-*") classifier.train(get_in_fn(train_data)).evaluate(get_in_fn(eval_data))
def main(_): from examples_utils import get_data_dir import tempfile config = Trainer.get_default_run_config(tempfile.mkdtemp()) feature_context = Datasets.get_context(get_data_dir("train")) (feature_names, label_names) = feature_context.multispec_feature_groups features = [tf.feature_column.numeric_column(x) for x in feature_names] def split_features_label_fn(spec): label = spec.pop(label_names[0]) return spec, label classifier = tf.estimator.LinearClassifier(features, config=config) Trainer.run(estimator=classifier, training_data_dir=get_data_dir("train"), eval_data_dir=get_data_dir("eval"), split_features_label_fn=split_features_label_fn, run_config=config)
def test_trainer_shouldnt_crash(self): context = Datasets.get_context(self.test_resources_dir) (feature_names, label_names) = context.multispec_feature_groups feature_columns = [tf.feature_column.numeric_column(name) for name in feature_names] config = Trainer.get_default_run_config(job_dir=tempfile.mkdtemp()) estimator = tf.estimator.LinearClassifier(feature_columns=feature_columns, config=config) def split_features_label_fn(parsed_features): self.assertEqual(len(label_names), 1) label = parsed_features.pop(label_names[0]) return parsed_features, label Trainer.run(estimator, training_data_dir=self.test_resources_dir, eval_data_dir=self.test_resources_dir, split_features_label_fn=split_features_label_fn, run_config=config)
def in_fn(): dataset = Datasets.examples_via_feature_spec( data, raw_feature_spec) return dataset.map(split_features_label_fn)
def test_parse_schema_from_stats(self): feature_spec, schema = Datasets.parse_schema_from_stats( self.stats_path) self.assertEqual(len(feature_spec), self.N_FEATURES)
def test_feature_order_multispec(self): expected_features = ["f3", "f1", "f2_EVEN", "f2_ODD"] _, context = Datasets.mk_iter(self.test_resources_dir) feature_names, _ = context.multispec_feature_groups self.assertEqual(feature_names, expected_features)