Exemple #1
0
def train(_):
    training_dir = pjoin(FLAGS.training_set, FLAGS.train_subdir)
    feature_context = Datasets.get_context(training_dir)

    (feature_names, label_names) = feature_context.multispec_feature_groups

    training_dataset = Datasets.dict.read_dataset(training_dir)
    (feature_train_data,
     labels_train_data) = transform_dataset(feature_context, training_dataset)

    params = {
        'objective': 'multi:softprob',
        'verbose': False,
        'num_class': len(label_names),
        'max_depth': 6,
        'nthread': 4,
        'silent': 1
    }

    xg_train = xgb.DMatrix(feature_train_data, label=labels_train_data)
    xg_model = xgb.train(params, xg_train, FLAGS.rounds)

    model_path = pjoin(FLAGS.local_dir, "iterator.model")
    xg_model.save_model(model_path)

    output_path = pjoin(FLAGS.training_set, "xgboost/iterator.model")
    file_io.copy(model_path, output_path, overwrite=True)
Exemple #2
0
    def test_trainer_shouldnt_crash(self):
        label_key = "label"
        feature_spec, _ = Datasets.parse_schema(self.schema_path)
        all_features = {
            name: tf.feature_column.numeric_column(name, default_value=.0)
            for name in feature_spec.keys()
        }
        feature_columns = all_features.copy()
        feature_columns.pop(label_key)

        config = tf.estimator.RunConfig(tempfile.mkdtemp())

        estimator = tf.estimator.LinearClassifier(
            feature_columns=feature_columns.values(), config=config)

        def split_features_label_fn(parsed_features):
            label = parsed_features.pop(label_key)
            return parsed_features, label

        def get_in_fn(data):
            raw_feature_spec = tf.feature_column.make_parse_example_spec(
                all_features.values())

            def in_fn():
                dataset = Datasets.examples_via_feature_spec(
                    data, raw_feature_spec)
                return dataset.map(split_features_label_fn)

            return in_fn

        estimator.train(get_in_fn(self.train_data)).evaluate(
            get_in_fn(self.eval_data))
 def get_input(args):
     with tf.name_scope("input"):
         dataset, c = Datasets.get_featran_example_dataset(args.input, gen_spec=["label"])
         iterator = dataset.make_initializable_iterator()
         (label,), features = iterator.get_next()
         label = tf.reshape(label, [-1, 1])
         features = tf.reshape(features, [-1, c.num_features])
         return iterator, label, features
    def test_get_example_dataset(self):
        dataset = Datasets.examples_via_schema(self.train_data, self.schema_path, batch_size=16)
        batch_it = dataset.make_one_shot_iterator().get_next()

        with tf.Session() as sess:
            batch = sess.run(batch_it)
            self.assertEqual(len(batch), self.N_FEATURES)
            self.assertEqual(len(batch["f1"]), 16)
Exemple #5
0
    def test_mk_iter(self):
        it, context = Datasets.mk_iter(self.test_resources_dir)
        batch_it = it.get_next()

        with tf.Session() as sess:
            batch = sess.run(batch_it)
            self.assertEqual(len(batch), self.N_FEATURES)

            first_feature = list(context.features.keys())[0]
            self.assertEqual(len(batch[first_feature]), tf.flags.FLAGS["batch-size"].value)
 def test_simple_get_example_dataset(self):
     data, schema_path = SquareTest._write_test_data()
     with self.test_session() as sess:
         dataset = Datasets.examples_via_schema(data, schema_path)  # noqa: E501
         iterator = dataset.make_one_shot_iterator()
         r = iterator.get_next()
         f1, f2 = r["f1"], r["f2"]
         self.assertAllEqual([[1], [2]], sess.run([f1, f2]))
         with self.assertRaises(tf.errors.OutOfRangeError):
             f1.eval()
Exemple #7
0
 def test_get_featran_example_dataset(self):
     d, _, _ = DataUtil.write_featran_test_data()
     with self.test_session() as sess:
         dataset, c = Datasets._get_featran_example_dataset(d)
         self.assertEquals(len(c.features), 2)
         iterator = dataset.make_one_shot_iterator()
         r = iterator.get_next()
         f1, f2 = r["f1"], r["f2"]
         self.assertAllEqual([1, 2], sess.run([f1, f2]))
         with self.assertRaises(tf.errors.OutOfRangeError):
             f1.eval()
Exemple #8
0
def train(_):
    import tempfile

    config = tf.estimator.RunConfig(tempfile.mkdtemp())

    train_data_dir = get_data_dir("train")
    schema_path = os.path.join(train_data_dir, "_inferred_schema.pb")

    feature_spec, _ = Datasets.parse_schema(schema_path)
    # we use OrderedDict and sorted keys for features for determinism
    all_features = OrderedDict([
        (name, tf.feature_column.numeric_column(name, default_value=.0))
        for name in sorted(feature_spec.keys())
    ])
    feature_columns = all_features.copy()
    label_keys = sorted(
        [l for l in set(feature_columns.keys()) if l.startswith("class_name")])
    for l in label_keys:
        feature_columns.pop(l)

    def split_features_label_fn(spec):
        # Canned TF's LinearClassifier requires label to be a single integer, Featran gives us
        # one hot encoding for class, thus we need to convert one hot encoding to single integer
        labels = tf.concat([[spec.pop(l)] for l in label_keys], axis=0)
        label = tf.argmax(labels, axis=0)
        # Get the rest of the features out of the spec
        return spec, label

    def get_in_fn(data):
        raw_feature_spec = tf.feature_column.make_parse_example_spec(
            all_features.values())

        def in_fn():
            dataset = Datasets.examples_via_feature_spec(
                data, raw_feature_spec)
            return dataset.map(split_features_label_fn)

        return in_fn

    classifier = tf.estimator.LinearClassifier(
        feature_columns=feature_columns.values(), n_classes=3, config=config)

    train_data = os.path.join(train_data_dir, "part-*")
    eval_data = os.path.join(get_data_dir("eval"), "part-*")
    classifier.train(get_in_fn(train_data)).evaluate(get_in_fn(eval_data))
Exemple #9
0
def main(_):
    from examples_utils import get_data_dir
    import tempfile

    config = Trainer.get_default_run_config(tempfile.mkdtemp())

    feature_context = Datasets.get_context(get_data_dir("train"))
    (feature_names, label_names) = feature_context.multispec_feature_groups
    features = [tf.feature_column.numeric_column(x) for x in feature_names]

    def split_features_label_fn(spec):
        label = spec.pop(label_names[0])
        return spec, label

    classifier = tf.estimator.LinearClassifier(features, config=config)
    Trainer.run(estimator=classifier,
                training_data_dir=get_data_dir("train"),
                eval_data_dir=get_data_dir("eval"),
                split_features_label_fn=split_features_label_fn,
                run_config=config)
Exemple #10
0
    def test_trainer_shouldnt_crash(self):
        context = Datasets.get_context(self.test_resources_dir)
        (feature_names, label_names) = context.multispec_feature_groups
        feature_columns = [tf.feature_column.numeric_column(name) for name in feature_names]

        config = Trainer.get_default_run_config(job_dir=tempfile.mkdtemp())

        estimator = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                                  config=config)

        def split_features_label_fn(parsed_features):
            self.assertEqual(len(label_names),  1)
            label = parsed_features.pop(label_names[0])
            return parsed_features, label

        Trainer.run(estimator,
                    training_data_dir=self.test_resources_dir,
                    eval_data_dir=self.test_resources_dir,
                    split_features_label_fn=split_features_label_fn,
                    run_config=config)
Exemple #11
0
 def in_fn():
     dataset = Datasets.examples_via_feature_spec(
         data, raw_feature_spec)
     return dataset.map(split_features_label_fn)
Exemple #12
0
 def test_parse_schema_from_stats(self):
     feature_spec, schema = Datasets.parse_schema_from_stats(
         self.stats_path)
     self.assertEqual(len(feature_spec), self.N_FEATURES)
Exemple #13
0
 def test_feature_order_multispec(self):
     expected_features = ["f3", "f1", "f2_EVEN", "f2_ODD"]
     _, context = Datasets.mk_iter(self.test_resources_dir)
     feature_names, _ = context.multispec_feature_groups
     self.assertEqual(feature_names, expected_features)