def __init__(self, params, device_assigner=None, model_dir=None, graph_builder_class=tensor_forest.RandomForestGraphs, config=None, weights_name=None, keys_name=None, feature_engineering_fn=None, early_stopping_rounds=100, num_trainers=1, trainer_id=0): """Initializes a TensorForestEstimator instance. Args: params: ForestHParams object that holds random forest hyperparameters. These parameters will be passed into `model_fn`. device_assigner: An `object` instance that controls how trees get assigned to devices. If `None`, will use `tensor_forest.RandomForestDeviceAssigner`. model_dir: Directory to save model parameters, graph, etc. To continue training a previously saved model, load checkpoints saved to this directory into an estimator. graph_builder_class: An `object` instance that defines how TF graphs for random forest training and inference are built. By default will use `tensor_forest.RandomForestGraphs`. config: `RunConfig` object to configure the runtime settings. weights_name: A string defining feature column name representing weights. Will be multiplied by the loss of the example. Used to downweight or boost examples during training. keys_name: A string defining feature column name representing example keys. Used by `predict_with_keys` method. feature_engineering_fn: Feature engineering function. Takes features and labels which are the output of `input_fn` and returns features and labels which will be fed into the model. early_stopping_rounds: Allows training to terminate early if the forest is no longer growing. 100 by default. num_trainers: Number of training jobs, which will partition trees among them. trainer_id: Which trainer this instance is. Returns: A `TensorForestEstimator` instance. """ self.params = params.fill() self.graph_builder_class = graph_builder_class self.early_stopping_rounds = early_stopping_rounds self.weights_name = weights_name self._estimator = estimator.Estimator( model_fn=get_model_fn(params, graph_builder_class, device_assigner, weights_name=weights_name, keys_name=keys_name, num_trainers=num_trainers, trainer_id=trainer_id), model_dir=model_dir, config=config, feature_engineering_fn=feature_engineering_fn) self._skcompat = estimator.SKCompat(self._estimator)
def testUntrained(self): boston = base.load_boston() est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn)) with self.assertRaises(learn.NotFittedError): _ = est.score(x=boston.data, y=boston.target.astype(np.float64)) with self.assertRaises(learn.NotFittedError): est.predict(x=boston.data)
def build_lr_estimator(model_dir, feature_count): return estimator.SKCompat( learn.LinearClassifier(feature_columns=[ tf.contrib.layers.real_valued_column("", dimension=feature_count) ], n_classes=2, model_dir=model_dir))
def main(unused_argv): # Load datasets. training_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) validation_metrics = { "accuracy": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_accuracy, prediction_key=tf.contrib.learn.PredictionKey.CLASSES), "precision": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_precision, prediction_key=tf.contrib.learn.PredictionKey.CLASSES), "recall": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_recall, prediction_key=tf.contrib.learn.PredictionKey.CLASSES) } # Create a ValidationMonitor validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( test_set.data, test_set.target, every_n_steps=50, metrics=validation_metrics, early_stopping_metric="loss", early_stopping_metric_minimize=True, early_stopping_rounds=200) # Specify that all features have real-value data feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)] # Build 3 layer DNN with 10, 20, 10 units respectively. classifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir="/tmp/iris_model", config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) clf = estimator.SKCompat(classifier) # Fit model clf.fit(x=training_set.data, y=training_set.target, steps=2000) # monitors=[validation_monitor]) # Evaluate accuracy accuracy_score = clf.score(x=test_set.data, y=test_set.target)["accuracy"] print("Accuracy: {0:f}".format(accuracy_score)) # Classify two new flower samples. new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float) y = list(clf.predict(new_samples)) print('Predictions: {}'.format(str(y)))
def build_rf_estimator(model_dir, feature_count): params = tensor_forest.ForestHParams( num_classes=2, num_features=feature_count, num_trees=100, max_nodes=1000, min_split_samples=10) graph_builder_class = tensor_forest.RandomForestGraphs return estimator.SKCompat(random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def nonconv(training_data, validation_data, test_data): feature_columns = learn.infer_real_valued_columns_from_input( training_data.images) classifier = learn.DNNClassifier([100], feature_columns, model_dir=None, n_classes=10, optimizer=tf.train.FtrlOptimizer(0.3, l2_regularization_strength=0.1), activation_fn=nn.sigmoid, dropout=0.2) estimator.SKCompat(classifier).fit(training_data.images, training_data.labels.astype(np.int32), batch_size=10, steps=200000) mytuple = (test_data.labels, list(classifier.predict(test_data.images))) score = metrics.accuracy_score(*mytuple) print('Accuracy: {0:f}'.format(score))
def build_estimator(model_dir): """Build an estimator.""" params = tensor_forest.ForestHParams( num_classes=10, num_features=784, num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes) graph_builder_class = tensor_forest.RandomForestGraphs if FLAGS.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest # Use the SKCompat wrapper, which gives us a convenient way to split # in-memory data like MNIST into batches. return estimator.SKCompat(random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def testBostonAll(self): boston = base.load_boston() est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn)) float64_labels = boston.target.astype(np.float64) est.fit(x=boston.data, y=float64_labels, steps=100) scores = est.score( x=boston.data, y=float64_labels, metrics={'MSE': metric_ops.streaming_mean_squared_error}) predictions = np.array(list(est.predict(x=boston.data))) other_score = _sklearn.mean_squared_error(predictions, boston.target) self.assertAllClose(scores['MSE'], other_score) self.assertTrue('global_step' in scores) self.assertEqual(100, scores['global_step'])
def loadModel(): global classifier classifier = estimator.SKCompat(estimator.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR )) df = pd.read_csv(CSV_FILE, header=None) train_df = df[0:1] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train)), dtype=int) y_train = np.array(train_df[0], dtype=int) classifier.score(x_train, y_train) print 'Model updated'
def build_estimator(model_dir): params = tensor_forest.ForestHParams( num_classes=config.num_classes, num_features=config.num_features, num_trees=config.num_trees, max_nodes=config.max_nodes, bagging_fraction=config.bagging_fraction, feature_bagging_fraction=config.feature_bagging_fraction) graph_builder_class = tensor_forest.RandomForestGraphs if config.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest # Use the SKCompat wrapper, which gives us a convenient way to split # in-memory data like MNIST into batches. return estimator.SKCompat( random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class, model_dir=model_dir))
def main(unused_argv): # global n_words # Prepare training and testing data dbpedia = learn.datasets.load_dataset( 'dbpedia', test_with_fake_data=False) #FLAGS.test_with_fake_data) x_train = pandas.DataFrame(dbpedia.train.data)[1] y_train = pandas.Series(dbpedia.train.target) x_test = pandas.DataFrame(dbpedia.test.data)[1] y_test = pandas.Series(dbpedia.test.target) if FLAGS.embeddings: model_, vocabulary_, x_transform_train, x_transform_test = process_emb( x_train, x_test) else: model_, vocabulary_, x_transform_train, x_transform_test = process_cat( x_train, x_test) x_train = np.array(list(x_transform_train)) x_test = np.array(list(x_transform_test)) setting.n_words = len(vocabulary_) print('Total words: %d' % setting.n_words) print('x_train shape: ' + str(x_train.shape)) print('x_test shape: ' + str(x_test.shape)) # Build model # Switch between rnn_model and bag_of_words_model to test different models. model_fn = rnn_model if FLAGS.bow_model: model_fn = model_ else: model_fn = rnn_model classifier = estimator.Estimator(model_fn=model_fn) # Train and predict estimator.SKCompat(classifier).fit(x_train, y_train, steps=100) y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def testCheckInputs(self): est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn)) # Lambdas so we have to different objects to compare right_features = lambda: np.ones(shape=[7, 8], dtype=np.float32) right_labels = lambda: np.ones(shape=[7, 10], dtype=np.int32) est.fit(right_features(), right_labels(), steps=1) # TODO(wicke): This does not fail for np.int32 because of data_feeder magic. wrong_type_features = np.ones(shape=[7, 8], dtype=np.int64) wrong_size_features = np.ones(shape=[7, 10]) wrong_type_labels = np.ones(shape=[7, 10], dtype=np.float32) wrong_size_labels = np.ones(shape=[7, 11]) est.fit(x=right_features(), y=right_labels(), steps=1) with self.assertRaises(ValueError): est.fit(x=wrong_type_features, y=right_labels(), steps=1) with self.assertRaises(ValueError): est.fit(x=wrong_size_features, y=right_labels(), steps=1) with self.assertRaises(ValueError): est.fit(x=right_features(), y=wrong_type_labels, steps=1) with self.assertRaises(ValueError): est.fit(x=right_features(), y=wrong_size_labels, steps=1)
def testIrisAll(self): iris = base.load_iris() est = estimator.SKCompat( estimator.Estimator(model_fn=logistic_model_no_mode_fn)) est.fit(iris.data, iris.target, steps=100) scores = est.score( x=iris.data, y=iris.target, metrics={('accuracy', 'class'): metric_ops.streaming_accuracy}) predictions = est.predict(x=iris.data) predictions_class = est.predict(x=iris.data, outputs=['class'])['class'] self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0]) self.assertAllClose(predictions['class'], predictions_class) self.assertAllClose( predictions['class'], np.argmax( predictions['prob'], axis=1)) other_score = _sklearn.accuracy_score(iris.target, predictions['class']) self.assertAllClose(scores['accuracy'], other_score) self.assertTrue('global_step' in scores) self.assertEqual(100, scores['global_step'])
def build_model(self, global_step, is_chief, sync, num_replicas): # Load datasets. self.training_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) self.test_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) # Specify that all features have real-value data feature_columns = [ tf.contrib.layers.real_valued_column("", dimension=4) ] # Build 3 layer DNN with 10, 20, 10 units respectively. dnnClassifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir="/tmp/iris_model") self.classifier = estimator.SKCompat(dnnClassifier) return None
def train(self, data: np.ndarray, labels: np.ndarray): """Trains the decision forest classifier. Args: data (np.ndarray): The training data. labels (np.ndarray): The labels of the training data. """ # build the estimator if self.use_training_loss: graph_builder_class = tensor_forest.TrainingLossForest else: graph_builder_class = tensor_forest.RandomForestGraphs self.estimator = estimator.SKCompat( random_forest.TensorForestEstimator( self.parameters, graph_builder_class=graph_builder_class, model_dir=self.model_dir, report_feature_importances=self.report_feature_importances )) self.estimator.fit(x=data, y=labels, batch_size=self.batch_size)
def main(unused_argv): # Load datasets. training_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) # Specify that all features have real-value data feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)] # Build 3 layer DNN with 10, 20, 10 units respectively. dnnClassifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir="/tmp/iris_model", config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) classifier = estimator.SKCompat(dnnClassifier) # var_names = dnnClassifier.get_variable_names() # print("Variable names:{}".format(var_names)) # Fit model classifier.fit(x=training_set.data, y=training_set.target, max_steps=2000) # Evaluate accuracy scores = classifier.score(x=test_set.data, y=test_set.target) print("Accuracy: {0:f}".format(scores["accuracy"])) print("global_step: {0}".format(scores["global_step"])) print("auc: {0}".format(scores["auc"])) print("loss: {0}".format(scores["loss"])) # Classify two new flower samples. new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float) y_predicted = classifier.predict(new_samples) print('Predictions: {}'.format(str(y_predicted)))
test, train = train[test_idx,:],\ train[training_idx,:] test_labels, train_labels = labels[test_idx],\ labels[training_idx] train = np.array(train, dtype=np.float32) test = np.array(test, dtype=np.float32) train_labels = np.array(train_labels, dtype=np.int32) test_labels = np.array(test_labels, dtype=np.int32) # Convert features to learn style feature_columns = learn.infer_real_valued_columns_from_input( train.reshape([-1, 36 * 36])) # Logistic Regression classifier = estimator.SKCompat( learn.LinearClassifier(feature_columns=feature_columns, n_classes=5)) # One line training # steps is number of total batches # steps*batch_size/len(train) = num_epochs classifier.fit(train.reshape([-1, 36 * 36]), train_labels, steps=1024, batch_size=32) # sklearn compatible accuracy test_probs = classifier.predict(test.reshape([-1, 36 * 36])) sklearn.metrics.accuracy_score(test_labels, test_probs['classes']) # Dense neural net classifier = estimator.SKCompat(
strides=[1, 2, 2, 1], padding='VALID') # Need to flatten conv output for use in dense layer p1_size = np.product([s.value for s in p1.get_shape()[1:]]) p1f = tf.reshape(p1, [-1, p1_size]) # densely connected layer with 32 neurons and dropout h_fc1 = layers.fully_connected(p1f, 5, activation_fn=tf.nn.relu) drop = layers.dropout(h_fc1, keep_prob=0.5, is_training=mode == tf.contrib.learn.ModeKeys.TRAIN) logits = layers.fully_connected(drop, 5, activation_fn=None) loss = tf.losses.softmax_cross_entropy(y, logits) # Setup the training function manually train_op = layers.optimize_loss(loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) return tf.argmax(logits, 1), loss, train_op # Use generic estimator with our function classifier = estimator.SKCompat(learn.Estimator(model_fn=conv_learn)) classifier.fit(train, train_labels, steps=1024, batch_size=32) # simple accuracy metrics.accuracy_score(test_labels, classifier.predict(test))
lr = LogisticRegression() lr.fit(X_train, y_train) print(accuracy_score(lr.predict(X_test), y_test)) # Linear classifier. random.seed(42) tflr = learn.LinearClassifier(n_classes=2, feature_columns=learn.infer_real_valued_columns_from_input(X_train), optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) # tflr.fit(X_train, y_train, batch_size=128, steps=500) # print(accuracy_score(tflr.predict(X_test), y_test)) est = estimator.SKCompat(tflr) est.fit(X_train, y_train, batch_size=128, steps=500) print(accuracy_score(est.predict(X_test)["classes"], y_test)) # 3 layer neural network with rectified linear activation. random.seed(42) classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=2, feature_columns=learn.infer_real_valued_columns_from_input(X_train), optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) # classifier.fit(X_train, y_train, batch_size=128, steps=500) # print(accuracy_score(classifier.predict(X_test), y_test)) ## use SKCompat est = estimator.SKCompat(classifier)
def testEstimatorParams(self): boston = base.load_boston() est = estimator.SKCompat( estimator.Estimator(model_fn=linear_model_params_fn, params={'learning_rate': 0.01})) est.fit(x=boston.data, y=boston.target, steps=100)
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:3300] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] x_train = x_train.str.replace('[^\x00-\x7F]', '') ##################################### ''' x_train = train_df[2] x_train = x_train.str.replace('[^\x00-\x7F]','') tokenizer = RegexpTokenizer(r"\w+") stemmer = PorterStemmer() #wnl = WordNetLemmatizer() for i in xrange(0,3000): x_train[i] = str(x_train[i]) x_train[i] = tokenizer.tokenize(x_train[i]) x_train[i] = list(word for word in x_train[i] if word not in stopwords.words('english')) x_train[i] = [stemmer.stem(word) for word in x_train[i]] #x_train[i] = [wnl.lemmatize(word) for word in x_train[i]] x_train[i] = " ".join(str(word) for word in x_train[i]) ''' ########################################################### y_train = np.array(train_df[0], dtype=int) x_test = test_df[1] y_test = np.array(test_df[0], dtype=int) # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = estimator.SKCompat( estimator.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR, config=learn.RunConfig(save_checkpoints_secs=10, save_summary_steps=10))) # Set up logging for predictions tensors_to_log = {"prob": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) # Train and predict classifier.fit(x_train, y_train, batch_size=BATCH, steps=STEPS, monitors=[logging_hook]) # Configure the accuracy metric metrics = { "accuracy": learn.MetricSpec(metric_fn=tf.metrics.accuracy, prediction_key="class") } # Evaluate the model eval_results = classifier.score(x=x_test, y=y_test, metrics=metrics)
test_idx, training_idx = indices[:valid_cnt],\ indices[valid_cnt:] test, train = train[test_idx,:],\ train[training_idx,:] test_labels, train_labels = labels[test_idx],\ labels[training_idx] train = np.array(train, dtype=np.float32) test = np.array(test, dtype=np.float32) train_labels = np.array(train_labels, dtype=np.int32) test_labels = np.array(test_labels, dtype=np.int32) # Convert features to learn style feature_columns = learn.infer_real_valued_columns_from_input( train.reshape([-1, 36 * 36])) # Logistic Regression classifier = estimator.SKCompat( learn.LinearClassifier(feature_columns=feature_columns, n_classes=5)) # One line training # steps is number of total batches # steps*batch_size/len(train) = num_epochs classifier.fit(train.reshape([-1, 36 * 36]), train_labels, steps=1024, batch_size=32) # sklearn compatible accuracy test_probs = classifier.predict(test.reshape([-1, 36 * 36])) sklearn.metrics.accuracy_score(test_labels, test_probs['classes'])