def _test_pandas_input_fn_helper(self, fc_impl, fn_to_run): """Tests complete flow with pandas_input_fn.""" if not HAS_PANDAS: return label_dimension = 1 batch_size = 10 data = np.linspace(0., 2., batch_size, dtype=np.float32) x = pd.DataFrame({'x': data}) y = pd.Series(data) train_input_fn = pandas_io.pandas_input_fn(x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = pandas_io.pandas_input_fn(x=x, y=y, batch_size=batch_size, shuffle=False) predict_input_fn = pandas_io.pandas_input_fn(x=x, batch_size=batch_size, shuffle=False) fn_to_run(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, predict_input_fn=predict_input_fn, input_dimension=label_dimension, label_dimension=label_dimension, batch_size=batch_size, fc_impl=fc_impl)
def test_pandas_input_fn(self): """Tests complete flow with pandas_input_fn.""" if not HAS_PANDAS: return input_dimension = 1 n_classes = 3 batch_size = 10 data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32) x = pd.DataFrame({'x': data}) y = pd.Series(self._as_label(data)) train_input_fn = pandas_io.pandas_input_fn(x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = pandas_io.pandas_input_fn(x=x, y=y, batch_size=batch_size, shuffle=False) predict_input_fn = pandas_io.pandas_input_fn(x=x, batch_size=batch_size, shuffle=False) self._test_complete_flow(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, predict_input_fn=predict_input_fn, input_dimension=input_dimension, n_classes=n_classes, batch_size=batch_size)
def testPandasInputFn_NonBoolShuffle(self): if not HAS_PANDAS: return x, _ = self.makeTestDataFrame() y_noindex = pd.Series(np.arange(-32, -28)) with self.assertRaisesRegexp( ValueError, 'shuffle must be provided and explicitly ' 'set as boolean'): # Default shuffle is None pandas_io.pandas_input_fn(x, y_noindex)
def testPandasInputFn_IndexMismatch(self): if not HAS_PANDAS: return x, _ = self.makeTestDataFrame() y_noindex = pd.Series(np.arange(-32, -28)) with self.assertRaises(ValueError): pandas_io.pandas_input_fn(x, y_noindex, batch_size=2, shuffle=False, num_epochs=1)
def testPandasInputFn_RaisesWhenTargetColumnIsAList(self): if not HAS_PANDAS: return x, y = self.makeTestDataFrame() with self.assertRaisesRegexp(TypeError, 'target_column must be a string type'): pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1, target_column=['one', 'two'])
def trainingInputFunction(self): return pandas_input_fn(self.trainX, self.trainY['prognosis'], batch_size=128, num_epochs=1, shuffle=True, num_threads=1)
def testPandasInputFn_Idempotent(self): if not HAS_PANDAS: return x, y = self.makeTestDataFrame() for _ in range(2): pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1)() for _ in range(2): pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=True, num_epochs=1)()
def testingInputFunction(self): return pandas_input_fn(self.testX, self.testY['prognosis'], batch_size=128, num_epochs=1, shuffle=False, num_threads=1)
def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self): if not HAS_PANDAS: return with self.cached_session() as session: index = np.arange(100, 102) a = np.arange(2) b = np.arange(32, 34) x = pd.DataFrame({'a': a, 'b': b}, index=index) y = pd.Series(np.arange(-32, -30), index=index) input_fn = pandas_io.pandas_input_fn(x, y, batch_size=128, shuffle=False, num_epochs=2) results = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) features, target = session.run(results) self.assertAllEqual(features['a'], [0, 1, 0, 1]) self.assertAllEqual(features['b'], [32, 33, 32, 33]) self.assertAllEqual(target, [-32, -31, -32, -31]) with self.assertRaises(errors.OutOfRangeError): session.run(results) coord.request_stop() coord.join(threads)
def func(): train_df, test_df = LoadUtil.load_data_df_sk('Social_Network_Ads.csv') x_train_df = train_df.iloc[:, 1:4] y_train_df = train_df.iloc[:, 4] x_test_df = test_df.iloc[:, 1:4] y_test_df = test_df.iloc[:, 4] PlotUtil.pairplot(x_train_df, hue='Gender') feature_columns = [ fc.categorical_column_with_vocabulary_list('Gender', vocabulary_list=['Male', 'Female']), fc.numeric_column('Age', dtype=tf.float32, normalizer_fn=lambda x: (x / np.float32(100))), fc.numeric_column('EstimatedSalary', dtype=tf.float32, normalizer_fn=lambda x: (x / np.float32(100000)))] classifier = tf.estimator.LinearClassifier(feature_columns=feature_columns, n_classes=2, model_dir='logs/') train_input_fn = pandas_input_fn( x=x_train_df, y=y_train_df, num_epochs=None, shuffle=True) classifier.train(train_input_fn, steps=1000) test_input_fn = pandas_input_fn( x=x_test_df, y=y_test_df, num_epochs=1, shuffle=False) y_pred = list() probabilities = classifier.predict(input_fn=test_input_fn) for i in range(len(y_test_df.values)): ret = next(probabilities) classes = ret['classes'] y_pred.append(int(classes)) print('test: {} predict: {}'.format(y_test_df.values[i], classes)) eval_results = classifier.evaluate(input_fn=test_input_fn) for key, value in sorted(eval_results.items()): print('%s: %s' % (key, value)) PlotUtil.display_confusion_matrix(y_test_df.values, y_pred)
def func(): train_df, test_df = LoadUtil.load_data_df_sk('50_Startups.csv') x_train_df = train_df.iloc[:, :4] y_train_df = train_df.iloc[:, 4] x_test_df = test_df.iloc[:, :4] y_test_df = test_df.iloc[:, 4] PlotUtil.pairplot(x_train_df) feature_columns = [ fc.numeric_column('RnDSpend', dtype=tf.float32), fc.numeric_column('Administration', dtype=tf.float32), fc.numeric_column('MarketingSpend', dtype=tf.float32), fc.categorical_column_with_vocabulary_list( 'State', vocabulary_list=['New York', 'California', 'Florida']) ] train_input_fn = pandas_input_fn(x=x_train_df, y=y_train_df, num_epochs=None, shuffle=True) linear_est = tf.estimator.LinearRegressor(feature_columns=feature_columns, model_dir='logs/') linear_est.train(train_input_fn, steps=100) test_input_fn = pandas_input_fn(x=x_test_df, y=y_test_df, num_epochs=1, shuffle=False) predictions = linear_est.predict(test_input_fn) y_pred = list() for i in range(len(y_test_df.values)): predict = next(predictions)['predictions'] y_pred.append(predict) PlotUtil.display_multiple_linear_result(y_test_df.values, y_pred, x_label='#', y_label='Profit')
def testPandasInputFn_RespectsEpoch_WithShuffle(self): if not HAS_PANDAS: return with self.cached_session() as session: x, y = self.makeTestDataFrame() input_fn = pandas_io.pandas_input_fn(x, y, batch_size=4, shuffle=True, num_epochs=1) self.assertInputsCallableNTimes(input_fn, session, 1)
def testPandasInputFn_RespectsEpochUnevenBatches(self): if not HAS_PANDAS: return x, y = self.makeTestDataFrame() with self.cached_session() as session: input_fn = pandas_io.pandas_input_fn(x, y, batch_size=3, shuffle=False, num_epochs=1) # Before the last batch, only one element of the epoch should remain. self.assertInputsCallableNTimes(input_fn, session, 2)
def testPandasInputFn_ExcludesIndex(self): if not HAS_PANDAS: return with self.cached_session() as session: x, y = self.makeTestDataFrame() input_fn = pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1) features, _ = self.callInputFnOnce(input_fn, session) self.assertFalse('index' in features)
def testPandasInputFn_OnlyX(self): if not HAS_PANDAS: return with self.cached_session() as session: x, _ = self.makeTestDataFrame() input_fn = pandas_io.pandas_input_fn(x, y=None, batch_size=2, shuffle=False, num_epochs=1) features = self.callInputFnOnce(input_fn, session) self.assertAllEqual(features['a'], [0, 1]) self.assertAllEqual(features['b'], [32, 33])
def predictDisease(self, symptomList): symptomList = pd.DataFrame([symptomList], columns=self.featureList) input_fn = pandas_input_fn(symptomList, None, batch_size=1, num_epochs=1, shuffle=False, num_threads=1) prediction = self.tree.predict(input_fn, predict_keys=None, hooks=None, checkpoint_path=None, yield_single_examples=True) disease = numpy.argmax(list(prediction)[0]['logits']) return self.diseases[int(disease)]
def testPandasInputFn_ProducesExpectedOutputs(self): if not HAS_PANDAS: return with self.cached_session() as session: x, y = self.makeTestDataFrame() input_fn = pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1) features, target = self.callInputFnOnce(input_fn, session) self.assertAllEqual(features['a'], [0, 1]) self.assertAllEqual(features['b'], [32, 33]) self.assertAllEqual(target, [-32, -31])
def testPandasInputFnYIsDataFrame_HandlesOverlappingColumnsInTargets(self): if not HAS_PANDAS: return with self.cached_session() as session: x, y = self.makeTestDataFrameWithYAsDataFrame() y = y.rename(columns={'a_target': 'a', 'b_target': 'a_n'}) input_fn = pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1) features, targets = self.callInputFnOnce(input_fn, session) self.assertAllEqual(features['a'], [0, 1]) self.assertAllEqual(features['b'], [32, 33]) self.assertAllEqual(targets['a'], [10, 11]) self.assertAllEqual(targets['a_n'], [50, 51])
def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize( self): if not HAS_PANDAS: return with self.cached_session() as session: index = np.arange(100, 105) a = np.arange(5) b = np.arange(32, 37) x = pd.DataFrame({'a': a, 'b': b}, index=index) y = pd.Series(np.arange(-32, -27), index=index) input_fn = pandas_io.pandas_input_fn(x, y, batch_size=2, shuffle=False, num_epochs=1) results = input_fn() coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(session, coord=coord) features, target = session.run(results) self.assertAllEqual(features['a'], [0, 1]) self.assertAllEqual(features['b'], [32, 33]) self.assertAllEqual(target, [-32, -31]) features, target = session.run(results) self.assertAllEqual(features['a'], [2, 3]) self.assertAllEqual(features['b'], [34, 35]) self.assertAllEqual(target, [-30, -29]) features, target = session.run(results) self.assertAllEqual(features['a'], [4]) self.assertAllEqual(features['b'], [36]) self.assertAllEqual(target, [-28]) with self.assertRaises(errors.OutOfRangeError): session.run(results) coord.request_stop() coord.join(threads)