def test_make_query_for_querio_view(self): if not is_travis: return i = Interface(self.db_uri, 'querio_view') object1 = QueryObject('height') object1.add((Feature('profession_name') == 'programmer')) object1.add(Feature('stars') > 20) result1 = i.object_query(object1) self.assertTrue(result1 is not None)
def test_loading_by_chunk(self): data = pd.read_csv(os.path.join(os.path.dirname(__file__), '1000.csv'), chunksize=100) model = Model(data, 'table', "", ['age', 'height'], 'income', '') pred = model.query(Feature('age') > 20) self.assertEqual(len(model.trees), 10) self.assertEqual(len(model.plot_data), 990)
def visualize_decision(self, feature, axis, expression=None, prediction_style='b-', actual_style='r.', query_points=100, param_dict={}): """Plot the prediction with some real data points. Plots query(Feature(feature) == x) for points in the range of the feature. Also plots 100 points per chunk of actual data passed to the Model constructor. By default the prediction is plotted with a blue line and the actual points are plotted as red dots. It's recommended to plot the actual points with points, as their order in the dataset is arbitrary. Parameters: feature: str The feature to plot. axis: matplotlib.axes.Axes object The axis object the plot is made to. expression: Expression An expression limiting the query range. Only query points and actual points that match the expression are plotted. If the expression is very restrictive compared to the range of the plotted feature, ensure that query_points is set high enough to get an appropriate number of points in the plot. prediction_style: str The style of the prediction. Default blue line (b-) actual_style: str The style of the actual points. Default red dot (r.) query_points: int The number of points the model is queried at. Default 100 param_dict: Extra arguments passed to the axis object for plotting. Example: import matplotlib.pyplot as plt fig, ax = plt.subplots() model.visualize_decision('age', ax) fig.show() """ min_max = self.feature_min_max_count[feature] min = min_max['min'] max = min_max['max'] xs = np.linspace(min, max, query_points) if expression is not None: xs = [x for x in xs if expression.match(feature, x)] matching_rows = self.plot_data[self.plot_data.apply( lambda row: expression.match(feature, row[feature]), axis=1)] else: matching_rows = self.plot_data axis.plot(xs, [self.query(Feature(feature) == x).result for x in xs], prediction_style, **param_dict) axis.plot(matching_rows[feature], matching_rows[self.output_name], actual_style, **param_dict)
class ModelTest(unittest.TestCase): def setUp(self): ages = [22, 44, 36, 64, 32, 86, 11, 45] incomes = [age * 301 for age in ages] heights = [age * 50 for age in ages] github_stars = [age * 20 + 10 for age in ages] professions = [ 'accountant', 'janitor', 'president', 'janitor', 'accountant', 'programmer', 'janitor', 'account_manager' ] is_client = [True, True, False, True, False, False, False, True] self.data = pd.DataFrame({ 'age': ages, 'income': incomes, 'height': heights, 'github_stars': github_stars, 'profession': professions, 'is_client': is_client, 'prof_with_underscore': professions }) self.models = { 'One feature': Model(self.data, 'table', "", 'age', 'income', ''), 'One feature with boolean': Model(self.data, 'table', "", 'is_client', 'income', ''), 'One feature with categorical': Model(self.data, 'table', "", 'profession', 'income', ''), 'One feature with categorical_underscore': Model(self.data, 'table', "", 'prof_with_underscore', 'income', ''), 'Two features with categorical': Model(self.data, 'table', "", ['age', 'profession'], 'income', ''), 'Two features': Model(self.data, 'table', "", ['age', 'height'], 'income', ''), 'Two features reverse': Model(self.data, 'table', "", ['height', 'age'], 'income', ''), 'Three features': Model(self.data, 'table', "", ['age', 'height', 'github_stars'], 'income', '') } @parameterized.expand([ ('One feature', Cond('age', Op.eq, 35)), ('One feature with boolean', Cond('is_client', Op.eq, True)), ('One feature with boolean', Feature('is_client') == 1), ('One feature with categorical', Cond('profession', Op.eq, 'janitor')), ('One feature with categorical_underscore', (Feature('prof_with_underscore') == 'janitor')), ('One feature with categorical_underscore', (Feature('prof_with_underscore') == 'account_manager')), ('One feature with categorical', Feature('profession') == 'janitor'), ('Two features', ExpressionTreeNode(Cond('age', Op.eq, 35), BoolOp.and_, Cond('height', Op.eq, 700))), ('Two features', (Feature('age') == 20) & (Feature('height') == 800)), ('Two features', (Feature('age') == 20) | (Feature('height') == 800)), ('Three features', (((Feature('age') == 20) | (Feature('height') == 800)) & (Feature('github_stars') == 300))), ]) def test_query_gives_value_in_correct_range(self, name, test_conditions): prediction = self.models[name].query(test_conditions) self.assertGreaterEqual(prediction.result, self.data['income'].min()) self.assertLessEqual(prediction.result, self.data['income'].max()) self.assertGreaterEqual(prediction.variance, 0) @parameterized.expand([('One feature', Cond('age', Op.eq, 40)), ('Two features', ExpressionTreeNode(Cond('age', Op.eq, 41), BoolOp.and_, Cond('height', Op.eq, 134)))]) @unittest.skip def test_query_same_mean_as_sklearn_predict(self, name, test_condition): model = self.models[name] prediction = model.query(test_condition) self.assertAlmostEqual( model.tree.predict([[cond.threshold for cond in test_condition]])[0], prediction.result) @parameterized.expand([ ('Too old', Feature('age') > 100), ('Contradiction', ((Feature('height') > 1000) & (Feature('height') < 900))), ('Too few github stars', Feature('github_stars') == 100) ]) def test_query_raises_NoMatch_when_no_rows_match(self, name, test_condition): with self.assertRaises(NoMatch): model = self.models['Three features'] model.query(test_condition) def test_query_raises_ValueError_with_bad_feature_names(self): with self.assertRaises(ValueError): self.models['Two features'].query(Cond('github_stars', Op.eq, 0)) def test_query_raises_ValueError_with_bad_categorical_feature_values(self): with self.assertRaises(ValueError): self.models['One feature with categorical'].query( Cond('profession', Op.eq, 'firefighter')) @unittest.expectedFailure def test_reversing_features_doesnt_change_prediction(self): pred = self.models['Two features'].query([25, 130]) pred_reverse = self.models['Two features reverse'].query([130, 25]) self.assertAlmostEqual(pred.result, pred_reverse.result) @parameterized.expand([ ('One feature'), ('Two features'), ('Three features'), ]) def test_train_score_is_sensible(self, name): score = self.models[name].get_score_for_train() self.assertLessEqual(score, 1) @parameterized.expand([ ('One feature'), ('Two features'), ('Three features'), ]) def test_test_score_is_sensible(self, name): score = self.models[name].get_score_for_test() self.assertLessEqual(score, 1) def test_loading_by_chunk(self): data = pd.read_csv(os.path.join(os.path.dirname(__file__), '1000.csv'), chunksize=100) model = Model(data, 'table', "", ['age', 'height'], 'income', '') pred = model.query(Feature('age') > 20) self.assertEqual(len(model.trees), 10) self.assertEqual(len(model.plot_data), 990) @parameterized.expand([('Two features with categorical', 'profession'), ('One feature with categorical_underscore', 'prof_with_underscore')]) def test_get_categories_for_feature_returns_categories(self, model, feat): model = self.models[model] categories = model.get_categories_for_feature(feat) true_categories = [ 'accountant', 'janitor', 'president', 'programmer', 'account_manager' ] true_categories.sort() categories.sort() self.assertEqual(categories, true_categories) @parameterized.expand([('age'), ('income'), ('not_feature')]) def test_get_categories_for_feature_raises_with_not_categorical( self, feature): model = self.models['Two features with categorical'] with self.assertRaises(ValueError): model.get_categories_for_feature(feature) def __render_graph(self, model, name): import graphviz graph = graphviz.Source(model.export_graphviz()) graph.render(name)
class TestExpressionNode(unittest.TestCase): @parameterized.expand([ ('Simple true', Feature('age') > 30, 'age', 40, True), ('Simple false', Feature('age') < 30, 'age', 40, False), ('Simple equal', Feature('age') == 30, 'age', 30, True), ('Simple not equal', Feature('age') == 30, 'age', 40, False), ('Simple limit', Feature('age') < 30, 'age', 30, False), ('And true', ( (Feature('age') > 30) & (Feature('age') < 50) ), 'age', 40, True), ('And false', ( (Feature('age') > 30) & (Feature('age') < 50) ), 'age', 20, False), ('Or true', ( (Feature('age') > 30) | (Feature('age') < 20) ), 'age', 40, True), ('Or false', ( (Feature('age') > 30) | (Feature('age') < 20) ), 'age', 25, False), ]) def test_match(self, name, expression, feature, value, is_match): match = expression.match(feature, value) self.assertEqual(match, is_match)
class TreeTraversalTest(unittest.TestCase): def setUp(self): ages = [22, 44, 36, 64, 32, 86, 11, 45] incomes = [age * 301 for age in ages] heights = [age * 50 for age in ages] github_stars = [age * 20 + 10 for age in ages] self.data = pd.DataFrame({ 'age': ages, 'income': incomes, 'height': heights, 'github_stars': github_stars }) self.tree = sklearn.tree.DecisionTreeRegressor( criterion='mse', random_state=42, ) self.feature_names = ['age', 'height', 'github_stars'] self.feature_min_maxes = { 'age': { 'min': min(ages), 'max': max(ages) }, 'height': { 'min': min(heights), 'max': max(heights) }, 'github_stars': { 'min': min(github_stars), 'max': max(github_stars) } } train, test = sklearn.model_selection.train_test_split(self.data, random_state=42) self.tree.fit(train[self.feature_names], train['income']) @parameterized.expand([ ('0', Cond('age', Op.eq, 42), 10535), ('1', ExpressionTreeNode(Cond('age', Op.eq, 33), BoolOp.and_, Cond('height', Op.eq, 600)), 5191.320987654321), ('2', ExpressionTreeNode(Cond('height', Op.eq, 700), BoolOp.and_, Cond('github_stars', Op.eq, 350)), 3311), ('3', Cond('github_stars', Op.eq, 420), 6977.576923076923), ('4', Feature('age') == 42, 10535), ('5', Feature('height') > 2500, 17872.891891891893), ('6', Feature('github_stars') > 700, 14071.329608938544), ('7', Feature('age') > 40, 10535), ('8', ExpressionTreeNode( Feature('height') < 1000, BoolOp.and_, Feature('github_stars') > 700), 10836), ('9', ((Feature('height') == 1000) | (Feature('github_stars') == 250)), 7399.2865474884), ('10', (Feature('github_stars') == 100) | ((Feature('github_stars') == 700) & (Feature('height') == 1500)), 10836), ('11', (Feature('github_stars') < 250) | ((Feature('github_stars') == 700) & (Feature('height') == 1500)), 6977.5935071991), ]) def test_query_same_value_as_pre_calculated(self, name, test_conditions, true_result): prediction = query_one_tree(self.tree, test_conditions, self.feature_names, self.feature_min_maxes) self.assertAlmostEqual(true_result, prediction[0]) def __render_graph(self, tree, name): import graphviz grapviz = sklearn.tree.export_graphviz( self.tree, out_file=None, feature_names=self.feature_names, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(grapviz) graph.render(name)
def test_feature_eq_op(self, feature, threshold): feat = Feature(feature) cond = feat == threshold self.assertEqual(feature, cond.feature) self.assertAlmostEqual(threshold, cond.threshold) self.assertEqual(Op.eq, cond.op)