Beispiel #1
0
    def test_make_query_for_querio_view(self):
        if not is_travis:
            return
        
        i = Interface(self.db_uri, 'querio_view')

        object1 = QueryObject('height')
        object1.add((Feature('profession_name') == 'programmer'))
        object1.add(Feature('stars') > 20)

        result1 = i.object_query(object1)

        self.assertTrue(result1 is not None)
Beispiel #2
0
 def test_loading_by_chunk(self):
     data = pd.read_csv(os.path.join(os.path.dirname(__file__), '1000.csv'),
                        chunksize=100)
     model = Model(data, 'table', "", ['age', 'height'], 'income', '')
     pred = model.query(Feature('age') > 20)
     self.assertEqual(len(model.trees), 10)
     self.assertEqual(len(model.plot_data), 990)
Beispiel #3
0
    def visualize_decision(self,
                           feature,
                           axis,
                           expression=None,
                           prediction_style='b-',
                           actual_style='r.',
                           query_points=100,
                           param_dict={}):
        """Plot the prediction with some real data points.

        Plots query(Feature(feature) == x) for points in the range of the
        feature. Also plots 100 points per chunk of actual data passed to
        the Model constructor. By default the prediction is plotted with a
        blue line and the actual points are plotted as red dots. It's
        recommended to plot the actual points with points, as their order
        in the dataset is arbitrary.

        Parameters:
        feature: str
            The feature to plot.
        axis: matplotlib.axes.Axes object
            The axis object the plot is made to.
        expression: Expression
            An expression limiting the query range. Only query points and
            actual points that match the expression are plotted. If the
            expression is very restrictive compared to the range of the
            plotted feature, ensure that query_points is set high enough to
            get an appropriate number of points in the plot.
        prediction_style: str
            The style of the prediction. Default blue line (b-)
        actual_style: str
            The style of the actual points. Default red dot (r.)
        query_points: int
            The number of points the model is queried at. Default 100
        param_dict:
            Extra arguments passed to the axis object for plotting.

        Example:
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        model.visualize_decision('age', ax)
        fig.show()
        """
        min_max = self.feature_min_max_count[feature]
        min = min_max['min']
        max = min_max['max']
        xs = np.linspace(min, max, query_points)
        if expression is not None:
            xs = [x for x in xs if expression.match(feature, x)]
            matching_rows = self.plot_data[self.plot_data.apply(
                lambda row: expression.match(feature, row[feature]), axis=1)]
        else:
            matching_rows = self.plot_data
        axis.plot(xs, [self.query(Feature(feature) == x).result for x in xs],
                  prediction_style, **param_dict)
        axis.plot(matching_rows[feature], matching_rows[self.output_name],
                  actual_style, **param_dict)
Beispiel #4
0
class ModelTest(unittest.TestCase):
    def setUp(self):
        ages = [22, 44, 36, 64, 32, 86, 11, 45]
        incomes = [age * 301 for age in ages]
        heights = [age * 50 for age in ages]
        github_stars = [age * 20 + 10 for age in ages]
        professions = [
            'accountant', 'janitor', 'president', 'janitor', 'accountant',
            'programmer', 'janitor', 'account_manager'
        ]
        is_client = [True, True, False, True, False, False, False, True]
        self.data = pd.DataFrame({
            'age': ages,
            'income': incomes,
            'height': heights,
            'github_stars': github_stars,
            'profession': professions,
            'is_client': is_client,
            'prof_with_underscore': professions
        })
        self.models = {
            'One feature':
            Model(self.data, 'table', "", 'age', 'income', ''),
            'One feature with boolean':
            Model(self.data, 'table', "", 'is_client', 'income', ''),
            'One feature with categorical':
            Model(self.data, 'table', "", 'profession', 'income', ''),
            'One feature with categorical_underscore':
            Model(self.data, 'table', "", 'prof_with_underscore', 'income',
                  ''),
            'Two features with categorical':
            Model(self.data, 'table', "", ['age', 'profession'], 'income', ''),
            'Two features':
            Model(self.data, 'table', "", ['age', 'height'], 'income', ''),
            'Two features reverse':
            Model(self.data, 'table', "", ['height', 'age'], 'income', ''),
            'Three features':
            Model(self.data, 'table', "", ['age', 'height', 'github_stars'],
                  'income', '')
        }

    @parameterized.expand([
        ('One feature', Cond('age', Op.eq, 35)),
        ('One feature with boolean', Cond('is_client', Op.eq, True)),
        ('One feature with boolean', Feature('is_client') == 1),
        ('One feature with categorical', Cond('profession', Op.eq, 'janitor')),
        ('One feature with categorical_underscore',
         (Feature('prof_with_underscore') == 'janitor')),
        ('One feature with categorical_underscore',
         (Feature('prof_with_underscore') == 'account_manager')),
        ('One feature with categorical', Feature('profession') == 'janitor'),
        ('Two features',
         ExpressionTreeNode(Cond('age', Op.eq, 35), BoolOp.and_,
                            Cond('height', Op.eq, 700))),
        ('Two features', (Feature('age') == 20) & (Feature('height') == 800)),
        ('Two features', (Feature('age') == 20) | (Feature('height') == 800)),
        ('Three features',
         (((Feature('age') == 20) | (Feature('height') == 800))
          & (Feature('github_stars') == 300))),
    ])
    def test_query_gives_value_in_correct_range(self, name, test_conditions):
        prediction = self.models[name].query(test_conditions)
        self.assertGreaterEqual(prediction.result, self.data['income'].min())
        self.assertLessEqual(prediction.result, self.data['income'].max())
        self.assertGreaterEqual(prediction.variance, 0)

    @parameterized.expand([('One feature', Cond('age', Op.eq, 40)),
                           ('Two features',
                            ExpressionTreeNode(Cond('age', Op.eq, 41),
                                               BoolOp.and_,
                                               Cond('height', Op.eq, 134)))])
    @unittest.skip
    def test_query_same_mean_as_sklearn_predict(self, name, test_condition):
        model = self.models[name]
        prediction = model.query(test_condition)
        self.assertAlmostEqual(
            model.tree.predict([[cond.threshold
                                 for cond in test_condition]])[0],
            prediction.result)

    @parameterized.expand([
        ('Too old', Feature('age') > 100),
        ('Contradiction',
         ((Feature('height') > 1000) & (Feature('height') < 900))),
        ('Too few github stars', Feature('github_stars') == 100)
    ])
    def test_query_raises_NoMatch_when_no_rows_match(self, name,
                                                     test_condition):
        with self.assertRaises(NoMatch):
            model = self.models['Three features']
            model.query(test_condition)

    def test_query_raises_ValueError_with_bad_feature_names(self):
        with self.assertRaises(ValueError):
            self.models['Two features'].query(Cond('github_stars', Op.eq, 0))

    def test_query_raises_ValueError_with_bad_categorical_feature_values(self):
        with self.assertRaises(ValueError):
            self.models['One feature with categorical'].query(
                Cond('profession', Op.eq, 'firefighter'))

    @unittest.expectedFailure
    def test_reversing_features_doesnt_change_prediction(self):
        pred = self.models['Two features'].query([25, 130])
        pred_reverse = self.models['Two features reverse'].query([130, 25])
        self.assertAlmostEqual(pred.result, pred_reverse.result)

    @parameterized.expand([
        ('One feature'),
        ('Two features'),
        ('Three features'),
    ])
    def test_train_score_is_sensible(self, name):
        score = self.models[name].get_score_for_train()
        self.assertLessEqual(score, 1)

    @parameterized.expand([
        ('One feature'),
        ('Two features'),
        ('Three features'),
    ])
    def test_test_score_is_sensible(self, name):
        score = self.models[name].get_score_for_test()
        self.assertLessEqual(score, 1)

    def test_loading_by_chunk(self):
        data = pd.read_csv(os.path.join(os.path.dirname(__file__), '1000.csv'),
                           chunksize=100)
        model = Model(data, 'table', "", ['age', 'height'], 'income', '')
        pred = model.query(Feature('age') > 20)
        self.assertEqual(len(model.trees), 10)
        self.assertEqual(len(model.plot_data), 990)

    @parameterized.expand([('Two features with categorical', 'profession'),
                           ('One feature with categorical_underscore',
                            'prof_with_underscore')])
    def test_get_categories_for_feature_returns_categories(self, model, feat):
        model = self.models[model]
        categories = model.get_categories_for_feature(feat)
        true_categories = [
            'accountant', 'janitor', 'president', 'programmer',
            'account_manager'
        ]
        true_categories.sort()
        categories.sort()
        self.assertEqual(categories, true_categories)

    @parameterized.expand([('age'), ('income'), ('not_feature')])
    def test_get_categories_for_feature_raises_with_not_categorical(
            self, feature):
        model = self.models['Two features with categorical']
        with self.assertRaises(ValueError):
            model.get_categories_for_feature(feature)

    def __render_graph(self, model, name):
        import graphviz
        graph = graphviz.Source(model.export_graphviz())
        graph.render(name)
Beispiel #5
0
class TestExpressionNode(unittest.TestCase):

    @parameterized.expand([
        ('Simple true', Feature('age') > 30, 'age', 40, True),
        ('Simple false', Feature('age') < 30, 'age', 40, False),
        ('Simple equal', Feature('age') == 30, 'age', 30, True),
        ('Simple not equal', Feature('age') == 30, 'age', 40, False),
        ('Simple limit', Feature('age') < 30, 'age', 30, False),
        ('And true', (
            (Feature('age') > 30) & (Feature('age') < 50)
        ), 'age', 40, True),
        ('And false', (
            (Feature('age') > 30) & (Feature('age') < 50)
        ), 'age', 20, False),
        ('Or true', (
            (Feature('age') > 30) | (Feature('age') < 20)
        ), 'age', 40, True),
        ('Or false', (
            (Feature('age') > 30) | (Feature('age') < 20)
        ), 'age', 25, False),
    ])
    def test_match(self, name, expression, feature, value, is_match):
        match = expression.match(feature, value)
        self.assertEqual(match, is_match)
Beispiel #6
0
class TreeTraversalTest(unittest.TestCase):
    def setUp(self):
        ages = [22, 44, 36, 64, 32, 86, 11, 45]
        incomes = [age * 301 for age in ages]
        heights = [age * 50 for age in ages]
        github_stars = [age * 20 + 10 for age in ages]
        self.data = pd.DataFrame({
            'age': ages,
            'income': incomes,
            'height': heights,
            'github_stars': github_stars
        })
        self.tree = sklearn.tree.DecisionTreeRegressor(
            criterion='mse',
            random_state=42,
        )
        self.feature_names = ['age', 'height', 'github_stars']
        self.feature_min_maxes = {
            'age': {
                'min': min(ages),
                'max': max(ages)
            },
            'height': {
                'min': min(heights),
                'max': max(heights)
            },
            'github_stars': {
                'min': min(github_stars),
                'max': max(github_stars)
            }
        }
        train, test = sklearn.model_selection.train_test_split(self.data,
                                                               random_state=42)
        self.tree.fit(train[self.feature_names], train['income'])

    @parameterized.expand([
        ('0', Cond('age', Op.eq, 42), 10535),
        ('1',
         ExpressionTreeNode(Cond('age', Op.eq, 33), BoolOp.and_,
                            Cond('height', Op.eq, 600)), 5191.320987654321),
        ('2',
         ExpressionTreeNode(Cond('height', Op.eq, 700), BoolOp.and_,
                            Cond('github_stars', Op.eq, 350)), 3311),
        ('3', Cond('github_stars', Op.eq, 420), 6977.576923076923),
        ('4', Feature('age') == 42, 10535),
        ('5', Feature('height') > 2500, 17872.891891891893),
        ('6', Feature('github_stars') > 700, 14071.329608938544),
        ('7', Feature('age') > 40, 10535),
        ('8',
         ExpressionTreeNode(
             Feature('height') < 1000, BoolOp.and_,
             Feature('github_stars') > 700), 10836),
        ('9', ((Feature('height') == 1000) | (Feature('github_stars') == 250)),
         7399.2865474884),
        ('10', (Feature('github_stars') == 100) |
         ((Feature('github_stars') == 700) & (Feature('height') == 1500)),
         10836),
        ('11', (Feature('github_stars') < 250) |
         ((Feature('github_stars') == 700) & (Feature('height') == 1500)),
         6977.5935071991),
    ])
    def test_query_same_value_as_pre_calculated(self, name, test_conditions,
                                                true_result):
        prediction = query_one_tree(self.tree, test_conditions,
                                    self.feature_names, self.feature_min_maxes)
        self.assertAlmostEqual(true_result, prediction[0])

    def __render_graph(self, tree, name):
        import graphviz
        grapviz = sklearn.tree.export_graphviz(
            self.tree,
            out_file=None,
            feature_names=self.feature_names,
            filled=True,
            rounded=True,
            special_characters=True)
        graph = graphviz.Source(grapviz)
        graph.render(name)
Beispiel #7
0
 def test_feature_eq_op(self, feature, threshold):
     feat = Feature(feature)
     cond = feat == threshold
     self.assertEqual(feature, cond.feature)
     self.assertAlmostEqual(threshold, cond.threshold)
     self.assertEqual(Op.eq, cond.op)