def test_model_def_pickle(self):
        c = ModelDefinition(features=['a', F('a'), Map('a', len)],
                            estimator=linear_model.LogisticRegression())
        s = pickle.dumps(c)
        c2 = pickle.loads(s)
        self.assertEqual(repr(c), repr(c2))

        # lambdas are not picklable, should fail
        c = ModelDefinition(features=['a',
                                      F('a'),
                                      Map('a', lambda x: len(x))],
                            estimator=linear_model.LogisticRegression())
        self.assertRaises(pickle.PicklingError, pickle.dumps, c)
Esempio n. 2
0
 def test_discard_incomplete(self):
     model_def = ModelDefinition(features=[F('a'), Map('b', np.abs)],
                                 target='y',
                                 discard_incomplete=False)
     x, y, ff, ft = generate_train(model_def, self.data)
     self.assertEqual(len(x), len(self.data))
     # create incomplete cases
     self.data['a'][10] = None
     self.data['b'][11] = None
     self.data['b'][12] = None
     model_def = ModelDefinition(features=[F('a'), Map('b', np.abs)],
                                 target='y',
                                 discard_incomplete=True)
     x, y, ff, ft = generate_train(model_def, self.data)
     self.assertEqual(len(x), len(self.data) - 3)
Esempio n. 3
0
def cross_validate(data=None,
                   folds=5,
                   repeat=1,
                   metrics=None,
                   reporters=None,
                   model_def=None,
                   **kwargs):
    """Shortcut to cross-validate a single configuration.

    ModelDefinition variables are passed in as keyword args, along
    with the cross-validation parameters.
    """
    md_kwargs = {}
    if model_def is None:
        for arg in ModelDefinition.params:
            if arg in kwargs:
                md_kwargs[arg] = kwargs.pop(arg)
        model_def = ModelDefinition(**md_kwargs)
    if metrics is None:
        metrics = []
    if reporters is None:
        reporters = []
    metrics = [MetricReporter(metric) for metric in metrics]
    results = modeling.cross_validate(model_def,
                                      data,
                                      folds,
                                      repeat=repeat,
                                      **kwargs)
    for r in reporters + metrics:
        r.process_results(results)
    return CVResult(results, reporters, metrics)
Esempio n. 4
0
    def make_model_def_basic(self):
        features = [F(10), F('a')]
        target = F('b')
        estimator = DummyEstimator()

        model_def = ModelDefinition(features=features,
                                    estimator=estimator,
                                    target=target)
        return model_def
Esempio n. 5
0
def cv_factory(data=None,
               folds=5,
               repeat=1,
               reporters=[],
               metrics=None,
               cv_runner=None,
               **kwargs):
    """Shortcut to iterate and cross-validate models.

    All ModelDefinition kwargs should be iterables that can be
    passed to model_definition_factory.

    Parameters:
    ___________

    data:
        Raw DataFrame

    folds:
        If an int, than basic k-fold cross-validation will be done.
        Otherwise must be an iterable of tuples of pandas Indexes
        [(train_index, test_index), ...]

    repeat:
        How many times to repeat each cross-validation run of each model. Only
        makes sense if cross-validation folds are randomized.

    kwargs:
        Can be any keyword accepted by `ModelDefinition`.
        Values should be iterables.
    """
    cv_runner = cv_runner or cross_validate
    md_kwargs = {}
    for arg in ModelDefinition.params:
        if arg in kwargs:
            md_kwargs[arg] = kwargs.pop(arg)
    model_def_fact = model_definition_factory(ModelDefinition(), **md_kwargs)
    results = []
    model_defs = list(model_def_fact)
    for model_def in model_defs:
        reporters = [reporter.copy() for reporter in reporters]
        cvr = cv_runner(model_def=model_def,
                        data=data,
                        folds=folds,
                        repeat=repeat,
                        reporters=reporters,
                        metrics=metrics,
                        **kwargs)
        results.append(cvr)

    return CVComparisonResult(model_defs, results)
    def test_categorical_indicators(self):
        self.data['categorical'] = map(str, range(10))
        model_def = ModelDefinition(
            features=[Map('categorical', str),
                      F('a'),
                      Map('b', np.abs)],
            target='y',
            categorical_indicators=False)
        x, ff = build_featureset_safe(model_def.features, self.data)
        self.assertEqual(len(x.columns), len(model_def.features))

        self.data['categorical'] = map(str, range(10))
        model_def = ModelDefinition(
            features=[Map('categorical', str),
                      F('a'),
                      Map('b', np.abs)],
            target='y',
            categorical_indicators=True)
        print model_def.features
        x, ff = build_featureset_safe(model_def.features, self.data)
        print x
        for f in ff:
            print f.feature
        self.assertEqual(len(x.columns), len(model_def.features) + 9)
 def test_model_def_factory(self):
     base = ModelDefinition(features=['a'],
                            estimator=estimators.Estimator('dummy'),
                            target='y')
     factory = model_definition_factory(base,
                                        features=[
                                            ['a', 'b'],
                                            ['a', 'b', 'c'],
                                            ['a', 'b', 'c', 'y'],
                                        ],
                                        estimator=[
                                            estimators.Estimator('dummy'),
                                            estimators.Estimator('dummy2'),
                                        ])
     mds = list(factory)
     self.assertEqual(len(mds), 6)