def test_model_def_pickle(self): c = ModelDefinition(features=['a', F('a'), Map('a', len)], estimator=linear_model.LogisticRegression()) s = pickle.dumps(c) c2 = pickle.loads(s) self.assertEqual(repr(c), repr(c2)) # lambdas are not picklable, should fail c = ModelDefinition(features=['a', F('a'), Map('a', lambda x: len(x))], estimator=linear_model.LogisticRegression()) self.assertRaises(pickle.PicklingError, pickle.dumps, c)
def test_discard_incomplete(self): model_def = ModelDefinition(features=[F('a'), Map('b', np.abs)], target='y', discard_incomplete=False) x, y, ff, ft = generate_train(model_def, self.data) self.assertEqual(len(x), len(self.data)) # create incomplete cases self.data['a'][10] = None self.data['b'][11] = None self.data['b'][12] = None model_def = ModelDefinition(features=[F('a'), Map('b', np.abs)], target='y', discard_incomplete=True) x, y, ff, ft = generate_train(model_def, self.data) self.assertEqual(len(x), len(self.data) - 3)
def cross_validate(data=None, folds=5, repeat=1, metrics=None, reporters=None, model_def=None, **kwargs): """Shortcut to cross-validate a single configuration. ModelDefinition variables are passed in as keyword args, along with the cross-validation parameters. """ md_kwargs = {} if model_def is None: for arg in ModelDefinition.params: if arg in kwargs: md_kwargs[arg] = kwargs.pop(arg) model_def = ModelDefinition(**md_kwargs) if metrics is None: metrics = [] if reporters is None: reporters = [] metrics = [MetricReporter(metric) for metric in metrics] results = modeling.cross_validate(model_def, data, folds, repeat=repeat, **kwargs) for r in reporters + metrics: r.process_results(results) return CVResult(results, reporters, metrics)
def make_model_def_basic(self): features = [F(10), F('a')] target = F('b') estimator = DummyEstimator() model_def = ModelDefinition(features=features, estimator=estimator, target=target) return model_def
def cv_factory(data=None, folds=5, repeat=1, reporters=[], metrics=None, cv_runner=None, **kwargs): """Shortcut to iterate and cross-validate models. All ModelDefinition kwargs should be iterables that can be passed to model_definition_factory. Parameters: ___________ data: Raw DataFrame folds: If an int, than basic k-fold cross-validation will be done. Otherwise must be an iterable of tuples of pandas Indexes [(train_index, test_index), ...] repeat: How many times to repeat each cross-validation run of each model. Only makes sense if cross-validation folds are randomized. kwargs: Can be any keyword accepted by `ModelDefinition`. Values should be iterables. """ cv_runner = cv_runner or cross_validate md_kwargs = {} for arg in ModelDefinition.params: if arg in kwargs: md_kwargs[arg] = kwargs.pop(arg) model_def_fact = model_definition_factory(ModelDefinition(), **md_kwargs) results = [] model_defs = list(model_def_fact) for model_def in model_defs: reporters = [reporter.copy() for reporter in reporters] cvr = cv_runner(model_def=model_def, data=data, folds=folds, repeat=repeat, reporters=reporters, metrics=metrics, **kwargs) results.append(cvr) return CVComparisonResult(model_defs, results)
def test_categorical_indicators(self): self.data['categorical'] = map(str, range(10)) model_def = ModelDefinition( features=[Map('categorical', str), F('a'), Map('b', np.abs)], target='y', categorical_indicators=False) x, ff = build_featureset_safe(model_def.features, self.data) self.assertEqual(len(x.columns), len(model_def.features)) self.data['categorical'] = map(str, range(10)) model_def = ModelDefinition( features=[Map('categorical', str), F('a'), Map('b', np.abs)], target='y', categorical_indicators=True) print model_def.features x, ff = build_featureset_safe(model_def.features, self.data) print x for f in ff: print f.feature self.assertEqual(len(x.columns), len(model_def.features) + 9)
def test_model_def_factory(self): base = ModelDefinition(features=['a'], estimator=estimators.Estimator('dummy'), target='y') factory = model_definition_factory(base, features=[ ['a', 'b'], ['a', 'b', 'c'], ['a', 'b', 'c', 'y'], ], estimator=[ estimators.Estimator('dummy'), estimators.Estimator('dummy2'), ]) mds = list(factory) self.assertEqual(len(mds), 6)