def linear_metric_explore(df, metrics): """Explore different combinations of metrics with a linear classifier. """ print df.describe() context = ramp.DataContext(data=df) config = ramp.Configuration(target="target", metrics=[ramp.metrics.AUC()]) models = [sklearn.svm.SVC(kernel="linear", C=100.0)] for sub_metrics in [metrics] + list(_get_metrics_groups(metrics)): print "==>", sub_metrics factory = ramp.ConfigFactory( config, model=models, features=[[ramp.BaseFeature(x) for x in sub_metrics]]) for x in factory: ramp.models.cv(x, context, folds=5, repeat=2, print_results=True)
def explore_ml_decisiontree(df, metrics, depth, out_decision): context = ramp.DataContext(data=df) config = ramp.Configuration(target="target", metrics=[ramp.metrics.AUC()]) factory = ramp.ConfigFactory( config, features=[[ramp.BaseFeature(x) for x in metrics]], model=[ sklearn.tree.DecisionTreeClassifier(max_depth=depth, criterion="entropy") ]) for x in factory: ramp.models.fit(x, context) out_file = sklearn.tree.export_graphviz(x.model, out_file=out_decision, feature_names=metrics) out_file.close() out_pdf = "%s.pdf" % os.path.splitext(out_decision)[0] subprocess.check_call(["dot", "-T", "pdf", "-o", out_pdf, out_decision])
def _find_rf_params(df, metrics): """Perform a grid search to find best parameters for random forest. """ context = ramp.DataContext(data=df) config = ramp.Configuration( target="target", features=[ramp.BaseFeature(x) for x in metrics]) x, y = ramp.models.get_xy(config, context) n = len(metrics) param_grid = dict(max_features=range(int(math.ceil(math.sqrt(n))), n + 1, 3), n_estimators=range(20, 101, 20)) grid = sklearn.grid_search.GridSearchCV( sklearn.ensemble.RandomForestClassifier(), param_grid=param_grid, cv=sklearn.cross_validation.StratifiedKFold(y=y, k=3)) grid.fit(x, y) print grid.best_estimator_ out = {} for attr in param_grid.keys(): out[attr] = getattr(grid.best_estimator_, attr) return out
def ml_param_explore(df, metrics, test_all=False): """Explore classification approaches and parameters, leveraging ramp to compare multiple models. """ print df.describe() context = ramp.DataContext(data=df) config = ramp.Configuration(target="target", metrics=[ ramp.metrics.AUC(), ramp.metrics.F1(), ramp.metrics.HingeLoss() ]) #rf_params = _find_rf_params(df, metrics) models = [ sklearn.ensemble.RandomForestClassifier( n_estimators=50, max_features=int(math.ceil(math.sqrt(len(metrics))))), sklearn.linear_model.LogisticRegression() ] if test_all: svm_tester = "linear" if svm_tester == "linear": linear_params = _find_svm_rbf_params(df, metrics, "linear") models.append( sklearn.svm.SVC(kernel="linear", C=linear_params["C"])) else: rbf_params = _find_svm_rbf_params(df, metrics, "rbf") models.append( sklearn.svm.SVC(kernel="rbf", C=rbf_params["C"], gamma=rbf_params["gamma"])) factory = ramp.ConfigFactory( config, features=[[ramp.BaseFeature(x) for x in metrics]], model=models) for x in factory: ramp.models.cv(x, context, folds=5, repeat=2, print_results=True)
def _find_svm_rbf_params(df, metrics, kernel): """Perform a grid search to find best parameters for a SVM RBF kernel. """ context = ramp.DataContext(data=df) config = ramp.Configuration( target="target", features=[ramp.BaseFeature(x) for x in metrics]) x, y = ramp.models.get_xy(config, context) if kernel == "linear": param_grid = dict(C=10.0**np.arange(-2, 5)) else: param_grid = dict(gamma=10.0**np.arange(-5, 4), C=10.0**np.arange(-2, 9)) grid = sklearn.grid_search.GridSearchCV( sklearn.svm.SVC(kernel=kernel), param_grid=param_grid, cv=sklearn.cross_validation.StratifiedKFold(y=y, k=3), verbose=True) grid.fit(x, y) print grid.best_estimator_ out = {} for attr in param_grid.keys(): out[attr] = getattr(grid.best_estimator_, attr) return out