コード例 #1
0
def linear_metric_explore(df, metrics):
    """Explore different combinations of metrics with a linear classifier.
    """
    print df.describe()
    context = ramp.DataContext(data=df)
    config = ramp.Configuration(target="target", metrics=[ramp.metrics.AUC()])
    models = [sklearn.svm.SVC(kernel="linear", C=100.0)]
    for sub_metrics in [metrics] + list(_get_metrics_groups(metrics)):
        print "==>", sub_metrics
        factory = ramp.ConfigFactory(
            config,
            model=models,
            features=[[ramp.BaseFeature(x) for x in sub_metrics]])
        for x in factory:
            ramp.models.cv(x, context, folds=5, repeat=2, print_results=True)
コード例 #2
0
def explore_ml_decisiontree(df, metrics, depth, out_decision):
    context = ramp.DataContext(data=df)
    config = ramp.Configuration(target="target", metrics=[ramp.metrics.AUC()])
    factory = ramp.ConfigFactory(
        config,
        features=[[ramp.BaseFeature(x) for x in metrics]],
        model=[
            sklearn.tree.DecisionTreeClassifier(max_depth=depth,
                                                criterion="entropy")
        ])
    for x in factory:
        ramp.models.fit(x, context)
        out_file = sklearn.tree.export_graphviz(x.model,
                                                out_file=out_decision,
                                                feature_names=metrics)
        out_file.close()
    out_pdf = "%s.pdf" % os.path.splitext(out_decision)[0]
    subprocess.check_call(["dot", "-T", "pdf", "-o", out_pdf, out_decision])
コード例 #3
0
def _find_rf_params(df, metrics):
    """Perform a grid search to find best parameters for random forest.
    """
    context = ramp.DataContext(data=df)
    config = ramp.Configuration(
        target="target", features=[ramp.BaseFeature(x) for x in metrics])
    x, y = ramp.models.get_xy(config, context)

    n = len(metrics)
    param_grid = dict(max_features=range(int(math.ceil(math.sqrt(n))), n + 1,
                                         3),
                      n_estimators=range(20, 101, 20))
    grid = sklearn.grid_search.GridSearchCV(
        sklearn.ensemble.RandomForestClassifier(),
        param_grid=param_grid,
        cv=sklearn.cross_validation.StratifiedKFold(y=y, k=3))
    grid.fit(x, y)
    print grid.best_estimator_
    out = {}
    for attr in param_grid.keys():
        out[attr] = getattr(grid.best_estimator_, attr)
    return out
コード例 #4
0
def ml_param_explore(df, metrics, test_all=False):
    """Explore classification approaches and parameters, leveraging ramp to compare multiple models.
    """
    print df.describe()
    context = ramp.DataContext(data=df)
    config = ramp.Configuration(target="target",
                                metrics=[
                                    ramp.metrics.AUC(),
                                    ramp.metrics.F1(),
                                    ramp.metrics.HingeLoss()
                                ])
    #rf_params = _find_rf_params(df, metrics)
    models = [
        sklearn.ensemble.RandomForestClassifier(
            n_estimators=50,
            max_features=int(math.ceil(math.sqrt(len(metrics))))),
        sklearn.linear_model.LogisticRegression()
    ]
    if test_all:
        svm_tester = "linear"
        if svm_tester == "linear":
            linear_params = _find_svm_rbf_params(df, metrics, "linear")
            models.append(
                sklearn.svm.SVC(kernel="linear", C=linear_params["C"]))
        else:
            rbf_params = _find_svm_rbf_params(df, metrics, "rbf")
            models.append(
                sklearn.svm.SVC(kernel="rbf",
                                C=rbf_params["C"],
                                gamma=rbf_params["gamma"]))

    factory = ramp.ConfigFactory(
        config,
        features=[[ramp.BaseFeature(x) for x in metrics]],
        model=models)
    for x in factory:
        ramp.models.cv(x, context, folds=5, repeat=2, print_results=True)
コード例 #5
0
def _find_svm_rbf_params(df, metrics, kernel):
    """Perform a grid search to find best parameters for a SVM RBF kernel.
    """
    context = ramp.DataContext(data=df)
    config = ramp.Configuration(
        target="target", features=[ramp.BaseFeature(x) for x in metrics])
    x, y = ramp.models.get_xy(config, context)

    if kernel == "linear":
        param_grid = dict(C=10.0**np.arange(-2, 5))
    else:
        param_grid = dict(gamma=10.0**np.arange(-5, 4),
                          C=10.0**np.arange(-2, 9))
    grid = sklearn.grid_search.GridSearchCV(
        sklearn.svm.SVC(kernel=kernel),
        param_grid=param_grid,
        cv=sklearn.cross_validation.StratifiedKFold(y=y, k=3),
        verbose=True)
    grid.fit(x, y)
    print grid.best_estimator_
    out = {}
    for attr in param_grid.keys():
        out[attr] = getattr(grid.best_estimator_, attr)
    return out