Beispiel #1
0
def rforest_grid_tuned(train, target):
    clf = RandomForestClassifier(n_estimators=800,
                                 max_depth=6,
                                 min_samples_leaf=6,
                                 max_features=0.33)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # use a full grid over all parameters
    param_grid = {"max_depth": [3, None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [1, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Beispiel #2
0
def xgboost_grid_tuned(train, target):
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # Tune with grid search

    param_grid = {
        "n_estimators": [80],#, 40, 20],
        "learning_rate": [0.1],
        # "max_depth": [4, 6],
        # "min_samples_leaf": [3, 5, 9, 17],
        # "max_features": [1.0, 0.3, 0.1]
    }

    clf = GradientBoostingClassifier()
    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Beispiel #3
0
def rforest(train, target):
    clf = RandomForestClassifier(n_estimators=100, random_state=1)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    # source = SMOTE(source)

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Beispiel #4
0
def xgboost(train, target):
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    # source = SMOTE(source)

    clf = GradientBoostingClassifier(n_estimators=80,
                                     max_depth=6,
                                     min_samples_leaf=6,
                                     learning_rate=0.085,
                                     subsample=True,
                                     max_features=0.33)

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]
    preds = [1 if val > 0.77 else 0 for val in distr]
    return preds, distr
Beispiel #5
0
    featureTot = 0
    information_gain = []
    for i in range(0, len(nz[0])):
        if (i != 0 and nz[0][i] != pre):
            for notappear in range(pre + 1, nz[0][i]):
                information_gain.append(0)
            ig = _calIg()
            information_gain.append(ig)
            pre = nz[0][i]
            classCnt = {}
            featureTot = 0
        featureTot = featureTot + 1
        yclass = y[nz[1][i]]
        if yclass not in classCnt:
            classCnt[yclass] = 1
        else:
            classCnt[yclass] = classCnt[yclass] + 1
    ig = _calIg()
    information_gain.append(ig)

    return np.asarray(information_gain)


if __name__ == "__main__":
    data = DefectData.get_all_projects()["Apache"]
    test_data = list2dataframe(data["ant"].data)
    indep_var = test_data[test_data.columns[:-1]]
    depen_var = test_data[test_data.columns[-1]]
    information_gain(indep_var.values, depen_var.values)
    set_trace()