def rforest_grid_tuned(train, target): clf = RandomForestClassifier(n_estimators=800, max_depth=6, min_samples_leaf=6, max_features=0.33) try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def xgboost_grid_tuned(train, target): try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # Tune with grid search param_grid = { "n_estimators": [80],#, 40, 20], "learning_rate": [0.1], # "max_depth": [4, 6], # "min_samples_leaf": [3, 5, 9, 17], # "max_features": [1.0, 0.3, 0.1] } clf = GradientBoostingClassifier() source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def rforest(train, target): clf = RandomForestClassifier(n_estimators=100, random_state=1) try: source = list2dataframe(train) except IOError: source = train # source = SMOTE(source) source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def xgboost(train, target): try: source = list2dataframe(train) except IOError: source = train # source = SMOTE(source) clf = GradientBoostingClassifier(n_estimators=80, max_depth=6, min_samples_leaf=6, learning_rate=0.085, subsample=True, max_features=0.33) source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] preds = [1 if val > 0.77 else 0 for val in distr] return preds, distr
featureTot = 0 information_gain = [] for i in range(0, len(nz[0])): if (i != 0 and nz[0][i] != pre): for notappear in range(pre + 1, nz[0][i]): information_gain.append(0) ig = _calIg() information_gain.append(ig) pre = nz[0][i] classCnt = {} featureTot = 0 featureTot = featureTot + 1 yclass = y[nz[1][i]] if yclass not in classCnt: classCnt[yclass] = 1 else: classCnt[yclass] = classCnt[yclass] + 1 ig = _calIg() information_gain.append(ig) return np.asarray(information_gain) if __name__ == "__main__": data = DefectData.get_all_projects()["Apache"] test_data = list2dataframe(data["ant"].data) indep_var = test_data[test_data.columns[:-1]] depen_var = test_data[test_data.columns[-1]] information_gain(indep_var.values, depen_var.values) set_trace()