def rforest_grid_tuned(train, target): clf = RandomForestClassifier(n_estimators=800, max_depth=6, min_samples_leaf=6, max_features=0.33) try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # use a full grid over all parameters param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] } source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def xgboost_grid_tuned(train, target): try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # Tune with grid search param_grid = { "n_estimators": [80], #, 40, 20], "learning_rate": [0.1], # "max_depth": [4, 6], # "min_samples_leaf": [3, 5, 9, 17], # "max_features": [1.0, 0.3, 0.1] } clf = GradientBoostingClassifier() source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def CART(self): " CART" # Apply random forest Classifier to predict the number of bugs. if self.smoteit: self.train = SMOTE(self.train, atleast=50, atmost=101, resample=self.duplicate) if not self.tuning: clf = DecisionTreeRegressor(random_state=1) else: clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]), min_samples_split=int(self.tunings[1]), min_samples_leaf=int(self.tunings[2]), max_features=float(self.tunings[3] / 100), max_leaf_nodes=int(self.tunings[4]), criterion='entropy', random_state=1) features = self.train.columns[:-2] klass = self.train[self.train.columns[-2]] # set_trace() clf.fit(self.train[features].astype('float32'), klass.astype('float32')) preds = clf.predict( self.test[self.test.columns[:-2]].astype('float32')).tolist() return preds
def rf_model(source, target): clf = RandomForestClassifier(n_estimators=100, random_state=1) # Binarize source # source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1 source = SMOTE(source) features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) return preds
def nbayes(source, target): """ Naive Bayes Classifier """ source = SMOTE(source) clf = GaussianNB() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]]) return preds, distr
def logistic_model(source, target): # Binarize source clf = LogisticRegression() source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1 source.loc[source[source.columns[-1]] < 1, source.columns[-1]] = 0 # set_trace() source = SMOTE(source, k=1) # set_trace()set_trace features = source.columns[:-1] klass = [1 if k>0 else 0 for k in source[source.columns[-1]]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]]) return preds, distr[:,1]
def round_smote(Xall, y1, k=5, h=1.0): p_zeros = [i for i, e in enumerate(y1) if e == 0] p_ones = [i for i, e in enumerate(y1) if e > 0] delta = len(p_zeros) - len(p_ones) if delta > 0: N = (int(len(p_zeros) / len(p_ones)) + 1) * 100 T = Xall[p_ones, :] S = SMOTE(T, N, k, h) sel = random.sample(range(S.shape[0]), delta) X1 = np.vstack([Xall, S[sel, :]]) z1 = np.hstack([y1, np.ones(delta)]) elif delta < 0: delta = -delta N = (int(len(p_ones) / len(p_zeros)) + 1) * 100 T = Xall[p_zeros, :] S = SMOTE(T, N, k, h) sel = random.sample(range(S.shape[0]), delta) X1 = np.vstack([Xall, S[sel, :]]) z1 = np.hstack([y1, np.zeros(delta)]) else: return Xall, y1 #print "round smote:","X1:",X1.shape,"z1:",z1.shape return X1, z1
def rforest(train, target): clf = RandomForestClassifier(n_estimators=100, random_state=1) try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def cross_validation(X, y, clf, option = '5', smote=False, ml='svc'): ''' 对训练数据进行交叉验证 参数: X: 样本数据的特征向量 y: 样本数据的标签 option: 交叉验证类型,包括5折,10折交叉以及jackknife验证 smote: 设置是否在交叉验证过程进行smote ml: 分类器类型,包括linear SVC和kernel SVC等 ''' if option == '5' or option == '10': option = int(option) skf = StratifiedKFold(n_splits=option, shuffle=True) cv_split = list(skf.split(X, y)) elif option == 'j': loo = LeaveOneOut() cv_split = list(loo.split(X, y)) else: print('error cv option!') return -1 y_score_all = [0.0 for i in range(len(y))] y_pred_all = [0.0 for i in range(len(y))] for i, (train_index, test_index) in enumerate(cv_split): X_train = X[train_index] y_train = y[train_index] if smote: # estimator = svm.SVC(class_weight='balanced', random_state=check_random_state(None), kernel='linear') # estimator = svm.SVC(class_weight='balanced', random_state=check_random_state(None)) X_train, y_train = SMOTE(kind='svm').fit_sample(X_train, y_train) # X_train, y_train = Smote(sampling_rate=2).fit_sample(X_train, y_train) X_test = X[test_index] y_test = y[test_index] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # y_score = clf.decision_function(X_test) if ml == 'lsvc' or ml == 'svc': y_score = clf.decision_function(X_test) else: y_score = clf.predict_proba(X_test)[:, 1] for j in range(len(test_index)): y_pred_all[test_index[j]] = y_pred[j] y_score_all[test_index[j]] = y_score[j] return y_pred_all, y_score_all
def ovoSmoteClassifier(trainSet, testSet, n_class, n_attr): # data = np.loadtxt('dataset/contraceptive-5-5tra.dat', dtype=float, delimiter=', ') # testData = np.loadtxt('dataset/contraceptive-5-5tst.dat', dtype=float, delimiter=', ') tra_ovo_class = decomposeOVO(trainSet, n_attr + 1, n_class) # for i in tra_ovo_class: # print len(i) x_tst, y_tst = np.split(testSet, (n_attr, ), axis=1) # connect each single class by two as binary binary_class_list = [] # binary_class_IR=[] x_train_ovo = [] y_train_ovo = [] for i in range(len(tra_ovo_class)): for j in range(len(tra_ovo_class)): k_neigh = 5 if (j > i): ciSize = float(len(tra_ovo_class[i])) cjSize = float(len(tra_ovo_class[j])) if ciSize < k_neigh: k_neigh = int(ciSize) if cjSize < k_neigh: k_neigh = int(cjSize) syntheticSamples = [] print ciSize, ' ', cjSize binary_class_IR = 0 if ciSize > cjSize: binary_class_IR = ciSize / cjSize if binary_class_IR > 1.5: print int((binary_class_IR - 1)) * 100 syntheticSamples = SMOTE( tra_ovo_class[j], int((binary_class_IR - 1)) * 100, k_neigh) else: binary_class_IR = cjSize / ciSize if binary_class_IR > 1.5: print int((binary_class_IR - 1)) * 100 syntheticSamples = SMOTE( tra_ovo_class[i], int((binary_class_IR - 1)) * 100, k_neigh) temp = np.empty(shape=[0, n_attr]) temp = np.append(tra_ovo_class[i], tra_ovo_class[j], axis=0) if len(syntheticSamples) > 0: temp = np.append(temp, syntheticSamples, axis=0) binary_class_list.append(temp) for i in range(len(binary_class_list)): # print len(binary_class_list[i]) x, y = np.split(binary_class_list[i], (n_attr, ), axis=1) x_train_ovo.append(x) y_train_ovo.append(y) clf_ovo = [] y_pred_tst = [] for i in range(len(binary_class_list)): clf = tree.DecisionTreeClassifier() clf.fit(x_train_ovo[i], y_train_ovo[i]) y_pred_tst.append(clf.predict(x_tst)) # print clf.score(x_train_ovo[i], y_train_ovo[i]) clf_ovo.append(clf) y_pred_temp = [([0] * len(y_pred_tst)) for i in range(len(y_pred_tst[0]))] for i in range(len(y_pred_tst)): for j in range(len(y_pred_tst[0])): y_pred_temp[j][i] = y_pred_tst[i][j] y_pred_final = [] for i in y_pred_temp: count = np.bincount(i) y_pred_final.append(count.argmax()) # print np.bincount(y_pred_final) y_test = [] for i in range(len(y_tst)): y_test.append(int(y_tst[i][0])) # print statAUC(3, y_test, y_pred_final) print y_pred_final print y_test mauc = statAUC(n_class, y_test, y_pred_final) return mauc