Ejemplo n.º 1
0
def rforest_grid_tuned(train, target):
    clf = RandomForestClassifier(n_estimators=800,
                                 max_depth=6,
                                 min_samples_leaf=6,
                                 max_features=0.33)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # use a full grid over all parameters
    param_grid = {
        "max_depth": [3, None],
        "max_features": [1, 3, 10],
        "min_samples_split": [1, 3, 10],
        "min_samples_leaf": [1, 3, 10],
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Ejemplo n.º 2
0
def xgboost_grid_tuned(train, target):
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # Tune with grid search

    param_grid = {
        "n_estimators": [80],  #, 40, 20],
        "learning_rate": [0.1],
        # "max_depth": [4, 6],
        # "min_samples_leaf": [3, 5, 9, 17],
        # "max_features": [1.0, 0.3, 0.1]
    }

    clf = GradientBoostingClassifier()
    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Ejemplo n.º 3
0
    def CART(self):
        "  CART"
        # Apply random forest Classifier to predict the number of bugs.
        if self.smoteit:
            self.train = SMOTE(self.train,
                               atleast=50,
                               atmost=101,
                               resample=self.duplicate)

        if not self.tuning:
            clf = DecisionTreeRegressor(random_state=1)
        else:
            clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]),
                                        min_samples_split=int(self.tunings[1]),
                                        min_samples_leaf=int(self.tunings[2]),
                                        max_features=float(self.tunings[3] /
                                                           100),
                                        max_leaf_nodes=int(self.tunings[4]),
                                        criterion='entropy',
                                        random_state=1)
        features = self.train.columns[:-2]
        klass = self.train[self.train.columns[-2]]
        # set_trace()
        clf.fit(self.train[features].astype('float32'),
                klass.astype('float32'))
        preds = clf.predict(
            self.test[self.test.columns[:-2]].astype('float32')).tolist()
        return preds
Ejemplo n.º 4
0
def rf_model(source, target):
    clf = RandomForestClassifier(n_estimators=100, random_state=1)
    # Binarize source
    # source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1
    source = SMOTE(source)
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    return preds
Ejemplo n.º 5
0
def nbayes(source, target):
    """ Naive Bayes Classifier
    """
    source = SMOTE(source)
    clf = GaussianNB()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])
    return preds, distr
Ejemplo n.º 6
0
def logistic_model(source, target):
    # Binarize source
    clf = LogisticRegression()
    source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1
    source.loc[source[source.columns[-1]] < 1, source.columns[-1]] = 0
    # set_trace()
    source = SMOTE(source, k=1)
    # set_trace()set_trace
    features = source.columns[:-1]
    klass = [1 if k>0 else 0 for k in source[source.columns[-1]]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])
    return preds, distr[:,1]
Ejemplo n.º 7
0
def round_smote(Xall, y1, k=5, h=1.0):
    p_zeros = [i for i, e in enumerate(y1) if e == 0]
    p_ones = [i for i, e in enumerate(y1) if e > 0]
    delta = len(p_zeros) - len(p_ones)
    if delta > 0:
        N = (int(len(p_zeros) / len(p_ones)) + 1) * 100
        T = Xall[p_ones, :]
        S = SMOTE(T, N, k, h)
        sel = random.sample(range(S.shape[0]), delta)
        X1 = np.vstack([Xall, S[sel, :]])
        z1 = np.hstack([y1, np.ones(delta)])
    elif delta < 0:
        delta = -delta
        N = (int(len(p_ones) / len(p_zeros)) + 1) * 100
        T = Xall[p_zeros, :]
        S = SMOTE(T, N, k, h)
        sel = random.sample(range(S.shape[0]), delta)
        X1 = np.vstack([Xall, S[sel, :]])
        z1 = np.hstack([y1, np.zeros(delta)])
    else:
        return Xall, y1
    #print "round smote:","X1:",X1.shape,"z1:",z1.shape
    return X1, z1
Ejemplo n.º 8
0
def rforest(train, target):
    clf = RandomForestClassifier(n_estimators=100, random_state=1)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
Ejemplo n.º 9
0
def cross_validation(X, y, clf, option = '5', smote=False, ml='svc'):
    '''
    对训练数据进行交叉验证
    参数:
        X: 样本数据的特征向量
        y: 样本数据的标签
        option: 交叉验证类型,包括5折,10折交叉以及jackknife验证
        smote: 设置是否在交叉验证过程进行smote
        ml: 分类器类型,包括linear SVC和kernel SVC等
    '''
    if option == '5' or option == '10':
        option = int(option)
        skf = StratifiedKFold(n_splits=option, shuffle=True)  
        cv_split = list(skf.split(X, y))
    elif option == 'j':
        loo = LeaveOneOut()
        cv_split = list(loo.split(X, y))
    else:
        print('error cv option!')
        return -1
    y_score_all = [0.0 for i in range(len(y))]
    y_pred_all = [0.0 for i in range(len(y))]
    for i, (train_index, test_index) in enumerate(cv_split):
        X_train = X[train_index]
        y_train = y[train_index]
        if smote:
            # estimator = svm.SVC(class_weight='balanced', random_state=check_random_state(None), kernel='linear')
            # estimator = svm.SVC(class_weight='balanced', random_state=check_random_state(None))
            X_train, y_train = SMOTE(kind='svm').fit_sample(X_train, y_train)
            # X_train, y_train = Smote(sampling_rate=2).fit_sample(X_train, y_train)
        X_test = X[test_index]
        y_test = y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        # y_score = clf.decision_function(X_test)
        if ml == 'lsvc' or ml == 'svc':
            y_score = clf.decision_function(X_test)
        else:
            y_score = clf.predict_proba(X_test)[:, 1]
        
        for j in range(len(test_index)):
            y_pred_all[test_index[j]] = y_pred[j]
            y_score_all[test_index[j]] = y_score[j]
    return y_pred_all, y_score_all
Ejemplo n.º 10
0
def ovoSmoteClassifier(trainSet, testSet, n_class, n_attr):
    # data = np.loadtxt('dataset/contraceptive-5-5tra.dat', dtype=float, delimiter=', ')
    # testData = np.loadtxt('dataset/contraceptive-5-5tst.dat', dtype=float, delimiter=', ')
    tra_ovo_class = decomposeOVO(trainSet, n_attr + 1, n_class)

    # for i in tra_ovo_class:
    #     print len(i)
    x_tst, y_tst = np.split(testSet, (n_attr, ), axis=1)

    # connect each single class by two as binary
    binary_class_list = []
    # binary_class_IR=[]
    x_train_ovo = []
    y_train_ovo = []
    for i in range(len(tra_ovo_class)):
        for j in range(len(tra_ovo_class)):
            k_neigh = 5
            if (j > i):
                ciSize = float(len(tra_ovo_class[i]))
                cjSize = float(len(tra_ovo_class[j]))
                if ciSize < k_neigh:
                    k_neigh = int(ciSize)
                if cjSize < k_neigh:
                    k_neigh = int(cjSize)
                syntheticSamples = []
                print ciSize, ' ', cjSize
                binary_class_IR = 0
                if ciSize > cjSize:
                    binary_class_IR = ciSize / cjSize
                    if binary_class_IR > 1.5:
                        print int((binary_class_IR - 1)) * 100
                        syntheticSamples = SMOTE(
                            tra_ovo_class[j],
                            int((binary_class_IR - 1)) * 100, k_neigh)
                else:
                    binary_class_IR = cjSize / ciSize
                    if binary_class_IR > 1.5:
                        print int((binary_class_IR - 1)) * 100
                        syntheticSamples = SMOTE(
                            tra_ovo_class[i],
                            int((binary_class_IR - 1)) * 100, k_neigh)
                temp = np.empty(shape=[0, n_attr])
                temp = np.append(tra_ovo_class[i], tra_ovo_class[j], axis=0)
                if len(syntheticSamples) > 0:
                    temp = np.append(temp, syntheticSamples, axis=0)
                binary_class_list.append(temp)

    for i in range(len(binary_class_list)):
        # print len(binary_class_list[i])
        x, y = np.split(binary_class_list[i], (n_attr, ), axis=1)
        x_train_ovo.append(x)
        y_train_ovo.append(y)

    clf_ovo = []
    y_pred_tst = []
    for i in range(len(binary_class_list)):
        clf = tree.DecisionTreeClassifier()
        clf.fit(x_train_ovo[i], y_train_ovo[i])
        y_pred_tst.append(clf.predict(x_tst))
        # print clf.score(x_train_ovo[i], y_train_ovo[i])
        clf_ovo.append(clf)
    y_pred_temp = [([0] * len(y_pred_tst)) for i in range(len(y_pred_tst[0]))]
    for i in range(len(y_pred_tst)):
        for j in range(len(y_pred_tst[0])):
            y_pred_temp[j][i] = y_pred_tst[i][j]

    y_pred_final = []
    for i in y_pred_temp:
        count = np.bincount(i)
        y_pred_final.append(count.argmax())

    # print np.bincount(y_pred_final)

    y_test = []
    for i in range(len(y_tst)):
        y_test.append(int(y_tst[i][0]))
    # print statAUC(3, y_test, y_pred_final)
    print y_pred_final
    print y_test
    mauc = statAUC(n_class, y_test, y_pred_final)
    return mauc