def test_classification_toy():
    """Check classification on a toy dataset."""
    # Random forest
    clf = RandomForestClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    clf = RandomForestClassifier(n_estimators=10, max_features=1,
                                 random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    # also test apply
    leaf_indices = clf.apply(X)
    assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))

    # Extra-trees
    clf = ExtraTreesClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    clf = ExtraTreesClassifier(n_estimators=10, max_features=1,
                               random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    # also test apply
    leaf_indices = clf.apply(X)
    assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))
Beispiel #2
0
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100):
    # Perform some model selection to determine good parameters
    # Load data
    X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_train = encoder.transform(forest.apply(X_train))
    learner = SGDClassifier(
        loss="hinge",
        penalty="l2",
        learning_rate="invscaling",
        alpha=0.001,
        average=10 ** 4,
        eta0=0.5,
        class_weight="balanced",
    )

    metric = "f1"
    losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"]
    penalties = ["l2", "l1", "elasticnet"]
    alphas = 10.0 ** numpy.arange(-5, 0)
    learning_rates = ["constant", "optimal", "invscaling"]
    param_grid = [{"alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates}]
    grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True)

    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, grid_search.best_score_)
    return grid_search
Beispiel #3
0
def test_classification_toy():
    """Check classification on a toy dataset."""
    # Random forest
    clf = RandomForestClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    clf = RandomForestClassifier(n_estimators=10,
                                 max_features=1,
                                 random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    # also test apply
    leaf_indices = clf.apply(X)
    assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))

    # Extra-trees
    clf = ExtraTreesClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    clf = ExtraTreesClassifier(n_estimators=10, max_features=1, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    # also test apply
    leaf_indices = clf.apply(X)
    assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))
Beispiel #4
0
    def fit(self, **kwargs) -> Model:
        feature_list = kwargs.get('feature_list', None)
        if not feature_list:
            self.name = self.name + '(-irt)'
        self.train_x = self.select_features(self.feature.features_train,
                                            feature_list)
        self.train_y = self.feature.label_train.values
        self.feature_names = self.train_x.columns

        self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y)

        rf = RandomForestClassifier(**self.param)
        rf_enc = OneHotEncoder()
        rf_lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs')
        rf.fit(self.train_x, self.train_y)
        rf_enc.fit(rf.apply(self.train_x))
        rf_lm.fit(rf_enc.transform(rf.apply(self.train_x)), self.train_y)
        self.rf = rf
        self.rf_enc = rf_enc
        self.model = rf_lm

        # 评估训练集上的效果
        self.train_y_pred = self.predict(self.train_x)
        self.train_y = np.array(self.train_y)
        self.train_y_pred = np.array(self.train_y_pred)
        self.train_ev = self.evaluation.evaluate(y_true=self.train_y,
                                                 y_pred=self.train_y_pred,
                                                 threshold=0.5)

        return self
Beispiel #5
0
def runRfStack(inputfile,outputfile):
    '''
    输入输出文件
    '''
    df_all = pd.read_csv(inputfile)
    df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:]))

    # 默认填充的0,显示使用一个负数尝试一下
    df_all.replace([np.inf, -np.inf], np.nan,inplace=True)
    df_all = df_all.fillna(0)

    # 默认填充的0,显示使用一个负数尝试一下
    features = df_all.columns[0:]
    features = list(features)
    features.remove('EID')
    label = 'TARGET'
    
    clf = RandomForestClassifier(
        n_estimators=50,#50棵树
        max_depth=7,
        n_jobs=4,
        random_state=101)

    df_all_prov11,df_all_prov12 = split_data_with_prov(df_all)

    ###################### prov == 11
    df_train11,df_test11 = xtrain_and_test(df_all_prov11)

    X_train11 = df_train11[features]
    Y_label11 = df_train11[label]

    X_test11 = df_test11[features]

    clf.fit(X_train11,Y_label11)
    column = ['STACKFEATURE'+str(i) for i in range(50)]
    df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]),columns=column)
    df_all_prov11[column] = df_new_feature11

    ###################### prov == 12
    df_train12,df_test12 = xtrain_and_test(df_all_prov12)

    X_train12 = df_train12[features]
    Y_label12 = df_train12[label]

    X_test12 = df_test12[features]

    clf.fit(X_train12,Y_label12)
    column = ['STACKFEATURE'+str(i) for i in range(50)]
    df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]),columns=column)
    df_all_prov12[column] = df_new_feature12

    # 合并
    df_all = df_all_prov11.append(df_all_prov12)

    df_all.to_csv(outputfile,index=False,index_label=False)
    del df_all_prov11,df_all_prov12,df_all
    return outputfile
Beispiel #6
0
def RandomForestLR():
    RF = RandomForestClassifier(n_estimators=100, max_depth=4)
    RF.fit(X_train, Y_train)
    OHE = OneHotEncoder()
    OHE.fit(RF.apply(X_train))
    LR = LogisticRegression()
    LR.fit(OHE.transform(RF.apply(X_train_lr)), Y_train_lr)
    Y_pred = LR.predict_proba(OHE.transform(RF.apply(X_test)))[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    print("RandomForest+LogisticRegression:", auc)
    return fpr, tpr
Beispiel #7
0
def rf_lr_model():
    """
    RandomForest + LR
    """
    rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth)
    rf_enc = OneHotEncoder()
    rf_lm = LogisticRegression()
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
    y_pred = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    print 'RandomForest+LR AUC: {0}'.format(auc(fpr, tpr))
Beispiel #8
0
def RandomForestLR(X_train, y_train, X_test, y_test, X_train_lr, y_train_lr):
    rf = RandomForestClassifier(max_depth=3, n_estimators=50)
    rf_enc = OneHotEncoder()
    rf_lr = LogisticRegression()
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_lr.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
    y_pred_rf_lr = rf_lr.predict_proba(rf_enc.transform(rf.apply(X_test)))[:,
                                                                           1]
    fpr_rf_lr, tpr_rf_lr, _ = roc_curve(y_test, y_pred_rf_lr)
    auc = roc_auc_score(y_test, y_pred_rf_lr)
    print("RF+LR:", auc)
    return fpr_rf_lr, tpr_rf_lr
def RF_Logit(X_train, y_train, X_test):
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.5)
    grd = RandomForestClassifier(max_depth=10, max_features=9)
    grd_enc = OneHotEncoder()
    grd_lm = LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train))
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)), y_train_lr)
    y_hat_RF_log = grd_lm.predict_proba(grd_enc.transform(
        grd.apply(X_test)))[:, 1]
    return y_hat_RF_log
def main():
    # initialize sklearn objects
    rf = RandomForestClassifier(n_estimators=300,
                                max_depth=3,
                                verbose=1,
                                random_state=SEED)
    logitsgd = SGDClassifier(loss='log', n_jobs=-1, verbose=1)
    encoder = OneHotEncoder()

    train, click = load_train_data(train_loc)

    # rf feature transformation
    rf.fit(train, click)
    train_rf = rf.apply(train)
    train = None

    # encode rf features for logit
    print('fitting encoder ... ')
    encoder.fit(train_rf)
    print('transforming ...')
    embedded = []
    for row in train_rf:
        embedded = vstack((embedded, encoder.transform(row)))

    train_rf = None

    # train model
    logitsgd.fit(X=embedded, y=click)
    embedded = None

    # load testing data
    test = load_test_data(test_loc)

    # rf transform test
    test_rf = rf.apply(test)
    test = None

    # encode test
    print('transforming ...')
    embedded = []
    for row in test_rf:
        embedded = vstack((embedded, encoder.transform(row)))

    test_rf = None

    # make predictions
    prediction = logitsgd.predict_proba(embedded_test)

    # save predictions
    prediction = np.array(prediction)
    np.savetxt("predictions.csv", prediction, delimiter=",")
Beispiel #11
0
def smts(aTrainY, aTrainX, aTestX, aR, aJins, aNTree):

    tTrainOut = pd.DataFrame()
    tTestOut = pd.DataFrame()

    tTrainX = aTrainX.iloc[:, 1:len(aTrainX.columns)]
    #trainのsidリスト
    tTrSids = pd.DataFrame(aTrainX[aTrainX.columns[0]],
                           columns=[aTrainX.columns[0]])
    tTestX = aTestX.iloc[:, 1:len(aTestX.columns)]
    #testのsidリスト
    tTeSids = pd.DataFrame(aTestX[aTestX.columns[0]],
                           columns=[aTestX.columns[0]])

    # uniqueな状態になったsid
    sid_tr = pd.DataFrame()
    sid_te = pd.DataFrame()

    # 学習機RFinsをaJins個作成
    for i in range(aJins):

        clf = RandomForestClassifier(n_estimators=aNTree,
                                     max_leaf_nodes=int((aR + 2) / 2))

        #trainで学習
        clf.fit(tTrainX, aTrainY)

        #train testともにNode計算
        tIdxTr = clf.apply(tTrainX)
        tIdxTe = clf.apply(tTestX)

        #変換完了(1つのTree) [N x R]
        sid_tr, tTrX = H_jX(tIdxTr, tTrSids, aR)
        sid_te, tTeX = H_jX(tIdxTe, tTeSids, aR)

        #outputに結合
        tTrainOut = pd.concat([tTrainOut, tTrX], axis=1, ignore_index=True)
        tTestOut = pd.concat([tTestOut, tTeX], axis=1, ignore_index=True)

        print("Done " + str(i + 1) + "th RF")

    #TrainOutは[sid]+[N x RJins]

    #TestOutは[sid]+[N x RJins]

    #DataFrame型にする
    return pd.concat([sid_tr, tTrainOut],
                     axis=1), pd.concat([sid_te, tTestOut], axis=1)
def main():
	# initialize sklearn objects
	rf = RandomForestClassifier(n_estimators = 300, max_depth = 3, verbose = 1, random_state = SEED)
	logitsgd = SGDClassifier(loss ='log', n_jobs = -1, verbose = 1)
	encoder = OneHotEncoder()
	
	train, click = load_train_data(train_loc)
	
	# rf feature transformation
	rf.fit(train, click)
	train_rf = rf.apply(train)
	train = None
	
	# encode rf features for logit
	print('fitting encoder ... ')
	encoder.fit(train_rf)
	print('transforming ...')
	embedded = []
	for row in train_rf:
		embedded = vstack((embedded, encoder.transform(row)))
	
	train_rf = None
	
	# train model
	logitsgd.fit(X = embedded, y = click)
	embedded = None
	
	# load testing data
	test = load_test_data(test_loc)
	
	# rf transform test
	test_rf = rf.apply(test)
	test = None
	
	# encode test
	print('transforming ...')
	embedded = []
	for row in test_rf:
		embedded = vstack((embedded, encoder.transform(row)))
	
	test_rf = None
	
	# make predictions
	prediction = logitsgd.predict_proba(embedded_test)
	
	# save predictions
	prediction = np.array(prediction)
	np.savetxt("predictions.csv", prediction, delimiter = ",")
Beispiel #13
0
    def _most_informative(self, X, clusterer, neighborhoods):
        n = X.shape[0]
        l = len(neighborhoods)

        neighborhoods_union = set()
        for neighborhood in neighborhoods:
            for i in neighborhood:
                neighborhoods_union.add(i)

        unqueried_indices = set(range(n)) - neighborhoods_union

        # If there is only one neighborhood then choose the point randomly
        if l <= 1:
            return np.random.choice(list(unqueried_indices)), [1]

        # Learn a random forest classifier
        n_estimators = 50
        rf = RandomForestClassifier(n_estimators=n_estimators)
        rf.fit(X, clusterer.labels_)

        # Compute the similarity matrix
        leaf_indices = rf.apply(X)
        S = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                S[i, j] = (leaf_indices[i, ] == leaf_indices[j, ]).sum()
        S = S / n_estimators

        p = np.empty((n, l))
        uncertainties = np.zeros(n)
        expected_costs = np.ones(n)

        # For each point that is not in any neighborhood...
        for x_i in range(n):
            if not x_i in neighborhoods_union:
                for n_i in range(l):
                    p[x_i, n_i] = (S[x_i, neighborhoods[n_i]].sum() /
                                   len(neighborhoods[n_i]))

                # If the point is not similar to any neighborhood set equal probabilities of belonging to each neighborhood
                if np.all(p[x_i, ] == 0):
                    p[x_i, ] = np.ones(l)

                p[x_i, ] = p[x_i, ] / p[x_i, ].sum()

                if not np.any(p[x_i, ] == 1):
                    positive_p_i = p[x_i, p[x_i, ] > 0]
                    uncertainties[x_i] = -(positive_p_i *
                                           np.log2(positive_p_i)).sum()
                    expected_costs[x_i] = (positive_p_i *
                                           range(1,
                                                 len(positive_p_i) + 1)).sum()
                else:
                    uncertainties[x_i] = 0
                    expected_costs[x_i] = 1  # ?

        normalized_uncertainties = uncertainties / expected_costs

        most_informative_i = np.argmax(normalized_uncertainties)
        return most_informative_i, p[most_informative_i]
def kfingerprinting(X_train,X_test,y_train,y_test):
    # logger.info('training...')
    model = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True)
    model.fit(X_train, y_train)
#    M = model.predict(X_test)
    # for i in range(0,len(M)):
    #     x = M[i]
    #     label = str(Y_test[i][0])+'-'+str(Y_test[i][1])
    #     logger.info('%s: %s'%(str(label), str(x)))
    acc = model.score(X_test, y_test)
    #logger.info('Accuracy = %.4f'%acc)
    train_leaf = model.apply(X_train)
    test_leaf = model.apply(X_test)
    # print(model.feature_importances_)
#    joblib.dump(model, 'dirty-trained-kf.pkl')
    return train_leaf, test_leaf
class Forest():
    def __init__(self, n_estimators=10, categorical_features=[]):
        self.encoder = OneHotEncoder(categorical_features=categorical_features)
        self.forest = RandomForestClassifier(n_estimators=n_estimators)

    def fit(self, X, y):
        self.encoder.fit(X)
        self.forest.fit(self.encoder.transform(X), y)
        return self

    def predict(self, X):
        return self.forest.predict(self.encoder.transform(X))

    def votes(self, X):
        # FIXME There is probably a more clever way of doing this
        X_enc = self.encoder.transform(X)
        predictions = [t.predict(X_enc) for t in self.forest.estimators_]
        votes = pd.DataFrame.from_dict(
            dict(zip(range(len(self.forest.estimators_)), predictions)))
        return votes.transpose().sum()

    def score(self, X, y):
        return self.forest.score(self.encoder.transform(X), y)

    def apply(self, X):
        return self.forest.apply(X)
Beispiel #16
0
def rf_dis(n_trees, X, Y, train_indices, test_indices, seed):
    clf = RandomForestClassifier(n_estimators=500,
                                 random_state=seed,
                                 oob_score=True,
                                 n_jobs=-1)
    clf = clf.fit(X[train_indices], Y[train_indices])
    pred = clf.predict(X[test_indices])
    weight = clf.score(X[test_indices], Y[test_indices])
    #print(1 - clf.oob_score_)
    n_samples = X.shape[0]
    dis = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        dis[i][i] = 1
    res = clf.apply(X)
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            a = np.ravel(res[i])
            b = np.ravel(res[j])
            score = a == b
            d = float(score.sum()) / n_trees
            dis[i][j] = dis[j][i] = d
    X_features1 = np.transpose(dis)
    X_features2 = X_features1[train_indices]
    X_features3 = np.transpose(X_features2)
    return X_features3[train_indices], X_features3[test_indices], weight, pred
Beispiel #17
0
def rf_dis(n_trees, X, Y, train_indices, test_indices, seed):
    clf = RandomForestClassifier(n_estimators=500,
                                 random_state=seed,
                                 oob_score=True,
                                 n_jobs=1)
    clf = clf.fit(X[train_indices], Y[train_indices])
    pred = clf.predict(X[test_indices])
    prediction = clf.predict(X)
    prob = clf.predict_proba(X[test_indices])
    weight = clf.oob_score_  #clf.score(X[test_indices], Y[test_indices])
    print(clf.score(X[train_indices], Y[train_indices]))
    #print(1 - clf.oob_score_)
    n_samples = X.shape[0]
    trees = clf.estimators_
    dis = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        dis[i][i] = 1
    res = clf.apply(X)
    www = 0.9
    pre = np.zeros((n_trees, n_samples))
    prepro = np.zeros((n_trees, n_samples))
    for i in range(n_trees):
        pre[i] = trees[i].predict(X)
        sss = trees[i].predict_proba(X)
        ss = []
        for j in range(n_samples):
            ss.append(max(sss[j]))
        prepro[i] = ss

    pre = pre.transpose()
    prepro = prepro.transpose()
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            a = np.ravel(res[i])
            b = np.ravel(res[j])
            c = np.ravel(pre[i])
            d = np.ravel(pre[j])
            e = np.ravel(prepro[i])
            f = np.ravel(prepro[j])
            score = 0
            for k in range(n_trees):
                if a[k] == b[k]:
                    s1 = 1
                else:
                    s1 = 0
                if c[k] == d[k]:
                    s2 = min(e[k], f[k])
                else:
                    s2 = 0
                print(s2)
                s = s1 * www + s2 * (1 - www)
                score = score + s
            dis[i][j] = dis[j][i] = score / n_trees
    X_features1 = np.transpose(dis)
    X_features2 = X_features1[train_indices]
    X_features3 = np.transpose(X_features2)

    return X_features3[train_indices], X_features3[
        test_indices], weight, pred, prob, clf
def test_drf_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestClassifier

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True)
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import RandomForestClassifier
    drf_sk = RandomForestClassifier(random_state=1234, oob_score=True, max_depth=3)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)).all() == True
        assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_classes_")
        print(drf.n_classes_)
        print(drf_sk.n_classes_)
        assert drf.n_classes_ == drf_sk.n_classes_

        print("n_features")
        print(drf.classes_)
        print(drf_sk.classes_)
        assert (drf.classes_ == drf_sk.classes_).all() == True

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_
Beispiel #19
0
def train(input_filename, num_train_examples, num_test_examples, block_size):
    # Load initial training data and test data
    X_train, y_train, X_test, y_test, scaler = loaddata(
        input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=150, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_test = encoder.transform(forest.apply(X_test))
    # Make sure that classes are weighted inversely to their frequencies
    weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train))
    class_weights = {0: weights[0], 1: weights[1]}
    learner = SGDClassifier(loss="hinge",
                            penalty="l2",
                            learning_rate="invscaling",
                            alpha=0.0001,
                            average=10**4,
                            eta0=1.0,
                            class_weight=class_weights)

    num_passes = 3
    aucs = []

    for j in range(num_passes):
        for i in range(0, num_train_examples, block_size):
            df = pandas.read_csv(input_filename,
                                 header=None,
                                 skiprows=i,
                                 nrows=block_size)
            X_train = df.values[:, 1:]
            X_train = scaler.transform(X_train)
            X_train = encoder.transform(forest.apply(X_train))
            y_train = numpy.array(df.values[:, 0], numpy.int)
            del df

            learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1]))
            y_pred_prob = learner.decision_function(X_test)
            auc = roc_auc_score(y_test, y_pred_prob)
            aucs.append([i + num_train_examples * j, auc])
            print(aucs[-1])

    df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"])
    df = df.set_index("Iterations")
    return df
Beispiel #20
0
def kfingerprinting(X_train, X_test, y_train, y_test):
    logger.info('training...')
    model = RandomForestClassifier(n_jobs=-1,
                                   n_estimators=1000,
                                   oob_score=True)
    model.fit(X_train, y_train)
    #    M = model.predict(X_test)
    # for i in range(0,len(M)):
    #     x = M[i]
    #     label = str(Y_test[i][0])+'-'+str(Y_test[i][1])
    #     logger.info('%s: %s'%(str(label), str(x)))
    acc = model.score(X_test, y_test)
    logger.info('Accuracy = %.4f' % acc)
    train_leaf = zip(model.apply(X_train), y_train)
    test_leaf = zip(model.apply(X_test), y_test)
    joblib.dump(model, 'ranpad2_0610_2057_norm.pkl')
    return train_leaf, test_leaf
Beispiel #21
0
    def _fit(self, dataset, **options):
        # self.param = param
        # print('model GBDT_LR fit begin:')
        # GBDT 模型
        rf = RandomForestClassifier(**options)
        rf.fit(dataset.x, dataset.y)
        #
        enc = OneHotEncoder()
        enc.fit(rf.apply(dataset.x)[:, :, 0])

        lm = LogisticRegression(**self.model_params)
        x = enc.transform(rf.apply(dataset.x)[:, :, 0])
        lm.fit(x, dataset.y)

        self.tree = rf
        self.enc = enc
        self.m = lm
Beispiel #22
0
def modelselect(input_filename,
                num_test_examples,
                block_size,
                n_estimators=100):
    # Perform some model selection to determine good parameters
    # Load data
    X_train, y_train, X_test, y_test, scaler = loaddata(
        input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_train = encoder.transform(forest.apply(X_train))
    learner = SGDClassifier(loss="hinge",
                            penalty="l2",
                            learning_rate="invscaling",
                            alpha=0.001,
                            average=10**4,
                            eta0=0.5,
                            class_weight="balanced")

    metric = "f1"
    losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"]
    penalties = ["l2", "l1", "elasticnet"]
    alphas = 10.0**numpy.arange(-5, 0)
    learning_rates = ["constant", "optimal", "invscaling"]
    param_grid = [{
        "alpha": alphas,
        "loss": losses,
        "penalty": penalties,
        "learning_rate": learning_rates
    }]
    grid_search = GridSearchCV(learner,
                               param_grid,
                               n_jobs=-1,
                               verbose=2,
                               scoring=metric,
                               refit=True)

    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, grid_search.best_score_)
    return grid_search
Beispiel #23
0
def train(input_filename, num_train_examples, num_test_examples, block_size):
    # Load initial training data and test data
    X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=150, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_test = encoder.transform(forest.apply(X_test))
    # Make sure that classes are weighted inversely to their frequencies
    weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train))
    class_weights = {0: weights[0], 1: weights[1]}
    learner = SGDClassifier(
        loss="hinge",
        penalty="l2",
        learning_rate="invscaling",
        alpha=0.0001,
        average=10 ** 4,
        eta0=1.0,
        class_weight=class_weights,
    )

    num_passes = 3
    aucs = []

    for j in range(num_passes):
        for i in range(0, num_train_examples, block_size):
            df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size)
            X_train = df.values[:, 1:]
            X_train = scaler.transform(X_train)
            X_train = encoder.transform(forest.apply(X_train))
            y_train = numpy.array(df.values[:, 0], numpy.int)
            del df

            learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1]))
            y_pred_prob = learner.decision_function(X_test)
            auc = roc_auc_score(y_test, y_pred_prob)
            aucs.append([i + num_train_examples * j, auc])
            print(aucs[-1])

    df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"])
    df = df.set_index("Iterations")
    return df
Beispiel #24
0
class Train(object):
    """docstring for TrainModel"""
    def preprocess_model(self):
        '''This allows preprocessing using logistic regression'''
        X_train, X_train_lr, y_train, y_train_lr = train_test_split(
            self.train, self.predictors, test_size=0.5)
        encode = OneHotEncoder()
        logistic = LogisticRegression()
        self.clf = RandomForestClassifier(n_estimators=512,
                                          oob_score=True,
                                          n_jobs=-1)
        self.clf.fit(X_train, y_train)
        encode.fit(self.clf.apply(X_train))
        self.predmodel = logistic.fit(
            encode.transform(self.clf.apply(X_train_lr)), y_train_lr)

    def train_model(self):
        '''This is standard model training'''
        '''For RandomForestClassifier to work their must be no nan values, one
        way of handling this is to use the --impute option. This uses mean
        imputation, which is the least information imputer, imputation is done
        by feature
        '''
        if np.any(np.isnan(self.train)):
            warnings.warn('RandomForestClassifier requires no missing data,\
                           features being imputed by mean')
            X = self.train
            imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
            imp.fit(X)
            self.train = imp.transform(X)
        self.clf = RandomForestClassifier(n_estimators=512,
                                          oob_score=True,
                                          n_jobs=-1)
        self.predmodel = self.clf.fit(X=self.train,
                                      y=self.predictors,
                                      sample_weight=self.weights)

    def __init__(self, train):
        self.train = train.train
        self.predictors = train.predictors
        self.features = train.feature_names
        self.weights = train.weights
Beispiel #25
0
def RF_openworld(mon_type, path_to_dict = dic_of_feature_data):
    '''Produces leaf vectors used for classification.'''

    mon_training, mon_test = mon_train_test_references(mon_type, path_to_dict)
    unmon_training, unmon_test = unmon_train_test_references(path_to_dict)

    training = mon_training + unmon_training
    test = mon_test + unmon_test

    tr_data, tr_label1 = zip(*training)
    tr_label = zip(*tr_label1)[0]
    te_data, te_label1 = zip(*test)
    te_label = zip(*te_label1)[0]

    print "Training ..."
    model = RandomForestClassifier(n_jobs=-1, n_estimators=num_Trees, oob_score=True)
    model.fit(tr_data, tr_label)

    train_leaf = zip(model.apply(tr_data), tr_label)
    test_leaf = zip(model.apply(te_data), te_label)
    return train_leaf, test_leaf
Beispiel #26
0
class Train(object):
    """docstring for TrainModel"""
    def preprocess_model(self):
        '''This allows preprocessing using logistic regression'''
        X_train, X_train_lr, y_train, y_train_lr = train_test_split(self.train,
                                                                    self.predictors,
                                                                    test_size=0.5)
        encode = OneHotEncoder()
        logistic = LogisticRegression()
        self.clf = RandomForestClassifier(n_estimators=512,
                                          oob_score=True, n_jobs=-1)
        self.clf.fit(X_train, y_train)
        encode.fit(self.clf.apply(X_train))
        self.predmodel = logistic.fit(encode.transform(self.clf.apply(X_train_lr)), y_train_lr)

    def train_model(self):
        '''This is standard model training'''
        '''For RandomForestClassifier to work their must be no nan values, one
        way of handling this is to use the --impute option. This uses mean
        imputation, which is the least information imputer, imputation is done
        by feature
        '''
        if np.any(np.isnan(self.train)):
            warnings.warn('RandomForestClassifier requires no missing data,\
                           features being imputed by mean')
            X = self.train
            imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
            imp.fit(X)
            self.train = imp.transform(X)
        self.clf = RandomForestClassifier(n_estimators=512,
                                          oob_score=True, n_jobs=-1)
        self.predmodel = self.clf.fit(X=self.train, y=self.predictors,
                                      sample_weight=self.weights)

    def __init__(self, train):
        self.train = train.train
        self.predictors = train.predictors
        self.features = train.feature_names
        self.weights = train.weights
def RF_openworld(mon_type, path_to_dict=dic_of_feature_data):
    '''Produces leaf vectors used for classification.'''

    mon_training, mon_test = mon_train_test_references(mon_type, path_to_dict)
    unmon_training, unmon_test = unmon_train_test_references(path_to_dict)

    training = mon_training + unmon_training
    test = mon_test + unmon_test

    tr_data, tr_label1 = zip(*training)
    tr_label = zip(*tr_label1)[0]
    te_data, te_label1 = zip(*test)
    te_label = zip(*te_label1)[0]

    print "Training ..."
    model = RandomForestClassifier(n_jobs=-1,
                                   n_estimators=num_Trees,
                                   oob_score=True)
    model.fit(tr_data, tr_label)

    train_leaf = zip(model.apply(tr_data), tr_label)
    test_leaf = zip(model.apply(te_data), te_label)
    return train_leaf, test_leaf
def rf_lr(X_train, X_test, y_train, y_test):
    """
    RF + LR

    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    # 基于随机森林的监督变换
    rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=3)
    rf.fit(X_train, y_train)
    # 得到 OneHot 编码
    rf_enc = OneHotEncoder(categories='auto')
    rf_enc.fit(rf.apply(X_train))
    # 使用 OneHot 编码作为特征,训练 LR
    rf_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
    rf_lr.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
    # 使用 LR 进行预测
    y_pred_rf_lr = rf_lr.predict_proba(rf_enc.transform(rf.apply(X_test)))[:,
                                                                           1]
    fpr_rf_lr, tpr_rf_lr, _ = roc_curve(y_test, y_pred_rf_lr)
    return fpr_rf_lr, tpr_rf_lr
class EntityEmbeddingTree(BaseEstimator, TransformerMixin):
    def __init__(self, *, numeric_columns, categorical_columns):
        self.__numeric_columns = numeric_columns
        self.__categorical_columns = categorical_columns
        self.__target_encoder, self.__one_hot_encoder = [
            None for _ in range(2)
        ]
        self.__max_target, self.__max_param = [None for _ in range(2)]
        self.__clf = None

    def fit(self, X, y):
        X = X.copy(deep=True)
        y = y.copy(deep=True)

        self.__target_encoder = TargetEncoder()
        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.fit_transform(
            X[self.__categorical_columns], y)

        self.__max_target, self.__max_param = optimize_rf(X, y)
        self.__clf = RandomForestClassifier(
            min_samples_leaf=max(
                min(self.__max_param["min_samples_leaf"], 1.0), 0),
            n_estimators=max(int(round(self.__max_param["n_estimators"])), 1))

        self.__clf.fit(X, y)
        gc.collect()

        return self

    def transform(self, X):
        X = X.copy(deep=True)

        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.transform(
            X[self.__categorical_columns])
        gc.collect()

        return pd.DataFrame(self.__clf.apply(X)).astype(str)

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X=X, y=y)

        return self.transform(X)
    def detect(self, X, y):
        X, y = self._check_everything(X, y)

        forest = RandomForestClassifier(n_estimators=self.n_estimators,
                                        max_leaf_nodes=self.max_leaf_nodes,
                                        n_jobs=self.n_jobs,
                                        random_state=self.random_state).fit(
                                            X, y)

        Xs = forest.apply(X)
        knn = KNeighborsClassifier(n_neighbors=self.n_neighbors,
                                   metric='hamming',
                                   algorithm='brute',
                                   weights=self.weight,
                                   n_jobs=self.n_jobs).fit(Xs, y)

        return self._get_kdn(knn, y)
def runRfStack(inputfile, outputfile):
    '''
    输入输出文件
    '''
    df_all = pd.read_csv(inputfile)
    df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:]))

    # 默认填充的0,显示使用一个负数尝试一下
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_all = df_all.fillna(0)

    # 默认填充的0,显示使用一个负数尝试一下
    features = df_all.columns[0:]
    features = list(features)
    features.remove('EID')
    label = 'TARGET'

    df_train, df_test = xtrain_and_test(df_all)

    clf = RandomForestClassifier(
        n_estimators=50,  #50棵树
        max_depth=7,
        n_jobs=4,
        random_state=101)

    X_train = df_train[features]
    Y_label = df_train[label]

    X_test = df_test[features]

    clf.fit(X_train, Y_label)
    column = ['STACKFEATURE' + str(i) for i in range(50)]
    df_new_feature = pd.DataFrame(clf.apply(df_all[features]), columns=column)
    df_all[column] = df_new_feature

    df_all.to_csv(outputfile, index=False, index_label=False)
    del df_train, df_test, df_all
    return outputfile
Beispiel #32
0
def rf_embed(feature, labels, n_estimators=100, max_depth=3):
    """construct an embedding using a random forest.

	Args:
		feature (np.ndarray): a matrix of shape (len(labels),D) with D>0.
		labels (list): a list of integers on the range [0,1]
		n_estimators (int): the number of trees in the random forest
		max_depth (int): the maximum depth of each tree

	Returns:
		np.ndarray: a matrix containg all pairwise similarities for a single categorical distribution (feature).

	"""
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 bootstrap=True,
                                 criterion='entropy',
                                 class_weight='balanced')
    clf.fit(feature, labels)
    leaves = clf.apply(feature)
    embedded = np.array(
        OneHotEncoder(categories='auto').fit_transform(leaves).todense())
    return 1. - cosine_distances(embedded)
Beispiel #33
0
# 隨機森林擬合後, 再將葉編碼 (*.apply) 結果做獨熱 / 邏輯斯迴歸
rf = RandomForestClassifier(n_estimators=20,
                            min_samples_split=10,
                            min_samples_leaf=5,
                            max_features=4,
                            max_depth=3,
                            bootstrap=True)
onehot = OneHotEncoder()
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
"""
Your Code Here (Hint : 隨機森林的葉編碼(.apply)不需要加上[:, :, 0], 直接用rf.apply()調用即可, 本作業其餘寫法相同)
"""

rf.fit(train_X, train_Y)
onehot.fit(rf.apply(train_X))
lr.fit(onehot.transform(rf.apply(val_X)), val_Y)

# 將梯度提升樹+葉編碼+邏輯斯迴歸結果輸出
pred_rf_lr = lr.predict_proba(onehot.transform(rf.apply(test_X)))[:, 1]
fpr_rf_lr, tpr_rf_lr, _ = roc_curve(test_Y, pred_rf_lr)
# 將梯度提升樹結果輸出
pred_rf = rf.predict_proba(test_X)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(test_Y, pred_rf)

import matplotlib.pyplot as plt
# 將結果繪圖
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RandomForest')
plt.plot(fpr_rf_lr, tpr_rf_lr, label='RandomForest + LR')
plt.xlabel('False positive rate')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

wine = load_wine()

x_train, x_test, y_train, y_test = train_test_split(wine.data,
                                                    wine.target,
                                                    test_size=0.3)
dtc = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=0, oob_score=True)
dtc.fit(x_train, y_train)
rfc.fit(x_train, y_train)

print("袋外数据", rfc.oob_score_)

rfc = RandomForestClassifier(n_estimators=25)
rfc.fit(x_train, y_train)
score = rfc.score(x_test, y_test)
fi = rfc.feature_importances_
result_apply = rfc.apply(x_test)
result_predict = rfc.predict(x_test)
result_proba = rfc.predict_proba(x_test)
Beispiel #35
0
# y_pred_rf = model.predict_proba(enc.transform(model.apply(X_test)))[:, 1]

# model_prob = model.predict_proba(X_test)
# score=log_loss(Y_test,model_prob)
# score_mean=mean_squared_error(Y_test,model.predict(X_test))
# print("Score:",score)
# print("MSE:",score_mean)
# print("model_prob",model_prob)

# model_prob=model_prob.reshape(1,-1)
#
rf = RandomForestClassifier(max_depth=3)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegressionCV(cv=5)
rf.fit(X_train, Y_train.ravel())
rf_enc.fit(rf.apply(X_train))
model_used = rf_lm.fit(rf_enc.transform(rf.apply(X_test)), Y_test.ravel())
preds = rf.predict(X_test)
print(type(model_used))

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(Y_test, y_pred_rf_lm, pos_label='pos')
roc_auc = auc(fpr_rf_lm, tpr_rf_lm)

plt.plot(fpr_rf_lm, tpr_rf_lm, label=str(roc_auc))
plt.title('ROC curve-Test Set (With Imbalance removal using SMOTE), AUC: ' +
          str(roc_auc))
plt.xlabel('False positive rate')
plt.plot([0, 1], [0, 1], 'k--')
# plt.legend('AUC:'str(round(roc_auc)))
plt.ylabel('True positive rate')
Beispiel #36
0
class Classifier:
    def __init__(self,
                 max_depth,
                 directory,
                 tree_file_name='tree.txt',
                 title='Embedded Values',
                 already_classified=False,
                 n_neighbors=100,
                 train=True,
                 n_estimators=10,
                 file_included_in_directory=False,
                 file_name='Untitled.csv',
                 reduce=False,
                 path='auto',
                 dimension=3,
                 mock=False,
                 index='gene_callers_id',
                 create_folder=False,
                 folder_name='folder',
                 separator=None,
                 norm=True,
                 _filter=epsilon,
                 rows=100,
                 columns=100):

        self.algorithm_filename = 'metagenome-centric_classifier_algorithm.sav'
        self.train = train
        self.n_estimators = n_estimators
        self.max_depth = max_depth

        self.embedded = Embedding(n_neighbors,
                                  directory,
                                  path=path,
                                  reduced_dimension=dimension,
                                  train=train,
                                  mock=mock,
                                  file_name=file_name,
                                  index=index,
                                  create_folder=create_folder,
                                  folder_name=folder_name,
                                  separator=separator,
                                  norm=norm,
                                  _filter=_filter,
                                  rows=rows,
                                  columns=columns,
                                  tree_file='tree.txt')

        self.directory = self.embedded.directory
        self.dataframe = self.embedded.embedded_dataframe
        self.X = self.embedded.coverage_values

        if train:
            self.train_data(tree_file_name, title)

        self.fit_data()

    def train_data(self, tree_file_name, title):
        '''
        Allows user to manually train the data
        '''
        directory = self.embedded.directory
        coverage_values_file = self.embedded.embedded_coverage_values_file
        classified_values_file = self.embedded.embedded_classified_values_file

        training_data = training.Train(
            directory=directory,
            coverage_values_file=coverage_values_file,
            classified_values_file=classified_values_file,
            tree_file=tree_file_name,
            title=title)

        self.X = training_data.coverage_values
        self.y = training_data.classified_values

    def save_model(self):
        '''
        Saves model to a binary file using pickle
        '''
        with open(self.algorithm_filename, 'wb') as f:
            pickle.dump(self.model, f)

    def load_model(self):
        '''
        Loads model to a binary file using pickle
        '''
        with open(self.algorithm_filename, 'wb') as f:
            return pickle.load(open(self.algorithm_filename, 'wb'))

    def fit_data(self):
        '''
        Performs the random forest classifier
        '''

        # checks if random forest classifier has already been created
        try:
            self.model = self.load_model()
        except EOFError:
            self.model = RandomForestClassifier(n_estimators=self.n_estimators,
                                                max_depth=self.max_depth)
        if self.train:
            self.model.fit(self.X, self.y)
        else:
            self.model.apply(self.X)

        self.save_model()
Beispiel #37
0
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
Beispiel #38
0
def show_roc(classifier, with_probas):
    cv = StratifiedKFold(labels[:-1], n_folds=5)

    for i, (train, test) in enumerate(cv):

        vectorizer = CountVectorizer(vocabulary=vocab)
        features = vectorizer.fit_transform(data[train])
        #transformer = TfidfTransformer()
        #tfidf_features = transformer.fit(features).transform(features)
        #X = np.array(tfidf_features.todense())

        #X = preprocess(features.toarray())
        X = features.toarray()
        y = labels[train]

        X, X1, y, y1 = train_test_split(X, y, test_size=0.5)
        clf1 = RandomForestClassifier(n_estimators=20)
        enc = OneHotEncoder()
        clf2 = RandomForestClassifier(n_estimators=10)
        clf1.fit(X, y)
        enc.fit(clf1.apply(X))
        clf2.fit(enc.transform(clf1.apply(X1)), y1)

      
        #clf = classifier.fit(X, y)

        X_test = vectorizer.transform(data[test])
        #t_f = preprocess(t_features.toarray())
        y_test = labels[test]
        #res = clf.predict(t_f)
        
        res = clf2.predict(enc.transform(clf1.apply(X_test)))  

        if with_probas:
            res_p = clf2.predict_proba(enc.transform(clf1.apply(X_test)))
            #res_p = clf.predict_proba(t_features)
            fpr, tpr, _ = roc_curve(y_test, res_p[:,1])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        check = zip(y_test, res)
        tp, tn, fp, fn = 0, 0, 0, 0
        for value, prediction in check:
            if (prediction and value):
                tp += 1
            if (prediction and not value):
                fp += 1
            if (not prediction and value):
                fn += 1
            if (not prediction and not value):
                tn += 1
        print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn))
        print ("Precision Score : %f" % metrics.precision_score(y_test, res))
        print ("Recall Score : %f" % metrics.recall_score(y_test, res))
        print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, res))
        print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, res))

    if with_probas:
        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        plt.legend(loc="lower right")
        plt.show()
Beispiel #39
0
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)
rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)