Beispiel #1
0
    def tlearn(self,dev,test,val,bad_ind,featureA,featureB,featureC,drop_rate):  
        """ 
            dev 训练集 源域 
            test 测试集 辅助域 
            val 验证集 
            bad_ind 标签 
            featureA 特征组A 
            featureB 特征组B 
            featureC 特征组C 
        """  
        print(len(featureA),len(featureB),len(featureC))  
        result = pd.DataFrame()  
        temp_test = test  
        features = list(set(featureA+featureB+featureC))  
        turn = 1  
        while( turn <= self.max_turns):  
            new = pd.DataFrame()  
              
            """ 
                模型A对特征组featureA训练, 
                并预测得到dev和test和val的概率 
                以及test上的分类结果(分数分布在0.8*(min+max)两侧)
            """  
            self.clfA.fit(dev[featureA],dev[bad_ind])  
            predA= self.clfA.predict_proba(dev[featureA])[:,1]   
            probA = self.clfA.predict_proba(test[featureA])[:,1]  
            preA = (probA > (np.max(probA)+np.min(probA))*0.8)  
            valid_a = self.clfA.predict_proba(val[featureA])[:,1]   
            """ 
                模型B对特征组featureB训练, 
                并预测得到dev和test和val的概率 
                以及test上的分类结果(分数分布在0.8*(min+max)两侧)
            """  
            self.clfB.fit(dev[featureB],dev[bad_ind])  
            predB = self.clfB.predict_proba(dev[featureB])[:,1]  
            probB = self.clfB.predict_proba(test[featureB])[:,1]  
            preB = (probA > (np.max(probB)+np.min(probB))*0.8)  
            valid_b = self.clfB.predict_proba(val[featureB])[:,1]  
            """ 
                模型C对特征组featureC训练, 
                并预测得到dev和test和val的概率 
                以及test上的分类结果(分数分布在0.8*(min+max)两侧) 
            """              
            self.clfC.fit(dev[featureC],dev[bad_ind])  
            predC= self.clfC.predict_proba(dev[featureC])[:,1]  
            probC = self.clfC.predict_proba(test[featureC])[:,1]  
            preC = (probC > (np.max(probC)+np.min(probC))*0.8)  
            valid_c = self.clfC.predict_proba(val[featureC])[:,1]  
            """ 
                分别计算三个模型在val上的AUC 
                模型加权融合的策略:以单模型的AUC作为权重
            """  
            valid_scoreA = AUC(val[bad_ind],valid_a)  
            valid_scoreB = AUC(val[bad_ind],valid_b)  
            valid_scoreC = AUC(val[bad_ind],valid_c)  
            valid_score = AUC(val[bad_ind], valid_a*valid_scoreA
                                             +valid_b*valid_scoreB + valid_c*valid_scoreC)
              
            """ 
                index1 三个模型在test上的预测概率相同的样本 
                sum_va 三个模型AUC之和为分母做归一化 
                prob 测试集分类结果融合, 
                index1(分类结果)*AUC(权重)/sum_va(归一化分母) 
                index2 分类结果升序排列,取出两端的test样本 
                new 筛选后样本集 
            """  
            index1 = (preA==preB) & (preA==preC)  
            sum_va = valid_scoreA+valid_scoreB+valid_scoreC  
            prob = (probC[index1]*valid_scoreC+probA[index1]*valid_scoreA  
                    +probB[index1]*valid_scoreB)/sum_va  
            Ap_low = np.sort(prob)[int(len(prob)*turn/2.0/self.max_turns)]-0.01  
            Ap_high= np.sort(prob)[int(len(prob)*
                                                          (1-turn/2.0/self.max_turns))]+0.01
            index2 = ((prob>Ap_high) | (prob<Ap_low))    
            new['no'] = test['no'][index1][index2]      
            new['pred'] = prob[index2]  
            result = result.append(new)  
            """ 
                rightSamples 同时满足index1和index2条件的预测概率 
                score_sim 三个模型在test上的预测结果差异和 
            """  
            rightSamples = test[index1][index2]  
            rightSamples[bad_ind] = preA[index1][index2]  
  
            score_sim = np.sum(abs(probA-probB)+
                                             abs(probA-probC) +abs(probB-probC)+0.1)/len(probA)
            """ 
                从数据集dev中取出step之后的部分样本并计算AUC 
                valid_score 前文三模型加权融合的AUC 
                得到drop 
            """  
            true_y = dev.iloc[self.step:][bad_ind]  
            dev_prob = predA[self.step:]*valid_scoreA+ predB[self.step:]*valid_scoreB + predC[self.step:]*valid_scoreC  
                              
            dev_score = AUC(true_y,dev_prob)  
              
            drop = self.max_turns/(1+ drop_rate*
                                                      np.exp(-self.max_turns)*valid_score)
            """ 
                使用Traddaboost相同的权重调整方法, 
                挑选权重大于阈值的样本。 
            """  
            loss_bias = 0  
            if(self.step>0):  
                true_y = dev.iloc[0:self.step][bad_ind]  
                temp = predA[0:self.step]*valid_scoreA  \  
                        + predB[0:self.step]*valid_scoreB  \  
                        + predC[0:self.step]*valid_scoreC  
                temp = (temp+0.1)/(max(temp)+0.2)#归一化  
                temp = (true_y-1)*np.log(1-temp)-true_y*np.log(temp)#样本权重  
                loc = int(min(self.step,len(rightSamples)*drop+2)
                                                             *np.random.rand())#去除样本的比例  
                loss_bias =  np.sort(temp)[-loc]  
                temp = np.append(temp,np.zeros(len(dev)-self.step)-99)  
                remain_index = (temp <= loss_bias)  
                self.step = self.step-sum(1-remain_index)  
            else:  
                remain_index = []  
                  
            """ 
                得到新的test 
            """  
            dev = dev[remain_index].append(rightSamples[features+[bad_ind,'no']])  
            test = test[~test.index.isin(rightSamples.index)]  
            turn += 1  
        """ 
            计算原始test上的AUC 
        """  
        probA = self.clfA.predict_proba(test[featureA])[:,1]  
        pA = self.clfA.predict_proba(temp_test[featureA])[:,1]  
        valid_a = self.clfA.predict_proba(val[featureA])[:,1]  
  
        probB = self.clfB.predict_proba(test[featureB])[:,1]  
        valid_b = self.clfB.predict_proba(val[featureB])[:,1]  
        pB = self.clfB.predict_proba(temp_test[featureB])[:,1]  
  
        probC = self.clfC.predict_proba(test[features])[:,1]  
        valid_c = self.clfC.predict_proba(val[features])[:,1]  
        pC = self.clfC.predict_proba(temp_test[features])[:,1]  
  
        self.scoreA = AUC(val[bad_ind],valid_a)  
        self.scoreB = AUC(val[bad_ind],valid_b)  
        self.scoreC = AUC(val[bad_ind],valid_c)  

        return pA,pB,pC  
Beispiel #2
0
def run_all_classifiers_cv(cl_name, example_index, param_index):
    ''' get results for all the classifiers '''

    data_dir = os.getcwd() + '/'

    all_datasets = ['iris_bintarget', 'ilpd', 'adult_data_bintarget', 'credit_card_clients', \
                'ionosphere', 'iris_bintarget', 'parkinsons', 'pima-indians-diabetes', \
                'TomsHardware_bintarget', 'transfusion', 'Twitter_bintarget_small', 'wdbc']
    datasets = []
    if (example_index == -1):
        datasets = all_datasets
    else:
        datasets.append(all_datasets[example_index])

    num_exp = 10
    for exp_name in datasets:
        results = []
        timeTakenData = []
        ### loop over the cv-fold (to be consistent with SAT results these are saved in files)
        for e_num in range(num_exp):

            fname_tr = data_dir + 'Train/' + '%s_%d_train.csv' % (exp_name,
                                                                  e_num)
            fname_tst = data_dir + 'Test/' + '%s_%d_test.csv' % (exp_name,
                                                                 e_num)

            X_tr, X_tst, y_tr, y_tst, feat_names, target_name = load_check_data(
                fname_tr, fname_tst)

            if cl_name == 'rf':
                gen_cl = gen_rf
                cv_param = 'min_samp'
            elif cl_name == 'logreg':
                gen_cl = gen_logreg
                cv_param = 'C'
            elif cl_name == 'nn':
                gen_cl = lambda: gen_nn(max_nn=len(y_tr))
                cv_param = 'num_nbs'
            elif cl_name == 'svc':
                gen_cl = gen_svc
                cv_param = 'C'
            else:
                assert False, "No such classifier %s" % cl_name
            print type(gen_cl())
            #cl, min_samp= gen_cl()[0]

            ### generator over the classifier over a predefined range of paramters
            #for i in range(0,1):
            param_wait_index = 0
            for cl, min_samp in gen_cl():
                if (not (param_wait_index == param_index)):
                    param_wait_index += 1
                    continue
                param_wait_index += 1
                startTime = time.time()
                cl.fit(X_tr, y_tr)
                endTime = time.time()
                timeTaken = endTime - startTime
                y_hat = cl.predict(X_tst)
                p = cl.predict_proba(X_tst)

                acc = accuracy_score(y_tst, y_hat)
                auc = AUC(y_tst, p[:, 1])

                results.append([e_num, min_samp, acc, auc])
                timeTakenData.append([e_num, min_samp, timeTaken])
        print np.array(results)

        results_pd = pd.DataFrame(results,
                                  index=None,
                                  columns=['exp_num', cv_param, 'acc', 'auc'])
        timeTaken_pd = pd.DataFrame(timeTakenData,
                                    index=None,
                                    columns=['exp_num', cv_param, 'Time'])
        #print results_pd

        av_cv = results_pd.groupby(cv_param).mean()
        av_time = timeTaken_pd.groupby(cv_param).mean()
        del av_cv['exp_num']
        del av_time['exp_num']
        fname_out = 'Results/%s_%s_%d_results.csv' % (exp_name, cl_name,
                                                      param_index)
        av_cv.to_csv(fname_out)
        fname_out = 'Results/time_%s_%s_%d_results.csv' % (exp_name, cl_name,
                                                           param_index)
        av_time.to_csv(fname_out)
print("{0}{1}".format(st, fraction_correct(svc_pred, te[1])))

#### Evaluate Classifier Performance

##### ROC

# In[148]:

# because we have a binary classification problem,
# we can use ROC to evaluate the quality of these models

#logistic regression
pred_prob_lr = lr.predict_proba(te[0])
false_pos_rate_lr, true_pos_rate_lr, thresholds_lr = ROC(
    te[1], pred_prob_lr[:, 1])
roc_auc_lr = AUC(false_pos_rate_lr, true_pos_rate_lr)
print(
    "Logisitc Regression, area under the curve: {0:>9.3f}".format(roc_auc_lr))

# svm
pred_prob_svm = svc.predict_proba(te[0])
false_pos_rate_svm, true_pos_rate_svm, thresholds_svm = ROC(
    te[1], pred_prob_svm[:, 1])
roc_auc_svm = AUC(false_pos_rate_svm, true_pos_rate_svm)
print("SVM, area under the curve: {0:>25.3f}".format(roc_auc_svm))

# In[170]:

# plot the ROC curves for each classifier

fpr_lr, tpr_lr = false_pos_rate_lr, true_pos_rate_lr
# num_boost_round = 1500
# # watchlist = [(xg_train, 'train'), (xg_test, 'eval')]
# num_round=15
# bst = xgb.train(param, xg_train, num_round)
# preds = bst.predict(xg_test)
# auc = AUC(Y_test , preds)
# print(auc)

# 根据重要性筛选
# score = bst.get_score(importance_type='gain')
# print(score)
xgbc = XGBC(max_depth=200, seed=999, n_estimators=100, scale_pos_weight=13)
xgbc.fit(X_train, Y_train)
Y_test_proba = xgbc.predict_proba(X_test)
print("训练a的数据,得分auc:")
print(AUC(Y_test, Y_test_proba[:, 1]))

# 默认是信息增益 importance_type="gain"
feature_importances_ = xgbc.feature_importances_
feature_importances_series = pd.Series(feature_importances_)
feature_importances_series.index = effective_columns
effective_columns_before = effective_columns
#去除分类效果低的属性 反而效果更差
feature_importances_series = feature_importances_series[
    feature_importances_series.values > 0]

print("根据属性重要性,提取属性大小")
print(feature_importances_series.shape)
effective_columns = feature_importances_series.index.values
# 展示列的重要性排序
sort_values = feature_importances_series.sort_values(ascending=False)
Beispiel #5
0
cv = CV.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=20180512)

for f, (train_i, test_i) in enumerate(cv):

    print("# fold {}, {}".format(f + 1, ctime()))

    x_train = x.iloc[train_i]
    x_test = x.iloc[test_i]
    y_train = y.iloc[train_i]
    y_test = y.iloc[test_i]

    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_test)[:, 1]

    auc = AUC(y_test, p)
    print("# AUC: {:.2%}\n".format(auc))

    predictions[test_i] = p

# fold 1, Mon Jun 25 19:39:24 2018
# AUC: 82.71%

# fold 2, Mon Jun 25 19:39:48 2018
# AUC: 82.99%

# fold 3, Mon Jun 25 19:40:10 2018
# AUC: 82.20%

# fold 4, Mon Jun 25 19:40:31 2018
# AUC: 83.54%
Beispiel #6
0
 def fallback_auc(y_true, y_pred):
     try:
         return AUC(y_true, y_pred)
     except:
         return 0.5
Beispiel #7
0
def train_predict_lr_forward(train_file,
                             test_file,
                             predict_valid_file,
                             predict_test_file,
                             C,
                             n_fold=5):

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'lr_forward_{}'.format(C)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info("Loading training and test data...")
    X_trn, y_trn = load_data(train_file, dense=True)
    X_tst, _ = load_data(test_file, dense=True)

    logging.info('Normalizing data')
    scaler = StandardScaler()
    X_trn = scaler.fit_transform(X_trn)
    X_tst = scaler.transform(X_tst)

    cv = StratifiedKFold(y_trn,
                         n_folds=n_fold,
                         shuffle=True,
                         random_state=2015)

    selected_features = []
    features_to_test = [
        x for x in range(X_trn.shape[1]) if x not in selected_features
    ]

    auc_cv_old = .5
    is_improving = True
    while is_improving:
        auc_cvs = []
        for feature in features_to_test:
            logging.info('{}'.format(selected_features + [feature]))
            X = X_trn[:, selected_features + [feature]]

            p_val = np.zeros_like(y_trn)
            for i, (i_trn, i_val) in enumerate(cv, start=1):
                clf = LR(C=C, class_weight='auto', random_state=2014)
                clf.fit(X[i_trn], y_trn[i_trn])
                p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

            auc_cv = AUC(y_trn, p_val)
            logging.info('AUC CV: {:.6f}'.format(auc_cv))
            auc_cvs.append(auc_cv)

        auc_cv_new = max(auc_cvs)
        if auc_cv_new > auc_cv_old:
            auc_cv_old = auc_cv_new
            feature = features_to_test.pop(auc_cvs.index(auc_cv_new))
            selected_features.append(feature)
            logging.info('selected features: {}'.format(selected_features))
        else:
            is_improving = False
            logging.info(
                'final selected features: {}'.format(selected_features))

    logging.info('saving selected features as a file')
    with open('{}_selected.txt'.format(model_name), 'w') as f:
        f.write('{}\n'.format(selected_features))

    X = X_trn[:, selected_features]
    logging.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1]))

    p_val = np.zeros_like(y_trn)
    for i, (i_trn, i_val) in enumerate(cv, start=1):
        logging.info('Training CV #{}'.format(i))
        clf = LR(C=C, class_weight='auto', random_state=2015)
        clf.fit(X[i_trn], y_trn[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    auc_cv = AUC(y_trn, p_val)
    logging.info('AUC CV: {:.6f}'.format(auc_cv))
    logging.info("Writing test predictions to file")
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Retraining with 100% data...')
    clf.fit(X, y_trn)
    p_tst = clf.predict_proba(X_tst[:, selected_features])[:, 1]
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #8
0
print "loading y..."

y = np.loadtxt(y_file,
               usecols=[0, 2],
               converters={2: lambda x: x.split('|')[0]})
y_true = y[:, 0]
pos_ids = y[y[:, 0] == 1][:, 1].astype(int)

# calculate precision

print "calculating AP@{}...".format(k)

# this snippet is from Kaggle code
countRelevants = 0
listOfPrecisions = list()
for i, p_id in enumerate(p_ids):
    currentk = i + 1.0
    if p_id in pos_ids:
        countRelevants += 1
    precisionAtK = countRelevants / currentk
    listOfPrecisions.append(precisionAtK)

print sum(listOfPrecisions) / k

print
print "confusion matrix:"
print confusion_matrix(y_true, y_predicted)

print
print "AUC:", AUC(y_true, p[:, 0])
Beispiel #9
0
#!/usr/bin/env python
from sklearn.metrics import roc_auc_score as AUC
import argparse
import json
import numpy as np
import os

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--target-file',
                        '-t',
                        required=True,
                        dest='target_file')
    parser.add_argument('--predict-file',
                        '-p',
                        required=True,
                        dest='predict_file')

    args = parser.parse_args()

    # p contains the predictions across all K folds in cross validation
    p = np.loadtxt(args.predict_file, delimiter=',')
    y = np.loadtxt(args.target_file, delimiter=',')

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(args.predict_file))[0])[0]
    print('{}\t{:.6f}'.format(model_name, AUC(y, p)))
Beispiel #10
0
y_file = sys.argv[1]
p_file = sys.argv[2]

print "loading p..."

p = np.loadtxt( p_file )

y_predicted = np.ones(( p.shape[0] ))
y_predicted[p < 0] = -1

print "loading y..."

y = np.loadtxt( y_file, usecols= [0] )

print "accuracy:", accuracy( y, y_predicted )
print "AUC:", AUC( y, p )

print
print "confusion matrix:"
print confusion_matrix( y, y_predicted )


"""
run score.py data/test_v.txt vw/p_v_logistic.txt
accuracy: 0.994675826535
confusion matrix:
[[27444   136]
 [  236 42054]]
AUC: 0.998418419401
"""
Beispiel #11
0
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):

    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1]
    predictions[test_idx] = probs

# ### Results

# We'll output the ROC-AUC metric for our classifier as an estimate how much covariate shift this data has. As we can see that value for AUC is very close to .5. It implies that our classifier is not able to distinguish the rows whether it is belonging to train or test. This implies that majority of the observations comes from a feature space which is not particular to test or train.

# In[25]:

print('ROC-AUC for X and Z distributions:', AUC(y, predictions))

# In the *predictions* array, we just computed the probability of a sample in the full dataset being sample taken from the training distribution ($train$). We'll call this $p(train|Data)$. Next we'll use the relationship that $p(train|Data) = 1 - p(test|Data)$ to estimate $\beta$ for our training samples,
#
# $\beta_i = \frac{p_i(test|Data)}{p_i(train|Data)} = \frac{1 - p_i(train|Data)}{p_i(train|Data)} = \frac{1}{p(train|Data)} - 1$.
#
# So we now have a method to convert the probability of each point belonging to the training distribution into our sample weights $\beta$. Let's see the distribution of these weights for the training samples

# In[27]:

plt.figure(figsize=(20, 10))
predictions_train = predictions[len(tst):]  #filtering the actual training rows
weights = (1. / predictions_train) - 1.
weights /= np.mean(weights)  # Normalizing the weights
plt.xlabel('Computed sample weight')
plt.ylabel('# Samples')
Beispiel #12
0
for i in tqdm(range(len(pIndexTrain))):
    start, stop, length = pIndexTrain[i, :]
    sampleWeights[start:stop] = 1 / length

##############################################################################################

from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import roc_auc_score as AUC
from Helper.utilities import showCoef

model = LR(class_weight='balanced', C=1e-1)
model.fit(Xtrain, Ytrain, sample_weight=sampleWeights)
showCoef(model.coef_[0], baseVars + timeVarNames)

P = model.predict_proba(Xtest)[:, 1]
print(AUC(Ytest, P))
## performance of 0.781 for base + allVars (1week)
## performance of 0.775 for base + creatVars (1week) ## DB 0.640
## performance of 0.749 for base
## performance of 0.683 for AKI stage 1

##################################################################################################
import xgboost as xgb

dtrain = xgb.DMatrix(Xtrain,
                     label=Ytrain,
                     weight=sampleWeights,
                     feature_names=baseVars + timeVarNames)
dtest = xgb.DMatrix(Xtest, label=Ytest, feature_names=baseVars + timeVarNames)

param = {
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  model_file,
                  n_iter=100,
                  dim=4,
                  lrate=.1,
                  n_tree=30,
                  depth=4,
                  eta=0.05,
                  n_fold=5):

    # XGB parameters and configuration
    param = {
        'max_depth': depth,
        'eta': eta,
        'objective': 'binary:logistic',
        'colsample_bytree': .5,
        'subsample': .5,
        'eval_metric': 'auc',
        'seed': 2015
    }

    dir_feature = os.path.dirname(train_file)
    dir_model = os.path.dirname(model_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-8]
    xg_feature_name = 'xg_{}_{}_{}_{}'.format(n_tree, depth, eta, feature_name)
    algo_name = 'ffm_{}_{}_{}'.format(n_iter, dim, lrate)
    model_name = '{}_{}'.format(algo_name, feature_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training data')
    X, y = load_svmlight_file(train_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        valid_train_file = os.path.join(
            dir_feature, '{}.trn{}.ffm'.format(xg_feature_name, i))
        valid_test_file = os.path.join(
            dir_feature, '{}.val{}.ffm'.format(xg_feature_name, i))
        valid_model_file = os.path.join(dir_model,
                                        '{}.trn{}.mdl'.format(model_name, i))
        valid_predict_file = os.path.join(dir_val,
                                          '{}.val{}.yht'.format(model_name, i))
        valid_train_svm_file = os.path.join(
            dir_feature, '{}.trn{}.sps'.format(feature_name, i))
        valid_test_svm_file = os.path.join(
            dir_feature, '{}.val{}.sps'.format(feature_name, i))

        # if there are no libsvm files, generate them first.
        if (not os.path.isfile(valid_train_file)) or \
           (not os.path.isfile(valid_test_file)):

            # if there are no libsvm files, generate them first.
            if not os.path.isfile(valid_train_svm_file):
                dump_svmlight_file(X[i_trn],
                                   y[i_trn],
                                   valid_train_svm_file,
                                   zero_based=False)

            if not os.path.isfile(valid_test_svm_file):
                dump_svmlight_file(X[i_val],
                                   y[i_val],
                                   valid_test_svm_file,
                                   zero_based=False)

            # generate XGB features
            dtrain = xgb.DMatrix(valid_train_svm_file)
            dtest = xgb.DMatrix(valid_test_svm_file)
            watchlist = [(dtest, 'eval'), (dtrain, 'train')]

            logging.info('Generating XGB features')
            xg = xgb.train(param, dtrain, n_tree, watchlist)
            xg_trn_feature = xg.predict(dtrain, pred_leaf=True)
            xg_tst_feature = xg.predict(dtest, pred_leaf=True)

            # save XGB features as the libffm format
            np_to_ffm(xg_trn_feature, dtrain.get_label(), valid_train_file)
            np_to_ffm(xg_tst_feature, dtest.get_label(), valid_test_file)

        subprocess.call([
            "ffm-train", '-k', '{}'.format(dim), '-r',
            str(lrate), '-t',
            str(n_iter), '-p', valid_test_file, valid_train_file,
            valid_model_file
        ])

        subprocess.call([
            "ffm-predict", valid_test_file, valid_model_file,
            valid_predict_file
        ])

        p[i_val] = np.loadtxt(valid_predict_file)

    logging.info('AUC = {:.6f}'.format(AUC(y, p)))
    np.savetxt(predict_valid_file, p, fmt='%.6f')

    ffm_train_file = os.path.join(dir_feature,
                                  '{}.trn.ffm'.format(xg_feature_name))
    ffm_test_file = os.path.join(dir_feature,
                                 '{}.tst.ffm'.format(xg_feature_name))
    if (not os.path.isfile(ffm_train_file)) or \
       (not os.path.isfile(ffm_test_file)):

        # generate XGB features
        dtrain = xgb.DMatrix(train_file)
        dtest = xgb.DMatrix(test_file)
        watchlist = [(dtrain, 'train')]

        logging.info('Generating XGB features')
        xg = xgb.train(param, dtrain, n_tree, watchlist)
        xg_trn_feature = xg.predict(dtrain, pred_leaf=True)
        xg_tst_feature = xg.predict(dtest, pred_leaf=True)

        # save XGB features as the libffm format
        np_to_ffm(xg_trn_feature, dtrain.get_label(), ffm_train_file)
        np_to_ffm(xg_tst_feature, dtest.get_label(), ffm_test_file)

    logging.info('Retraining with 100% data...')
    subprocess.call([
        "ffm-train", '-k', '{}'.format(dim), '-r',
        str(lrate), '-t',
        str(n_iter), '-p', ffm_test_file, ffm_train_file, model_file
    ])

    subprocess.call(
        ["ffm-predict", ffm_test_file, model_file, predict_test_file])
Beispiel #14
0
def train_predict(train_file,
                  test_file,
                  train_svm_file,
                  predict_valid_file,
                  predict_test_file,
                  model_file,
                  n_iter=100,
                  lrate=.1,
                  n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_model = os.path.dirname(model_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-7]
    algo_name = 'vw_{}_{}'.format(n_iter, lrate)
    model_name = '{}_{}'.format(algo_name, feature_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training data')
    X, y = load_data(train_svm_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        valid_train_file = os.path.join(dir_feature,
                                        '{}.trn{}.vw'.format(feature_name, i))
        valid_test_file = os.path.join(dir_feature,
                                       '{}.val{}.vw'.format(feature_name, i))
        valid_model_file = os.path.join(dir_model,
                                        '{}.trn{}.mdl'.format(model_name, i))
        valid_predict_file = os.path.join(dir_val,
                                          '{}.val{}.yht'.format(model_name, i))

        # if there is no CV training or validation file, then generate them
        # first.
        if (not os.path.isfile(valid_train_file)
                or not os.path.isfile(valid_test_file)):
            # generate libsvm files
            valid_train_svm_file = os.path.join(
                dir_feature, '{}.trn{}.sps'.format(feature_name, i))
            valid_test_svm_file = os.path.join(
                dir_feature, '{}.val{}.sps'.format(feature_name, i))

            if not os.path.isfile(valid_train_svm_file):
                dump_svmlight_file(X[i_trn],
                                   y[i_trn],
                                   valid_train_svm_file,
                                   zero_based=False)

            if not os.path.isfile(valid_test_svm_file):
                dump_svmlight_file(X[i_val],
                                   y[i_val],
                                   valid_test_svm_file,
                                   zero_based=False)

            # then convert libsvm files into libffm formats
            svm_to_vw(valid_train_svm_file, valid_train_file, feature_name)
            svm_to_vw(valid_test_svm_file, valid_test_file, feature_name)

        subprocess.call([
            "vw", '--loss_function', 'logistic', '-d', valid_train_file, '-f',
            valid_model_file, '-l',
            str(lrate), '-b', '24', '-q', '::', '--passes',
            str(n_iter), '--cache_file', valid_train_file + '.tmp'
        ])

        subprocess.call([
            "vw", '-t', '-d', valid_test_file, '-i', valid_model_file, '-p',
            valid_predict_file
        ])

        p_val[i_val] = np.loadtxt(valid_predict_file)
        os.remove(valid_predict_file)

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    subprocess.call([
        "vw", '--loss_function', 'logistic', '-d', train_file, '-f',
        model_file, '-l',
        str(lrate), '-b', '24', '-q', '::', '--passes',
        str(n_iter), '--cache_file', train_file + '.tmp'
    ])

    subprocess.call([
        "vw", '-t', '-d', test_file, '-i', model_file, '-p', predict_test_file
    ])
Beispiel #15
0
def train_and_eval_auc( model, train_x, train_y, test_x, test_y ):
    model.fit( train_x, train_y )
    p = model.predict_proba(test_x )
    auc = AUC( test_y, p[:,1] )
    return auc
data = pd.get_dummies(data=data, columns=cat_features)
# randomize before splitting them up into train and test sets
data = data.iloc[np.random.permutation(len(data))]
data.reset_index(drop = True, inplace = True)

x = data.drop(['Target'], axis = 1)
y = data.Target

train_examples = 100000

x_train = x[:train_examples]
x_test = x[train_examples:]
y_train = y[:train_examples]
y_test = y[train_examples:]
# Logistic Regression:
clf = LogisticRegression()
clf.fit(x_train, y_train)
pred = clf.predict_proba(x_test)[:,1]
auc = AUC(y_test, pred)
print("Logistic Regression AUC: ",auc)

# Random Forest, a simple model (100 trees) trained in parallel
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(x_train, y_train)
pred = clf.predict_proba(x_test)[:,1]
auc = AUC(y_test, pred)
print ("Random Forest AUC: ",auc)

# Finally, CV our results (a very simple 2-fold CV):
scores = cross_val_score(LogisticRegression(), x, y, scoring='roc_auc', cv=2) 
print ("Mean AUC: {:.2%}, std: {:.2%} \n",scores.mean(),scores.std())
Beispiel #17
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  feature_map,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X_trn, y_trn = load_data(train_file)
    X_tst, _ = load_data(test_file)

    feature_map = pd.read_table(feature_map,
                                index_col=0,
                                header=None,
                                names=['feature_names', 'feature_type'])
    features = feature_map['feature_names'].values
    train_df = pd.DataFrame(data=X_trn.toarray(),
                            columns=feature_map['feature_names'])
    test_df = pd.DataFrame(data=X_tst.toarray(),
                           columns=feature_map['feature_names'])
    train_test = train_df.append(test_df)

    test_data = [
        test_df.loc[:, features].values[:, k]
        for k in range(test_df.loc[:, features].values.shape[1])
    ]

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=50, shuffle=True, random_state=SEED)

    vld_preds = np.zeros_like(y_trn)
    tst_preds = np.zeros((X_tst.shape[0], ))
    for cv_idx, (i_trn, i_vld) in enumerate(cv.split(X_trn, y_trn), 1):

        X_trn_cv = train_df.iloc[i_trn, :].reset_index(drop=True)
        X_vld_cv = train_df.iloc[i_vld, :].reset_index(drop=True)
        y_trn_cv = y_trn[i_trn]
        y_vld_cv = y_trn[i_vld]

        logging.info('Training model #{}'.format(cv_idx))

        clf = create_keras_embedding_model(train_test, features)
        clf.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=[auc])

        X_trn_cv = [
            X_trn_cv.loc[:, features].values[:, k]
            for k in range(X_trn_cv.loc[:, features].values.shape[1])
        ]
        X_vld_cv = [
            X_vld_cv.loc[:, features].values[:, k]
            for k in range(X_vld_cv.loc[:, features].values.shape[1])
        ]

        es = callbacks.EarlyStopping(monitor='val_auc',
                                     min_delta=0.001,
                                     patience=5,
                                     verbose=1,
                                     mode='max',
                                     baseline=None,
                                     restore_best_weights=True)

        rlr = callbacks.ReduceLROnPlateau(monitor='val_auc',
                                          factor=0.5,
                                          patience=3,
                                          min_lr=1e-6,
                                          mode='max',
                                          verbose=1)

        clf.fit(X_trn_cv,
                utils.to_categorical(y_trn_cv),
                validation_data=(X_vld_cv, utils.to_categorical(y_vld_cv)),
                verbose=0,
                batch_size=1024,
                callbacks=[es, rlr],
                epochs=50)

        vld_preds[i_vld] = clf.predict(X_vld_cv)[:, 1]

        logging.info('CV #{}: {:.4f}'.format(
            cv_idx, auc(y_trn[i_vld], vld_preds[i_vld])))

        if not retrain:
            tst_preds += (model.predict(test_data)[:, 1] / N_FOLDS).ravel()

    logging.info('Saving validation predictions...')
    logging.info('CV: {:.4f}'.format(AUC(y_trn, vld_preds)))
    np.savetxt(predict_valid_file, vld_preds, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')

        clf = create_keras_embedding_model(train_test, features)
        clf.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=[auc])

        X_trn_all = [
            train_df.loc[:, features].values[:, k]
            for k in range(train_df.loc[:, features].values.shape[1])
        ]

        clf.fit(X_trn_all,
                utils.to_categorical(y_trn),
                validation_data=(X_trn_all, utils.to_categorical(y_trn)),
                verbose=0,
                batch_size=1024,
                callbacks=[es, rlr],
                epochs=50)

        tst_preds = (clf.predict(test_data)[:, 1]).ravel()

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, tst_preds, fmt='%.6f', delimiter=',')
def train_predict(train_file,
                  test_file,
                  train_svm_file,
                  predict_valid_file,
                  predict_test_file,
                  model_file,
                  n_iter=100,
                  dim=4,
                  lrate=.1,
                  n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_model = os.path.dirname(model_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'ffm_{}_{}_{}'.format(n_iter, dim, lrate)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training data')
    X, y = load_svmlight_file(train_svm_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        valid_train_file = os.path.join(dir_feature,
                                        '{}.trn{}.ffm'.format(feature_name, i))
        valid_test_file = os.path.join(dir_feature,
                                       '{}.val{}.ffm'.format(feature_name, i))
        valid_model_file = os.path.join(dir_model,
                                        '{}.trn{}.mdl'.format(model_name, i))
        valid_predict_file = os.path.join(dir_val,
                                          '{}.val{}.yht'.format(model_name, i))

        # if there is no CV training or validation file, then generate them
        # first.
        if (not os.path.isfile(valid_train_file)
                or not os.path.isfile(valid_test_file)):
            # generate libsvm files
            valid_train_svm_file = os.path.join(
                dir_feature, '{}.trn{}.sps'.format(feature_name, i))
            valid_test_svm_file = os.path.join(
                dir_feature, '{}.val{}.sps'.format(feature_name, i))

            if not os.path.isfile(valid_train_svm_file):
                dump_svmlight_file(X[i_trn],
                                   y[i_trn],
                                   valid_train_svm_file,
                                   zero_based=False)

            if not os.path.isfile(valid_test_svm_file):
                dump_svmlight_file(X[i_val],
                                   y[i_val],
                                   valid_test_svm_file,
                                   zero_based=False)

            # then convert libsvm files into libffm formats
            svm_to_ffm(valid_train_svm_file, valid_train_file, feature_name)
            svm_to_ffm(valid_test_svm_file, valid_test_file, feature_name)

        subprocess.call([
            "ffm-train", '-k', '{}'.format(dim), '-r',
            str(lrate), '-t',
            str(n_iter), '-p', valid_test_file, valid_train_file,
            valid_model_file
        ])

        subprocess.call([
            "ffm-predict", valid_test_file, valid_model_file,
            valid_predict_file
        ])

        p[i_val] = np.loadtxt(valid_predict_file)

    logging.info('AUC = {:.6f}'.format(AUC(y, p)))
    np.savetxt(predict_valid_file, p, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    subprocess.call([
        "ffm-train", '-k', '{}'.format(dim), '-r',
        str(lrate), '-t',
        str(n_iter), '-p', test_file, train_file, model_file
    ])

    subprocess.call(["ffm-predict", test_file, model_file, predict_test_file])
Beispiel #19
0
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import log_loss

import os

input_file = os.getenv('TRAINING')

#

d = pd.read_csv(input_file, header=0)
features = [f for f in list(d) if 'feature' in f]
train, val = train_test_split( d, test_size = 5000 )

# train, predict, evaluate

n_trees = 100

rf = RF( n_estimators = n_trees, verbose = True )
rf.fit(train[features], train.target)

p = rf.predict_proba(val[features])

ll = log_loss(val.target.values, p[:,1])
auc = AUC( val.target.values, p[:,1] )
print("AUC: {:.2%}, log loss: {:.2%}".format(auc, ll))
Beispiel #20
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  n_leaf=200,
                  lrate=.1,
                  n_min=8,
                  subcol=.3,
                  subrow=.8,
                  subrow_freq=100,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename=f'{model_name}.log')

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    params = {
        'random_state': SEED,
        'n_jobs': -1,
        'objective': 'binary',
        'boosting': 'gbdt',
        'learning_rate': lrate,
        'num_leaves': n_leaf,
        'feature_fraction': subcol,
        'bagging_fraction': subrow,
        'bagging_freq': subrow_freq,
        'verbosity': -1,
        'min_child_samples': n_min,
        'metric': 'auc'
    }

    p = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info(f'Training model #{i}')
        trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn])
        val_lgb = lgb.Dataset(X[i_val], label=y[i_val])

        logging.info('Training with early stopping')
        clf = lgb.train(params,
                        trn_lgb,
                        n_est,
                        val_lgb,
                        early_stopping_rounds=n_stop,
                        verbose_eval=100)
        n_best = clf.best_iteration
        n_bests.append(n_best)
        logging.info(f'best iteration={n_best}')

        p[i_val] = clf.predict(X[i_val])
        logging.info(f'CV #{i}: {AUC(y[i_val], p[i_val]):.4f}')

        if not retrain:
            p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        n_best = sum(n_bests) // N_FOLD
        clf = lgb.LGBMRegressor(n_estimators=n_best,
                                num_leaves=n_leaf,
                                learning_rate=lrate,
                                min_child_samples=n_min,
                                subsample=subrow,
                                subsample_freq=subrow_freq,
                                colsample_bytree=subcol,
                                seed=SEED)

        clf = clf.fit(X, y)
        p_tst = clf.predict(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #21
0
Y = data['future48_AKI2_overlap'].values
Ytrain, Ytest = Y[:cutoff], Y[cutoff:]
IDtrain = data['PAT_ENC_CSN_ID'].values[:cutoff]

selectTrain, selectTest = np.isfinite(Ytrain), np.isfinite(Ytest)
Xtrain, Ytrain, IDtrain = Xtrain[
    selectTrain, :], Ytrain[selectTrain], IDtrain[selectTrain]
Xtest, Ytest = Xtest[selectTest, :], Ytest[selectTest]

pIndexSub = getPatientIndices(IDtrain)
sampleWeights = np.zeros(len(Xtrain))
for i in tqdm(range(len(pIndexSub))):
    start, stop, length = pIndexSub[i, :]
    sampleWeights[start:stop] = 1 / length

#X = np.concatenate((valueFeatures, timeFeatures, data2[['los']].values,
#                    data2[['creatinine']].values), axis = 1)

##############################################################################

from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import roc_auc_score as AUC
from Helper.utilities import showCoef
model = LR(class_weight='balanced', C=1e-1)

model.fit(Xtrain, Ytrain)  #,sample_weight = sampleWeights)
P = model.predict_proba(Xtest)[:, 1]
model.coef_
print(AUC(Ytest, P))  #, sample_weight = sampleWeights))

## performance is around 0.83 currently
Beispiel #22
0
 def on_epoch_end(self, epoch, logs={}):
     if epoch % self.interval == 0:
         y_pred = self.model.predict_proba(self.X_val, verbose=0)
         score = AUC(self.y_val, y_pred)
         #logging.info("interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score))
         print( "interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score))
p_train = p[0:train_end]
p_test = p[test_start:]

###

lr = LR()  # default param values
lr.fit(p_train.reshape(-1, 1), y_train)  # LR needs X to be 2-dimensional
p_calibrated = lr.predict_proba(p_test.reshape(-1, 1))[:, 1]

###

acc = accuracy_score(y_test, np.round(p_test))
acc_calibrated = accuracy_score(y_test, np.round(p_calibrated))

auc = AUC(y_test, p_test)
auc_calibrated = AUC(y_test, p_calibrated)

ll = log_loss(y_test, p_test)
ll_calibrated = log_loss(y_test, p_calibrated)

print "accuracy - before/after:", acc, "/", acc_calibrated
print "AUC - before/after:     ", auc, "/", auc_calibrated
print "log loss - before/after:", ll, "/", ll_calibrated
"""
accuracy - before/after: 0.847788697789 / 0.846805896806
AUC - before/after:      0.878139845077 / 0.878139845077
log loss - before/after: 0.630525772871 / 0.364873617584
"""

###
Beispiel #24
0
            'data/output/temp/v17_stage1_ver2_all_fold.csv',
            'data/output/temp/v18_stage1_ver2_all_fold.csv',
            'data/output/temp/v19_stage1_ver2_all_fold.csv',
            'data/output/temp/v20_stage1_ver2_all_fold.csv',
    ]:
        x = pd.read_csv(i)
        a = pd.concat([a, x], axis=1)

        cv_index = {}
        set_name = 'set{}'.format(set_idnex)
        for i in xrange(5):
            train_cv = set_data.loc[(set_data[set_name] != i).values,
                                    set_name].index
            test_cv = set_data.loc[(set_data[set_name] == i).values,
                                   set_name].index
            cv_index[i] = {}
            cv_index[i]['train'] = train_cv.values
            cv_index[i]['test'] = test_cv.values

        skf = pd.DataFrame(cv_index).stack().T
        auc = []
        for i in xrange(5):
            #print AUC(y.ix[skf['test'][i]].values, x.ix[skf['test'][i]].values)
            auc.append(
                AUC(y.ix[skf['test'][i]].values, x.ix[skf['test'][i]].values))

        set_idnex += 1
    print 'Per model, mean: {} std: {}'.format(np.mean(auc), np.std(auc))
    print 'Averaging AUC:{}'.format(AUC(y.values, a.mean(1).values))
    #AUC:0.842626778562
Beispiel #25
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  cid_train_file, cid_test_file):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_cid_{}.log'.format(feature_name))

    logging.info('Loading course IDs for training and test data')
    cid_trn = np.loadtxt(cid_train_file, dtype=int)
    cid_tst = np.loadtxt(cid_test_file, dtype=int)

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, y_tst = load_data(test_file)

    cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=2015)

    p = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        X_trn = X[i_trn]
        y_trn = y[i_trn]
        X_val = X[i_val]
        y_val = y[i_val]
        cid_valtrn = cid_trn[i_trn]
        cid_valtst = cid_trn[i_val]

        p_trn = np.zeros_like(y_trn)
        p_val = np.zeros_like(y_val)
        for j in range(39):
            idx_trn = np.where(cid_valtrn == j)[0]
            idx_val = np.where(cid_valtst == j)[0]

            clf = xgb.XGBClassifier(max_depth=PARAM[j][2],
                                    learning_rate=PARAM[j][1],
                                    n_estimators=PARAM[j][0],
                                    colsample_bytree=1,
                                    subsample=.4,
                                    nthread=6)

            clf.fit(X_trn[idx_trn], y_trn[idx_trn])
            p_trn[idx_trn] = clf.predict_proba(X_trn[idx_trn])[:, 1]
            p_val[idx_val] = clf.predict_proba(X_val[idx_val])[:, 1]
            logging.info('CID #{}: {:.4f}, {:.4f}'.format(
                j, AUC(y_trn[idx_trn], p_trn[idx_trn]),
                AUC(y_val[idx_val], p_val[idx_val])))

        logging.info('AUC TRN = {:.6f}'.format(AUC(y_trn, p_trn)))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y_val, p_val)))
        p[i_val] = p_val

    logging.info('AUC = {:.6f}'.format(AUC(y, p)))
    logging.info('Saving CV predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    p_tst = np.zeros_like(y_tst)
    n_tst = len(p_tst)
    for j in range(39):
        idx_trn = np.where(cid_trn == j)[0]
        idx_tst = np.where(cid_tst == j)[0]
        logging.info('CID #{}: {:.2f}%'.format(j, len(idx_tst) / n_tst * 100))
        clf.fit(X[idx_trn], y[idx_trn])
        p_tst[idx_tst] = clf.predict_proba(X_tst[idx_tst])[:, 1]

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Beispiel #26
0
def train_predict(train_feature_file,
                  test_feature_file,
                  predict_valid_file,
                  predict_test_file,
                  C=1.0,
                  class_weight='balanced',
                  max_iter=1000,
                  solver='lbfgs',
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_feature_file)
    X_tst, _ = load_data(test_feature_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True,
                         random_state=SEED).split(X, y)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}'.format(i))

        logging.info('Training Logistic Regression')
        clf = LogisticRegression(C=C,
                                 class_weight=class_weight,
                                 max_iter=max_iter,
                                 solver=solver,
                                 random_state=SEED)

        clf = clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p_val[i_val])))

        if not retrain:
            p_tst += clf.predict_proba(X_tst)[:, 1] / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p_val)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = LogisticRegression(C=C,
                                 class_weight=class_weight,
                                 max_iter=max_iter,
                                 solver=solver,
                                 random_state=SEED)

        clf = clf.fit(X, y)
        p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #27
0
    def sort_training_set(self, classifier='RF'):

        print "loading..."

        train = pd.read_csv(resource_filename('numerai.data', self.train_file_name))
        test = pd.read_csv(resource_filename('numerai.data', self.test_file_name))

        test.drop('t_id', axis=1, inplace=True)
        test['target'] = 0  # dummy for preserving column order when concatenating

        train['is_test'] = 0
        test['is_test'] = 1

        orig_train = train.copy()
        assert (np.all(orig_train.columns == test.columns))

        train = pd.concat((orig_train, test))
        train.reset_index(inplace=True, drop=True)

        x = train.drop(['is_test', 'target'], axis=1)
        y = train.is_test

        print "cross-validating..."

        n_estimators = 100
        if classifier == 'RF':
            clf = RF(bootstrap=True,
                     min_samples_leaf=3,
                     n_estimators=n_estimators,
                     max_features=20,
                     criterion='gini',
                     min_samples_split=20,
                     max_depth=None,
                     n_jobs=6)
        else:
            clf = LR(n_jobs=6)

        predictions = np.zeros(y.shape)

        cv = CV.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=5678)

        for f, (train_i, test_i) in enumerate(cv):
            print "# fold {}, {}".format(f + 1, ctime())

            x_train = x.iloc[train_i]
            x_test = x.iloc[test_i]
            y_train = y.iloc[train_i]
            y_test = y.iloc[test_i]

            clf.fit(x_train, y_train)

            p = clf.predict_proba(x_test)[:, 1]

            auc = AUC(y_test, p)
            print "# AUC: {:.2%}\n".format(auc)

            predictions[test_i] = p

        train['p'] = predictions

        i = predictions.argsort()
        train_sorted = train.iloc[i]

        train_sorted = train_sorted.loc[train_sorted.is_test == 0]
        assert (train_sorted.target.sum() == orig_train.target.sum())

        train_sorted.drop('is_test', axis=1, inplace=True)
        train_sorted.to_csv(resource_filename('numerai.data', self.sorted_file_name), index=False)
x_test = np.hstack((x_num_test, vec_x_cat_test))

if __name__ == "__main__":

    print "training..."

    n_trees = 100
    max_features = int(round(sqrt(x_train.shape[1]) *
                             2))  # try more features at each split
    max_features = 'auto'
    verbose = 1
    n_jobs = 1

    rf = RF(n_estimators=n_trees,
            max_features=max_features,
            verbose=verbose,
            n_jobs=n_jobs)
    rf.fit(x_train, y_train)

    p = rf.predict_proba(x_test)

    auc = AUC(y_test, p[:, 1])
    print "AUC", auc

    # AUC 0.701579086548
    # AUC 0.676126704696

    # max_features * 2
    # AUC 0.710060065732
    # AUC 0.706282346719
Beispiel #29
0
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

###

print "Training the random forest (this may take a while)..."

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
forest = forest.fit(train_data_features, train["sentiment"])

print "Predicting test labels...\n"
rf_p = forest.predict_proba(test_data_features)

auc = AUC(test['sentiment'].values, rf_p[:, 1])
print "random forest AUC:", auc

# a random score from a _random_ forest
# AUC: 0.919056767104

# let's define a helper function


def train_and_eval_auc(model, train_x, train_y, test_x, test_y):
    model.fit(train_x, train_y)
    p = model.predict_proba(test_x)
    auc = AUC(test_y, p[:, 1])
    return auc

def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  batch_size=1024,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0], ))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn], y[i_trn],
                                                    batch_size, True),
                          nb_epoch=n_est,
                          samples_per_epoch=X[i_trn].shape[0],
                          verbose=1)

        p[i_val] = clf.predict_generator(generator=batch_generatorp(
            X[i_val], batch_size, False),
                                         val_samples=X[i_val].shape[0])[:, 0]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        if not retrain:
            p_tst += clf.predict_generator(
                generator=batch_generatorp(X_tst, batch_size, False),
                val_samples=X_tst.shape[0])[:, 0] / N_FOLD

    logging.info('Saving validation predictions...')
    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X, Y, batch_size, True),
                          nb_epoch=n_est)
        p_tst = clf.predict_generator(generator=batch_generatorp(
            X_tst, batch_size, False),
                                      val_samples=X_tst.shape[0])[:, 0]

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')