def task_5fold(fold):
    train_path = url_path + '/series_train_k_DBSTAR.csv'

    test_path = url_path + '/series_test_k_DBSTAR.csv'
    new_path = './output/predicts.csv'

    train_df = pd.read_csv(train_path, header=0)
    test_df = pd.read_csv(test_path, header=0)

    train_data = train_df.loc[:, feature_NAME]

    train_target = train_df.loc[:, 'FLAG']
    test_data = test_df.loc[:, feature_NAME]
    test_target = test_df.loc[:, 'FLAG']
    '''Boost-pruning'''
    boost_pruning = boost_modify.AdaBoostClassifier(
        learning_rate=0.5,
        n_estimators=450,
        base_estimator=tree_prune.DecisionTreeClassifier(max_depth=15),
        algorithm='SAMME')
    boost_pruning.fit(train_data, train_target, v_Folds=5)

    test_df['LABEL_PREDICTION'] = boost_pruning.predict(test_data)
    test_df.to_csv(new_path)

    bp_prob = boost_pruning.predict_proba(test_data)
    fpr, tpr, _ = roc_curve(test_target, bp_prob[:, 1], pos_label=1)
    roc_auc = auc(fpr, tpr)
    print('Fold %d AUC:%f' % (fold, roc_auc))
    return roc_auc
    train_path = url_path + '/series_train_k_DBSTAR.csv'

    test_path = url_path + '/series_test_k_DBSTAR.csv'
    train_df = pd.read_csv(train_path, header=0)
    test_df = pd.read_csv(test_path, header=0)

    train_data = train_df.loc[:, feature_NAME]

    train_target = train_df.loc[:, 'FLAG']
    test_data = test_df.loc[:, feature_NAME]
    test_target = test_df.loc[:, 'FLAG']

    from sklearn.ensemble import boost_modify
    from sklearn.tree import tree_prune
    clf = tree_prune.DecisionTreeClassifier()
    clf.fit(train_data, train_target)
    tree_prune.get_n_leaves(clf)

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import boost_modify
from sklearn.tree import tree_prune
import multiprocessing


def task_5fold(fold):
Beispiel #3
0
 def fit(self, X,y):
     '''
     input:
         X: numpy array
         y: numpy array
         
     return:
         paras: dict
         
         clf: tree object
         
     '''
     
     #X = df[df.columns[:-1]].values
     #y = df[df.columns[-1]].values
     
         
     if len(np.unique(y)) <= 10:
         clf = tree.DecisionTreeClassifier()  
         clf_prune = tree_prune.DecisionTreeClassifier()
         scoring = 'roc_auc'
     else:
         clf = tree.DecisionTreeRegressor() 
         clf_prune = tree_prune.DecisionTreeRegressor() 
         scoring = 'r2'
     clf.random_state = self.random_state
     clf_prune.random_state = self.random_state
     
     clf.fit(X,y)
     depth = tree_prune.get_max_depth(clf)
     leaf_samples,node_samples = tree_prune.get_min_sample_leaf_split(clf)
     leaf_nodes =  tree_prune.get_n_leaves(clf)
     
     if self.method == 'none':
         paras = {}
         paras['max_depth'] = depth
         paras['min_samples_split'] = node_samples
         paras['min_samples_leaf'] = leaf_samples
         paras['max_leaf_nodes'] = leaf_nodes
 
     elif self.method == 'cal': 
         paras = {}
         min_samples_split,min_samples_leaf = self.get_min_samples(len(y))
         clf.min_samples_split = min_samples_split
         clf.min_samples_leaf = min_samples_leaf
         clf.fit(X,y)
         depth = tree_prune.get_max_depth(clf)
         leaf_samples,node_samples = tree_prune.get_min_sample_leaf_split(clf)       
         paras['max_depth'] = depth
         paras['min_samples_split'] = node_samples
         paras['min_samples_leaf'] = leaf_samples
         paras['max_leaf_nodes'] =  tree_prune.get_n_leaves(clf)
         
     elif self.method == 'tune':
         paras = self.tune_paras(clf,X,y, 
                            random_state = self.random_state,
                            cv =self.cv, 
                            scoring = scoring,
                            n_jobs =self.n_jobs)
     elif self.method == 'prune':
         #clf_prune.max_depth = depth
         #clf_prune.min_samples_split = node_samples
         #clf_prune.min_samples_leaf = leaf_samples
 
 
         if depth >= 25:
             clf_prune.max_depth = 25
         else:
             clf_prune.max_depth = depth        
         
         min_samples_split,min_samples_leaf = self.get_min_samples(len(y))
         clf_prune.min_samples_split = int(0.8*min_samples_split)
         clf_prune.min_samples_leaf = int(0.8*min_samples_leaf)
         
         clf_prune.fit(X,y)
         
         clf1 = deepcopy(clf_prune)
 
         print('get optimal n_leaves,max leaves of the tree is %d......\n' % tree_prune.get_n_leaves(clf1))  
         
         n_leaves,means,stds,x = self.get_optimal_leaves(clf1,
                                                         X,
                                                         y,
                                                         n_iterations=self.n_iter,
                                                         random_state = self.random_state, 
                                                         alpha = self.prune_alpha,
                                                         max_leaves = self.max_leaves)
         
         print('optimal n_leaves is: %d, begin pruning tree...\n' % n_leaves)
         #prune
         clf_prune.prune(n_leaves)
         
         print('pruning is finished')
         
         #get pruned tree's best parameters
         max_depth = tree_prune.get_max_depth(clf_prune)
         min_sample_leaf, min_sample_split = tree_prune.get_min_sample_leaf_split(clf_prune)
         
         paras = {}
         paras['max_depth'] = max_depth
         paras['min_samples_split'] = min_sample_split
         paras['min_samples_leaf'] = min_sample_leaf      
         paras['max_leaf_nodes'] =  n_leaves
         self.plot_prune(x,means,stds)
         
     elif self.method == 'both':
         
         print('tuning,please wait...\n')
         tune_dict = self.tune_paras(clf,X,y, 
                                random_state = self.random_state,
                                cv =self.cv, 
                                scoring = scoring,
                                n_jobs =self.n_jobs)
     
         clf_prune.max_depth = tune_dict['max_depth']
         clf_prune.min_samples_split = tune_dict['min_samples_split']
         clf_prune.min_samples_leaf = tune_dict['min_samples_leaf']
         
         
         clf_prune.fit(X,y)
 
 
         clf2= deepcopy(clf_prune)
         
         print('get optimal n_leaves...,max leaves of the tree is %d\n' % tree_prune.get_n_leaves(clf2))          
         
         #find best prune parameter: n_leaves
         n_leaves,means,stds,x = self.get_optimal_leaves(clf2,
                                                         X,
                                                         y,
                                                         n_iterations=self.n_iter,
                                                         random_state = self.random_state, 
                                                         alpha = self.both_alpha,
                                                         max_leaves = self.max_leaves)
 
         print('optimal n_leaves is: %d, begin pruning tree...\n' % n_leaves)
         
         #prune
         clf_prune.prune(n_leaves)
         
         print('pruning is finished') 
         
         #get pruned tree's best parameters
         max_depth = tree_prune.get_max_depth(clf_prune)
         min_sample_leaf, min_sample_split = tree_prune.get_min_sample_leaf_split(clf_prune)
         
         paras = {}
         paras['max_depth'] = max_depth
         paras['min_samples_split'] = min_sample_split
         paras['min_samples_leaf'] = min_sample_leaf
         paras['max_leaf_nodes'] =  n_leaves
         
         self.plot_prune(x,means,stds)
     else:
         print("get empty parameters dict, only 'none', 'cal', 'tune', 'prune' and 'both' are supported in the method currently")
         paras = {} 
         paras['max_depth'] = None
         paras['min_samples_split'] = 1
         paras['min_samples_leaf'] = 2
         paras['max_leaf_nodes'] = None
         
     if self.method == 'both' or 'prune':
         self.clf = clf_prune
     else:
         self.clf = clf
         
     self.max_depth = paras['max_depth']
     self.min_samples_split = paras['min_samples_split']
     self.min_samples_leaf = paras['min_samples_leaf']  
     self.max_leaf_nodes = paras['max_leaf_nodes']
     return self