def feature_selection(self,features_importance_print=False):
     clf = ExtraTreesClassifier()        
     features_available = self.used_features()
     clf.max_features = len(features_available)
     clf = clf.fit(self.training_data[features_available].values, self.training_data["label"])
     for importance in clf.feature_importances_:
         index = np.where(clf.feature_importances_==importance)[0][0]
         self.features_importance_dict[importance] =features_available[index]
     
     if features_importance_print:
         for feature_score in sorted(self.features_importance_dict.keys(),reverse=True):
             print self.features_importance_dict[feature_score],feature_score
Exemple #2
0
    param_grid = tuple([n_vals, n_minleaf, n_minsplit, n_maxfeat])
    param_grid = list(product(*param_grid))

    # storage structure for forecasts
    mvalid = np.zeros((xtrain.shape[0], len(param_grid)))
    mfull = np.zeros((xtest.shape[0], len(param_grid)))

    ## build 2nd level forecasts
    for i in range(len(param_grid)):
        print "processing parameter combo:", i
        # configure model with j-th combo of parameters
        x = param_grid[i]
        model.n_estimators = x[0]
        model.min_samples_leaf = x[1]
        model.min_samples_split = x[2]
        model.max_features = x[3]

        # loop over folds
        for j in range(0, n_folds):
            idx0 = np.where(fold_index != j)
            idx1 = np.where(fold_index == j)
            x0 = np.array(xtrain)[idx0, :][0]
            x1 = np.array(xtrain)[idx1, :][0]
            y0 = np.array(y_train)[idx0]
            y1 = np.array(y_train)[idx1]

            # fit the model on observations associated with subject whichSubject in this fold
            model.fit(x0, y0)
            mvalid[idx1, i] = model.predict_proba(x1)[:, 1]
            y_pre = model.predict_proba(x1)[:, 1]
            scores = roc_auc_score(y1, y_pre)
    f1.write('\nmodel type:'); f1.write(str(model_type))
    f1.write('\nseed value: '); f1.write(str(seed_value))    
    f1.write('\nparameter grid \n'); f1.write(str(param_grid)    )
    f1.close()
    
    # storage structure for forecasts
    mvalid = np.zeros((xtrain.shape[0],len(param_grid)))
    mfull = np.zeros((xtest.shape[0],len(param_grid)))
    
    ## build 2nd level forecasts
    for i in range(len(param_grid)):        
            print "processing parameter combo:", i
            # configure model with j-th combo of parameters
            x = param_grid[i]
            model.max_depth = int(x[0])
            model.max_features = int(x[1])
            model.max_features = int(x[2])
            model.min_samples_leaf = int(x[3])
            model.min_weight_fraction_leaf = x[4]
            model.n_estimators = int(x[5])
            
            # loop over folds
            for j in range(0,n_folds):
                idx0 = np.where(fold_index != j)
                idx1 = np.where(fold_index == j)
                x0 = np.array(xtrain)[idx0,:][0];
                x1 = np.array(xtrain)[idx1,:][0]
                y0 = np.array(y)[idx0];
                y1 = np.array(y)[idx1]

                model.fit(x0, y0)
Exemple #4
0
    def init_hyperparameters(self, trial, X, y):
        self.name = id_name('SelectKBest')

        self.k_fraction = trial.suggest_uniform(self.name + 'k_fraction', 0.0,
                                                1.0)

        self.sparse = False

        score_func = trial.suggest_categorical(self.name + 'score_func', [
            'chi2', 'f_classif', 'mutual_info', 'ExtraTreesClassifier',
            'LinearSVC'
        ])

        if score_func == "chi2":
            self.score_func = sklearn.feature_selection.chi2
        elif score_func == "f_classif":
            self.score_func = sklearn.feature_selection.f_classif
        elif score_func == "mutual_info":
            self.score_func = sklearn.feature_selection.mutual_info_classif

        elif score_func == 'ExtraTreesClassifier':
            new_name = self.name + '_' + score_func + '_'
            model = ExtraTreesClassifier()
            model.n_estimators = 100
            model.criterion = trial.suggest_categorical(
                new_name + "criterion", ["gini", "entropy"])
            model.max_features = trial.suggest_uniform(
                new_name + "max_features", 0, 1)
            model.max_depth = None
            model.max_leaf_nodes = None
            model.min_samples_split = trial.suggest_int(new_name +
                                                        "min_samples_split",
                                                        2,
                                                        20,
                                                        log=False)
            model.min_samples_leaf = trial.suggest_int(new_name +
                                                       "min_samples_leaf",
                                                       1,
                                                       20,
                                                       log=False)
            model.min_weight_fraction_leaf = 0.
            model.min_impurity_decrease = 0.
            model.bootstrap = trial.suggest_categorical(
                new_name + "bootstrap", [True, False])

            self.score_func = functools.partial(
                model_score, estimator=model)  #bindFunction1(model)

        elif score_func == 'LinearSVC':
            new_name = self.name + '_' + score_func + '_'
            model = sklearn.svm.LinearSVC()
            model.penalty = "l1"
            model.loss = "squared_hinge"
            model.dual = False
            model.tol = trial.suggest_loguniform(new_name + "tol", 1e-5, 1e-1)
            model.C = trial.suggest_loguniform(new_name + "C", 0.03125, 32768)
            model.multi_class = "ovr"
            model.fit_intercept = True
            model.intercept_scaling = 1

            self.score_func = functools.partial(model_score, estimator=model)
Exemple #5
0
    f1.write(str(seed_value))
    f1.write('\nparameter grid \n')
    f1.write(str(param_grid))
    f1.close()

    # storage structure for forecasts
    mvalid = np.zeros((xtrain.shape[0], len(param_grid)))
    mfull = np.zeros((xtest.shape[0], len(param_grid)))

    ## build 2nd level forecasts
    for i in range(len(param_grid)):
        print "processing parameter combo:", i
        # configure model with j-th combo of parameters
        x = param_grid[i]
        model.max_depth = int(x[0])
        model.max_features = int(x[1])
        model.max_features = int(x[2])
        model.min_samples_leaf = int(x[3])
        model.min_weight_fraction_leaf = x[4]
        model.n_estimators = int(x[5])

        # loop over folds
        for j in range(0, n_folds):
            idx0 = np.where(fold_index != j)
            idx1 = np.where(fold_index == j)
            x0 = np.array(xtrain)[idx0, :][0]
            x1 = np.array(xtrain)[idx1, :][0]
            y0 = np.array(y)[idx0]
            y1 = np.array(y)[idx1]

            model.fit(x0, y0)
    param_grid = tuple([n_vals, n_minleaf, n_minsplit, n_maxfeat])
    param_grid = list(product(*param_grid))

    # storage structure for forecasts
    mvalid = np.zeros((xtrain.shape[0],len(param_grid)))
    mfull = np.zeros((xtest.shape[0],len(param_grid)))
    
    ## build 2nd level forecasts
    for i in range(len(param_grid)):        
            print "processing parameter combo:", i
            # configure model with j-th combo of parameters
            x = param_grid[i]
            model.n_estimators = x[0]
            model.min_samples_leaf = x[1]     
            model.min_samples_split = x[2]
            model.max_features = x[3]
            
            # loop over folds
            for j in range(0,n_folds):
                idx0 = np.where(fold_index != j)
                idx1 = np.where(fold_index == j)
                x0 = np.array(xtrain)[idx0,:][0];
                x1 = np.array(xtrain)[idx1,:][0]
                y0 = np.array(y_train)[idx0];
                y1 = np.array(y_train)[idx1]

                # fit the model on observations associated with subject whichSubject in this fold
                model.fit(x0, y0)
                mvalid[idx1,i] = model.predict_proba(x1)[:,1]
                y_pre = model.predict_proba(x1)[:,1]
                scores = roc_auc_score(y1,y_pre)