Beispiel #1
0
def extratrees(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame,
               parameters: Dict):
    n_splits = 5
    # n_neighbors = parameters["n_neighbors"]
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof = np.zeros((df.shape[0] + test.shape[0], 9))

    for trn_idx, val_idx in folds.split(df, target):
        train_x = df.iloc[trn_idx, :].values
        val_x = df.iloc[val_idx, :].values
        train_y = target[trn_idx].values
        val_y = target[val_idx].values

        classifier = ExtraTreesClassifier(n_jobs=14,
                                          n_estimators=100,
                                          max_depth=12)
        classifier.fit(train_x, train_y)

        y_hat = classifier.predict_proba(val_x)

        print(log_loss(val_y, y_hat))
        print(oof.shape, y_hat.shape)
        oof[val_idx] = y_hat
        pred = classifier.predict_proba(test.values)

        oof[len(target):, :] += pred / n_splits

    print(oof.shape)
    # np.save("data/04_features/oof.npz", oof)
    # oof = np.load("data/04_features/oof.npy")
    n_name = ["knn_{}".format(i) for i in range(9)]
    oof = pd.DataFrame(oof)
    oof.to_csv("data/09_oof/extra_{}.csv".format("n3"))
    return oof[len(target):].values
Beispiel #2
0
def train_ensemble(train_X, train_y, test_X):
    def to_tfidf(X):
        X = X.astype(np.float32)
        tfidf = TfidfTransformer()
        X = tfidf.fit_transform(X).toarray()

        return X

    train_set_X = train_X.copy()
    test_set_X = test_X.copy()

    train_set_X = to_tfidf(train_set_X)
    test_set_X = to_tfidf(test_X)

    model = ExtraTreesClassifier(n_estimators=300,
                                 criterion="entropy",
                                 max_features=30,
                                 max_depth=25)

    model.fit(train_set_X, train_y)

    pred_y = model.predict_proba(test_set_X)
    save_prediction("./test/extratrees.stack.csv", pred_y)

    pred_y = model.predict_proba(train_set_X)
    save_prediction("./train/extratrees.stack.csv", pred_y)
Beispiel #3
0
def runET(train_X, train_y, test_X, test_y=None, test_X2=None, rounds=100, depth=20, 
          leaf=10, feat=0.2,min_data_split_val=2,seed_val=0,job = -1):
	model = ExtraTreesClassifier(
                    n_estimators = rounds,
					max_depth = depth,
					min_samples_split = min_data_split_val,
					min_samples_leaf = leaf,
					max_features =  feat,
					n_jobs = job,
					random_state = seed_val)
	model.fit(train_X, train_y)
	train_preds = model.predict_proba(train_X)[:,1]
	test_preds = model.predict_proba(test_X)[:,1]
	
	test_preds2 = 0
	if test_X2 is not None:
		test_preds2 = model.predict_proba(test_X2)[:,1]
	
	test_loss = 0
	if test_y is not None:
		train_loss = metrics.roc_auc_score(train_y, train_preds)
		test_loss = metrics.roc_auc_score(test_y, test_preds)
		print("Depth, leaf, feat : ", depth, leaf, feat)
		print("Train and Test loss : ", train_loss, test_loss)
	return test_preds, test_loss, test_preds2, model
Beispiel #4
0
def try_params(n_iterations, params):

    n_estimators = int(round(n_iterations * trees_per_iteration))
    print "n_estimators:", n_estimators
    pprint(params)

    clf = XT(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params)
    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_train)[:, 1]

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    #

    p = clf.predict_proba(x_test)[:, 1]

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    return {'loss': ll, 'log_loss': ll, 'auc': auc}
Beispiel #5
0
def predict_et():

    X = pd.read_csv('data/X_train.csv', header=0)
    y = pd.read_csv('data/y_train.csv', header=0)
    #X= X.drop(['id'],axis=1)
    #X= X.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1)
    y = y['fault_severity']

    testX = pd.read_csv('data/X_test.csv', header=0)
    testY = pd.read_csv('data/y_test.csv', header=0)
    testX1 = testX
    #testX1= testX.drop(['id'],axis=1)
    #testX1=testX.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1)
    testY = testY['fault_severity']

    et = ExtraTreesClassifier(n_estimators=440, random_state=1)
    et.fit(X, y)
    print(et.score(X, y))
    print(et.score(testX1, testY))

    # prediction
    testy = et.predict_proba(testX1)

    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    submission = pd.DataFrame(et.predict_proba(testX1),
                              index=testX.id,
                              columns=pred_cols)
    print(multiclass_log_loss(testY.values, submission.values))

    submission.to_csv('et_output.csv', index_label='id')
Beispiel #6
0
def stack_extra_trees_layer2(features, labels, test_feature):
    features = np.load(features)
    test = np.load(test_feature)
    fold_split, feature_split, label_split = stack_split(features, labels, 5)
    fold_score = []
    test_score = []
    print("\nInitiate stack extra_trees")
    for i in range(len(fold_split)):
        print("\nProcessing random forest model number:{}".format(i + 1))
        extra_trees = ExtraTreesClassifier(n_estimators=450,
                                           max_depth=4,
                                           criterion='entropy')
        extra_trees.fit(feature_split["feature_{}".format(i + 1)],
                        label_split["label_{}".format(i + 1)])
        print("Training complete")
        stack_score = extra_trees.predict_proba(
            fold_split["fold_{}".format(i + 1)])
        print("fold score predicted")
        test_prediction = extra_trees.predict_proba(test_feature)
        print("test score predicted")
        test_score.append(test_prediction[:, 1].tolist())
        fold_score += stack_score[:, 1].tolist()
        joblib.dump(extra_trees,
                    model_path + "ET_layer_2_model_{}.pkl".format(i + 1))
        print("ET model nubmer:{}".format(i + 1) + " complete")
        # print(scores)
    return fold_score, test_score
Beispiel #7
0
    def objective( self, args ):

        args_ = self.input_converter( args )
        print(args_)
        try:
            CLASSIFIER = ExtraTreesClassifier( random_state = 42, **args_ )
        except:
            CLASSIFIER = ExtraTreesClassifier( **args_ )

        CLASSIFIER.fit( self.X_train, self.y_train )

        nt_preds = CLASSIFIER.predict( self.X_test )
        nt_score = f1_score( self.y_test, nt_preds )

        pred_ = CLASSIFIER.predict_proba( self.X_val )[ :, 1 ]
        best, test_preds = self.find_best_threshold( pred_ )

        preds = CLASSIFIER.predict_proba( self.X_test )[ :, 1 ]
        test_preds = list()
        for k in range( len( preds ) ):
            if preds[ k ] > best:
                test_preds.append( 1 )
            else:
                test_preds.append( 0 )

        score = f1_score( self.y_test, test_preds )
        self.improved.append( ( nt_score, score, best ) )
        print( '\n ============================ \n {} \n ============================ \n'.format( nt_score ) )
        print( '\n ============================ \n {} \n ============================ \n'.format( score ) )
        print( '\n ============================ \n {} \n ============================ \n'.format( best ) )
        print( '\n {} \n'.format( args_ ) )
        cm = np.array( confusion_matrix( self.y_test, test_preds ) )
        plot_confusion_matrix( cm = cm, target_names = [ 'nothing', 'spike' ] )

        return {'loss': -score, 'status': STATUS_OK}
Beispiel #8
0
class MyExtraTree(MyClassifier):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesClassifier(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesClassifier(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    # def predict(self, Xtest, option = None):
    #   return self._extree.predict(Xtest)

    def predict_proba(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)[:, 1]

    def predict_proba_multi(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Extreme Random Forest Classifier***************")
    t0 = time()
    clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Extreme Random Forest Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def eval_param(params):
    """Evaluation of one set of xgboost's params.
    Then, use 3 folds as training and cv in a row as xgboost's watchlist with an early_stop at 50.
    """
    global df_results, train, target, test
    print ("Training with params : ")
    print (params)

    random_state = 42
    avg_score = 0.
    n_folds = 3
    predict = np.zeros(test.shape[0])
    #dtest = xgb.DMatrix(test)
    skf = StratifiedKFold(target, n_folds=n_folds, random_state=random_state)
    for train_index, cv_index in skf:
        # train
        x_train, x_cv = train[train_index], train[cv_index]
        y_train, y_cv = target[train_index], target[cv_index]
        clf = ExtraTreesClassifier(**params).fit(x_train, y_train)
        #bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds, maximize=True)
            # test / score
        predict_cv = clf.predict_proba(x_cv, y_cv)#bst.predict(dvalid, ntree_limit=bst.best_iteration)
        avg_score += -log_loss(y_cv, predict_cv)
        predict += clf.predict_proba(test)#bst.predict(dtest, ntree_limit=bst.best_iteration)
    predict /= n_folds
    avg_score /= n_folds 
    # store
    new_row = pd.DataFrame([np.append([avg_score], list(params.values()))],
                                 columns=np.append(['score'], list(params.keys())))
    df_results = df_results.append(new_row, ignore_index=True)
    np.savetxt('hyperopt_preds/pred' + str(df_results.index.max()) + '.txt', predict, fmt='%s')
    df_results.to_csv('hyperopt_results_sgd.csv')
    print ("\tScore {0}\n\n".format(avg_score))
    return {'loss': - avg_score, 'status': STATUS_OK}
Beispiel #11
0
def extra_forest(train_data, var_count, y, validate, test_data):
    extf_model = ExtraTreesClassifier(n_estimators=350,
                                      max_depth=10,
                                      min_samples_leaf=10,
                                      random_state=1234,
                                      max_features=0.75)
    extf_model.fit(train_data, np.ravel(y))
    valid_pred = extf_model.predict_proba(validate)
    test_pred = extf_model.predict_proba(test_data)
    return valid_pred, test_pred
Beispiel #12
0
def et(train_data,train_label,val_data,val_label,test_data,name="extratrees_submission.csv"):
	print "start training ExtraTrees..."
	etClf = ExtraTreesClassifier(n_estimators=10)
	etClf.fit(train_data,train_label)
	#evaluate on validation set
	val_pred_label = etClf.predict_proba(val_data)
	logloss = preprocess.evaluation(val_label,val_pred_label)
	print "logloss of validation set:",logloss

	print "Start classify test set..."
	test_label = etClf.predict_proba(test_data)
	preprocess.saveResult(test_label,filename = name)
Beispiel #13
0
def et(series, n_folds, clfparams, featureparams, aggregateparams, include,
       exclude, save_test_predictions, save_oob_predictions,
       skip_cross_validation, _run):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        feature_importances_ = 0
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(
                itrain, itest, **aggregateparams)

            clf = ET(**clfparams)
            clf.fit(Xtr, ytr)
            pred.iloc[itest, :] = clf.predict_proba(Xte)
            trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr))
            _run.info['trainloss'].append(trainloss)
            loss = multiclass_log_loss(yte, pred.iloc[itest].values)
            _run.info['loss'].append(loss)
            if i == 1:
                feature_importances_ = clf.feature_importances_ / n_folds
            else:
                feature_importances_ += clf.feature_importances_ / n_folds
            i += 1
        loss = multiclass_log_loss(y, pred.values)

        _run.info['features'] = list(Xtr.columns)
        _run.info['feature_importances'] = list(feature_importances_)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        clf = ET(**clfparams)
        clf.fit(Xtr, ytr)
        predtest = pd.DataFrame(clf.predict_proba(Xte),
                                index=yte.index,
                                columns=pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
Beispiel #14
0
def et(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude,
        save_test_predictions, save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include = include, exclude = exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index = y.index, columns = pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        feature_importances_ = 0
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams)

            clf = ET(**clfparams)
            clf.fit(Xtr, ytr)
            pred.iloc[itest, :] = clf.predict_proba(Xte)
            trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr))
            _run.info['trainloss'].append(trainloss)
            loss = multiclass_log_loss(yte, pred.iloc[itest].values)
            _run.info['loss'].append(loss)
            if i == 1:
                feature_importances_ = clf.feature_importances_/n_folds
            else:
                feature_importances_ += clf.feature_importances_/n_folds
            i += 1
        loss = multiclass_log_loss(y, pred.values)

        _run.info['features'] = list(Xtr.columns)
        _run.info['feature_importances'] = list(feature_importances_)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        clf = ET(**clfparams)
        clf.fit(Xtr, ytr)
        predtest = pd.DataFrame(clf.predict_proba(Xte),
                                index = yte.index, columns = pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
def process_sylvine(Xtrain, ytrain, Xval, Xtest, global_params):
    print 'ITS A SYLVINE TIME'
    print

    t0 = time.time()

    goods = np.array([
        False, False, False, False, False, False, True, False, True, True,
        False, False, False, False, True, True, False, False, False, True
    ])

    Xnewtrain = np.array(Xtrain[:, goods])
    Xnewtest = np.array(Xtest[:, goods])
    Xnewval = np.array(Xval[:, goods])

    t0 = time.time()

    iso = Isomap(n_neighbors=20, n_components=3).fit(Xnewtrain[:, :6])

    print 'ISOSTAS !!!'
    print(time.time() - t0) / 60.

    t0 = time.time()

    Xisotrain = iso.transform(Xnewtrain[:, :6])
    Xisotest = iso.transform(Xnewtest[:, :6])
    Xisoval = iso.transform(Xnewval[:, :6])

    print 'ISOSTAS RETURNED !!!'
    print(time.time() - t0) / 60.

    Xnewtrain = np.hstack((Xnewtrain, Xisotrain))
    Xnewtest = np.hstack((Xnewtest, Xisotest))
    Xnewval = np.hstack((Xnewval, Xisoval))

    modelrf = ExtraTreesClassifier(n_estimators=10000,
                                   n_jobs=global_params['n_jobs'])
    modelrf.fit(Xnewtrain, ytrain)

    print(time.time() - t0) / 60.

    ytestrf = modelrf.predict_proba(Xnewtest)[:, 1]
    yvalrf = modelrf.predict_proba(Xnewval)[:, 1]

    ytestfinal = np.round(ytestrf)
    yvalfinal = np.round(yvalrf)

    return yvalfinal, ytestfinal
def predictSingle():
    train = pd.read_csv('newTrain.csv')
    train = train.drop(['AnimalID'], axis=1)
    label = train['OutcomeType']
    train = train.drop(['OutcomeType'], axis=1)
    test = pd.read_csv('newTest.csv')
    id = test.ID
    test = test.drop(['ID'], axis=1)

    et = ExtraTreesClassifier(n_estimators=800,
                              max_features='sqrt',
                              max_depth=10,
                              min_samples_leaf=2,
                              random_state=seed)
    et.fit(train, label)
    plotFeatureImportance(et, train)
    train, test = removeUnimporantFeat(train, test, et)
    et.fit(train, label)
    #cross-validation for rf
    kfold = KFold(n_splits=10, random_state=seed)
    score = cross_val_score(et, train, label, scoring='neg_log_loss', cv=kfold)
    print(-score.mean())
    columns = et.classes_
    predictions = et.predict_proba(test)
    output_et = pd.DataFrame(predictions, columns=columns)
    output_et = pd.concat([id, output_et], axis=1)
    output_et.to_csv('output_et.csv', index=False)
Beispiel #17
0
    def _cascade_layer(self, X, y=None, layer=0):
        n_tree = getattr(self, 'n_cascadeRFtree')
        n_cascadeRF = getattr(self, 'n_cascadeRF')
        min_samples = getattr(self, 'min_samples_cascade')

        prf = RandomForestClassifier(
            n_estimators=100, max_features=8,
            bootstrap=True, criterion="entropy", min_samples_split=20,
            max_depth=None, class_weight='balanced', oob_score=True)
        crf = ExtraTreesClassifier(
            n_estimators=100, max_depth=None,
            bootstrap=True, oob_score=True)

        prf_pred = []
        if y is not None:
            # print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
            for irf in range(n_cascadeRF):
                prf.fit(X, y)
                crf.fit(X, y)
                setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
                setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
                probas = prf.oob_decision_function_
                probas += crf.oob_decision_function_
                prf_pred.append(probas)
        elif y is None:
            for irf in range(n_cascadeRF):
                prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
                crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
                probas = prf.predict_proba(X)
                probas += crf.predict_proba(X)
                prf_pred.append(probas)

        return prf_pred
Beispiel #18
0
class ExtraTreesClassifierMetaPrim(primitive):
    def __init__(self, random_state=0):
        super(ExtraTreesClassifierMetaPrim, self).__init__(name='ExtraTreesMetaClassifier')
        self.id = 61
        self.hyperparams = []
        self.type = 'ensemble'
        self.description = "An extra-trees classifier. This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = ExtraTreesClassifier(random_state=random_state, n_jobs=5)
        self.accept_type = 'c'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Classification')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['proba_predictions'] = self.model.predict_proba(output['X'])
        classes = list(self.model.classes_)
        cols = ["{}_{}Pred".format(c, self.name) for c in classes]
        output['X'] = pd.DataFrame(output['proba_predictions'], columns=cols)
        output['proba_predictions'] = pd.DataFrame(output['proba_predictions'], columns=classes)
        output['Y'] = output['Y']
        final_output = {0: output}
        return final_output
Beispiel #19
0
class ExtraTreeModel(BaseModel):

    def __init__(self, model_params):
        super(BaseModel, self).__init__()
        self.model = ExtraTreesClassifier(**model_params)


    def fit(self, data, dep_var_name=None):

        if dep_var_name is None:
            sys.exit('dep_var_name is needed for fit function.')
        else:
            self.dep_var_name = dep_var_name

        tmp_data = data.copy()
        data_label = tmp_data[self.dep_var_name].values
        tmp_data.drop(self.dep_var_name, axis=1, inplace=True)
        self.model.fit(tmp_data, data_label)


    def predict(self, data):

        if self.dep_var_name in data.columns:
            tmp_data = data.copy()
            tmp_data.drop(self.dep_var_name, axis=1, inplace=True)
        else:
            tmp_data = data

        scores = self.model.predict_proba(tmp_data)
        ## scores is a numpy array without index
        result = pd.Series(scores[:, 1], index=tmp_data.index)
        return result
Beispiel #20
0
def ExtraTree_prediction(feature_data, result_data):
    n_splits = 5
    kf = StratifiedKFold(
        n_splits=n_splits)  # 分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同,需要目标数据
    all_pred = np.zeros(feature_data.shape[0])
    all_proba = np.zeros(feature_data.shape[0])
    for train_index, test_index in kf.split(feature_data, result_data):
        feature_train, feature_test, result_train, result_test= \
            feature_data[train_index], feature_data[test_index], result_data[train_index], result_data[test_index]
        class_weight = {0: 1, 1: 1}
        clf = ExtraTreesClassifier(random_state=random_state,
                                   class_weight=class_weight)
        clf.fit(feature_train, result_train.ravel())
        test_pred = clf.predict(feature_test)
        test_proba = clf.predict_proba(feature_test)
        all_pred[test_index] = test_pred
        all_proba[test_index] = test_proba[:, 1]
    confmat = confusion_matrix(result_data, all_pred)
    sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
    sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, all_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, all_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, all_proba, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, all_pred, average='macro')))
Beispiel #21
0
def train_classifier(prefix='atx', nside=32, ds=4, color_thresh=30, test_size=0.5):
    X_img,y=load_labeled(prefix=prefix,nside=nside,quick=False)
    if prefix=='atx': color_name='pool'
    colors = get_colors(name=color_name, quick=True)
    print '...getting features...'
    X = get_features(X_img, colors, ds=ds, thresh=color_thresh)
    print '...done getting features...'
    from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
    from sklearn.cross_validation import train_test_split
    from sklearn import metrics

    rf = ExtraTreesClassifier(n_estimators=200, n_jobs=6, max_features=0.02)
    X_train, X_test, y_train, y_test, img_train, img_test = train_test_split(X,y,X_img,test_size=0.5)
    print '...fitting...'
    rf.fit(X_train, y_train)
    y_proba = rf.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba)
    auc = metrics.auc(fpr, tpr)

    pl.clf(); pl.plot(fpr, tpr, 'b-o')
    pl.plot(fpr, fpr/np.mean(y), 'r--'); pl.ylim(0,1); pl.xlim(0,1)
    pl.title('AUC: %0.3f'%auc)

    for i,th in enumerate(thresholds): print th,tpr[i],tpr[i]/fpr[i]
    prob_thresh=0.6
    wh_missed=np.where((y_proba<prob_thresh)&(y_test==1))[0]
    wh_ok=np.where((y_proba>prob_thresh)&(y_test==1))[0]
Beispiel #22
0
def load_train_data(r_seed):
    X_train, X_valid, Y_train, Y_valid = train_test_split(
        train[train_features],
        train['Response'],
        test_size=0.8,
        random_state=r_seed)
    rbm1 = ExtraTreesClassifier(n_estimators=500,
                                max_features=0.4,
                                n_jobs=32,
                                random_state=jj,
                                verbose=1).fit(X_train, Y_train)
    rbm2 = RandomForestClassifier(n_estimators=300,
                                  max_features=0.28,
                                  n_jobs=32,
                                  verbose=1,
                                  random_state=jj).fit(X_train, Y_train)
    rbm3 = GradientBoostingClassifier(n_estimators=48,
                                      max_depth=11,
                                      subsample=0.8,
                                      min_samples_leaf=5,
                                      verbose=1,
                                      random_state=jj).fit(X_train, Y_train)
    res_mean = rbm1.predict_proba(X_valid) + rbm2.predict_proba(
        X_valid) + rbm3.predict_proba(X_valid)
    res_mean = res_mean / 3.0
    feats = ['new_feat_%d' % (i) for i in range(1, 9)]
    new_data = pd.DataFrame(res_mean, columns=feats)
    new_data.index = X_valid.index
    all_data = pd.concat([X_valid, new_data], axis=1)
    print all_data.shape
    return all_data, Y_valid, rbm1, rbm2, rbm3
Beispiel #23
0
def learn(x, y, test_x):
    cw = {
        "0": variables.weight_0_rf,
        "1000": variables.weight_1000_rf,
        "1500": variables.weight_1500_rf,
        "2000": variables.weight_2000_rf
    }
    clf = ExtraTreesClassifier(
        n_jobs=-1,
        n_estimators=variables.n_estimators_et,
        max_depth=variables.max_depth_et,
        random_state=0,
        min_samples_split=variables.min_samples_split_et,
        min_samples_leaf=variables.min_samples_leaf_et,
        max_features=variables.max_feature_et,
        max_leaf_nodes=variables.max_leaf_nodes_et,
        criterion=variables.criterion_et,
        min_impurity_split=variables.min_impurity_split_et,
        class_weight=variables.cw_et).fit(x, y)

    print "n_estimators=", variables.n_estimators_et,
    print "max_depth=", variables.max_depth_et,
    print "min_samples_split=", variables.min_samples_split_et,
    print "min_samples_leaf=", variables.min_samples_leaf_et,
    print "max_features=", variables.max_feature_et,
    print "max_leaf_nodes=", variables.max_leaf_nodes_et,
    print "criterion=", variables.criterion_et,
    print "min_impurity_split=", variables.min_impurity_split_et,
    print "class_weight=", variables.cw_et

    prediction_list = clf.predict(test_x)
    prediction_list_prob = clf.predict_proba(test_x)
    return prediction_list, prediction_list_prob
def movement_interval(train_on=['training1','training2', 'training3', 'training4'],
        predict_on=['validation1_lab', 'validation2_lab', 'validation3_lab']):

    window_shift = 5
    window_length = 40

    print 'aggregated_skeletion_win'
    X_win = aggregated_skeletion_win(predict_on,
        agg_functions=['median', 'var', 'min', 'max'], 
        window_shift=window_shift, window_length=window_length)
    X_win= X_win.fillna(0)

    print 'train rf model'
    X, y = aggregated_skeletion(file_names=train_on,
            agg_functions=['median', 'var', 'min', 'max'])
    X = X.fillna(0)
    y = np.array([gesture_to_id[gest] for gest in y])

    clf = ExtraTreesClassifier(n_estimators=1500, random_state=0,
        n_jobs=-1)
    clf.fit(X, y)
    del X
    del y

    print 'rf predict'
    y_pred = clf.predict_proba(X_win)

    df_out = pd.concat([DataFrame.from_records(X_win.index.values.tolist(),
        columns=['sample_id', 'frame']), DataFrame(y_pred)], axis=1)
    df_out['movement'] = np.array(np.argmax(y_pred, axis=1) != 0,
                                                                dtype=int)
    # adjust for sliding window size
    df_out.frame = df_out.frame + 20
    return df_out
Beispiel #25
0
def test_multioutput():
    """Check estimators on multi-output problems."""
    olderr = np.seterr(divide="ignore")

    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]]

    y = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3]]

    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]

    # toy classification problem
    clf = ExtraTreesClassifier(random_state=0)
    y_hat = clf.fit(X, y).predict(T)
    assert_array_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))

    proba = clf.predict_proba(T)
    assert_equal(len(proba), 2)
    assert_equal(proba[0].shape, (4, 2))
    assert_equal(proba[1].shape, (4, 4))

    log_proba = clf.predict_log_proba(T)
    assert_equal(len(log_proba), 2)
    assert_equal(log_proba[0].shape, (4, 2))
    assert_equal(log_proba[1].shape, (4, 4))

    # toy regression problem
    clf = ExtraTreesRegressor(random_state=5)
    y_hat = clf.fit(X, y).predict(T)
    assert_almost_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))

    np.seterr(**olderr)
def main():
	start = time.time()
	print("Reading the data from " + train_file)
	data = cu.get_dataframe(train_file)

	print("Extracting features")
	fea = features.extract_features(feature_names, data)

	print("Training the model")
	clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2)
	clf.fit(fea, data["OpenStatus"])

	print "Listing feature importances:"
	cu.list_feature_importance(clf,feature_names)
	
	print("Reading test file and making predictions: " + test_file)
	data = cu.get_dataframe(test_file)
	test_features = features.extract_features(feature_names, data)
	probs = clf.predict_proba(test_features)

	if (update_posteriors):
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
	
	print("Saving submission to %s" % submission_file)
	cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Beispiel #27
0
def test_multioutput():
    """Check estimators on multi-output problems."""

    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1],
         [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]]

    y = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2],
         [-1, 2], [1, 3], [1, 3], [1, 3]]

    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]

    # toy classification problem
    clf = ExtraTreesClassifier(random_state=0)
    y_hat = clf.fit(X, y).predict(T)
    assert_array_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))

    proba = clf.predict_proba(T)
    assert_equal(len(proba), 2)
    assert_equal(proba[0].shape, (4, 2))
    assert_equal(proba[1].shape, (4, 4))

    log_proba = clf.predict_log_proba(T)
    assert_equal(len(log_proba), 2)
    assert_equal(log_proba[0].shape, (4, 2))
    assert_equal(log_proba[1].shape, (4, 4))

    # toy regression problem
    clf = ExtraTreesRegressor(random_state=5)
    y_hat = clf.fit(X, y).predict(T)
    assert_almost_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))
def bagged_set(X_ts, y_cs, seed, estimators, xt, yt=None):

    # create array object to hold predictions
    baggedpred = np.array([0.0 for d in range(0, xt.shape[0])])

    #loop for as many times as we want bags
    for n in range(0, estimators):

        model = ExtraTreesClassifier(n_estimators=1000,
                                     criterion="entropy",
                                     max_depth=12,
                                     min_samples_leaf=4,
                                     max_features=0.5,
                                     n_jobs=20)
        model.fit(X_ts, y_cs)
        preds = model.predict_proba(xt)[:, 1]
        # update bag's array
        baggedpred += preds

        print("completed: " + str(n))

    # divide with number of bags to create an average estimate
    baggedpred /= estimators

    return baggedpred
def eval_seq_model(out_file='eval_model.csv',window_shift=1, retrain=False):

    filename = 'cache/joblib/rf_eval_model.joblib.pkl'
    file_names=['training1', 'training3', 'training4', 
                    'validation1_lab', 'validation3_lab']

    if retrain:
        X, y = aggregated_skeletion(file_names=file_names,
                agg_functions=['median', 'var', 'min', 'max'])
        X = X.fillna(0)
        y = np.array([gesture_to_id[gest] for gest in y])


        clf = ExtraTreesClassifier(n_estimators=500, random_state=0,
            n_jobs=-1)
        clf.fit(X, y)
        _ = joblib.dump(clf, filename, compress=9)
    else:
        clf = joblib.load(filename)

    X_win = aggregated_skeletion_win(['validation2_lab', 'training2'],
            agg_functions=['median', 'var', 'min', 'max'],
            window_shift=window_shift)

    y_pred = clf.predict_proba(X_win)
    df_pred = DataFrame(y_pred, index=[s for (s, _) in X_win.index])

    to_dump = df_pred.groupby(level=0).apply(postprocess)
    dump_predictions(to_dump, out_path=out_file)
    return df_pred, to_dump
def extra_tree():
    train_features, test_features = load_features()
    train_features = train_features.fillna(value=0)
    test_features = test_features.fillna(value=0)
    X_train = train_features.drop(["bidder_id", "outcome"], axis=1)
    Y_train = train_features["outcome"]
    X_test = test_features.drop(["bidder_id"], axis=1)
    print("Training extra_tree model")
    extraTree = ExtraTreesClassifier(n_estimators=3000, max_features=10)
    print("Model trained")
    print("Cross validation score (extra_tree) : ")
    cv_score = np.mean(
        cross_val_score(extraTree, X_train, Y_train, cv=5, scoring='roc_auc'))
    print(cv_score)

    print("Generating submission file")
    extraTree.fit(X_train, Y_train)
    prediction = extraTree.predict_proba(X_test)
    test_features['prediction'] = prediction[:, 1]
    test_features[['bidder_id',
                   'prediction']].to_csv('data/submission_extra_tree.csv',
                                         index=False)
    print("Output file successfully created")

    print("Generating auc curve and auc score")
    auc = roc_auc(train_features, extraTree)
    print("AUC score : " + str(auc))
def objective_etree(space):
    numfolds = 10
    total = 0
    kf2 = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=13) 
    
    etree = ExtraTreesClassifier(n_estimators = space['n_estimators'], 
                            max_depth = space['max_depth'],
                            max_features = space['max_features'],
                            criterion = space['criterion'],
                            min_impurity_split = space['min_impurity_split'],
             #               scale = space['scale'],
             #               normalize = space['normalize'],
             #               min_samples_leaf = space['min_samples_leaf'],
             #               min_weight_fraction_leaf  = space['min_weight_fraction_leaf'],
             #               min_impurity_split = space['min_impurity_split'],
                            random_state = 13,
                            warm_start = True,                            
                            n_jobs = -1
                            )
    
    for train_index, test_index in kf2.split(X_train_cl,y_train_cl.IS_IT_GAMER):
        xtrain, xtest = X_train_cl.iloc[train_index], X_train_cl.iloc[test_index]
        ytrain, ytest = y_train_cl.iloc[train_index], y_train_cl.iloc[test_index]
        
     #   eval_set = [(xtrain, ytrain),(xtest, ytest)]

        etree.fit(xtrain, ytrain.values.ravel())
        pred = etree.predict_proba(xtest)[:,1]
     
        logloss = log_loss(ytest, pred)
#        print ("SCORE:", logloss)  
        total += logloss
    total = total/numfolds
    print (total)
    return{'loss':total, 'status': STATUS_OK }
Beispiel #32
0
def real_submodel(tag, by):
    print "Classify submodel_by_{}_{} ...".format(by, tag)
    X, y, X_test = load_n_clean_data(tag, by, load=False, cv=False)
    print "Build model ..."
    # clf = AdaBoostClassifier(ExtraTreesClassifier(n_jobs=-1,
    # 	n_estimators=100,
    # 	min_samples_leaf=9,
    # 	max_depth=20,
    # 	verbose=4), n_estimators=10)
    clf = ExtraTreesClassifier(n_jobs=-1,
                               n_estimators=200,
                               min_samples_leaf=9,
                               max_depth=30,
                               verbose=4)
    clf.fit(X, y)
    pred = clf.predict_proba(X_test)
    print pred

    print "Dump precious stuff in case of crash ..."
    import pickle
    # with open('output/submodels_by_{}/pred_{}.cache'.format(by, tag), 'w') as fout_pred:
    # 	pickle.dump(pred, fout_pred)
    # clf occupy too much space of disk
    # with open('output/submodels_by_{}/clf_{}.cache'.format(by, tag), 'w') as fout_clf:
    # 	pickle.dump(clf, fout_clf)

    label1_idx = clf.classes_.tolist().index(1)
    X_test['Predicted'] = [item[label1_idx] for item in pred]
    return X_test['Predicted']
def eval_gesture_model(retrain=False, window_shift=1, window_length=40,
        train_on=['training1', 'training3', 'training4',
                    'validation1_lab', 'validation3_lab'],
        predict_on=['validation2_lab', 'training2']):

    filename = 'cache/joblib/rf_eval_model' + str(window_length) + '.joblib.pkl'
    #file_names=['training1', 'training3', 'training4',
    #                'validation1_lab', 'validation3_lab']

    if retrain:
        X, y = aggregated_skeletion(file_names=train_on,
                agg_functions=['median', 'var', 'min', 'max'],
                window_length=window_length)
        X = X.fillna(0)
        y = np.array([gesture_to_id[gest] for gest in y])


        clf = ExtraTreesClassifier(n_estimators=500, random_state=0,
            n_jobs=-1)
        clf.fit(X, y)
        _ = joblib.dump(clf, filename, compress=9)
    else:
        clf = joblib.load(filename)

    X_test, y_test = aggregated_skeletion(predict_on,
            agg_functions=['median', 'var', 'min', 'max'],
        window_length=window_length)
    X_test = X_test.fillna(0)
    y_test = np.array([gesture_to_id[gest] for gest in y_test])
    y_pred = clf.predict_proba(X_test)
    return y_pred, y_test
Beispiel #34
0
class ETClassifier(BaseClassifier):
    def __init__(self, opt):
        super().__init__(opt)
        self.clf_name = 'ETClassifier'
        self.clf = ExtraTreesClassifier(
            n_estimators=opt.get('n_estimators', 200),
            max_depth=opt.get('max_depth', 7),
            min_samples_leaf=opt.get('min_samples_leaf', 10),
            max_leaf_nodes=opt.get('max_leaf_nodes', 63),
            min_samples_split=opt.get('min_samples_split', 2),
            bootstrap=opt.get('bootstrap', True),
            class_weight=opt.get('class_weight', {
                0: 1,
                1: 10
            }),
            random_state=opt.get('random_state', 18520),
            n_jobs=opt.get('n_jobs', 2))

    def fit(self, train_set, valid_set=None):
        self.clf.fit(train_set[0], train_set[1])

    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:, 1]

    def get_feat_imp(self):
        return self.clf.feature_importances_
Beispiel #35
0
def ef_predictedValue():
    print '----------ExtraForest----------'
    ef_clf = ExtraTreesClassifier(n_estimators = NoOfEstimators, n_jobs = NoJobs)
    ef_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    ef_predictedValue = ef_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % ef_clf.feature_importances_
    return ef_predictedValue[:,1]
Beispiel #36
0
def extratrees():
	train,test,Y,device_id = None,None,None,None
	print('Load the featured Train/Test data..')
	with open('../cache/sparse_train_xgb.p', 'rb') as f:
		train = pickle.load(f)
	with open('../cache/sparse_test_xgb.p', 'rb') as f:
		test = pickle.load(f)
	with open('../cache/y.p', 'rb') as f:
		Y = pickle.load(f)
	with open('../cache/device.p', 'rb') as f:
		device_id = pickle.load(f)
		
	# Group Labels
	lable_group = LabelEncoder()
	Y = lable_group.fit_transform(Y)
	
	X_train, X_val, y_train, y_val = train_test_split(train, Y, test_size=.30)

	##################
	#     ExtraTrees
	##################
	model = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0, criterion='entropy', n_jobs=32, verbose=20)
	model.fit(X_train, y_train)
	x_val_prob = model.predict_proba(X_val)
	score = log_loss(y_val.tolist(), x_val_prob)
	print("ExtraTrees - Score : " + str(score))
Beispiel #37
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
        #y_pred/=m;
        clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3,
                            max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
def calc_prob(df_features_driver, df_features_other):

    df_train = df_features_driver.append(df_features_other)
    df_train.reset_index(inplace = True)
    df_train.Driver = df_train.Driver.astype(int)

    # So far, the best result was achieved by using a RandomForestClassifier with Bagging
    # model = BaggingClassifier(base_estimator = ExtraTreesClassifier())
    # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1))
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = AdaBoostClassifier())
    #model = RandomForestClassifier(200)
    # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()])
    # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()),
    #                             GradientBoostingClassifier])
    #model = GradientBoostingClassifier(n_estimators = 10000)
    model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True)
    # model = ExtraTreesClassifier(500, criterion='entropy')

    feature_columns = df_train.iloc[:, 4:]

    # Train the classifier
    model.fit(feature_columns, df_train.Driver)
    df_submission = pd.DataFrame()

    df_submission['driver_trip'] = create_first_column(df_features_driver)

    probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver
    probs_df = pd.DataFrame(probs_array)

    df_submission['prob'] = np.array(probs_df.iloc[:, 1])

    return df_submission
def plot_confusion_matrix(model, relevant_features_new, y_new,
                          threshold_classification):

    extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0)
    base_classification = Base_Classification(model, extra_trees)

    #sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    sss = StratifiedKFold(n_splits=3, shuffle=False, random_state=10)
    for train_index, test_index in sss.split(relevant_features_new, y_new):
        x_train, x_test = relevant_features_new.iloc[
            train_index, :], relevant_features_new.iloc[test_index, :]
        y_train, y_test = y_new.iloc[train_index, :], y_new.iloc[test_index, :]
        break

    #x_train, x_test, y_train, y_test = train_test_split(relevant_features_new, y_new, test_size=0.3, random_state=42)
    extra_trees.fit(x_train, y_train)
    pred = extra_trees.predict_proba(x_test)
    pred = pd.DataFrame(pred, columns=extra_trees.classes_)
    valid_indexes = base_classification.get_accuracy.get_indexes_with_valid_predictions(
        pred, threshold_classification)

    x_test_valid = x_test.iloc[valid_indexes, :]
    y_test_valid = y_test.iloc[valid_indexes, :]

    base_classification.get_accuracy.plot_confusion_matrix(
        x_test_valid, y_test_valid, extra_trees)
    print("Accuracy => {}".format(extra_trees.score(x_test_valid,
                                                    y_test_valid)))
    base_classification.get_accuracy.plot_confusion_matrix(
        x_test, y_test, extra_trees)
    print("Accuracy => {}".format(extra_trees.score(x_test, y_test)))
Beispiel #40
0
def eval_seq_model(out_file='eval_model.csv', window_shift=1, retrain=False):

    filename = 'cache/joblib/rf_eval_model.joblib.pkl'
    file_names = [
        'training1', 'training3', 'training4', 'validation1_lab',
        'validation3_lab'
    ]

    if retrain:
        X, y = aggregated_skeletion(
            file_names=file_names,
            agg_functions=['median', 'var', 'min', 'max'])
        X = X.fillna(0)
        y = np.array([gesture_to_id[gest] for gest in y])

        clf = ExtraTreesClassifier(n_estimators=500, random_state=0, n_jobs=-1)
        clf.fit(X, y)
        _ = joblib.dump(clf, filename, compress=9)
    else:
        clf = joblib.load(filename)

    X_win = aggregated_skeletion_win(
        ['validation2_lab', 'training2'],
        agg_functions=['median', 'var', 'min', 'max'],
        window_shift=window_shift)

    y_pred = clf.predict_proba(X_win)
    df_pred = DataFrame(y_pred, index=[s for (s, _) in X_win.index])

    to_dump = df_pred.groupby(level=0).apply(postprocess)
    dump_predictions(to_dump, out_path=out_file)
    return df_pred, to_dump
Beispiel #41
0
def ef_predictedValue():
    print '----------ExtraForest----------'
    ef_clf = ExtraTreesClassifier(n_estimators=NoOfEstimators, n_jobs=NoJobs)
    ef_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    ef_predictedValue = ef_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % ef_clf.feature_importances_
    return ef_predictedValue[:, 1]
def et_model(X_train, y_train, X_test, y_test=None):  
    #ExtraTree  
    model = ExtraTreesClassifier(max_features='log2',n_estimators=1000,n_jobs=1).fit(X_train,y_train)
    predict = model.predict_proba(X_test)[:,1] 
    minmin = min(predict)  
    maxmax = max(predict)  
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))  
    return vfunc(predict)     
Beispiel #43
0
def extra(X, y, test):

    clf = ExtraTreesClassifier(n_estimators=250,
                               max_depth=9,
                               min_samples_split=6)
    clf.fit(X, y)

    return clf.predict_proba(test)[:, 1]
def et_prob(train, label, test):
    extratree = ExtraTreesClassifier(n_estimators=300, max_depth=None, max_features="auto", n_jobs=-1, random_state=2017,
                                     verbose=0)
    extratree.fit(train, label)
    predict = extratree.predict_proba(test)
    extratree = []
    del extratree
    return predict
    def extra_tree(self):
        if not 'et' in self.params['model']['model_list']:
            raise Exception('Extra Tree Classifier not listed in the model list parameter')
        space = self.params
        
        clf = ExtraTreesClassifier(n_estimators=self.params['model']['rf_params']['n_estimators'],
                                   max_depth=self.params['model']['dt_params']['max_depth'],
                                    min_samples_split=self.params['model']['dt_params']['min_samples_split'],
                                    min_samples_leaf=self.params['model']['dt_params']['min_samples_leaf'],
                                    min_weight_fraction_leaf=self.params['model']['dt_params']['min_weight_fraction_leaf'],
                                    max_features=self.params['model']['dt_params']['max_features'],
                                    max_leaf_nodes=self.params['model']['dt_params']['max_leaf_nodes'],
                                   min_impurity_decrease=self.params['model']['rf_params']['min_impurity_decrease'],
                                    class_weight=self.params['model']['dt_params']['class_weight'])
        
        if self.params['cross_validation']['time_based_test_split']:
            X_train, X_test, y_train, y_test = self.split_data_into_train_test_time_based()
        else:
            X_train, X_test, y_train, y_test = self.split_data_into_train_test()

        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        pred_prob = clf.predict_proba(X_test)
        pred_prob = [x[1] for x in pred_prob]
        predictions = pd.DataFrame({'id': self.id_vals_test, 'date': self.date_vals_test, 'prob': pred_prob, 'status': y_test})
        
        pred_prob_train = clf.predict_proba(X_train)
        pred_prob_train = [x[1] for x in pred_prob_train]
        predictions_train = pd.DataFrame({'id': self.id_vals_train, 'date': self.date_vals_train, 'prob': pred_prob_train, 'status': y_train})        
        
        if not os.path.exists(self.params['info']['base_dir']+'models/'):
            os.mkdir(self.params['info']['base_dir']+'models/')  
        
        if not os.path.exists(self.params['info']['base_dir']+'predictions/'):
            os.mkdir(self.params['info']['base_dir']+'predictions/')
        
        joblib.dump(clf, self.params['info']['base_dir'] + 'models/' + 'trained_model_et.pkl')
        predictions.to_csv(self.params['info']['base_dir']+'predictions/'+'predictions_et_test.csv', index = False)
        predictions_train.to_csv(self.params['info']['base_dir']+'predictions/'+'predictions_et_train.csv', index = False)
        
        print('Saved trained model (Extra Trees): {}'.format(self.params['info']['base_dir'] + 'models/' + 'trained_model_et.pkl'))
        print('Written test predictions (Extra Trees): {}'.format(self.params['info']['base_dir']+'predictions/'+'predictions_et_test.csv'))
        print('Written train predictions (Extra Trees): {}'.format(self.params['info']['base_dir']+'predictions/'+'predictions_et_train.csv'))
        
        accuracy = accuracy_score(y_test, pred)
        return {'loss':-accuracy, 'status': STATUS_OK }
Beispiel #46
0
def et(train_data,
       train_label,
       val_data,
       val_label,
       test_data,
       name="extratrees_submission.csv"):
    print "start training ExtraTrees..."
    etClf = ExtraTreesClassifier(n_estimators=10)
    etClf.fit(train_data, train_label)
    #evaluate on validation set
    val_pred_label = etClf.predict_proba(val_data)
    logloss = preprocess.evaluation(val_label, val_pred_label)
    print "logloss of validation set:", logloss

    print "Start classify test set..."
    test_label = etClf.predict_proba(test_data)
    preprocess.saveResult(test_label, filename=name)
def load_train_data(train, train_y, ttf):
    X = train.copy()
    y = np.array(train_y[offset:LINES].copy(), dtype = np.int32)
    rbm0 = ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=5, n_jobs = 8).fit(X[0:offset-1,:], train_y[0:offset-1])
    #rbm1 = xgb.XGBClassifier(n_estimators=200,max_depth=6,subsample=0.8,min_child_weight = 2, nthread=8).fit(X[0:offset-1,:], train_y[0:offset-1])
    rbm1 = KNeighborsClassifier(n_neighbors = 5).fit(X[0:offset-1,:], train_y[0:offset-1])
    rbm2 = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='auto', bootstrap=False, oob_score=False, n_jobs=8, verbose=1).fit(X[0:offset-1,:], train_y[0:offset-1])
    rbm3 = xgb.XGBClassifier(n_estimators=300,max_depth=8,subsample=0.8,min_child_weight=4,nthread=8).fit(X[0:offset-1,:], train_y[0:offset-1])
    X =  np.hstack([X[offset:LINES,:], rbm0.predict_proba(X[offset:LINES,:]), rbm1.predict_proba(X[offset:LINES,:]), np.power(rbm2.predict_proba(X[offset:LINES,:])*rbm3.predict_proba(X[offset:LINES,:]), (1/2.0)) ] )    
    return np.array(X, dtype = np.float32), y, rbm0, rbm1, rbm2, rbm3
def extratree_cla(train_data, train_id, test_data, seed = None):
    clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=4, random_state= seed)#, max_features="log2")
    param_grid = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
    }
    clf.fit(train_data, train_id)
    pred_class = clf.predict(test_data)
    pred_prob = clf.predict_proba(test_data)
    return pred_class, pred_prob
def build_XT1(X_train,y_train,X_cal,y_cal,X_test):
    cal_prob,test_prob = 0,0
    for i in range(3):
        print("--Building and Training model %s" % i)
        seed = randrange(1,10000)
        model = ExtraTreesClassifier(n_estimators=500,criterion="entropy",min_samples_split=1,random_state=seed,n_jobs=-1)
        model = CalibratedClassifierCV(base_estimator=model,method='isotonic',cv=5).fit(X_train,y_train)
        print("Model %s training complete." % i)
        test_prob += model.predict_proba(X_test)
    test_prob = test_prob/3.
    return(cal_prob,test_prob)
def extraTree(X, y, train, valid):
	clf = ExtraTreesClassifier(n_jobs = -1, n_estimators = 300, verbose = 2,
            random_state = 1, max_depth = 10, bootstrap = True)
	clf.fit(X[train], y[train])
	yhat = clf.predict(X[valid])
	yhat_prob = clf.predict_proba(X[valid])[:,1]
	print("extra tree randomForest" + str(accuracy_score(y[valid], yhat)))
	print(classification_report(y[valid], yhat))

	print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob)))
	np.savetxt("y_extratree.csv", yhat_prob)
	return yhat_prob
Beispiel #51
0
def train_predict(X_train, X_test, y_train, y_test,  model_name, param):
	
	if model_name == 'clf_xgb_tree':
		'''
		if y_test:
        		dtest_base = xgb.DMatrix(X_test, label=y_test)
		else:
        		dtest_base = xgb.DMatrix(X_test)
			
                dtrain_base = xgb.DMatrix(X_train, label=y_train)
		watchlist = []
                #watchlist  = [(dtrain_base, 'train'), (dtest_base, 'valid')]
		bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=xgb_loss)
                pred_test = bst.predict(dtest_base)
		'''
		print 'no xgboost' 
	elif model_name == "clf_skl_lr":
                lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5,
                                            C=param['C'], fit_intercept=True, intercept_scaling=1.0,
                                            random_state=param['random_state'])
                lr.fit(X_train, y_train)
                pred_test = lr.predict(X_test)
        elif model_name == 'clf_skl_etr':
		etr = ExtraTreesClassifier(n_estimators=int(param['n_estimators']),
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                etr.fit(X_train, y_train)
                pred_test = etr.predict_proba(X_test)[:,1]

        elif model_name == 'clf_skl_rf':
		rf = RandomForestClassifier(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                rf.fit(X_train, y_train)
                pred_test = rf.predict_proba(X_test)[:,1]

        elif model_name == 'clf_skl_gbm':
		gbm = GradientBoostingClassifier(n_estimators=param['n_estimators'],
                                                    max_features=param['max_features'],
                                                    learning_rate=param['learning_rate'],
                                                    max_depth=param['max_depth'],
                                                    subsample=param['subsample'],
                                                    random_state=param['random_state'])

                gbm.fit(X_train, y_train)
                pred_test = gbm.predict_proba(X_test)[:,1]

	return pred_test
def load_train_data(r_seed):
    X_train,X_valid,Y_train,Y_valid=train_test_split(train[train_features],train['Response'],test_size=0.8,random_state=r_seed)
    rbm1 = ExtraTreesClassifier(n_estimators=500,
                                  max_features=0.4,
                                  n_jobs=32,
                                  random_state=jj,verbose=1).fit(X_train,Y_train)
    rbm2 = RandomForestClassifier(n_estimators=300, max_features=0.28,n_jobs=32, verbose=1,random_state=jj).fit(X_train,Y_train)
    rbm3 = GradientBoostingClassifier(n_estimators=48,max_depth=11,subsample=0.8,min_samples_leaf=5,verbose=1,random_state=jj).fit(X_train,Y_train)
    res_mean =  rbm1.predict_proba(X_valid)+rbm2.predict_proba(X_valid)+rbm3.predict_proba(X_valid)
    res_mean = res_mean /3.0
    feats = ['new_feat_%d'%(i) for i in range(1,9)]
    new_data = pd.DataFrame(res_mean,columns=feats)
    new_data.index = X_valid.index
    all_data = pd.concat([X_valid,new_data],axis=1)
    print all_data.shape
    return all_data,Y_valid,rbm1,rbm2,rbm3
class Model_ETC:

  def __init__(self, trainX, trainY, seed):
    self.model = ExtraTreesClassifier(
        n_estimators=500,
        random_state=seed
    )
    if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]:
      trainX = trainX.toarray()
    self.model.fit(trainX, trainY)

  def predict(self, testX):
    if type(testX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]:
      testX = testX.toarray()
    predictions = self.model.predict_proba(testX)[:,1]
    return predictions
Beispiel #54
0
def etc_level2(train_x, train_y, test_x, seed):

    clf2 = ExtraTreesClassifier(
        n_estimators=1000,
        max_features=50,
        criterion='entropy',
        min_samples_split=4,
        max_depth=35,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=seed,
        verbose=2
    )
    clf2.fit(train_x, train_y)
    pred = clf2.predict_proba(test_x).astype(np.float32)
    return pred
Beispiel #55
0
def cross_val(clf_name, X, y, n_folds=5, proba=False, score=accuracy_score, *params, **kwargs):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41)
    if clf_name == "extra":
        c = ExtraTreesClassifier(12, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs)
    elif clf_name == "grad":
        c = GradientBoostingClassifier(n_estimators=40, learning_rate=0.1, *params, **kwargs)
    elif clf_name == "cgrad":
        c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(n_estimators = 20,learning_rate= 0.1, *params, **kwargs), method='isotonic', cv=10) 
    elif clf_name == "cmulti":
        c = CalibratedClassifierCV(base_estimator=MultinomialNB(alpha = alpha_multi, *params, **kwargs), method='isotonic', cv=10) 
    elif clf_name == "multi":
        c = MultinomialNB(*params, **kwargs)
    elif clf_name == "bag":
        c = BaggingClassifier(base_estimator=MultinomialNB(alpha = 0.5, *params, **kwargs),n_estimators = 100,n_jobs = -1)
    elif clf_name == "bern":
        c = BernoulliNB(alpha=0.00000000001, *params, **kwargs)
    elif clf_name == "gauss":
        c = GaussianNB(*params, **kwargs)
    elif clf_name == "random":
        c = RandomForestClassifier(1200,max_depth= 23,max_features = 10,n_jobs = -1, *params, **kwargs)
    elif clf_name == "lda":
        c = LinearDiscriminantAnalysis(*params, **kwargs)
    elif clf_name == "logistic":
        c = LogisticRegression(C=1, *params, **kwargs)
    elif clf_name == "svm":
        c = LinearSVC(C=100, *params, **kwargs)
    elif clf_name == "knn":
        c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs)
    elif clf_name == "near":
        c = NearestCentroid(*params, **kwargs)
    elif clf_name == "ridge":
        c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs))
    elif clf_name == "sgd":
        c = SGDClassifier(loss="hinge", penalty="l2", n_iter=50, alpha=0.000001, fit_intercept=True, average=True)

    y_pred = np.zeros(y.shape)
    score_list = []
    for i, (train, test) in enumerate(cv):
        c.fit(X[train,:], y[train])
        if proba:
            y_pred[test] = c.predict_proba(X[test,:])
        else:
            y_pred[test] = c.predict(X[test,:])
        score_list.append(score(y[test], y_pred[test]))
        print(score_list[i])
    print("Final score",score(y,y_pred))
    return y_pred
Beispiel #56
0
class MyExtraTree(MyClassifier):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesClassifier(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesClassifier(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    # def predict(self, Xtest, option = None):
    #   return self._extree.predict(Xtest)

    def predict_proba(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)[:, 1]
Beispiel #57
0
def etclassifier(training_samples, eval_samples, do_grid_search=True):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples

        
    clf = ExtraTreesClassifier(max_depth=None, n_estimators=1000,
                                 min_weight_fraction_leaf=0.0, max_features=None, min_samples_split=16, criterion='gini',
                                 min_samples_leaf=2, max_leaf_nodes=None, oob_score=False, bootstrap=True,
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    to_be_tuned_parameters = {
                              #'n_estimators':[500, 2000, 4000],
                              'max_features':['log2', 'auto', None],
                              'min_samples_split':[2, 8, 16],
                              'min_samples_leaf': [1, 2],

                            }
    if do_grid_search:
        clf = GridSearchCV(clf, to_be_tuned_parameters, cv=5, n_jobs=5, scoring='log_loss')
    #Best parameters set found on development set:
    #()
    #{'max_features': None, 'min_samples_split': 10, 'n_estimators': 1000, 'min_samples_leaf': 2}
    
    
                

    print(clf)
    clf.fit(X_train, Y_train)
    if do_grid_search:
        print("Best parameters set found on development set:")
        print()
        
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean_score, scores.std() * 2, params))
        
    else:
        scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
        print scores, np.mean(scores), np.median(scores)
    Y_eval = clf.predict(X_eval)
    Y_prob = clf.predict_proba(X_eval)
    return Y_eval, Y_prob
Beispiel #58
0
def testET(X_train, X_test, y_train, y_test):
    """
    Train a extra-trees classifier and make predictions for test data.

    :param X_train: training data
    :param X_test: test data
    :param y_train: training labels
    :param y_test: test labels

    :return: predictions for the test data
    """
    clf = ExtraTreesClassifier(n_estimators=1000, max_features=5, n_jobs=-1, verbose=False)
    clf.fit(X_train, y_train)
    yhat = clf.predict_proba(X_test)[:, 1]
    auc = metrics.roc_auc_score(y_test, yhat)
    print('ET AUC:', auc)

    return yhat
Beispiel #59
0
def main():
	## read training dataset
	traindataset = pd.read_csv('/usr3/graduate/xysun/walmart/traindata.csv')
	target = traindataset['TripType']
	traindata_feature = traindataset.drop(['TripType','VisitNumber'],1)

	etc = ExtraTreesClassifier(n_estimators=500,bootstrap=False, n_jobs=-1)
	etc = etc.fit(traindata_feature, target)

	
	##test models
	testdataset = pd.read_csv('/usr3/graduate/xysun/walmart/testdata.csv')
	testdata = testdataset.set_index('VisitNumber')
	result = etc.predict_proba(testdata)
	etc_csv = pd.DataFrame(result[0:,0:], index=testdata.index)
	etc_csv.index.name = "VisitNumber"
	etc_csv.columns = ["TripType_3","TripType_4","TripType_5","TripType_6","TripType_7","TripType_8","TripType_9","TripType_12","TripType_14","TripType_15","TripType_18","TripType_19","TripType_20","TripType_21","TripType_22","TripType_23","TripType_24","TripType_25","TripType_26","TripType_27","TripType_28","TripType_29","TripType_30","TripType_31","TripType_32","TripType_33","TripType_34","TripType_35","TripType_36","TripType_37","TripType_38","TripType_39","TripType_40","TripType_41","TripType_42","TripType_43","TripType_44","TripType_999"]
	etc_csv.to_csv('/usr3/graduate/xysunn/walmart/etc_csv.csv',header=True, index=True,delimiter=',')
def cross_val(training_df,frac):
#    
    train_cv, test_cv = shuffle_and_sample(training_df,frac)            
#    
    rf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs = -1)
    rf.fit(train_cv[features], train_cv["sponsored0"])
    rf_pred = rf.predict_proba(test_cv[features])[:,1]
    del rf
#   
    et = ExtraTreesClassifier(n_estimators=100, random_state=1, n_jobs = -1)
    et.fit(train_cv[features], train_cv["sponsored0"])
    et_pred = et.predict_proba(test_cv[features])[:,1]
    del et
#
    test_probs = (rf_pred + et_pred)/2
    true_labels = test_cv["sponsored0"].values
    aucscore=roc_auc_score(true_labels,test_probs)
    return aucscore