Ejemplo n.º 1
0
 def fit(self, X, Y, W):
     self.imputer = Imputer()
     self.imputer.fit(X)
     X = replace_nan(X, self.imputer)
     rf_model = RandomForest(**self.params)
     rf_model.fit(X, Y.ravel())
     return RandomForestClassifier(rf_model, self.imputer)
Ejemplo n.º 2
0
def calculateFeatureAuto(trainX, trainY, testX, testY):
    first_rs2 = np.array([])
    for i in range(1, 151, 1):
        reg1 = RandomForest(n_estimators=i, max_depth=7, max_features="auto")
        reg1.fit(trainX, trainY)
        predict = reg1.predict(testX)
        rss = calculateRSS(testY, predict)
        first_rs2 = np.append(first_rs2, rss)
    return first_rs2
Ejemplo n.º 3
0
def calculateFeatureSqrt(trainX, trainY, testX, testY):
    second_rs2 = np.array([])
    for i in range(1, 151, 1):
        reg2 = RandomForest(n_estimators=i, max_depth=7, max_features="sqrt")
        reg2.fit(trainX, trainY)
        predict = reg2.predict(testX)
        rss = calculateRSS(testY, predict)
        second_rs2 = np.append(second_rs2, rss)
    return second_rs2
Ejemplo n.º 4
0
def calculateFeature4(trainX, trainY, testX, testY):
    forth_rs2 = np.array([])
    for i in range(1, 151, 1):
        reg3 = RandomForest(n_estimators=i, max_depth=7, max_features=4)
        reg3.fit(trainX, trainY)
        predict = reg3.predict(testX)
        rss = calculateRSS(testY, predict)
        forth_rs2 = np.append(forth_rs2, rss)
    return forth_rs2
Ejemplo n.º 5
0
 def __init__(self, n=10, **kwargs):
     # keyword arguments are passed on to scikit-learn's KNN implementation
     # see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
     # relevant kwargs (* indicates default):
     #     n (int): 10* (number of trees in the random forest)
     #     n_jobs (int): 1* or more (cores used to parallelize neighbor search)
     super(RandomForestClassifier, self).__init__("Random Forest",
                                                  n=n,
                                                  **kwargs)
     self._forest = RandomForest(n_estimators=n, **kwargs)
Ejemplo n.º 6
0
 def getModel(self, _params):
   return RandomForest(
     n_estimators= int(_params['n_estimators']),
     # criterion= _params['criterion'],
     max_depth= _params['max_depth'],
     # min_samples_split= _params['min_samples_split'],
     # min_samples_leaf= _params['min_samples_leaf'],
     min_weight_fraction_leaf= _params['min_weight_fraction_leaf'],
     max_features= _params['max_features'],
     # max_leaf_nodes= int(_params['max_leaf_nodes']),
     # min_impurity_decrease= _params['min_impurity_decrease'],
     # bootstrap= _params['bootstrap'],
     oob_score= _params['oob_score'],
     # ccp_alpha= _params['ccp_alpha'],
     n_jobs= definitions.getNumberOfCore(),
   )
Ejemplo n.º 7
0
def main(args):
    start = time.time()

    if not args.all_feats:
        data = pickle.load(open(args.pruned_ds, 'rb'))
    else:
        data = pickle.load(open(args.full_ds, 'rb'))
        data = np.array([feats[1] for feats in data])

    X = data[:, 1:]
    y = data[:,0]

    if args.num_folds > 0:
        print(f'Performing {args.num_folds}-fold validation')
        f_scores = kfold_validation(X, y, algorithm=args.algorithm, num_folds=args.num_folds)
        accs = kfold_scores(f_scores)
        print(f_scores)
        print(f'Average accuracy of {args.num_folds}-folds: {100*accs[0]:.2f}%')
        print(f'Best accuracy of {args.num_folds}-folds: {100*accs[1]:.2f}%')
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=args.seed)
        print(f'Train data: {X_train.shape}, train labels: {y_train.shape}')
        print(f'Test data: {X_test.shape}, test labels: {y_train.shape}')

        if args.algorithm == 'NB':
            model = BernoulliNB()
            model.fit(X_train, y_train)

        if args.algorithm == 'RF':
            model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2)
            model.fit(X_train, y_train)

        if args.algorithm == 'XGB':
            model = XGB(verbosity=1, n_estimators=1000, max_depth=8, reg_lambda=1e-2, reg_alpha=4)
            model.fit(X_train, y_train, eval_set=[(X_test,y_test)], eval_metric='logloss', verbose=True, early_stopping_rounds=20)

        # test model
        test_model(model, X_test, y_test)

    print(f'Script completed in {time.time()-start:.2f} secs')

    return 0
def get_classifier(cls, param):
    if cls == "LR":
        return LR(C=param, random_state=123)
    elif cls == "KNN":
        return KNN(n_neighbors=param)
    elif cls == "RForest":
        return RandomForest(n_estimators=75, max_depth=param, random_state=123)
    elif cls == "BagTree":
        return Bagging(base_estimator=DecisionTree(max_depth=param,
                                                   random_state=123),
                       random_state=123)
    elif cls == "Perceptron":
        return Perceptron(eta0=param, random_state=123)
    elif cls == "MLP":
        return MLP(hidden_layer_sizes=(20, ),
                   alpha=param,
                   max_iter=40,
                   solver='lbfgs')  #too slow
    else:
        pass
Ejemplo n.º 9
0
def kfold_validation(features, labels, algorithm='XGB', num_folds=2):

    kf = KFold(n_splits=num_folds)
    kf.get_n_splits(features)

    fold_scores = {'train':[], 'val':[]} 

    fold_num = 0
    for train_idx, val_idx in kf.split(features):
        fold_num += 1
        print(f'Training on fold {fold_num}')
        X_train, y_train = features[train_idx], labels[train_idx]
        X_val, y_val = features[val_idx], labels[val_idx]

        if args.algorithm == 'NB':
            model = BernoulliNB()
            model.fit(X_train, y_train)

        if args.algorithm == 'RF':
            model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2)
            model.fit(X_train, y_train)

        if args.algorithm == 'XGB':
            model = XGB(verbosity=1, n_estimators=1000, max_depth=3, reg_lambda=1, reg_alpha=1e-4)
            model.fit(X_train, y_train, eval_set=[(X_val,y_val)], eval_metric='logloss', verbose=True, early_stopping_rounds=20)

        train_score = model.score(X_train, y_train)
        fold_scores['train'].append(train_score)

        val_score = model.score(X_val, y_val)
        fold_scores['val'].append(val_score)

        print(f'Fold {fold_num}: training score = {train_score}, validation score = {val_score}')

        with open('fold_accs_random_forest.npy', 'wb') as outfile:
            pickle.dump(fold_scores, outfile)

    return fold_scores
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn import cross_validation
from utils import *

# Read data
X, y = get_train_data("../data/train.csv")

# Parameters to test
parameter_space = [[320], [340], [360], [380], [400]]

# Cross validation
parameter_scores = []
for parameter in parameter_space:
    clf = RandomForest(n_estimators=parameter[0], n_jobs=2)  # criterion='entropy'
    scores = cross_validation.cross_val_score(clf, X, y, cv=4, scoring='log_loss', verbose=3)
    parameter_scores.append(np.mean(scores * -1))

# Show results
print "Logloss: " + str(parameter_scores)
Ejemplo n.º 11
0
def reproducing_sakar():
    classifiers = {
        "Naive Bayes": NaiveBayes(),
        "Logistic Regression": LogisticRegression(),
        "k-NN": KNN(p=1, n_neighbors=1),
        "Multilayer Perceptron": MLP(),
        "Random Forest": RandomForest(n_estimators=100),
        "SVM (Linear)": SVM(kernel="linear", gamma="auto"),
        "SVM (RBF)": SVM(kernel="rbf", gamma="auto")
    }

    scores = {
        "subject": [],
        "Naive Bayes": [],
        "Logistic Regression": [],
        "k-NN": [],
        "Multilayer Perceptron": [],
        "Random Forest": [],
        "SVM (Linear)": [],
        "SVM (RBF)": []
    }

    f1s = {
        "subject": [],
        "Naive Bayes": [],
        "Logistic Regression": [],
        "k-NN": [],
        "Multilayer Perceptron": [],
        "Random Forest": [],
        "SVM (Linear)": [],
        "SVM (RBF)": []
    }

    mccs = {
        "subject": [],
        "Naive Bayes": [],
        "Logistic Regression": [],
        "k-NN": [],
        "Multilayer Perceptron": [],
        "Random Forest": [],
        "SVM (Linear)": [],
        "SVM (RBF)": []
    }

    voting = {"subject": [], "voted": [], "true": []}

    df = pd.read_csv("parkinsons.csv")
    df = df.drop(["gender"], axis=1)

    for i in range(252):
        print("SUBJECT {}".format(i))

        scores["subject"].append(i)
        f1s["subject"].append(i)
        mccs["subject"].append(i)

        train_set = df.loc[df["id"] != i].drop(["id"], axis=1)
        test_set = df.loc[df["id"] == i].drop(["id"], axis=1)

        X_train = train_set.drop(["class"], axis=1)
        y_train = train_set["class"]
        X_test = test_set.drop(["class"], axis=1)
        y_test = test_set["class"]

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        #pca = PCA(n_components=50)
        pca = LDA()
        pca.fit(X_train, y_train)
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        print(X_train_pca.shape)

        predictions = []
        for name, classifier in classifiers.items():
            classifier.fit(X_train_pca, y_train)
            pred = classifier.predict(X_test_pca)

            score = round(accuracy_score(y_test, pred), 2)
            f1 = round(f1_score(y_test, pred), 2)
            mcc = round(matthews_corrcoef(y_test, pred), 2)

            scores[name].append(score)
            f1s[name].append(f1)
            mccs[name].append(mcc)

            predictions.extend(list(pred))

            print("{:<25}{} {} {}".format(name, score, f1, mcc))

        voted_label = mode(predictions)
        true_label = list(y_test)[0]

        voting["subject"].append(i)
        voting["voted"].append(voted_label)
        voting["true"].append(true_label)

        print("Voted/True: {}/{}".format(voted_label, true_label))
        print()

    scores = pd.DataFrame(scores)
    scores.to_csv("results/scores.csv", index=None)

    f1s = pd.DataFrame(f1s)
    f1s.to_csv("results/f1s.csv", index=None)

    mccs = pd.DataFrame(mccs)
    mccs.to_csv("results/mccs.csv", index=None)

    voting = pd.DataFrame(voting)
    voting.to_csv("results/voting.csv", index=None)

    print(scores)
    print(f1s)
    print(mccs)
    print(voting)
Ejemplo n.º 12
0
X, y = get_train_data('../features_all.csv', '../trainLabels.csv')

# Parameters space creation
params_space = [[200]]

# Grid search
grid_errors = []
for params in params_space:

    # Cross validation
    skf = StratifiedKFold(y, n_folds=8)

    errors = []
    for train, test in skf:

        clf = RandomForest(n_estimators=params[0], n_jobs=2)
        clf.fit(X[train], y[train])
        predictions = clf.predict(X[test])

        kappa_score = kappa(y[test], predictions, weights='quadratic')
        print "Kappa: %f" % kappa_score
        print "Confusion matrix:"
        print confusion_matrix(y[test], predictions)
        print "Classification report:"
        print classification_report(y[test], predictions)

        errors.append(kappa_score)

    grid_errors.append(np.mean(errors))

# Show results
Ejemplo n.º 13
0
the predictor is picked based on if they can contribute more to the model's 
accuracy - basically picking the predictor that reduces error in prediction.

Now, a Random Forest is a bunch of different decision trees, and we take the 
weighted average of all the decision trees to make our prediction. However, 
randomness is introduced at each split of the tree, such that only a few 
predictors out of all the predictors are even considered to be asked a
question.

Below we fit the model and use it to predict our response in train and test 
set.
'''

# Create the random forest using a randomisation of 1000 observations
forest = RandomForest(n_estimators=1000,
                               criterion='mse',
                               random_state=1,
                               n_jobs=-1)

# Fit training data to model
forest.fit(x_train2, y_train)
# Train the model on training data and predict test data
rfy_train_pred = forest.predict(x_train2)
rfy_test_pred = forest.predict(x_test2)

# Look at the performance measures in terms of RMSE and R-Squared
print('RF RMSE train: %.3f, test: %.3f' % (
        MSE(y_train, rfy_train_pred)**(1/2),
        MSE(y_test, rfy_test_pred)**(1/2)))
print('RF R^2 train: %.3f, test: %.3f' % (
        R2(y_train, rfy_train_pred),
        R2(y_test, rfy_test_pred)))
Ejemplo n.º 14
0
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(features),
                                                                  np.array(labels),
                                                                  test_size=vali_size,
                                                                  random_state=seed)

print("[INFO] splitted train and test data...")
print("[INFO] train data  : {}".format(trainData.shape))
print("[INFO] test data   : {}".format(testData.shape))
print("[INFO] train labels: {}".format(trainLabels.shape))
print("[INFO] test labels : {}".format(testLabels.shape))

# use logistic regression as the model
print("[INFO] creating model...")

model_LR = LogisticRegression(random_state=seed)
model_RF = RandomForest(n_estimators=100, random_state=seed)
model_SVC = SVC(probability=True,random_state = seed)

model_LR.fit(trainData, trainLabels)
#model_RF.fit(trainData, trainLabels)
#model_SVC.fit(trainData, trainLabels)

testLabels_pre = model_LR.predict(testData)
score_LR = accuracy_score(testLabels, testLabels_pre)
#testLabels_pre = model_RF.predict(testData)
#score_RF = accuracy_score(testLabels, testLabels_pre)
#testLabels_pre = model_SVC.predict(testData)
#score_SVC = accuracy_score(testLabels, testLabels_pre)
#testLabels_pre = model_vote.predict(testData)
#score_vote = accuracy_score(testLabels, testLabels_pre)
#print('Vote accuracy:',score_vote)
            open('svm_rbf_grid_results.p', 'wb'))

### Single Decision Tree
DecisionTree_Model = OneVsRestClassifier(
    tree.DecisionTreeClassifier(criterion='gini'))
grid = GridSearchCV(DecisionTree_Model,
                    param_grid={'estimator__max_depth': range(1, 10)},
                    scoring=hamming_scorer)
grid.fit(X_data_std, y_data)
grid.cv_results_['mean_test_score']
grid.best_score_

### Random Forest: Classical Random Forest
# Tune: max_depth, min_samples_leaf

RandomForest_Model = OneVsRestClassifier(RandomForest())
rf_grid = GridSearchCV(RandomForest_Model,
                       param_grid={'estimator__max_depth': [10, 20, 30]})

rf_grid.fit(X_data_std, y_data)
rf_grid.cv_results_['mean_test_score']
rf_grid.best_score_

### Random Forest, Ada boosted: 100 trees
# NOTE: Need to tune learning rate
Ada_Model = OneVsRestClassifier(AdaBoostClassifier(n_estimators=100))
Ada_Model.fit(X_data_std, y_data)
scores = cross_val_score(Ada_Model, X_data_std, y_data, scoring=hamming_scorer)

### Random Forest, Gradient boosting
GBRT_Model = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100))
Ejemplo n.º 16
0
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.model_selection import cross_val_score

from text_tokenizer import TextTokenizer

from utils_files import read_lines
from utils_files import load_object


algorithms = {
	"logistic-regression": LogisticRegression(),
	"naive-bayes": BernoulliNB(),
	"linear-svc": LinearSVC(),
	"random-forest": RandomForest(n_estimators = 100, n_jobs = -1)
}




class NodeClassif(object):

	""" Represents a hierarchical node classifier
	Attributes:
	----------
		model:
			type: sklearn.estimator
			info: trained classifier model
		selector:
			type: SelectPercentile
Ejemplo n.º 17
0
#print(y_train)
'''
for c in [0.01, 0.05, 0.25, 0.5, 0.6, 0.75, 1]:
    pclf = Pipeline([
        ('vect', CountVectorizer(binary=True)),
#        ('tfidf', TfidfTransformer()),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=c)),
    ])

    pclf.fit(X_train, y_train)
    y_pred = pclf.predict(X_test)

    print("C = %s"%(c))
    print(metrics.classification_report(y_test, y_pred))
'''

for value in [1]:
    pclf = Pipeline([
        ('vect', CountVectorizer()),
        #        ('tfidf', TfidfTransformer()),
        #        ('norm', Normalizer()),
        ('clf', RandomForest(n_estimators=100, max_depth=500)),
    ])

    pclf.fit(X_train, y_train)
    y_pred = pclf.predict(X_test)

    print("C = %s" % (value))
    print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 18
0
# 形式変換
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})

# 欠損補完
data['Age'] = data['Age'].fillna(data['Age'].median())

# 使わない列削除
data = data.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

# 乗船者のIDと生存ラベルを取り除いたデータと生存ラベルに分ける
X = data.drop(['Survived', 'PassengerId'], axis=1).values
y = data['Survived'].values

# 学習
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = RandomForest(n_estimators=100).fit(X_train, y_train)

# テスト
target = pd.read_csv('../input/titanic/test.csv')

target['Sex'] = target['Sex'].map({'female': 0, 'male': 1})
target['Age'] = target['Age'].fillna(target['Age'].median())
target['Fare'] = target['Fare'].fillna(target['Fare'].median())
target = target.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

# 乗船者のIDと生存ラベルを取り除いたデータと生存ラベルに分ける
X_target = target.drop(['PassengerId'], axis=1).values
result = model.predict(X_target)

target['Survived'] = result
target[["PassengerId","Survived"]].to_csv("submission2.csv",index=False)
Ejemplo n.º 19
0
def calculateFeatureSqrt(trainX, trainY, testX, testY):
    second_rs2 = np.array([])
    for i in range(1, 151, 1):
        reg2 = RandomForest(n_estimators=i, max_depth=7, max_features="sqrt")
        reg2.fit(trainX, trainY)
        predict = reg2.predict(testX)
        rss = calculateRSS(testY, predict)
        second_rs2 = np.append(second_rs2, rss)
    return second_rs2


auto = calculateFeatureAuto(train_x, train_y, test_X, test_Y)
sqrt = calculateFeatureSqrt(train_x, train_y, test_X, test_Y)
forth = calculateFeature4(train_x, train_y, test_X, test_Y)

y_reg1 = RandomForest(n_estimators=150, max_depth=7, max_features=4)
y_reg1.fit(train_x, train_y)
pred = y_reg1.predict(test_X)

y_reg2 = RandomForest(n_estimators=150, max_depth=1, max_features=4)
y_reg2.fit(train_x, train_y)
pred2 = y_reg2.predict(test_X)

plt.ylabel('R^2 Score')
plt.xlabel('Number of Estimators(decision trees)')

arrArange = np.arange(0, 150, 1)

plt.plot(arrArange, auto, "r", label='Auto')
plt.plot(arrArange, sqrt, "b", label='Sqrt')
plt.plot(arrArange, forth, "g", label='Four')
Ejemplo n.º 20
0
            "logistic_ucb",
            "logistic_egreedy",
    ]:
        kwargs["epsilon"] = 0.01
    policy = counterfactual_policy_dict[counterfactual_policy](**kwargs)
    # compared OPE estimators
    ope_estimators = [
        DirectMethod(),
        InverseProbabilityWeighting(),
        SelfNormalizedInverseProbabilityWeighting(),
        DoublyRobust(),
        SelfNormalizedDoublyRobust(),
        SwitchDoublyRobust(),
    ]
    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(RandomForest(**hyperparams))

    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_runs)
        for est in ope_estimators
    }
    for i in np.arange(n_runs):
        # sample a new set of logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # run a counterfactual bandit algorithm on logged bandit feedback data
        selected_actions = run_bandit_simulation(
            bandit_feedback=bandit_feedback, policy=policy)
        # estimate the ground-truth policy values of the counterfactual policy
        # using the full expected reward contained in the bandit feedback dictionary
        ground_truth_policy_value = bandit_feedback["expected_reward"][
def f(id, filtered_df, pos_stem, name):
    result = []
    for n in range(1, 4):
        for pca_n in [0, 2]:
            print "Worker " + str(id) + " : " + str(n) + "/3 and " + str(
                pca_n + 1) + "/2"
            vectorized_1gram, vectorized_1gram_names = vectorize(pos_stem, n)

            if pca_n != 0:
                # project to the data onto the two axes
                bag_to_use = make_pca(vectorized_1gram, pca_n)
            else:
                bag_to_use = vectorized_1gram

            y = filtered_df['y'].values

            x_train, x_test, y_train, y_test = train_test_split(
                bag_to_use, y, test_size=0.4, random_state=42)

            # to keep track of the best model
            best_avg = 0
            best_trees_avg = None
            best_depth_avg = None
            best_nodes_avg = None

            ### RF CV
            # parameters for tuning
            n_trees = np.arange(10, 200, 20)
            depths = np.arange(2, 10)
            leaf_nodes = np.arange(2, 10)
            num_folds = 4

            # iterate through trees and depths
            for nodes in leaf_nodes:
                for trees in n_trees:
                    for depth in depths:
                        # cross validation for every experiment
                        k_folds = KFold(x_train.shape[0],
                                        n_folds=num_folds,
                                        shuffle=True)
                        scores = []

                        # for each fold
                        for train_indices, validation_indices in k_folds:
                            # generate training data
                            x_train_cv = x_train[train_indices]
                            y_train_cv = y_train[train_indices]
                            # generate validation data
                            x_validate = x_train[validation_indices]
                            y_validate = y_train[validation_indices]

                            # fit random forest on training data
                            rf = RandomForest(n_estimators=trees,
                                              max_depth=depth,
                                              max_leaf_nodes=nodes,
                                              class_weight='balanced')
                            rf.fit(x_train_cv, y_train_cv)
                            print "HITTING"
                            # score on validation data
                            scores += [simulate(x_validate, rf, filtered_df)]

                        # record and report accuracy
                        average_score = np.mean(scores, axis=0)

                        # update our record of the best parameters see so far
                        if np.mean(average_score) >= best_avg:
                            best_avg = np.mean(average_score)
                            best_trees_avg = trees
                            best_depth_avg = depth
                            best_nodes_avg = nodes

            result += [(name, n, pca_n, best_trees_avg, best_depth_avg,
                        best_nodes_avg, best_avg)]
    return result
Ejemplo n.º 22
0
from sklearn.ensemble import RandomForestClassifier as RandomForest
from utils import *

# Read data
X_train, y_train = get_train_data("../data/train.csv")
X_test = get_test_data("../data/test.csv")

# Fit model and make predictions
clf = RandomForest(n_estimators=360, n_jobs=2, verbose=1)
clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)

# Save submission to file
save_submission(predictions, 'rf.csv')
Ejemplo n.º 23
0
bc_dataset = datasets.load_breast_cancer()

# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# define target (what we want to predict) and train (data used to predict)
target = bc_dataset.target
train = bc_dataset.data

# Split: train 60% test 40%
from sklearn.cross_validation import train_test_split
split = train_test_split(train, target, test_size=0.4, random_state=42)
data_train, data_test, target_train, target_test = split

## train with RandomForest
model = RandomForest(n_estimators=10)
model.fit(data_train[:None], target_train)
print 'perf: '
print model.score(data_test[:None], target_test)

# variable with the more impact ?
features = [
    'mean radius', 'mean texture', 'mean perimeter', 'mean area',
    'mean smoothness', 'mean compactness', 'mean concavity',
    'mean concave points', 'mean symmetry', 'mean fractal dimension',
    'radius error', 'texture error', 'perimeter error', 'area error',
    'smoothness error', 'compactness error', 'concavity error',
    'concave points error', 'symmetry error', 'fractal dimension error',
    'worst radius', 'worst texture', 'worst perimeter', 'worst area',
    'worst smoothness', 'worst compactness', 'worst concavity',
    'worst concave points', 'worst symmetry', 'worst fractal dimension'
Ejemplo n.º 24
0
	def instanciate(self, params):
		n_estimators, = params
		self.model = RandomForest(n_estimators=n_estimators, n_jobs=CORES)
Ejemplo n.º 25
0
def select_feature(data, y, tol=0, min_bin=0.05, regress=True):
    """
    Use random forest to select features from data set with both numerical and
    categorical features.
    input:  data    2D array of the independent variables
            y       1D array of the respons variable
            tol     the smallest feature importance to keep, default 0
            min_bin the minimum percentage of one class in categorical feature,
                    default 0.05
            regress indicate whether the problem is regression or classification
    """
    logger = logging.getLogger(__name__)
    logger.info(
        "------ select_feature(data, y, tol=0, min_bin=0.05, regress=True) ------"
    )
    from time import time
    begin = time()
    cat_cols = data.columns[data.dtypes == 'object']
    num_cols = data.columns[data.dtypes != 'object']
    logger.info(
        "{} numerical and {} categorical features out of {} features".format(
            len(num_cols), len(cat_cols), data.shape[1]))
    results = []  # score, feature name, feature importance

    # for numerical variables
    beg = time()
    if regress:
        from sklearn.ensemble import RandomForestRegressor as RandomForest
    else:
        from sklearn.ensemble import RandomForestClassifier as RandomForest
    rf = RandomForest(n_estimators=500,
                      max_depth=int(np.log(len(num_cols)) / np.log(2)) + 1,
                      max_features=0.33)
    logger.info("max_depth", int(np.log(len(num_cols)) / np.log(2)) + 1)
    rf = rf.fit(data[num_cols], y)
    print("Model with numerical variables fitted, time used {:.2f} min".format(
        (time() - beg) / 60))
    beg = time()
    score = rf.score(data[num_cols], y)
    results.append([score, num_cols, rf.feature_importances_])
    selected = num_cols[rf.feature_importances_ > tol]
    print("Model score {:.2f}, time used {:.2f} min".format(
        score, (time() - beg) / 60))

    # for categorical variables
    dummies = pd.DataFrame()
    print("-" * len(cat_cols), end='\r', flush=True)
    for i, col in enumerate(cat_cols):
        beg = time()
        c, d = np.unique(data[col], return_counts=True)
        d = d / sum(d)
        for cls in c[d > min_bin]:
            dummies[str(col) + '.' + str(cls)] = (data[col] == cls).astype(int)
        rf = RandomForest(
            n_estimators=500,
            max_depth=int(
                np.log(len(selected) + dummies.shape[1]) / np.log(2)) + 1,
            max_features=0.33)
        temp = pd.concat([data[selected], dummies], axis=1)
        logger.info(
            "max_depth",
            int(np.log(len(selected) + dummies.shape[1]) / np.log(2)) + 1,
            temp.shape[1])
        rf = rf.fit(temp, y)
        score = rf.score(temp, y)
        results.append([score, temp.columns, rf.feature_importances_])
        selected = selected[rf.feature_importances_[:len(selected)] > tol]
        dummies.drop(
            dummies.columns[rf.feature_importances_[len(selected):] < tol],
            axis=1,
            inplace=True)
        print("*" * (i + 1) + "-" * (len(cat_cols) - i - 1) +
              " score {:.2f}, time {:.2f} min".format(score,
                                                      (time() - beg) / 60),
              end='\n',
              flush=True)
    print("\ntotal time used: {:.2f} min".format((time() - begin) / 60))
    return results