Beispiel #1
0
        classification_binary(
            light_clf.SGDClassifier(random_state=RANDOM_SEED)),

        # Decision trees
        regression(tree.DecisionTreeRegressor(**TREE_PARAMS)),
        regression(tree.ExtraTreeRegressor(**TREE_PARAMS)),
        classification(tree.DecisionTreeClassifier(**TREE_PARAMS)),
        classification(tree.ExtraTreeClassifier(**TREE_PARAMS)),
        classification_binary(tree.DecisionTreeClassifier(**TREE_PARAMS)),
        classification_binary(tree.ExtraTreeClassifier(**TREE_PARAMS)),

        # Random forest
        regression(ensemble.ExtraTreesRegressor(**FOREST_PARAMS)),
        regression(ensemble.RandomForestRegressor(**FOREST_PARAMS)),
        classification(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)),
        classification(ensemble.RandomForestClassifier(**FOREST_PARAMS)),
        classification_binary(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)),
        classification_binary(
            ensemble.RandomForestClassifier(**FOREST_PARAMS)),
    ],

    # Following is the list of extra tests for languages/models which are
    # not fully supported yet.

    # <empty>
)
def test_e2e(estimator, executor_cls, model_trainer, is_fast, global_tmp_dir):
    sys.setrecursionlimit(RECURSION_LIMIT)

    X_test, y_pred_true, fitted_estimator = model_trainer(estimator)
    executor = executor_cls(fitted_estimator)
Beispiel #2
0
             color='r',
             label='Seven Clusters')
plt.xlabel('Unsupervised method')
plt.ylabel('Silhouette score')
plt.title('Silhouette Score on Various Clustering Methods and Cluster Sizes')
plt.xticks(index + bar_width, ('GMM', 'Kmeans', 'HAC'))
plt.legend()
plt.savefig(sys.argv[2] + '/silhouette_barplot.png')
###END PLOT SNIPPET###

###SUPERVISED PORTION

from sklearn import ensemble
from sklearn import cross_validation
##For each of random forest, naive bayes, logistic regression, run 10 fold CV and get the average score (mean accuracy)
randfor = ensemble.RandomForestClassifier(100)
randforscores = cross_validation.cross_val_score(randfor,
                                                 PCA_data,
                                                 class_labels,
                                                 cv=10)

avgrandforscore = numpy.mean(randforscores)

from sklearn import naive_bayes
nbayes = naive_bayes.GaussianNB()
nbayesscores = cross_validation.cross_val_score(nbayes,
                                                PCA_data,
                                                class_labels,
                                                cv=10)

avgnbayesscores = numpy.mean(nbayesscores)
Beispiel #3
0
    :param classifier: Model Name
    :param feature_vector_train: Training input data
    :param label: Training output label
    :param feature_vector_valid: Testing input data
    :return: Accuracy score
    """
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, y_test)


accuracy = train_model(svm.SVC(kernel='linear'), xtrain_tfidf, y_train,
                       xvalid_tfidf)
print "SVM, WordLevel TF-IDF: Accuracy:", accuracy * 100
print("\n")
accuracy = train_model(svm.SVC(kernel='linear'), xtrain_count, y_train,
                       xvalid_count)
print "SVM, CountVector: Accuracy:", accuracy * 100
print("\n")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100),
                       xtrain_tfidf, y_train, xvalid_tfidf)
print "RF, Wordlevel TF-IDF: Accuracy:", accuracy * 100
print("\n")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100),
                       xtrain_count, y_train, xvalid_count)
print "RF, CountVector: Accuracy:", accuracy * 100
print("\n")
Beispiel #4
0
# -*- coding: utf-8 -*-
from sklearn import ensemble

MODELS = {
    "randomforest":
    ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2),
    "extratrees":
    ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1, verbose=2)
}
Beispiel #5
0

if __name__ == '__main__':
    now = datetime.now()

    unigrams = loadVocabulary()
    unigrams_flag = 'unigrams'

    print 'load train set'
    is_train_set = 1
    labels = loadLabels(is_train_set)
    features = loadFeatures(labels, unigrams, is_train_set)
    train_set = {'labels': labels, 'features': features}

    print 'load test set'
    is_train_set = 0
    labels = loadLabels(is_train_set)
    features = loadFeatures(labels, unigrams, is_train_set)
    test_set = {'labels': labels, 'features': features}

    model = ensemble.RandomForestClassifier(n_estimators=20, random_state=512)
    model_name = 'random forest'
    trainModel(unigrams_flag, train_set, test_set, model, model_name)

    model = ensemble.RandomForestClassifier(n_estimators=50, random_state=512)
    trainModel(unigrams_flag, train_set, test_set, model, model_name)

    topK = 100
    getTopFeatures(model, topK, unigrams)
    print 'running time is', datetime.now() - now
Beispiel #6
0
def getModel(dataset, model):
    if model == Models.RandomForest:
        if dataset == "IMDB":
            return ensemble.RandomForestClassifier(
                criterion='gini',
                max_depth=600,
                max_features=0.8,
                max_leaf_nodes=100,
                min_impurity_decrease=0.0001,
                n_estimators=50)
        else:
            return ensemble.RandomForestClassifier(
                min_impurity_decrease=0.0001,
                random_state=30,
                criterion='gini',
                ccp_alpha=0.0002,
                max_depth=200,
                max_features=0.4,
                n_estimators=90)

    elif model == Models.DecisionTree:
        if dataset == "IMDB":
            return tree.DecisionTreeClassifier(max_depth=600,
                                               min_impurity_decrease=0.0001,
                                               max_leaf_nodes=100,
                                               max_features=0.8,
                                               splitter="random",
                                               ccp_alpha=0.00025,
                                               criterion='gini')
        else:
            return tree.DecisionTreeClassifier(max_depth=450,
                                               min_impurity_decrease=0.0001,
                                               max_leaf_nodes=600,
                                               random_state=30,
                                               max_features=0.4,
                                               criterion='gini',
                                               splitter="best",
                                               ccp_alpha=0.00055)
    elif model == Models.AdaBoost:
        if dataset == "IMDB":
            return ensemble.AdaBoostClassifier(n_estimators=300,
                                               learning_rate=0.7,
                                               random_state=0)
        else:
            return ensemble.AdaBoostClassifier(n_estimators=125,
                                               learning_rate=0.5,
                                               random_state=0)

    elif model == Models.KNN:
        if dataset == "IMDB":
            return neighbors.KNeighborsClassifier(n_neighbors=525,
                                                  weights='uniform',
                                                  p=2)
        else:
            return neighbors.KNeighborsClassifier(n_neighbors=600,
                                                  weights='uniform',
                                                  p=2)

    elif model == Models.LogisticRegression:
        if dataset == "IMDB":
            return linear_model.LogisticRegression(C=1.0,
                                                   dual=False,
                                                   max_iter=1000,
                                                   penalty='l1',
                                                   solver='liblinear',
                                                   tol=0.1)
        else:
            return linear_model.LogisticRegression(C=1.0,
                                                   dual=False,
                                                   max_iter=100,
                                                   penalty='l2',
                                                   solver='saga',
                                                   tol=0.01)

    elif model == Models.SVM:
        if dataset == "IMDB":
            return svm.LinearSVC(C=0.1,
                                 dual=False,
                                 loss='squared_hinge',
                                 max_iter=1000,
                                 penalty='l2',
                                 tol=0.1)
        else:
            return svm.LinearSVC(C=1.0,
                                 dual=True,
                                 fit_intercept=True,
                                 loss='squared_hinge',
                                 max_iter=5000,
                                 penalty='l2',
                                 tol=0.01)
Beispiel #7
0
import pandas as pd
from sklearn import ensemble

if __name__ == "__main__":
    loc_train = "fullTrainM4.csv"
    #loc_test = "fullTestM4.csv"
    loc_test = "fullTrainM4.csv"
    #loc_submission = "kaggle.forest.submission.csv"
    loc_submission = "trainScore.csv"

    df_train = pd.read_csv(loc_train)
    df_test = pd.read_csv(loc_test)

    feature_cols = [
        col for col in df_train.columns if col not in ['repeater', 'id']
    ]

    X_train = df_train[feature_cols]
    X_test = df_test[feature_cols]
    y = df_train['repeater']
    #test_ids = df_test['id']
    test_ids = df_test['id']
    print "running RF ..."
    clf = ensemble.RandomForestClassifier(n_estimators=500, n_jobs=-1)

    clf.fit(X_train, y)
    print "scoring ..."
    with open(loc_submission, "wb") as outfile:
        outfile.write("id,repeatProbability\n")
        for e, val in enumerate(list(clf.predict_proba(X_test))):
            outfile.write("%s,%s\n" % (test_ids[e], val[1]))
Beispiel #8
0
from sklearn import ensemble
from sklearn.utils import shuffle
import numpy as np
from sklearn import cross_validation

#load data from file
data = np.loadtxt('acceldata.txt', delimiter=',')
DT = data.transpose()

#split data into arrays
X = np.array(DT[0:-1]).transpose()
Y = np.array(DT[-1]).transpose()

#create training and test sets, 70% in training 30% in testing set
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, Y, test_size=0.30, random_state=0)

#construct random forest with 10 trees, creating bootstrap samples
rf = ensemble.RandomForestClassifier(n_estimators=10,
                                     random_state=0,
                                     bootstrap=True)

### fit and score the model
rf.fit(X_train, y_train)
Beispiel #9
0
                                                    data_y,
                                                    test_size=0.3,
                                                    random_state=4)

print('----------- DTREE WITH GINI IMPURITY CRITERION ------------------')
dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini')
dtree_gini_mod.fit(x_train, y_train)
preds_gini = dtree_gini_mod.predict(x_test)
print_multiclass_classif_error_report(y_test, preds_gini)

n_est = [100]
depth = [None]
for n in n_est:
    for dp in depth:
        # Create model and fit.
        mod = ensemble.RandomForestClassifier(n_estimators=n, max_depth=dp)
        mod.fit(x_train, y_train)

        # Make predictions - both class labels and predicted probabilities.
        preds = mod.predict(x_test)
        print('---------- EVALUATING MODEL: n_estimators = ' + str(n) +
              ', depth =' + str(dp) + ' -------------------')
        # Look at results.
        print_multiclass_classif_error_report(y_test, preds)
#END-------------- Fatality or Injury ----------------------

#START-------------SPEEED LIMIT -----------------------------
features2 = list(data)
features2.remove('SPEED_LIMIT')

data_x = data[features2]
Beispiel #10
0
bag = {
    'bid': bid_cluster_classifier_bagging,
    'ask': ask_cluster_classifier_bagging
}
with open('../run_models/clusterAndClassify_Bagging.model', 'wb') as output:
    pickle.dump(bag, output, -1)

bid_cluster_classifier_rfc = multiclass.OneVsOneClassifier(
    estimator=ensemble.RandomForestClassifier(n_estimators=30,
                                              criterion='gini',
                                              max_depth=None,
                                              min_samples_split=2,
                                              min_samples_leaf=1,
                                              min_weight_fraction_leaf=0.0,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_split=1e-07,
                                              bootstrap=True,
                                              oob_score=False,
                                              n_jobs=1,
                                              random_state=None,
                                              verbose=0,
                                              warm_start=False,
                                              class_weight=None),
    n_jobs=-1)
bid_cluster_classifier_rfc.fit(trainFeatures, all_bid_labels)
print "Bid accuracy with Random Forest: ", bid_cluster_classifier_rfc.score(
    trainFeatures, all_bid_labels)

ask_cluster_classifier_rfc = multiclass.OneVsOneClassifier(
    estimator=ensemble.RandomForestClassifier(n_estimators=30,
                                              criterion='gini',
X_test = np.array(df.drop(['application_key'],1))
X_test = preprocessing.scale(X_test)#

# if pca applied
X_test = np.array(pd.DataFrame(data = pca.transform(X_test)))
X_test = preprocessing.scale(X_test)#





# different classifiers tested for the price pred dataset 


clf1 = svm.SVC()
clf2 = ske.RandomForestClassifier(n_estimators=100)
clf3 = neighbors.KNeighborsClassifier(n_neighbors=5)
clf4 = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(150, 100, 50, 10, 3))
clf5 = DecisionTreeClassifier(criterion = "entropy", random_state = 10,max_depth=100, min_samples_leaf=5)
clf6 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=100), n_estimators=600, learning_rate=1)



clfs = [clf2 ]
# train model

i = 1
for clf in clfs:
	epochs = 10
	for epoch in range(epochs):
		X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
Beispiel #12
0
pca_df.plot(x=0, y=1, kind='scatter')

variance_df = pandas.DataFrame({
    'variance': pca.explained_variance_,
    'principal component': pca_df.columns.tolist()
})

# adding one to pricnipal componetns (since there is no 0th compeonet)
variance_df['principal component'] = variance_df['principal component'] + 1
variance_df.plot(x='principal component', y='variance')
#  looks like variance stops getting explained after first two components

pca_df_small = pca_df.ix[:, 0:1]

# getting a cross val score of transformed data
rf = ensemble.RandomForestClassifier(n_estimators=500)
roc_scores_rf_pca = cross_val_score(rf,
                                    pca_df_small,
                                    response_series,
                                    cv=10,
                                    scoring='roc_auc')

print roc_scores_rf_pca.mean()
# 74% accuracy

roc_scores_rf = cross_val_score(rf,
                                explanatory_df,
                                response_series,
                                cv=10,
                                scoring='roc_auc')
print roc_scores_rf.mean()
Beispiel #13
0
# In[ ]:

from sklearn import linear_model
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from sklearn import svm
from sklearn import gaussian_process
from sklearn import naive_bayes
from sklearn import neural_network
from sklearn.model_selection import cross_val_score
clfs = {}

clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'}
clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'}
clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name':'DecisionTree'}
clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'}
clfs['svc'] = {'clf': svm.SVC(kernel="linear"), 'name': 'SupportVectorClassifier'}
clfs['nusvc'] = {'clf': svm.NuSVC(), 'name': 'NuSVC'}
clfs['linearsvc'] = {'clf': svm.LinearSVC(), 'name': 'LinearSVC'}
clfs['SGD'] = {'clf': linear_model.SGDClassifier(), 'name': 'SGDClassifier'}
clfs['GPC'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'name': 'GaussianProcess'}
clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name':'GaussianNaiveBayes'}
clfs['bag'] = {'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier"}
clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'}
clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(10,8,3), alpha=1e-5, solver='lbfgs'), 'name': 'MultilayerPerceptron'}


# In[ ]:
##grid_dt_search.fit(x_train, y_train)
##print(grid_dt_search.best_params_)
##tree_model.set_params(criterion="gini", max_depth=4)
##tree_model.fit(x_train, y_train)
##feature_importance = np.array(list(zip(data.columns.values, tree_model.feature_importances_)), 
##                              dtype=[('feature', 'S10'), ('importance', 'float')])
##most_important = np.sort(feature_importance, order="importance")[::-1]
##for i in most_important[0:5]:
##    print(i)

grid_para_forest = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 31),
    'n_estimators': range(10, 110, 10)
}

from sklearn import ensemble
forest_model = ensemble.RandomForestClassifier()

grid_rf_search = ms.GridSearchCV(forest_model, grid_para_forest, cv=3, n_jobs=1)
grid_rf_search.fit(x_train, y_train)
print(grid_rf_search.best_params_)
forest_model.set_params(criterion='gini', max_depth=3, n_estimators=80)
forest_model.fit(x_train, y_train)
print(forest_model.score(x_test, y_test))
feature_importance = np.array(list(zip(data.columns.values, forest_model.feature_importances_)), 
                              dtype=[('feature', 'S10'), ('importance', 'float')])
most_important = np.sort(feature_importance, order="importance")[::-1]
for i in most_important[0:5]:
    print(i)
Beispiel #15
0
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())

log_files = path.join(LOGS_PATH, 'log_benchmark_time.txt')
logging.basicConfig(filename=log_files + str(time_stamp),
                    level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This is a log message.')

models = [
    # #alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
    #    eta0=0.0, fit_intercept=True, max_iter=1000, tol=None,l1_ratio=0.15,
    #    learning_rate='optimal', loss='hinge'
    (LogisticRegression(C=2), {
        'C': [1, 10, 100]
    }, 'Logistic_reg_scale'),
    (ensemble.RandomForestClassifier(n_estimators=100), {
        "max_depth": [3, None],
        "max_features": ['auto', 3, 10],
        "min_samples_split": [2, 3, 10],
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }, 'RandomForest'),
    (ensemble.GradientBoostingClassifier(), {}, 'Gradient boosting')
    #(LateFusion(model2=svm.SVC(kernel='linear', C=1, probability=True),standardscaler=preprocessing.Normalizer()), {}, 'late_fusion(GB,SVM) fusion5:5')
    # (LateFusion(model2=LogisticRegression(C=2, class_weight={0:0.1, 1:0.9})),{},'late_fusion(GB,LR(0.1,0.9)'),
    # (LateFusion(model2=ensemble.GradientBoostingClassifier()),{},'late_fusion(GB, GB')
]


def run_nested_cv_fold(data_type='benchmark', n_split=5, cv_fold=5):
    """
Beispiel #16
0
def test_classification_tasks():
    # ------------------------
    # Load latest feature sets
    # ------------------------
    latest_feature_set_time = time.strftime('0')
    for index, file in enumerate(os.listdir(OUT_PATH)):
        if file.startswith("comb_dataset_") and file.endswith(".csv"):
            file_name = os.path.splitext(file)[0].split('_')
            timestamp = file_name[2] + '_' + file_name[3]
            if timestamp > latest_feature_set_time:
                latest_feature_set_time = timestamp

    print(latest_feature_set_time)
    features_path = OUT_PATH + 'comb_dataset_' + latest_feature_set_time + ".csv"
    df = pd.read_csv(features_path, sep=',')
    # df = pd.read_csv(OUT_PATH + 'dataset_20190513_115505.csv', sep=',')

    # feature_set = ['structural', 'temporal', 'social']
    # label_set = ['true', 'false', 'unverified', 'non-rumor']

    # print(df.dtypes)
    df = df.fillna(0)

    # ----------------
    #   DROP COLUMNS
    # ----------------
    # if 'temporal___longest_length' in df:
    #     df = df.drop(columns=['temporal___longest_length'])
    # if 'temp_longest_length' in df:
    #     df = df.drop(columns=['temp_longest_length'])

    # ---------------------
    #   DROP ROWS (label)
    # ---------------------
    # df = df[df.label != 'unverified']
    # df = df[df.label != 'non-rumor']

    X = df.drop(columns=['tweet_id', 'label'])
    y = df['label']

    # print(df.shape, df['label'].value_counts().to_dict())
    # print(df.info())

    classifiers = {'RF': ensemble.RandomForestClassifier(),
                   'XGB': XGBClassifier(),
                   'ADAB': ensemble.AdaBoostClassifier(),
                   'GRADB': ensemble.GradientBoostingClassifier()}

    for classifier_name in list(classifiers.keys()):

        accuracy_results = []
        f1_macro_results = []
        f1_micro_results = []

        max_accuracy = 0

        for i in range(100):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)  # 5-fold cross validation

            clf = classifiers[classifier_name]
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)  # predictions

            accuracy = metrics.accuracy_score(y_test, y_pred)
            f1_macro = f1_score(y_test, y_pred, average='macro')
            f1_micro = f1_score(y_test, y_pred, average='micro')

            # print("#{}: Accuracy={} F1-macro={} F1-micro={}".format(i, accuracy, round(f1_macro, 4), round(f1_micro, 4)))
            accuracy_results.append(accuracy)
            f1_macro_results.append(f1_macro)
            f1_micro_results.append(f1_micro)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                feature_importances = pd.DataFrame(clf.feature_importances_, index=X_train.columns,
                                                   columns=['importance']).sort_values('importance', ascending=False)

        print('\n\n' + classifier_name)
        print('MEAN    \t STD\t MEDIAN')
        print('ACC     \t', round(st.mean(accuracy_results), 4), '+-', round(st.pstdev(accuracy_results), 4))
        print('F1-macro\t', round(st.mean(f1_macro_results), 4), '+-', round(st.pstdev(f1_macro_results), 4))
        print('F1-micro\t', round(st.mean(f1_micro_results), 4), '+-', round(st.pstdev(f1_micro_results), 4))

        print(max_accuracy)
        print(feature_importances)
        print("==========================="*3)
Beispiel #17
0
  
    X_train,X_test,Y_train,Y_test= train_test_split(train,a,test_size=0.3)
    
    print("START TRAINNING")
    #KNN TRAINNING
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    y_predict = knn.predict(X_test)

    print(y_predict)   
    print("KNN accuracy:",accuracy_score(Y_test,y_predict))
    print("KNN precision:",precision_score(Y_test,y_predict, average=None))
   
    
    # 建立 random forest 模型
    forest = ensemble.RandomForestClassifier(n_estimators = 40)
    forest_fit = forest.fit(X_train, Y_train)
    # 預測
    test_y_predicted = forest.predict(X_test)
    
    print(test_y_predicted)    
    print("RF(40) accuracy:",accuracy_score(Y_test,test_y_predicted))
    print("RF(40) precision:",precision_score(Y_test,test_y_predicted, average=None))
    
    for i in range(len(Y_test)):
       if Y_test[i]!=test_y_predicted[i]:
           print(i,Y_test[i],test_y_predicted[i])
    
    print("----------------------------------------------------------------------------------")
    print("TESTING")
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
'''IMPUTE MISSING VALUES FOR TRAIN AND TEST SEPERATELY'''
x_train = missingValueImpute(x_train)
x_test = missingValueImpute(x_test)
'''CLASSIFICATION OF THE PRICE BINS'''
#Most of the time people have an idea about the price range in which their rental will fail.
#For users who have no idea about the price range, we can first classify in which price bucket their rental can fall in
#and then do the bucket specific regression

#RandomForest Classifier to predict the price bins
randomForestClassifier = ensemble.RandomForestClassifier(
    n_estimators=200,
    max_features='auto',
    max_depth=15,
    min_samples_leaf=7,
    random_state=25,
    class_weight='balanced')
randomForestClassifier.fit(scale(x_train), y_train['price_bins'])
print(randomForestClassifier.score(scale(x_train), y_train['price_bins']))
print(randomForestClassifier.score(scale(x_test), y_test['price_bins']))
y_pred = randomForestClassifier.predict(scale(x_test))
report = metrics.classification_report(y_test['price_bins'], y_pred)
print(report)

#Logistic Regression
logistic = linear_model.LogisticRegression(random_state=23,
                                           class_weight='balanced')
logistic.fit(scale(x_train), y_train['price_bins'])
print(logistic.score(scale(x_train), y_train['price_bins']))
Beispiel #19
0
               kaggle_format,
               delimiter=",",
               fmt='%d,%d',
               header='Id,Category',
               comments='')

    ############# BUILT-IN FUNCTION #############
    scoreBuffer = []
    print 50 * '='
    print "CROSS VALIDATION USING SCIKIT-LEARN"
    print 50 * '='

    for depth in depths:
        print "DEPTH:", depth
        clf = ensemble.RandomForestClassifier(n_estimators=5,
                                              criterion='entropy',
                                              max_depth=depth)
        scores = computeCV_Score(clf, crossValidation_Data,
                                 crossValidation_Labels, k)
        scoreBuffer.append((scores).mean())
        print "Depth:", depth, "Accuracy: %0.2f%% (+/- %0.2f)" % (
            (scores).mean(), np.array(scores).std() / 2)
        print 50 * '-'

    maxScore = np.max(scoreBuffer)
    maxScore_Index = scoreBuffer.index(maxScore)
    print "Best Depth Value:", depths[
        maxScore_Index], "Accuracy for that Depth:", np.around(maxScore, 3)
    print 50 * '-'

    print 20 * "*", "The End", 20 * "*"
np_features = df.as_matrix()
np_keys = keys.as_matrix()

features_train, features_test, keys_train, keys_test = train_test_split(np_features, np_keys, test_size=0.33, random_state=42)

print "type(features_train)=", type(features_train), "features_train.shape=", features_train.shape
print "type(features_test)=", type(features_test), "features_test.shape=", features_test.shape
print "type(keys_train)=", type(keys_train), "keys_train.shape=", keys_train.shape
print "type(keys_test)=", type(keys_test), "keys_test.shape=", keys_test.shape

del df
del np_features
del np_keys

start_time = time.time()
clf = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0)
print("--- time to execute  ensemble.RandomForestClassifier %s seconds ---" % (time.time() - start_time))
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#n_jobs : integer, optional (default=1)
# The number of jobs to run in parallel for both fit and predict.
# If -1, then the number of jobs is set to the number of cores.
# my I7-4790 has 4 cores.

print "fitting clf.fit"
start_time = time.time()
clf.fit(features_train, keys_train)
print("--- time to execute  clf.fit %s seconds ---" % (time.time() - start_time))

print "predicting"
start_time = time.time()
keys_test_predicted = clf.predict(features_test)
Beispiel #21
0
pca = doPCA(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
'''

clf = ensemble.ExtraTreesClassifier(n_estimators=800, min_samples_leaf=5)
clf = clf.fit(x_train, y_train)
model = feature_selection.SelectFromModel(clf, prefit=True)
x_train = model.transform(x_train)
x_test = model.transform(x_test)
print(np.shape(x_train))

# Train random forest classifier with gini-impurity
print("Begin training random forest with gini...")
clf = ensemble.RandomForestClassifier(n_estimators=800, min_samples_leaf=5)
clf.fit(x_train, y_train)
trng_acc = clf.score(x_train, y_train)
val_acc = clf.score(x_test, y_test)
rf1_pred = clf.predict(x_test)
print("Training accuracy: %f" %trng_acc)
print("Validation accuracy: %f" %val_acc)
joblib.dump(clf, "rf_gini.pkl")

# Train random forest classifier with entropy-impurity
print("Begin training random forest with entropy...")
clf = ensemble.RandomForestClassifier(n_estimators=800, min_samples_leaf=5, criterion='entropy')
clf.fit(x_train, y_train)
trng_acc = clf.score(x_train, y_train)
val_acc = clf.score(x_test, y_test)
rf2_pred = clf.predict(x_test)
Beispiel #22
0
    markersClassifier = None
    if learnAlgo == 'LogisticRegression':
        #markersClassifier = linear_model.LogisticRegression(C=nbMarkers, penalty='l1', class_weight=classWeights)
        markersClassifier = linear_model.LogisticRegression(max_iter=10000)
        markerFeaturesSet = markerFeaturesSet.tocsr()
    elif learnAlgo == 'SVM':
        markersClassifier = svm.SVC(probability=True,
                                    C=nbMarkers,
                                    class_weight=classWeights)
        markerFeaturesSet = markerFeaturesSet.tocsr()
    elif learnAlgo == 'DecisionTreeClassifier':
        markersClassifier = tree.DecisionTreeClassifier(min_samples_split=10,
                                                        min_density=1)
        markerFeaturesSet = markerFeaturesSet.toarray()
    elif learnAlgo == 'RandomForestClassifier':
        markersClassifier = ensemble.RandomForestClassifier(
            min_samples_split=10, min_density=1)
        markerFeaturesSet = markerFeaturesSet.toarray()
    elif learnAlgo == 'ExtraTreesClassifier':
        markersClassifier = ensemble.ExtraTreesClassifier(min_samples_split=10,
                                                          min_density=1)
        markerFeaturesSet = markerFeaturesSet.toarray()
    print(' - fit dataset')
    markersClassifier.fit(markerFeaturesSet, markerTargetsSet)
    print(' - save model to file')
    joblib.dump(markersClassifier, corpusModel + '/model_markers.txt')


# Compute permutation cost as edit distance adapted to sequences
def getSequenceDistance(s1, s2):
    s1Len = len(s1)
    s2Len = len(s2)
Beispiel #23
0
#Positive and Negative state of reviews
#+ve is 1, -ve is 2
for f in range(len(state)):
    if state[f] == "positive":
        state[f] = 1
    elif state[f] == "negative":
        state[f] = 2

#Splitting data into training and testing datasets
train_x, test_x, train_y, test_y = train_test_split(termdoc,
                                                    state,
                                                    test_size=0.3)

#Using Random Forest Classification Algorithm
rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_val_score(rfc, termdoc, state, cv=10)
print 'Random forest mean accuracy : %.2f' % (rfc_scores.mean())
print 'Random forest std : %.2f' % (rfc_scores.std())

rfc.fit(train_x, train_y)
predict_train_y = rfc.predict(train_x)
predict_test_y = rfc.predict(test_x)
print confusion_matrix(test_y, predict_test_y)

print 'Precision score is : %.2f' % (precision_score(test_y, predict_test_y))
print 'Recall Score is : %.2f' % (recall_score(test_y, predict_test_y))
print 'F score is : %.2f' % (f1_score(test_y, predict_test_y))

#Predicting the review states
predicted_all = rfc.predict(termdoc)
Beispiel #24
0
 def create_model(self, trial):
     rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
     model = ensemble.RandomForestClassifier(
         max_depth=rf_max_depth, n_estimators=10
     )
     return model
with open('../saves/exp_test_trim_1.pickle', 'rb') as f:
    test_df = pickle.load(f)

train_data = train_df
test_data = test_df

scaler = MinMaxScaler()
scaler.partial_fit(train_data[cols])
scaler.partial_fit(test_data[cols])

train_input = scaler.transform(train_data[cols])
test_input = scaler.transform(test_data[cols])

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
}

# cv_generator = GroupKFold(n_splits=3)
for i in range(50):
    CV_rfc = GridSearchCV(ensemble.RandomForestClassifier(),
                          param_grid=param_grid,
                          cv=10)
    CV_rfc.fit(train_input, train_data['true_class'])
    save_string = "../saves/cv_forest_model_aws" + str(i) + ".pickle"
    with open(save_string, 'wb') as f:
        pickle.dump(CV_rfc, f)
    print(CV_rfc.best_params_)

print('finished')
Beispiel #26
0
import time
import pickle
filename = "ApneaData.pkl"
testPercent=20
features = []
classes = []
t = time.time()
f = open(filename,'rb')
data = pickle.load(f)
f.close()
np.random.shuffle(data)
for row in data:
    features.append(row[:-1])
    classes.append(row[-1])
inputLength = len(features)
testLength = int(inputLength*0.2)
train_features, train_classes=features[:-testLength], classes[:-testLength]
test_features,test_classes = features[-testLength:],classes[-testLength:]
print("preprocessing time:",(time.time()-t))
t=time.time()
clf=ensemble.RandomForestClassifier(n_estimators=30)
clf.fit(train_features,train_classes)
print("fitting time:",(time.time()-t))
t=time.time()
pred_classes=[]
for e in test_features:
    pred_classes.append(clf.predict([e])[0])
score = accuracy_score(pred_classes,test_classes)*100
print("predicting time:",(time.time()-t))
print("Accuracy:",score)
Beispiel #27
0
from sklearn import ensemble, feature_extraction, preprocessing

# import data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sampleSubmission.csv')

# drop ids and get labels
labels = train.target.values
train = train.drop('id', axis=1)
train = train.drop('target', axis=1)
test = test.drop('id', axis=1)

# encode labels
lbl_enc = preprocessing.LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# train a random forest classifier
clf = ensemble.RandomForestClassifier(n_jobs=-1,
                                      n_estimators=100,
                                      max_features=50,
                                      verbose=2)
clf.fit(train, labels)

# predict on test set
preds = clf.predict_proba(test)

# create submission file
preds = pd.DataFrame(preds, index=sample.id.values, columns=sample.columns[1:])
preds.to_csv('benchmark.csv', index_label='id')
Beispiel #28
0
from sklearn import tree
from sklearn import ensemble

models = {
    "decision_tree_gini": tree.DecisionTreeClassifier(
        criterion="gini"
    ),
    "decision_tree_entropy": tree.DecisionTreeClassifier(
        criterion='entropy'
    ),
    "rf": ensemble.RandomForestClassifier(),
}
        base_endrow = base_endrow + ppd
        print(base_endrow)
        test_startrow = base_endrow + 1
        print(test_startrow)
        test_endrow = test_startrow + ppd
        print(test_endrow)
        day = day + 1
        print(day)

    return(calendar)

# scramble one
one = one.sample(frac=1)

#print(system_data_stream(one, p, t))
rf_model = skens.RandomForestClassifier(n_estimators=10,oob_score=True, criterion='entropy')

calendar = []

base_startrow = 1
base_endrow = t
test_startrow = t + 1
test_endrow = test_startrow + p
day = 1

print(base_startrow)
print(base_endrow)
print(test_startrow)
print(test_endrow)
print(day)
def train_model_random_forest(train, labels):
    # train a random forest classifier
    model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000)
    model.fit(train, labels)
    joblib.dump(model, 'rf_model2.model')
    return model