Exemple #1
0
def get_tree_results(tree, Xtest):
    """
    Runs data through a quantized DecisionTreeClassifier
    :param tree: DTC function handle
    :param Xtest: data to test
    :returns: predicted results
    """
    results = [tree(X) for X in Xtest]
    return np.array([results], ndmin=1).T
Exemple #2
0
def select_classify():
    return [
        naive(),
        tree(criterion="entropy"),
        knn(n_neighbors=8, weights='uniform', metric="manhattan"),
        mlp(hidden_layer_sizes=(128, ),
            alpha=0.01,
            activation='tanh',
            solver='sgd',
            max_iter=300,
            learning_rate='constant',
            learning_rate_init=0.001)
    ]
Exemple #3
0
print('CM:', confusion_matrix(y_test, y_pred))
print('AC:', ac(y_test, y_pred))
print('F1 scores:', f1(y_test, y_pred))
print('PR:', prfs(y_test, y_pred))

from sklearn.naive_bayes import GaussianNB

model5 = GaussianNB().fit(X_train, y_train)
y_pred = model5.predict(X_test)
print('CM:', confusion_matrix(y_test, y_pred))
print('AC:', ac(y_test, y_pred))
print('F1 scores:', f1(y_test, y_pred))
print('PR:', prfs(y_test, y_pred))

from sklearn.tree import DecisionTreeClassifier as tree

model6 = tree(criterion='entropy').fit(X_train, y_train)
y_pred = model6.predict(X_test)
print('CM:', confusion_matrix(y_test, y_pred))
print('AC:', ac(y_test, y_pred))
print('F1 scores:', f1(y_test, y_pred))
print('PR:', prfs(y_test, y_pred))

from sklearn.ensemble import RandomForestClassifier as forest

model7 = forest(max_depth=5).fit(X_train, y_train)
y_pred = model7.predict(X_test)
print('CM:', confusion_matrix(y_test, y_pred))
print('AC:', ac(y_test, y_pred))
print('F1 scores:', f1(y_test, y_pred))
print('PR:', prfs(y_test, y_pred))
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here

print len(labels_train)
print len(features_train)



clf = tree(min_samples_split=40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(pred,labels_test)

print accuracy #after removing 2 sig words 0.816837315131
print 'Importance of the most important feature:' #0.764705882353
print max(clf.feature_importances_)
print 'Number of most important feature:' #33614
print list(clf.feature_importances_).index(max(clf.feature_importances_))
print list(clf.feature_importances_)[-1] #sshacklensf cgermannsf
Exemple #5
0
from sklearn.tree import DecisionTreeClassifier as tree
from sklearn.svm import SVC as svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

data_set = ds.load_digits()

x = data_set.data
y = data_set.target

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

neighbors_model = KNeighborsClassifier(n_neighbors=3)
bayes_model = naive_bayes()
tree_model = tree()
svm_model = svm()
forest_model = RandomForestClassifier()

neighbors_model.fit(X_train, y_train)
bayes_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)

y_actual_neighbors = neighbors_model.predict(X_test)
y_actual_bayes = bayes_model.predict(X_test)
y_actual_tree = tree_model.predict(X_test)
y_actual_svm = svm_model.predict(X_test)
y_actual_forest = forest_model.predict(X_test)
Exemple #6
0
    def classifierTrainTest(score, diagn, real_art, cvPartition, classifier,
                            subjIndex, preAccMatrix, preInstOrder):
        x = 0
        iteration = 0
        idx = 0
        PCNo = len(score[0])
        subAccMatrix = 0
        # FIX: what is test->matlab function within cvpartition class
        #idx = numpy.random.rand(cvPartition, iteration)
        #idx_test = numpy.where(idx == 1)
        #idx_train = numpy.where(idx != 1)
        print("cvPartition:")
        print(cvPartition)

        #QUESTION: cv partition not scalar ,how works
        #iteration must be atleast 2
        for idx_train, idx_test in cvPartition:
            #change idx to boolean array
            idx = numpy.zeros((len(score), 1), dtype=bool)
            for index in idx_test:
                idx[index] = True

            #for testing purposes
            #idx = numpy.zeros((len(score), 1), dtype=bool)
            #idx[47] = True

            #idx is all training in MATLAB implementation?
            cvTEST = numpy.zeros((sum(idx), PCNo))
            diagnTEST = numpy.zeros((sum(idx), 1))
            real_artTEST = numpy.zeros((sum(idx), 1))
            instIndexTEST = numpy.zeros((sum(idx), 1))

            cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo))
            diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1))
            real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1))

            k = 0
            m = 0

            for j in range(len(idx)):
                if idx[j] == 1:
                    cvTEST[k, :] = score[j, :]
                    diagnTEST[k] = diagn[j]
                    real_artTEST[k] = real_art[j]
                    instIndexTEST[k] = subjIndex[j]
                    k = k + 1
                else:
                    cvTRAIN[m, :] = score[j, :]
                    diagnTRAIN[m] = diagn[j]
                    real_artTRAIN[m] = real_art[j]
                    m = m + 1

            # FIX: use scikit-learn for classifiers and predictions
            if classifier == "lda":
                #ldaModel = LDA()
                priorsArrays = numpy.array((.5, .5))
                ldaModel = LDA(solver='eigen',
                               priors=priorsArrays,
                               shrinkage=1.00)
                #ldaModel = LDA()
                ldaModel.fit(cvTRAIN, diagnTRAIN)
                label = ldaModel.predict(cvTEST)
            elif classifier == 'qda':
                # training a quadratic discriminant classifier to the data
                #qdaModel = QDA()
                priorsArrays = numpy.array((.5, .5))
                qdaModel = QDA(solver='eigen',
                               priors=priorsArrays,
                               shrinkage=1.00)
                qdaModel.fit(cvTRAIN, diagnTRAIN)
                label = qdaModel.predict(cvTEST)
            elif classifier == 'tree':
                # training a decision tree to the data
                treeModel = tree()
                treeModel.fit(cvTRAIN, diagnTRAIN)
                label = treeModel.predict(cvTEST)
            elif classifier == 'svm':
                # training a support vector machine to the data
                svmModel = SVC()
                svmModel.fit(cvTRAIN, diagnTRAIN)
                label = svmModel.predict(cvTEST)

            trueClassLabel = diagnTEST
            predictedClassLabel = label

            #from former loop

            subAccMatrix = numpy.column_stack(
                (trueClassLabel, predictedClassLabel, real_artTEST))
            preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix
            preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST

            x = x + len(subAccMatrix[:, 0])

            #for testing purposes
            #break
        # create dictionary for return values
        return {
            'cvTEST': cvTEST,
            'diagnTEST': diagnTEST,
            'real_artTEST': real_artTEST,
            'instIndexTEST': instIndexTEST,
            'cvTRAIN': cvTRAIN,
            'diagnTRAIN': diagnTRAIN,
            'real_artTRAIN': real_artTRAIN,
            'trueClassLabel': trueClassLabel,
            'predictedClassLabel': predictedClassLabel,
            'idx': idx,
            'subAccMatrix': subAccMatrix,
            'preAccMatrix': preAccMatrix,
            'preInstOrder': preInstOrder
        }
Exemple #7
0
x = data_set.data
y = data_set.target

cv_kfold = KFold(n_splits=30)

neighbors_classifiers = []
bayes_classifiers = []
tree_classifiers = []
svm_classifiers = []
forest_classifiers = []

for train_index, test_index in cv_kfold.split(y):
    neighbors_model = KNeighborsClassifier(n_neighbors=3)
    bayes_model = naive_bayes()
    tree_model = tree()
    svm_model = svm()
    forest_model = RandomForestClassifier()
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    neighbors_model.fit(X_train, y_train)
    bayes_model.fit(X_train, y_train)
    tree_model.fit(X_train, y_train)
    svm_model.fit(X_train, y_train)
    forest_model.fit(X_train, y_train)

    neighbors_classifiers.append(neighbors_model)
    bayes_classifiers.append(bayes_model)
    tree_classifiers.append(tree_model)
    svm_classifiers.append(svm_model)
Exemple #8
0
def mean_error(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = tree(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    prediction = model.predict(val_X)
    return mean_absolute_error(val_y, prediction)
Exemple #9
0
    clf.fit(X_train, y_train)

    y_predicted = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_predicted)
    f1 = f1_score(y_test, y_predicted, average="weighted")

    return accuracy, f1
    
      
#%%
from sklearn.tree import DecisionTreeClassifier as tree
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

classifiers = {
    'DecisionTree':tree(splitter='best', min_samples_split=10, min_samples_leaf= 6, max_features=100, 
                        max_depth= 30, criterion= 'gini'),
                        
    'Bagging':BaggingClassifier(n_estimators= 300, max_samples =0.7999999999999999,max_features=40),
    
    'RandomForest':RandomForestClassifier(n_estimators= 230, min_samples_split= 2, min_samples_leaf=2, 
                                          max_features=70, max_depth=45),
    
    'AdaBoost':AdaBoostClassifier(n_estimators=260, learning_rate=0.001, 
                                  base_estimator=tree(class_weight=None, criterion='gini', max_depth=10,
                                                      max_features=30, max_leaf_nodes=None,
                                                      min_impurity_decrease=0.0, min_impurity_split=None,
                                                      min_samples_leaf=5, min_samples_split=5,
                                                      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
                                                      splitter='best')),
                                                                                             
    'GradientBoostingTree':GradientBoostingClassifier(n_estimators=175,min_samples_split=2,
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

### add more features to features_list!
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)



### your code goes here 


features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

clf = tree()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(pred,labels_test)

print accuracy #0.724137931034
print np.array(labels_test)
print "number of POIs in the test set:"
num_pois_test = len([x for x in labels_test if x == 1.0])
print num_pois_test
print "total people in the test set:"
total_ppl_test= len(labels_test)
print total_ppl_test
print "If your identifier predicted 0. (not POI) for everyone in the test set, what would its accuracy be?"
acc = 1.0 -  float(num_pois_test)/total_ppl_test
print acc
    'max_depth': np.arange(1, 6),
    'min_samples_split': np.arange(3, 8),
    'min_samples_leaf': np.arange(1, 5)
}

params_rs = {
    'criterion': ('entropy', 'gini'),
    'splitter': ('best', 'random'),
    'max_depth': randint(1, 6),
    'min_samples_split': randint(3, 8),
    'min_samples_leaf': randint(1, 5)
}

# In[88]:

model = tree()
gs = GridSearchCV(tree(), cv=10, param_grid=params_gs, scoring='accuracy')
gs.fit(x_tr, y_tr)

# In[89]:

cv_score_gs = []
final_score_gs = []

for i in range(0, 100):
    print('Iteracja: ' + str(i))
    gs = GridSearchCV(tree(),
                      cv=10,
                      param_grid=params_gs,
                      scoring='accuracy',
                      n_jobs=-1)