def get_tree_results(tree, Xtest): """ Runs data through a quantized DecisionTreeClassifier :param tree: DTC function handle :param Xtest: data to test :returns: predicted results """ results = [tree(X) for X in Xtest] return np.array([results], ndmin=1).T
def select_classify(): return [ naive(), tree(criterion="entropy"), knn(n_neighbors=8, weights='uniform', metric="manhattan"), mlp(hidden_layer_sizes=(128, ), alpha=0.01, activation='tanh', solver='sgd', max_iter=300, learning_rate='constant', learning_rate_init=0.001) ]
print('CM:', confusion_matrix(y_test, y_pred)) print('AC:', ac(y_test, y_pred)) print('F1 scores:', f1(y_test, y_pred)) print('PR:', prfs(y_test, y_pred)) from sklearn.naive_bayes import GaussianNB model5 = GaussianNB().fit(X_train, y_train) y_pred = model5.predict(X_test) print('CM:', confusion_matrix(y_test, y_pred)) print('AC:', ac(y_test, y_pred)) print('F1 scores:', f1(y_test, y_pred)) print('PR:', prfs(y_test, y_pred)) from sklearn.tree import DecisionTreeClassifier as tree model6 = tree(criterion='entropy').fit(X_train, y_train) y_pred = model6.predict(X_test) print('CM:', confusion_matrix(y_test, y_pred)) print('AC:', ac(y_test, y_pred)) print('F1 scores:', f1(y_test, y_pred)) print('PR:', prfs(y_test, y_pred)) from sklearn.ensemble import RandomForestClassifier as forest model7 = forest(max_depth=5).fit(X_train, y_train) y_pred = model7.predict(X_test) print('CM:', confusion_matrix(y_test, y_pred)) print('AC:', ac(y_test, y_pred)) print('F1 scores:', f1(y_test, y_pred)) print('PR:', prfs(y_test, y_pred))
features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here print len(labels_train) print len(features_train) clf = tree(min_samples_split=40) clf.fit(features_train, labels_train) pred = clf.predict(features_test) accuracy = accuracy_score(pred,labels_test) print accuracy #after removing 2 sig words 0.816837315131 print 'Importance of the most important feature:' #0.764705882353 print max(clf.feature_importances_) print 'Number of most important feature:' #33614 print list(clf.feature_importances_).index(max(clf.feature_importances_)) print list(clf.feature_importances_)[-1] #sshacklensf cgermannsf
from sklearn.tree import DecisionTreeClassifier as tree from sklearn.svm import SVC as svm from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn import metrics data_set = ds.load_digits() x = data_set.data y = data_set.target X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) neighbors_model = KNeighborsClassifier(n_neighbors=3) bayes_model = naive_bayes() tree_model = tree() svm_model = svm() forest_model = RandomForestClassifier() neighbors_model.fit(X_train, y_train) bayes_model.fit(X_train, y_train) tree_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) forest_model.fit(X_train, y_train) y_actual_neighbors = neighbors_model.predict(X_test) y_actual_bayes = bayes_model.predict(X_test) y_actual_tree = tree_model.predict(X_test) y_actual_svm = svm_model.predict(X_test) y_actual_forest = forest_model.predict(X_test)
def classifierTrainTest(score, diagn, real_art, cvPartition, classifier, subjIndex, preAccMatrix, preInstOrder): x = 0 iteration = 0 idx = 0 PCNo = len(score[0]) subAccMatrix = 0 # FIX: what is test->matlab function within cvpartition class #idx = numpy.random.rand(cvPartition, iteration) #idx_test = numpy.where(idx == 1) #idx_train = numpy.where(idx != 1) print("cvPartition:") print(cvPartition) #QUESTION: cv partition not scalar ,how works #iteration must be atleast 2 for idx_train, idx_test in cvPartition: #change idx to boolean array idx = numpy.zeros((len(score), 1), dtype=bool) for index in idx_test: idx[index] = True #for testing purposes #idx = numpy.zeros((len(score), 1), dtype=bool) #idx[47] = True #idx is all training in MATLAB implementation? cvTEST = numpy.zeros((sum(idx), PCNo)) diagnTEST = numpy.zeros((sum(idx), 1)) real_artTEST = numpy.zeros((sum(idx), 1)) instIndexTEST = numpy.zeros((sum(idx), 1)) cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo)) diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) k = 0 m = 0 for j in range(len(idx)): if idx[j] == 1: cvTEST[k, :] = score[j, :] diagnTEST[k] = diagn[j] real_artTEST[k] = real_art[j] instIndexTEST[k] = subjIndex[j] k = k + 1 else: cvTRAIN[m, :] = score[j, :] diagnTRAIN[m] = diagn[j] real_artTRAIN[m] = real_art[j] m = m + 1 # FIX: use scikit-learn for classifiers and predictions if classifier == "lda": #ldaModel = LDA() priorsArrays = numpy.array((.5, .5)) ldaModel = LDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) #ldaModel = LDA() ldaModel.fit(cvTRAIN, diagnTRAIN) label = ldaModel.predict(cvTEST) elif classifier == 'qda': # training a quadratic discriminant classifier to the data #qdaModel = QDA() priorsArrays = numpy.array((.5, .5)) qdaModel = QDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) qdaModel.fit(cvTRAIN, diagnTRAIN) label = qdaModel.predict(cvTEST) elif classifier == 'tree': # training a decision tree to the data treeModel = tree() treeModel.fit(cvTRAIN, diagnTRAIN) label = treeModel.predict(cvTEST) elif classifier == 'svm': # training a support vector machine to the data svmModel = SVC() svmModel.fit(cvTRAIN, diagnTRAIN) label = svmModel.predict(cvTEST) trueClassLabel = diagnTEST predictedClassLabel = label #from former loop subAccMatrix = numpy.column_stack( (trueClassLabel, predictedClassLabel, real_artTEST)) preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST x = x + len(subAccMatrix[:, 0]) #for testing purposes #break # create dictionary for return values return { 'cvTEST': cvTEST, 'diagnTEST': diagnTEST, 'real_artTEST': real_artTEST, 'instIndexTEST': instIndexTEST, 'cvTRAIN': cvTRAIN, 'diagnTRAIN': diagnTRAIN, 'real_artTRAIN': real_artTRAIN, 'trueClassLabel': trueClassLabel, 'predictedClassLabel': predictedClassLabel, 'idx': idx, 'subAccMatrix': subAccMatrix, 'preAccMatrix': preAccMatrix, 'preInstOrder': preInstOrder }
x = data_set.data y = data_set.target cv_kfold = KFold(n_splits=30) neighbors_classifiers = [] bayes_classifiers = [] tree_classifiers = [] svm_classifiers = [] forest_classifiers = [] for train_index, test_index in cv_kfold.split(y): neighbors_model = KNeighborsClassifier(n_neighbors=3) bayes_model = naive_bayes() tree_model = tree() svm_model = svm() forest_model = RandomForestClassifier() X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] neighbors_model.fit(X_train, y_train) bayes_model.fit(X_train, y_train) tree_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) forest_model.fit(X_train, y_train) neighbors_classifiers.append(neighbors_model) bayes_classifiers.append(bayes_model) tree_classifiers.append(tree_model) svm_classifiers.append(svm_model)
def mean_error(max_leaf_nodes, train_X, val_X, train_y, val_y): model = tree(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y) prediction = model.predict(val_X) return mean_absolute_error(val_y, prediction)
clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) accuracy = accuracy_score(y_test, y_predicted) f1 = f1_score(y_test, y_predicted, average="weighted") return accuracy, f1 #%% from sklearn.tree import DecisionTreeClassifier as tree from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier from xgboost import XGBClassifier classifiers = { 'DecisionTree':tree(splitter='best', min_samples_split=10, min_samples_leaf= 6, max_features=100, max_depth= 30, criterion= 'gini'), 'Bagging':BaggingClassifier(n_estimators= 300, max_samples =0.7999999999999999,max_features=40), 'RandomForest':RandomForestClassifier(n_estimators= 230, min_samples_split= 2, min_samples_leaf=2, max_features=70, max_depth=45), 'AdaBoost':AdaBoostClassifier(n_estimators=260, learning_rate=0.001, base_estimator=tree(class_weight=None, criterion='gini', max_depth=10, max_features=30, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')), 'GradientBoostingTree':GradientBoostingClassifier(n_estimators=175,min_samples_split=2,
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) ### add more features to features_list! features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) ### your code goes here features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42) clf = tree() clf.fit(features_train, labels_train) pred = clf.predict(features_test) accuracy = accuracy_score(pred,labels_test) print accuracy #0.724137931034 print np.array(labels_test) print "number of POIs in the test set:" num_pois_test = len([x for x in labels_test if x == 1.0]) print num_pois_test print "total people in the test set:" total_ppl_test= len(labels_test) print total_ppl_test print "If your identifier predicted 0. (not POI) for everyone in the test set, what would its accuracy be?" acc = 1.0 - float(num_pois_test)/total_ppl_test print acc
'max_depth': np.arange(1, 6), 'min_samples_split': np.arange(3, 8), 'min_samples_leaf': np.arange(1, 5) } params_rs = { 'criterion': ('entropy', 'gini'), 'splitter': ('best', 'random'), 'max_depth': randint(1, 6), 'min_samples_split': randint(3, 8), 'min_samples_leaf': randint(1, 5) } # In[88]: model = tree() gs = GridSearchCV(tree(), cv=10, param_grid=params_gs, scoring='accuracy') gs.fit(x_tr, y_tr) # In[89]: cv_score_gs = [] final_score_gs = [] for i in range(0, 100): print('Iteracja: ' + str(i)) gs = GridSearchCV(tree(), cv=10, param_grid=params_gs, scoring='accuracy', n_jobs=-1)