def drawDecisionTree(data, name): data['change next day class'] = data['change next day'].apply(classify) X_train, X_test, y_train, y_test = train_test_split( data[[ 'rate of increase', 'increase length', 'rate of decrease', 'decrease length' ]], data['change next day class'], test_size=0.2, random_state=42) tree = DecisionTreeClassifier(max_depth=6, random_state=0) tree.fit(X_train, y_train) print('Train score:{:.3f}'.format(tree.score(X_train, y_train))) print('Test score:{:.3f}'.format(tree.score(X_test, y_test))) #生成可视化图 export_graphviz(tree, out_file="tree.dot", feature_names=[ 'rate of increase', 'increase length', 'rate of decrease', 'decrease length' ], impurity=False, filled=True) #展示可视化图 graph = pydotplus.graph_from_dot_file('tree.dot') graph.write_pdf(name + '.pdf')
def get_best_tree(model, X, keep_scores=False): """ Given a model of ensembled trees with an `estimators_` attribute, finds the tree that most closely resembles Parameters ---------- model X Returns ------- """ overall_prediction = model.predict(X) predictions = dict() scores = dict() best_score, best_tree_number = -999, -999 for tree_num, tree in enumerate(model.estimators_): predictions[tree_num] = tree.predict(X) new_score = tree.score(X, overall_prediction) scores[tree_num] = new_score if new_score > best_score: best_score = new_score best_tree_number = tree_num nearest_tree = model.estimators_[best_tree_number] if keep_scores: return best_tree_number, nearest_tree, scores return best_tree_number, nearest_tree
def evaluate_regressor(tree, X, Y): """ Evaluates a tree with the data values passed, returning the R2 and MSE """ r2 = tree.score(X, Y) e = tree.predict(X) mse = np.average(np.power((e - Y.values), 2)) return r2, mse
def process_chunk(self, chunk): def update_min_max_count_dict(key, dict1, dict2): return { 'max': max(dict1[key]['max'], dict2[key]['max']), 'min': max(dict1[key]['min'], dict2[key]['min']), 'count': dict1[key]['count'] + dict2[key]['count'], } new_min_max_count = get_feature_min_max_count(chunk, self.feature_names) if self.feature_min_max_count is None: self.feature_min_max_count = new_min_max_count else: self.feature_min_max_count = { key: update_min_max_count_dict(key, self.feature_min_max_count, new_min_max_count) for key in self.feature_names } chunk = self.__preprocess_data(chunk, self.feature_names) tree = sklearn.tree.DecisionTreeRegressor(criterion='mse', random_state=42, **self.model_params) train, test = sklearn.model_selection.train_test_split(chunk, random_state=42) plot, _ = sklearn.model_selection.train_test_split(chunk, random_state=42, train_size=min( 100, len(chunk) - 1), test_size=0) self.plot_data_frames.append(plot) tree.fit(train[self.model_feature_names], train[self.output_name]) self.test_scores.append( tree.score(test[self.model_feature_names], test[self.output_name])) self.train_scores.append( tree.score(train[self.model_feature_names], train[self.output_name])) self.trees.append(tree)
def cluster_then_forest(xs, ys, in_sample_size): isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) clf = cluster.KMeans(n_clusters=4) clf.fit(in_sample) oos_clusterid = clf.predict(out_sample) ins_clusterid = clf.predict(in_sample) for id in numpy.unique(oos_clusterid): print "Now working on Cluster " + str(id) oos_ind = oos_clusterid == id ins_ind = ins_clusterid == id tree = ensemble.RandomForestRegressor(50) tree.fit(in_sample[ins_ind], ys[isi][ins_ind]) print "Score for in-sample" print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind])) print "Score for out-of sample" tree.predict(out_sample[oos_ind]) print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind])) return None
def cluster_then_forest(xs, ys, in_sample_size): isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) clf = cluster.KMeans(n_clusters = 4) clf.fit(in_sample) oos_clusterid = clf.predict(out_sample) ins_clusterid = clf.predict(in_sample) for id in numpy.unique(oos_clusterid): print "Now working on Cluster " + str(id) oos_ind = oos_clusterid == id ins_ind = ins_clusterid == id tree = ensemble.RandomForestRegressor(50) tree.fit(in_sample[ins_ind], ys[isi][ins_ind]) print "Score for in-sample" print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind])) print "Score for out-of sample" tree.predict(out_sample[oos_ind]) print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind])) return None
def cross_val(data, classifiers): averages = [] for i in range(5): bayes_estimates = [] tree_estimates =[] data = np.array(data) classifiers = np.array(classifiers) folds = cross_validation.KFold(len(classifiers), n_folds=10, shuffle=True) for train, test in folds: bayes = naive_bayes(data[train], classifiers[train]) bayes_estimates.append(bayes.score(data[test], classifiers[test])) tree = decision_tree(data[train], classifiers[train]) tree_estimates.append(tree.score(data[test], classifiers[test])) temp = [] temp.append(np.mean(bayes_estimates)) temp.append(np.mean(tree_estimates)) averages.append(temp) visualize(averages)
def classify(keyword2int): import numpy as np import matplotlib.pyplot as plt import scipy.misc from PIL import Image import pprint pp = pprint.PrettyPrinter(indent=4) # In[18]: import sklearn from sklearn import datasets import skimage.io # ## Datasets preparation # In[19]: def png2vec(filename): img = Image.open(filename).convert('L') arr = np.array(img) return arr # In[20]: filesetNames = ["%01d" % x for x in range(0, 2)] # In[21]: import os images = [] tgt = [] count = 0 img_test = Image.open("./img/0/1.jpg") for curFileset in filesetNames: curPath = "./img/" + curFileset + "/" for file in os.listdir(curPath): curImageVector = png2vec(curPath + file) images.append(curImageVector) tgt.append(curFileset) count += 1 #end print(len(images)) # In[22]: # In[24]: from sklearn.model_selection import train_test_split from sklearn import model_selection, metrics images_np = np.array(images) img = images_np.reshape(images_np.shape[0], -1) xM, xT, yM, yT = train_test_split(img, tgt, test_size=0.01) print(yT) xT = list(xT) #map(list,xT) xT.extend(xM[0:20]) yT.extend(yM[0:20]) print(yT) xT = np.array(xT) print(yT) # # Naive Bayes # In[25]: # 要传参!!! from search keyword2int # 要传参!!! from search keyword2int #keyword2int = {'dog':0,'cat':1} from sklearn import metrics from sklearn import naive_bayes cls = naive_bayes.GaussianNB() cls.fit(xM, yM) res = cls.predict(xT) print(type(xT[0:1])) print(len(xT), len(xT[0]), 'result', type(xT)) print(res) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), "GaussianNB") #plt.title('这个图片是'+str(list(keyword2int.keys())[0])+'预测结果是'+ str(list(keyword2int.keys())[int(res_test[0])])) #plt.imshow(img_test) #plt.show() # In[26]: cls = naive_bayes.BernoulliNB(binarize=0.9) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), "BernoulliNB") # In[27]: cls = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), "naive_bayes.MultinomialNB") # # KNN # In[28]: from sklearn.neighbors import KNeighborsClassifier cls = KNeighborsClassifier(n_neighbors=1) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), ' knn') # # Decision Tree # In[33]: from sklearn import tree cls = tree.DecisionTreeClassifier() cls = cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), "Decision Tree") import random index = random.randint(0, 10) x_test = xT[index] y_tag = yT[index] DT_res = cls.predict([x_test]) x_test_image_list = list(x_test) x_temp = [] x_image = [] for j in range(100): for i in range(100): x_temp.append(x_test[i + j * 100]) x_image.append(x_temp) x_temp = [] np.array(x_image) plt.title('the image is ' + str(list(keyword2int.keys())[int(y_tag)]) + ' result is ' + str(list(keyword2int.keys())[int(DT_res[0])])) print(DT_res) print( list(keyword2int.keys())[int(y_tag)], list(keyword2int.keys())[int(DT_res[0])]) plt.imshow(x_image) plt.show() import sys sys.exit() # In[40]: from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(random_state=0, min_samples_split=2) tree.fit(xT, yT) print(tree.score(xT, yT), tree.score(xM, yM), "DecisionTreeClassifier") # In[37]: DecisionTreeClassifier() # # random forest # In[56]: from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=150, random_state=0) forest.fit(xT, yT) forest.score(xT, yT), forest.score(xM, yM) # In[44]: forest # In[74]: scoreListUniform = [] nListUniform = [] stepCount = 10 stepDist = 200 start = 200 end = 2001 for curCount in range(start, end, stepDist): forest = RandomForestClassifier(n_estimators=curCount, random_state=0) forest.fit(xT, yT) nListUniform.append(curCount) scoreListUniform.append(forest.score(xM, yM)) scoreListUniform # ## SVM # In[76]: from sklearn import svm cls = svm.SVC() cls = cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res), " svm") from sklearn import svm import matplotlib.pyplot as plt import numpy n_trials = 3 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) for (i, tp) in enumerate(train_percentages): test_accuracy = numpy.zeros(n_trials) for n in range(n_trials): xM, xT, yM, yT = train_test_split(digits.data, digits.target, train_size=tp / 100.0) cls = svm.LinearSVC().fit(xM, yM) res = cls.predict(xT) test_accuracy[n] = metrics.accuracy_score(yT, res) test_accuracies[i] = test_accuracy.mean() print(i, tp, test_accuracies[i]) print(train_percentages) fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show()
#tratando valores nulos encontrados pela média new_data_train['Age'].fillna(new_data_train['Age'].mean(), inplace=True) new_data_test['Age'].fillna(new_data_test['Age'].mean(), inplace=True) #Verificação de valores nulos, de ordem descrescente com os 10 primeiros new_data_test.isnull().sum().sort_values(ascending=False).head(10) #tratamento do valor nulo coluna fare pela média new_data_test['Fare'].fillna(new_data_test['Fare'].mean(), inplace=True) #separado as features para a criação do modelo X = new_data_train.drop("Survived", axis=1) #tirando apenas a coluna target y = new_data_train["Survived"] # colocando somente a coluna target from sklearn import tree from sklearn.tree import DecisionTreeClassifier #parametrizando o tamanho a arvore de decisao tree = DecisionTreeClassifier(max_depth=3, random_state=0) tree.fit(X, y) #avaliando o modelo tree.score(X, y) #Enviando a previsão para o Kaggle previsao = pd.DataFrame() previsao["PassengerId"] = new_data_test["PassengerId"] previsao["Survived"] = tree.predict(new_data_test) #importando para CSV previsao.to_csv('previsao.csv', index=False)
regiao = 'Grande Florianópolis' localizacao = 'URBANA' serie = '8º ano' regiao_enc = encoders['regiao'].transform([regiao])[0] localizacao_enc = encoders['localizacao'].transform([localizacao])[0] serie_enc = encoders['serie'].transform([serie])[0] prediction_enc = tree.predict([[regiao_enc, localizacao_enc, serie_enc]]) prediction = encoders['status'].inverse_transform(int(prediction_enc[0])) proba = tree.predict_proba([[regiao_enc, localizacao_enc, serie_enc]])[0] # Output print('\nAcurácia do classificador: ' + str(tree.score(data, target))) print('\nA predição retornou: ' + prediction) if prediction == 'Suficiente': print('O aluno atende aos pré requisitos do Bolsa Família.') else: print( 'O aluno não atende aos pré requisitos do Bolsa Família e deve ser desligado do programa.' ) print('\nPeso de cada variável:') for i in range(len(default_csv[0]) - 1): print('\t' + default_csv[0][i] + ': ' + str(tree.feature_importances_[i])) print('\nGrau de certeza de cada possível resposta:')
X = data.values[:, :-1] y = data.values[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) svm = svm.SVC(probability=True) logreg = LogisticRegression() tree = tree.DecisionTreeClassifier() svm.fit(X_train, y_train) logreg.fit(X_train, y_train) tree.fit(X_train, y_train) print("SVM Accuracy: %.2f%s" % (svm.score(X_test, y_test) * 100, '%')) print("LogReg Accuracy: %.2f%s" % (logreg.score(X_test, y_test) * 100, '%')) print("Tree Accuracy: %.2f%s" % (tree.score(X_test, y_test) * 100, '%')) w = [1, 1, 1] ensemble = VotingClassifier( estimators=[('svm', svm), ('logreg', logreg), ('tree', tree)], voting='hard', weights=w) ensemble.fit(X_train, y_train) print("Ensemble Accuracy: %.2f%s" % (ensemble.score(X_test, y_test) * 100, '%'))
plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) plt.show() # 准确率 print(kNN.score(x_test, y_test)) '''建决策树模型''' tree = tree.DecisionTreeClassifier() tree.fit(x_train, y_train) # 画图 draw(tree) plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) plt.show() # 准确率 print(tree.score(x_test, y_test)) '''接下来使用bagging集成学习,加入kNN''' # 100个不放回的抽样,也就是训练100个kNN分类器 bagging_kNN = BaggingClassifier(kNN, n_estimators=100) bagging_kNN.fit(x_train, y_train) # 画图 draw(bagging_kNN) plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) plt.show() # 准确率 print(bagging_kNN.score(x_test, y_test)) '''加入决策树的集成学习''' bagging_tree = BaggingClassifier(tree, n_estimators=100) bagging_tree.fit(x_train, y_train)
targets_test = test.loc[:, 'over_50k'] #the decision tree tends to overfit the training data at the expense of lower accuracy in testing #this can be mitigated by specifying a maximum tree depth #using the validation data to pick a tree depth validation_accs = [] #list of all validation accuracies for i in range(1, 24): #decision tree model tree = DecisionTreeClassifier(max_depth=i, random_state=0) #specify model type tree.fit(data_train, targets_train) #fit the model #add accuracy for this test to testing_accs validation_accs.append(100 * tree.score(data_validation, targets_validation)) depth = validation_accs.index( max(validation_accs) ) + 1 #the tree depth that produced the greatest test accuracy - in general this tends to be 7 or 8 print('\nSelecting a maximum tree depth of ' + str(depth)) #decision tree model model = DecisionTreeClassifier(max_depth=depth, random_state=0) #specify model type model.fit(data_train, targets_train) #fit the model print('Testing accuracy: ' + str('%.2f' % (100 * tree.score(data_test, targets_test))) + '%') #tends to be roughly 82%
print(All_X.shape, train_X.shape, val_X.shape, train_y.shape, val_y.shape, test_X.shape) # In[ ]: #Feature importance tree = DecisionTreeClassifier(random_state=99) tree.fit(train_X, train_y) imp = pd.DataFrame(tree.feature_importances_, columns=['Importance'], index=train_X.columns) imp = imp.sort_values(['Importance'], ascending=True) imp[:10].plot(kind='barh') print(tree.score(train_X, train_y)) # In[ ]: # Modeling MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes gaussian_process.GaussianProcessClassifier(),
info = [0.66, 0.8, 0.9, 0.95] depths = len(info) * [None] nleaves = len(info) * [None] accuracies_train = len(info) * [None] accuracies_test = len(info) * [None] for index in range(0, len(info)): elem = info[index] pca = PCA(elem) reduced_d2_X_train = pca.fit_transform(d2_X_train) reduced_d2_X_test = pca.transform(d2_X_test) tree = DecisionTreeClassifier() model = tree.fit(reduced_d2_X_train, y_train) depths[index] = tree.get_depth() nleaves[index] = tree.get_n_leaves() accuracies_train[index] = tree.score(reduced_d2_X_train, y_train) accuracies_test[index] = tree.score(reduced_d2_X_test, y_test) t = PrettyTable() t.add_column("% information", info) t.add_column("Tree depth", depths) t.add_column("number of leaves", nleaves) t.add_column("Accuracy on train set", accuracies_train) t.add_column("Accuracy on test set", accuracies_test) print(t) res_model.append("Decision tree") res_param.append("Unrestricted max depth \n66% of information") res_train_acc.append(accuracies_train[0]) res_valid_acc.append("-") res_test_acc.append(accuracies_test[0])
X = data.values[:, :-1] y = data.values[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) svm = svm.SVC(probability=True) logreg = LogisticRegression() tree = tree.DecisionTreeClassifier() svm.fit(X_train, y_train) logreg.fit(X_train, y_train) tree.fit(X_train, y_train) print("SVM Accuracy: %.2f%s" % (svm.score(X_test, y_test) * 100, '%')) print("LogReg Accuracy: %.2f%s" % (logreg.score(X_test, y_test) * 100, '%')) print("Tree Accuracy: %.2f%s" % (tree.score(X_test, y_test) * 100, '%')) w = [ svm.score(X_test, y_test), logreg.score(X_test, y_test), tree.score(X_test, y_test) ] ensemble = VotingClassifier(estimators=[('svm', svm), ('logreg', logreg), ('tree', tree)], voting='soft', weights=w) ensemble.fit(X_train, y_train) print("Ensemble Accuracy: %.2f%s" % (ensemble.score(X_test, y_test) * 100, '%'))
def evaluate_classifier(tree, X, Y): """ Evaluates a tree with the data values passed, returning the R2 and MSE """ miss_rate = tree.score(X, Y) return miss_rate
def decision_tree(): # Decision Tree tree = tree.DecisionTreeClassifier() tree.fit(X_train, y_train) tree_pred = tree.predict(X_test) print("Tree test set score: {:.5f}".format(tree.score(X_test, y_test)))
from scipy import misc # from sklearn.metrics import accuracy_score def show_tree(tree, features, path): f = io.StringIO() export_graphviz(tree, out_file=f, class_names=['malignant', 'benign'], feature_names=features, impurity=False) pydotplus.graph_from_dot_data(f.getvalue()).write_png(path) img = misc.imread(path) plt.rcParams['figure.figsize'] = (20, 20) plt.imshow(img) cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42) tree = DecisionTreeClassifier(max_depth=6, min_samples_split=10, random_state=0) tree.fit(X_train, y_train) show_tree(tree, cancer.feature_names, "BreastCancer.png") accuracy = tree.score(X_test, y_test) print('Accuracy :', accuracy)
edges[edge.get_source()].append(int(edge.get_destination())) for edge in edges: edges[edge].sort() for i in range(2): dest = graph.get_node(str(edges[edge][i]))[0] dest.set_fillcolor(colors[i]) graph.write_png('tree.png') """ ######################################################################### prediction = tree.predict(test_features) print("The prediction accuracy is: ", tree.score(test_features, test_targets) * 100, "%") # x axis values x = ['sun', 'mon', 'fri', 'sat', 'tue', 'wed', 'thu'] # y axis values y = [5, 6.7, 4, 6, 2, 4.9, 1.8] # plotting strip plot with seaborn ax = sns.stripplot(x, y) # giving labels to x-axis and y-axis ax.set(xlabel='x', ylabel='y') # giving title to the plot plt.title('My first graph')
4, # número de épocas, sem redução na função de perda, que o monitor espera para agir. verbose=1, ) #fazendo uso dos monitores criados history = model.fit(X_train, y_train_cat, batch_size=batch_size, epochs=epochs, callbacks=[reduce_lr, early_stopping], verbose=1, validation_data=(X_valid, y_valid_cat)) print("1-NN: ", 1 - neigh.score(X_valid, y_valid)) print("SVM: ", 1 - svm.score(X_valid, y_valid)) print('Tree: ', 1 - tree.score(X_valid, y_valid)) print("MLP:", 1 - model.evaluate(X_valid, y_valid_cat, verbose=0)[1]) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'val'], loc='upper left') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch')
print("Acuracia de teste NAIVE BAYES: %0.3f" % te1) #precision NAIVE BAYES y_pred1 = gnb.predict(X_test) micro_precision1 = precision_score(Y_test, y_pred1, average='macro') print("Precision NAIVE BAYES: %0.3f" % micro_precision1) #recall NAIVE BAYES recall1 = recall_score(Y_test, y_pred1, average='macro') print("Recall NAIVE BAYES: %0.3f" % recall1) #criando e treinando arvore de decisao tree = tree.DecisionTreeClassifier(criterion='entropy', random_state=1) t_tree = time() tree = tree.fit(X_train, Y_train) tf_tree = round(time() - t_tree, 3) print("Training time Arvore de Decisao:", tf_tree, "s") t2 = round(tree.score(X_train, Y_train), 3) te2 = round(tree.score(X_test, Y_test), 3) #acuracia treino e test print("Acuracia de treinamento ARVORE DE DECISAO: %0.3f" % t2) print("Acuracia de teste ARVORE DE DECISAO: %0.3f" % te2) #precision arvore de decisao y_pred2 = tree.predict(X_test) micro_precision2 = precision_score(Y_test, y_pred2, average='macro') print("Precision ARVORE DE DECISAO: %0.3f" % micro_precision2) #recall ARVORE DE DECISAO recall2 = recall_score(Y_test, y_pred2, average='macro') print("Recall ARVORE DE DECISAO: %0.3f" % recall2) #criando e treinando logistic regression lr = LogisticRegression(random_state=1) t_lr = time()
# max_depth = 3 list_min_leaves = [i for i in range(1, 10)] for criterion in insert_criterion: for depth in range(2, 7): for min_leaf in list_min_leaves: # tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, # min_samples_leaf=min_leaf, random_state=42) tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth, min_samples_leaf=min_leaf, random_state=42) tree.fit(x_train, y_train) train_result.append(tree.score(x_train, y_train)) test_result.append(tree.score(x_test, y_test)) criterions.append(criterion) # max_depths.append(max_depth) max_depths.append(depth) min_leaves.append(min_leaf) result = pd.DataFrame() result["Criterion"] = criterions result["Depth"] = max_depths result["MinLeafSize"] = min_leaves result["Train_Acc"] = train_result result["Test_Acc"] = test_result
forest = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=10, random_state=4324) # forest_pipe = sklearn.pipeline.Pipeline([("Scale", scaler), ("Model", forest)]) forest.fit(X_train, y_train) tree = sklearn.tree.DecisionTreeRegressor(max_depth=3, random_state=575767) tree.fit(X_train, y_train) print("Linear Regression & {:.3f} & {:.3f} \\\\".format( lm.score(X_train, y_train), lm.score(X_test, y_test))) print("Random Forest & {:.3f} & {:.3f} \\\\".format( forest.score(X_train, y_train), forest.score(X_test, y_test))) print("Decision Tree & {:.3f} & {:.3f} \\\\".format( tree.score(X_train, y_train), tree.score(X_test, y_test))) # fig, ax = plt.subplots(figsize=(15, 5)) # sklearn.tree.plot_tree(tree, filled=True, impurity=False, fontsize=10, ax=ax) # plt.show() vaasa_iter = [ get_data_week_and_two_previous(col, "Vaasa", algae) + get_data_week_and_two_previous(col, "Vaasa", temps) + get_data_week_and_two_previous(col, "Vaasa", rains) + get_data_week_and_two_previous(col, "Vaasa", aqs) # + [population.loc["Vaasa"][str(col[0])]] + [visits.loc["Vaasa"][col] / population.loc["Vaasa"][str(col[0])]] for col in visits.columns ] vaasa_data = np.stack(vaasa_iter)
def classify(keyword2int): import numpy as np import matplotlib.pyplot as plt import scipy.misc from PIL import Image import pprint pp = pprint.PrettyPrinter(indent = 4) # In[18]: import sklearn from sklearn import datasets import skimage.io # ## Datasets preparation # In[19]: def png2vec(filename): img = Image.open(filename).convert('L') arr = np.array(img) return arr # In[20]: filesetNames = ["%01d" %x for x in range(0,2)] # In[21]: import os images = [] tgt = [] count = 0 img_test = Image.open("./img/0/1.jpg") for curFileset in filesetNames: curPath = "./img/"+ curFileset + "/" for file in os.listdir(curPath): curImageVector = png2vec(curPath + file) images.append(curImageVector) tgt.append(curFileset) count += 1 #end print (len(images)) # In[22]: ''' import random index = random.randint(0,count - 1) random_raw_image = images[index] random_im = Image.fromarray(random_raw_image) title = "No."+ "%04d"%index + " Tag:" + tgt[index] plt.title(title) plt.imshow(random_im) # In[23]: random_raw_image.flatten() ''' # In[24]: from sklearn.model_selection import train_test_split from sklearn import model_selection, metrics images_np = np.array(images) img = images_np.reshape(images_np.shape[0],-1) xM, xT, yM, yT = train_test_split(img, tgt, test_size = 0.01) print(yT) xT = list(xT) #map(list,xT) xT.extend(xM[0:20]) yT.extend(yM[0:20]) print(yT) xT = np.array(xT) print(yT) # # Naive Bayes # In[25]: # 要传参!!! from search keyword2int # 要传参!!! from search keyword2int #keyword2int = {'dog':0,'cat':1} from sklearn import metrics from sklearn import naive_bayes cls = naive_bayes.GaussianNB() cls.fit(xM, yM) res = cls.predict(xT) print(type(xT[0:1])) print(len(xT),len(xT[0]),'result',type(xT)) print(res) print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"GaussianNB") #plt.title('这个图片是'+str(list(keyword2int.keys())[0])+'预测结果是'+ str(list(keyword2int.keys())[int(res_test[0])])) #plt.imshow(img_test) #plt.show() # In[26]: cls = naive_bayes.BernoulliNB(binarize = 0.9) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"BernoulliNB") # In[27]: cls = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"naive_bayes.MultinomialNB") # # KNN # In[28]: from sklearn.neighbors import KNeighborsClassifier cls = KNeighborsClassifier(n_neighbors=7) cls.fit(xM, yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),' knn') ''' # ## KNN Parameters # In[30]: scoreListUniform = [] nListUniform = [] stepCount = 10 start = 1 end = 10 for curN in range(start,end): cls = KNeighborsClassifier(n_neighbors=curN,weights = 'uniform') cls.fit(xM, yM) res = cls.predict(xT) curScore = metrics.accuracy_score(yT, res) nListUniform.append(curN) scoreListUniform.append(curScore) scoreListUniform # In[75]: scoreListDistance = [] nListDistance = [] for curN in range(start,end): cls = KNeighborsClassifier(n_neighbors = curN,weights = 'distance') cls.fit(xM, yM) res = cls.predict(xT) curScore = metrics.accuracy_score(yT, res) nListDistance.append(curN) scoreListDistance.append(curScore) scoreListDistance # ## Plot # In[ ]: x = range(start,end) plt.figure(figsize = (8,4)) plt.plot(x,scoreListUniform,"r-",label = "Uniform Weights",linewidth = 1) plt.plot(x,scoreListDistance,"b-",label = "Weights by Distance",linewidth = 1) plt.xlabel("Neighbours") plt.ylabel("Score") plt.title("KNN Parameters Scores") plt.legend() plt.savefig("KnnParameters.png") plt.show() # ## KNN Parameters: weights # In[ ]: scoreList = [] nList = [] stepCount = 10 start = 1 end = 10 cls = KNeighborsClassifier(weights = 'uniform') cls.fit(xM, yM) res = cls.predict(xT) curScore = metrics.accuracy_score(yT, res) nList.append('uniform') scoreList.append(curScore) cls = KNeighborsClassifier(weights = 'distance') cls.fit(xM, yM) res = cls.predict(xT) curScore = metrics.accuracy_score(yT, res) nList.append('distance') scoreList.append(curScore) ''' # # Decision Tree # In[33]: from sklearn import tree cls = tree.DecisionTreeClassifier() cls = cls.fit(xM,yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT,res), metrics.accuracy_score(yT,res),"Decision Tree") import random index = random.randint(0, 10) x_test = xT[index] y_tag = yT[index] DT_res = cls.predict([x_test]) x_test_image_list = list(x_test) x_temp = [] x_image = [] for j in range(100): for i in range(100): x_temp.append(x_test[i+j*100]) x_image.append(x_temp) x_temp = [] np.array(x_image) plt.title('the image is '+str(list(keyword2int.keys())[int(y_tag)])+' res is '+ str(list(keyword2int.keys())[int(DT_res[0])])) print(DT_res) print(list(keyword2int.keys())[int(y_tag)],list(keyword2int.keys())[int(DT_res[0])]) plt.imshow(x_image) plt.show() import sys sys.exit() # In[40]: from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(random_state=0, min_samples_split=2) tree.fit(xT,yT) print(tree.score(xT, yT),tree.score(xM, yM),"DecisionTreeClassifier") # In[37]: DecisionTreeClassifier() # # random forest # In[56]: from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=150, random_state=0) forest.fit(xT, yT) forest.score(xT, yT),forest.score(xM, yM) # In[44]: forest # In[74]: scoreListUniform = [] nListUniform = [] stepCount = 10 stepDist = 200 start = 200 end = 2001 for curCount in range(start,end,stepDist): forest = RandomForestClassifier(n_estimators = curCount, random_state=0) forest.fit(xT, yT) nListUniform.append(curCount) scoreListUniform.append(forest.score(xM, yM)) scoreListUniform # In[94]: # ### Gini Random # ### Entropy Best # ### Entropy Random # ## SVM # In[76]: ''' ''' from sklearn import svm cls = svm.SVC() cls = cls.fit(xM,yM) res = cls.predict(xT) print(metrics.confusion_matrix(yT,res), metrics.accuracy_score(yT,res), " svm") # In[77]: '''
max_iter=500, random_state=17, n_jobs=4, multi_class='multinomial') logit_pipe = Pipeline([('scaler', StandardScaler()), ('logit', logit)]) logit_pipe.fit(X_train, y_train) y_pred_lf = logit_pipe.predict(X_test) score_lf = accuracy_score(y_test, y_pred_lf) ##### 0.41 ##### Decision Tree tree = tree.DecisionTreeClassifier() tree.fit(X_train, y_train) y_pred_tree = tree.predict(X_test) score_tree = tree.score(X_test, y_test) ##### 0.38 ##### Random Forest forest = RandomForestClassifier(n_estimators=100, random_state=17, n_jobs=4) forest.fit(X_train, y_train) y_pred_forest = forest.predict(X_test) S = accuracy_score(y_test, y_pred_forest) ##### 0.43 ##### SVM ### Linear from sklearn.svm import SVC svc = OneVsRestClassifier(SVC()).fit(X_train, y_train)
loans_info['orig_fico'] = loans_orig_info['nb_original_fico'] loans_info['nb_realized_loss'] = np.log(loans_info['nb_realized_loss']) loans_info['orig_balance'] = np.log(loans_info['orig_balance']) vals = 0 runs = 100 x1 = 0 x2 = 0 x3 = 0 x4 = 0 for x in range (0,runs): clf = DecisionTreeClassifier() data = loans_info.dropna(axis = 0, how = 'any') X = (data.drop(columns={'nb_loan_number','nb_realized_loss', 'caused_loss','age'})) X_train, X_test, Y_train, Y_test = train_test_split(X, data['caused_loss'], test_size = .7, train_size = .3) tree = clf.fit(X_train, Y_train) vals = vals + (tree.score(X_test, Y_test)) #print(tree.feature_importances_) x1 += tree.feature_importances_[0] x2 += tree.feature_importances_[1] x3 += tree.feature_importances_[2] x4 += tree.feature_importances_[3] print(vals/runs) print('Average Importance of Loan Term, Loam Amount, FICO Score, and Loan to Value Ratio') print(x1/runs) print(x2/runs) print(x3/runs) x4/runs
def decision_tree_regressor(x_train, y_train, x_test, y_test): from sklearn import tree tree = tree.DecisionTreeRegressor() tree.fit(x_train, y_train) value = tree.score(x_test, y_test) return "{0:.2f}".format(value)
def getTreeImportanceAndScore(tree, features, traget): importance = tree.feature_importances_ score = tree.score(features, traget) return importance, score
######################################################################################################################## from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split # some documents still include the cross-validation option but it no more exists in version 18.0 import pylab as plt cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42) tree = DecisionTreeClassifier(random_state=0) tree.fit(X_train, y_train) print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test))) for i in range(2, 10): tree = DecisionTreeClassifier(max_depth=i, random_state=0) tree.fit(X_train, y_train) export_graphviz(tree, out_file="tree" + str(i) + ".dot", class_names=["malignant", "benign"], feature_names=cancer.feature_names, impurity=False, filled=True) print("Accuracy on training set: {:.3f}".format( tree.score(X_train, y_train))) print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
yList = [predY10, predY30, predY50, predY70, usedTrainY] yListName = [ '10% labeled', '30% labeled', '50% labeled', '70% labeled', 'All data' ] semiRecord = [] for yIdx, y in enumerate(yList): CV = cross_val_score(BaggingClassifier(cartTree, max_samples=0.7, max_features=1.0), usedTrainX, y, cv=5).mean() # CV = cross_val_score(ridge, usedTrainX, y, cv=5).mean() tree = cartTree_bagging.fit(usedTrainX, y) tree.fit(usedTrainX, y) testScore = tree.score(usedTestX, usedTestY) print(yListName[yIdx]) print('CV:\t{}'.format(CV)) print('Test:\t{}'.format(testScore)) semiRecord.append({'name': yListName[yIdx], 'CV': CV, 'test': testScore}) #%% 4. Predict(batch data) from sklearn.cross_validation import train_test_split x_batch_10 = usedTrainX[y_10 != -1] y_batch_10 = usedTrainY[y_10 != -1] x_batch_30 = usedTrainX[y_30 != -1] y_batch_30 = usedTrainY[y_30 != -1] x_batch_50 = usedTrainX[y_50 != -1] y_batch_50 = usedTrainY[y_50 != -1]
from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier income_data=pd.read_csv('income.csv', header=0, delimiter=', ') print(income_data.iloc[0]) labels=income_data[["income"]] income_data['sex-int']=income_data['sex'].apply(lambda x: 0 if x == 'Male' else 1) income_data['country-int']=income_data['native-country'].apply(lambda y: 0 if y == 'United-State' else 1) data = income_data[['age','capital-gain','capital-loss','hours-per-week','sex-int','country-int']] train_data,test_data,train_labels,test_labels=train_test_split(data, labels, random_state=1) forest=RandomForestClassifier(random_state=1) forest.fit(train_data, train_labels) #print(forest.feature_importances_) print(forest.score(test_data, test_labels)) #comparison with Decision Tree tree=DecisionTreeClassifier(random_state=1) tree.fit(train_data, train_labels) print(tree.score(test_data, test_labels)) #print(income_data['native-country'].value_counts())
lambda occ: 0 if occ in [ "Adm-clerical", "Armed-Forces", "Priv-house-serv", "Handlers-cleaners" ] else 1) labels = income_data[["income"]] data = income_data[[ "capital-gain", "capital-loss", "education-num", "occupation-int" ]] train_data, test_data, train_labels, test_labels = train_test_split( data, labels, random_state=1) forest = RandomForestClassifier(n_estimators=150, random_state=2) forest.fit(train_data, train_labels) # Show importances of features print("importances: ", forest.feature_importances_) print(forest.score(test_data, test_labels)) # 84.45% correct prediction = {} for i in range(1, 20): tree = DecisionTreeClassifier(random_state=2, max_depth=i) tree.fit(train_data, train_labels) prediction.update({i: tree.score(test_data, test_labels)}) print("Decision Tree best: ", max(prediction.items(), key=operator.itemgetter(1))) # 84.47% correct me = np.array([0, 0, 13, 1]).reshape(1, -1) print("Random Forest Prediction of me: ", forest.predict(me))