# generate evaluation metrics print("Train - Accuracy :", metrics.accuracy_score(y_train, clf.predict(X_train))) print("Train - Confusion matrix :", metrics.confusion_matrix(y_train, clf.predict(X_train))) print("Train - classification report :", metrics.classification_report(y_train, clf.predict(X_train))) print("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(X_test))) print("Test - Confusion matrix :", metrics.confusion_matrix(y_test, clf.predict(X_test))) print("Test - classification report :", metrics.classification_report(y_test, clf.predict(X_test))) tree.export_graphviz(clf, out_file='tree.dot') from sklearn.externals.six import StringIO import pydot out_data = StringIO() tree.export_graphviz( clf, out_file=out_data, feature_names=iris.feature_names, class_names=clf.classes_.astype(int).astype(str), filled=True, rounded=True, special_characters=True, node_ids=1, ) graph = pydot.graph_from_dot_data(out_data.getvalue()) graph[0].write_pdf("iris.pdf") # save to pdf0
def buildTree(self): k = 0 #Until now I considered as threshold for test reward: # 1- Without considering FER: TEST_REWARD = 76.0 # 2 - Without considering SER: TEST_REWARD = 76.0 # 4 - Without considering ES: TEST_REWARD = 76.0 TEST_REWARD = 76.0 # I want to know the average reward of all features (in this case three features) after 25 run avgRewAllFeatures = [] # I want to know the average error of all features (in this case three features) after 25 run total_error = np.array([]) # confidenceIntervalFeatures is a dictionary containing the Monte Carlo error considered for the self.num_trees_optimal policy trees for each run of the algorithm without considering the i-th feature (in this case the i-th key) confidenceIntervalFeatures = dict() for elem in self.features: confidenceIntervalFeatures[elem] = np.array([]) # Average reward after self.n_runs for trees without a feature averageRewardFeatures = dict() for elem in self.features: averageRewardFeatures[elem] = np.array([]) total_importance = [ 0.0, 0.0, 0.0 ] # it considers the importance of all runs of the algorithm while (k < self.num_run): print("RUN " + str(k)) # first tree based on the randomly generated buffer self.initializeBuffer() # Fit regression model self.current_tree.fit(self.X, self.y) # i is used to count the number of updates a tree has done, j is used to update the value of epsilon # lastRun is used to count how many trees you want to build considering the optimal policy (epsilon = 0) after reaching the stopping criteria for training i = j = flag = lastRun = 0 scores = [] slidingWindowReward = [] while (lastRun < self.num_trees_optimal_policy): print("Tree number: " + str(j)) total_reward = 0 epsilon = self.get_epsilon(j) current_state = self.env.reset() if (len(slidingWindowReward) == self.slidingWindow): if (sum(slidingWindowReward) / len(slidingWindowReward) > TEST_REWARD): total_importance += self.current_tree.feature_importances_ print("Total importance: " + str(total_importance)) lastRun += 1 epsilon = 0 # in order to consider the optimal policy we set epsilon = 0 else: slidingWindowReward = [ ] # we clear the slidingWindow if the samples considered don't match our threshold (TEST_REWARD) while (i < self.update): if ("Speech Emotion Recognition" not in self.features): listx = list(current_state) listx.remove(current_state[1]) tuplex = tuple(listx) current_state = tuplex if ("Object State" not in self.features): listx = list(current_state) listx.remove(current_state[2]) tuplex = tuple(listx) current_state = tuplex if (("Speech Emotion Recognition" in self.features) and ("Object State" in self.features)): current_state = current_state[self.initialIndex:self. finalIndex] action = self.choose_action(current_state, epsilon) obs, reward, done, _ = self.env.step(action) temp = obs if ("Speech Emotion Recognition" not in self.features): listx = list(obs) listx.remove(obs[1]) tuplex = tuple(listx) obs = tuplex if ("Object State" not in self.features): listx = list(obs) listx.remove(obs[2]) tuplex = tuple(listx) obs = tuplex if (("Speech Emotion Recognition" in self.features) and ("Object State" in self.features)): obs = obs[self.initialIndex:self.finalIndex] total_reward += reward q_current = self.current_tree.predict([current_state]) q_new = self.current_tree.predict([obs]) q_current[0][action] = reward + self.gamma * np.max( q_new[0]) self.buffer.append( [current_state, action, obs, q_current[0]]) self.X.append(current_state) self.y.append(q_current[0]) if ( not lastRun == 0 ): # it means until we did not build self.num_trees_optimal_policy self.X_final.append(current_state) self.y_final.append(q_current[0]) current_state = temp i += 1 if done: current_state = self.env.reset() if (not flag): scores.append(total_reward) flag = 1 if (lastRun == 0): slidingWindowReward.append(self.test_reward()) i = 0 if (not flag): scores.append(total_reward) if (lastRun == 0): slidingWindowReward.append(total_reward) self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X, self.y) j += 1 flag = 0 # Testing considering all the variables but training the model on samples generated by last self.num_trees_optimal_policy with epsilon = 0 print("Testing the tree considering all variables") self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X_final, self.y_final) # I want to know the average reward after 25 run considering all variables avgRewAllFeatures.append(self.test_reward()) # Visualize data if (self.MAX_LEN > 2 and self.update > 2): dot_data = StringIO() export_graphviz(self.current_tree, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=self.features) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png()) graph.write_png("final_tree.png") # Now we build four trees (each one without considering one of the four features), we generate a test-set for each tree (so we generate some episodes) # From this episodes we compute the Monte-Carlo error that is (r + gamma * G_{t+1} - q^predicted(x,a))^{2} e we sum all these differences to compute the total error # Higher is this error, more important is the removed feature (so the feature we didn't consider) """for i in range(0, len(self.features)): print("Testing the final tree without considering " + str(self.features[i])) self.current_tree = DecisionTreeRegressor() X_feature = self.dataFilter(i) self.current_tree.fit(X_feature, self.y_final) r, e = self.getMonteCarloError(i) averageRewardFeatures[self.features[i]] = np.append(averageRewardFeatures[self.features[i]], r) confidenceIntervalFeatures[self.features[i]] = np.append(confidenceIntervalFeatures[self.features[i]], e)""" self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X_final, self.y_final) r, e = self.getMonteCarloError(len(self.features)) total_error = np.append(total_error, e) k += 1 """"# This is the final average reward after 25 run considering all features
train_data = np.delete(iris["data"], test_index, 0) print(train_data) # test_data test_target = iris["target"][test_index] test_data = iris["data"][test_index] print(test_target) print(test_data) print(iris.feature_names) print(iris.target_names) clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_target) print(clf.predict(test_data)) #viz code dot_date = StringIO() tree.export_graphviz(clf, out_file=dot_date, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_date.getvalue()) graph.write_png("iris.png")
RMSECvAll.append(math.sqrt( sum( (Originaly-PredictedYcv)**2 )/ OriginalX.shape[0])) plt.figure() plt.plot(CandidatesOfDTDepth, RMSECvAll, 'k', linewidth=2) plt.xlabel("Depth of tree for DT") plt.ylabel("RMSE in CV for DT") plt.show() OptimalMaxDepthDT = CandidatesOfDTDepth[np.where( RMSECvAll == np.min(RMSECvAll) )[0][0] ] DTResult = tree.DecisionTreeRegressor(max_depth=OptimalMaxDepthDT, min_samples_leaf=MinSamplesLeafDT) DTResult.fit( OriginalX, Originaly ) CalculatedYAll[:,7] = DTResult.predict(OriginalX) np.random.seed(10000) PredictedYcvAll[:,7] = model_selection.cross_val_predict(DTResult, OriginalX, Originaly, cv=FoldNumber) np.random.seed() # Check rules of DT datapdDT = pd.read_csv("data.csv", encoding='SHIFT-JIS', index_col=0) with contextlib.closing(StringIO()) as DTfile: tree.export_graphviz(DTResult, out_file=DTfile, feature_names=datapdDT.columns[1:], class_names=datapdDT.columns[0]) output = DTfile.getvalue().splitlines() output.insert(1, 'node[fontname="meiryo"];') with open('DTResult.dot', 'w') as f: f.write('\n'.join(output)) # Estimate Y for new samples based on DT in 1. and 2. PredictedY1All[:,7] = DTResult.predict(OriginalX_prediction1) PredictedY2All[:,7] = DTResult.predict(OriginalX_prediction2) # 9. Random Forest (RF) NumberOfTreesRF = 500 # 1. Number of decision trees CandidatesOfXvariablesRateRF = np.arange( 1, 10, dtype=float)/10 #candidates of the ratio of the number of explanatory variables (X) for decision trees # Run RFR for every candidate of X-ratio and estimate values of objective variable (Y) for Out Of Bag (OOB) samples
print("") print("Data 1's Upper p value with Gini -> ", gini_upper_p_data1) print("") print("Data 2's Lower p value with Gini -> ", gini_lower_p_data2) print("") print("Data 2's Upper p value with Gini -> ", gini_upper_p_data2) print("") #Part i data_1_col_names = [ "age", "job", "marital", "education", "balance", "housing", "duration", "poutcome" ] data_2_col_names = ["job", "marital", "education", "housing"] dot_data1_entropy = StringIO() dot_data2_entropy = StringIO() dot_data1_gini = StringIO() dot_data2_gini = StringIO() export_graphviz(entropy_data_1, out_file=dot_data1_entropy, filled=True, rounded=True, special_characters=True, feature_names=data_1_col_names, class_names=["0", "1"]) data_1_entropy_graph = pydotplus.graph_from_dot_data( dot_data1_entropy.getvalue()) data_1_entropy_graph.write_png('data_1_entropy.png') Image(data_1_entropy_graph.create_png())
def train(): balance_data_excel = DecisionTreeQuestionnaire.get_csv_file_data() ''' Clean the Data and replace with nan ''' balance_data_excel = balance_data_excel.replace(r'^\s*$', str(np.nan), regex=True).replace( '', str(np.nan)) balance_data_excel = balance_data_excel.applymap(str) balance_data_excel print("Dataset Length:: ", len(balance_data_excel)) print("Dataset Shape:: ", balance_data_excel.shape) X = balance_data_excel.iloc[:, :-1] y = balance_data_excel.iloc[:, 22] X = DecisionTreeQuestionnaire.encode_onehot( X, X.columns.get_values().tolist()) X.head() le_y = LabelEncoder() y = le_y.fit_transform(y) cols = X.columns for c in cols: x = c if x.split('=')[1] == 'nan': X.drop(c, axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100) clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=100, min_samples_leaf=5, min_samples_split=8) abc = clf_entropy.fit(X_train, y_train) y_pred = clf_entropy.predict(X_test) print("Accuracy is ", accuracy_score(y_test, y_pred) * 100) ''' Convert Target Classes to key-value pairs ''' print("le2") class_names = {} for i in range(len(le_y.classes_)): class_names[i] = le_y.classes_[i] print(class_names) features = {} for i in range(len(list(X.columns[0:56]))): features[i] = X.columns[i] #features for i in range(len(list(X.columns[0:56]))): DecisionTreeQuestionnaire.feature_names.append(X.columns[i]) #feature_names dot_data = StringIO() export_graphviz(clf_entropy, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=features, class_names=class_names) # graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph = pydot.graph_from_dot_data(dot_data.getvalue()) # graph_from_dot_file() graph.write_png('decisiontree.png') DecisionTreeQuestionnaire.tree_to_code(abc) DecisionTreeQuestionnaire.tree_to_code2(abc, class_names) DecisionTreeQuestionnaire.paths DecisionTreeQuestionnaire.isModelTrained = True for i in DecisionTreeQuestionnaire.paths: for k, v in i.items(): print(k + ' : ' + v) print("-----------------------------------") print('Your inputs are not defined')
def decision_tree_training(self): self.target_names = ['lying', 'lie on the side', 'sitting', 'standing'] self.feature_names = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] print('---start training decision tree---') #split dataset in two equal parts #print(np.shape(self.feature), np.shape(self.label)) X_train, X_test, Y_train, Y_test = train_test_split(self.feature, self.label, test_size=0.25, random_state=0) np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/X_train.txt', X_train, fmt='%f') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/X_test.txt', X_test, fmt='%f') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_train.txt', Y_train, fmt='%d') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_test.txt', Y_test, fmt='%d') print('---split data done!---') print() clf = DecisionTreeClassifier(criterion='gini', random_state=0) # 默认使用CART算法 print(np.shape(X_train), np.shape(Y_train.ravel())) clf.fit(X_train, Y_train.ravel()) # cross_val_score(classifier, X_train, Y_train, cv=5) # visualization dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=self.feature_names, class_names=self.target_names, filled=True, rounded=True, impurity=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("decision_tree.pdf") # classifier.fit(X_train, Y_train) #验证测试集 print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() Y_true, Y_pred = Y_test.ravel(), clf.predict(X_test) np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_true.txt', Y_true, fmt='%d') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_pred.txt', Y_pred, fmt='%d') print( classification_report(Y_true, Y_pred, target_names=self.target_names)) print() print('Decision Tree model saving ......') model_save_path = '/home/hts/posture_classification_based_pose/decision_tree/train_decision_tree_model.m' joblib.dump(clf, model_save_path) return
kyphosis_train = kyphosis[kyphosis.is_train] kyphosis_test = kyphosis[kyphosis.is_train == False] # Train model kyphosis_features = kyphosis.columns[1:] kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1) kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features], kyphosis_train['Kyphosis']) # Print a string representation of the tree. # If you have graphviz (www.graphviz.org) installed, you can write a pdf # visualization using graph.write_pdf(filename) kyphosis_dt_data = StringIO() tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data) kyphosis_dt_graph = pydotplus.parser.parse_dot_data( kyphosis_dt_data.getvalue()) print(kyphosis_dt_graph.to_string()) # Predict classes of test set and evaluate kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features]) kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'], kyphosis_dt_pred, labels=['absent', 'present']) print(kyphosis_dt_cm) kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'], kyphosis_dt_pred) kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'],
from sklearn import tree iris = load_iris() test_idx = [0, 50, 100] train_target = np.delete(iris.target, test_idx) train_data = np.delete(iris.data, test_idx, axis=0) test_target = iris.target[test_idx] test_data = iris.data[test_idx] clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_target) print(test_target) print(clf.predict(test_data)) from sklearn.externals.six import StringIO import pydotplus dotdata = StringIO() tree.export_graphviz( clf, out_file=dotdata, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False) graph = pydotplus.graph_from_dot_data(dotdata.getvalue()) graph.write_pdf("target.pdf")
def customer_segment(customerslist, productname): g = globals() xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \ 'set r.Actual_Price=toFloat(r.Actual_Price)' \ 'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \ 'order by s.Category, pricelist' pricebucket = session.run(xx) for i in pricebucket: # print(i) actualcategoryspresent.append(i['s.Category']) if len(i['pricelist']) == 1: start = (i['pricelist'][0]) - 1 end = (i['pricelist'][0]) + 1 else: start = min(i['pricelist']) end = max(i['pricelist']) bucketsize = (end - start) / 10 starts.append(start) ends.append(end) bucketsizes.append(bucketsize) cat.append(i['s.Category']) # print('start :',start) # print('end :',end) # print('bucket_size:',bucketsize) for i in customerslist: x = 'optional MATCH(c:customerid{CustomerID: "' + i + '"})-[r:Bought_this]->(s:stockcode{Description: "' + productname + '"}) ' \ 'return distinct ' \ 'case ' \ 'when r.Quantity IS NULL THEN 0 ' \ 'when r.Quantity IS NOT NULL THEN 1 ' \ 'else r.Quantity END AS Quantity ' for ii in session.run(x): target.append(ii[0]) for i in customerslist: x = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) return c1.Country limit 1' result = session.run(x) for ii in result: countryofcustomer.append(ii[0]) for i in customerslist: g[i + ' vector'] = [0] * len(cat) g[i + ' category-wise purchase vector'] = [0] * len(cat) g[i + ' bucket_vector'] = [0] * 4 x='match(c1:customerid{CustomerID: "'+i+'"})-[r1:Bought_this]->(s1:stockcode) ' \ 'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price)' \ 'return c1.CustomerID as CustomerID, s1.Category as Category,reduce(sum=0, i in collect(r1.Quantity * r1.Price) | sum + i) as totalspent , reduce(sum=0, i in collect(r1.Quantity) | sum + i) as Quantity ' \ 'order by totalspent desc ' \ result = session.run(x) count = 0 for i in result: count = count + 1 if count == 1: totalspent.append(i['totalspent']) category.append(i['Category']) #print(i['totalspent']) if i['Category'] in cat: vv = cat.index(i['Category']) g[i['CustomerID'] + ' category-wise purchase vector'][vv] = i['Quantity'] else: if i['Category'] in cat: vv = cat.index(i['Category']) g[i['CustomerID'] + ' category-wise purchase vector'][vv] = i['Quantity'] for i in range(0, len(customerslist)): x='match(c1:customerid{CustomerID:"'+customerslist[i]+'"})-[r1:Bought_this]->(s1:stockcode{Category:"'+category[i]+'"}) ' \ 'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) ' \ 'return c1.CustomerID as CustomerID,collect(r1.Actual_Price) as prices_in_category' result = session.run(x) for yy in result: # print(yy[1]) most_buyed_item_cost.append(float(Most_Common(yy[1]))) for i in customerslist: dd = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) return c1.CustomerID as CustomerID, collect( distinct s1.Category) as Categoryss' result = session.run(dd) for uu in result: for hh in uu['Categoryss']: #print(hh) if hh in cat: vv = cat.index(hh) g[i + ' vector'][vv] = 1 for t in range(0, len(customerslist)): if category[t] in cat: vv = cat.index(category[t]) l = drop_in_bucket(starts[vv], ends[vv], bucketsizes[vv], most_buyed_item_cost[t]) pricesensi.append(l) ageassign(pricesensi) df = pd.DataFrame(columns=[ 'age', 'p_s', 'category', 'totalspent', 'total_cat_bought', 'country' ]) for t in range(0, len(customerslist)): #print(t) if category[t] in cat: vv = cat.index(category[t]) # print('------------------------------------------------------------------------------------------------------------------') # print('customer :',customerslist[t]) # print('customer age :',ages[t]) # print('country :',countryofcustomer[t]) # print('total spent :',totalspent[t]) # print('category spent :',category[t]) # print('customer category vector :',g[customerslist[t]+' vector']) if pricesensi[t] == 'High': g[customerslist[t] + ' bucket_vector'][0] = 1 if pricesensi[t] == 'Medium High': g[customerslist[t] + ' bucket_vector'][1] = 1 if pricesensi[t] == 'Medium Low': g[customerslist[t] + ' bucket_vector'][2] = 1 if pricesensi[t] == 'Low': g[customerslist[t] + ' bucket_vector'][3] = 1 # print('customer category-wise purchase vector :', g[customerslist[t] + ' category-wise purchase vector']) # print('customer bucket vector :',g[customerslist[t] +' bucket_vector']) # print('starting price of that category :',starts[vv]) # print('ending price of that category :',ends[vv]) # print('bucket size of category :',bucketsizes[vv]) # print('most buyed item cost :',most_buyed_item_cost[t]) # print('price sensitivity :',pricesensi[t]) # print('total catogories bought :',sum(g[customerslist[t]+' vector'])) # print('Yes/No :',target[t]) df = df.append( { 'age': ages[t], 'p_s': pricesensi[t], 'category': category[t], 'totalspent': totalspent[t], 'total_cat_bought': int( sum(g[customerslist[t] + ' vector'])), 'country': countryofcustomer[t] }, ignore_index=True) # print(df) df = df.drop('country', axis=1) hot_ps = pd.get_dummies(df.p_s) df = df.join(hot_ps) df = df.drop('p_s', axis=1) hot_category = pd.get_dummies(df.category) df = df.join(hot_category) df = df.drop('category', axis=1) # hot_country = pd.get_dummies(df.country) # df=df.join(hot_country) # df=df.drop('country',axis=1) data = df.values train_target = target train_data = data # print(train_target) x = tree.DecisionTreeClassifier() print("##################____________________productname", productname) print(train_data) print(train_target) x.fit(train_data, train_target) dot_data = StringIO() tree.export_graphviz(x, out_file=dot_data, feature_names=df.columns.tolist(), class_names=['No', 'Yes'], filled=True, rounded=True, impurity=False) # tree.export_graphviz(x,out_file='tree.dot') for i in dot_data: print(i) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) # print(graph) #os.remove("D:\E-commerce_data_visualization\search\static\stuff/segment.png") # os.remove(file) for file in os.listdir('path/to/directory') if file.endswith('.png') # os.system(rm ,"D:\E-commerce_data_visualization\search\static\stuff/segment.png") graph.write_png( os.path.join(base_dir, 'search/static/stuff/segment.png')) print(productname, "productname727")
from IPython.display import Image import os import sys def conda_fix(graph): path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz") paths = ("dot", "twopi", "neato", "circo", "fdp") paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths} graph.set_graphviz_executables(paths) from sklearn import tree buffer = StringIO() tree.export_graphviz(dt, out_file=buffer, feature_names=X.columns, class_names=X.columns, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(buffer.getvalue()) conda_fix(graph) graph.write_pdf("loan_tree.pdf") Image(graph.create_png()) #ada-booster #boosting #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html from sklearn.ensemble import AdaBoostClassifier ada = AdaBoostClassifier(base_estimator=dt,
def product_segment(productslist, customerid): descriptions = [] mrp = [] categoryofproduct = [] yes_no = [] cat = [] xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \ 'set r.Actual_Price=toFloat(r.Actual_Price)' \ 'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \ 'order by s.Category, pricelist' pricebucket = session.run(xx) for i in pricebucket: # print(i) actualcategoryspresent.append(i['s.Category']) if len(i['pricelist']) == 1: start = (i['pricelist'][0]) - 1 end = (i['pricelist'][0]) + 1 else: start = min(i['pricelist']) end = max(i['pricelist']) bucketsize = (end - start) / 10 starts.append(start) ends.append(end) bucketsizes.append(bucketsize) cat.append(i['s.Category']) # print('start :',start) # print('end :',end) # print('bucket_size:',bucketsize) for i in productslist: x='match() - [r:Bought_this]->(s:stockcode{StockCode:"'+i+'"}) ' \ 'return distinct s.Description as description, s.Category as category, r.Actual_Price as MRP ' \ 'limit 1' result = session.run(x) for i in result: descriptions.append(i['description']) mrp.append(i['MRP']) categoryofproduct.append(i['category']) df1 = pd.DataFrame( columns=['Description', 'MRP', 'category', 'PriceBucket']) for i in productslist: x = 'optional MATCH(c:customerid{CustomerID: "' + customerid + '"})-[r:Bought_this]->(s:stockcode{StockCode: "' + i + '"}) ' \ 'return distinct ' \ 'case ' \ 'when r.Quantity IS NULL THEN 0 ' \ 'when r.Quantity IS NOT NULL THEN 1 ' \ 'else r.Quantity END AS Quantity ' for ii in session.run(x): yes_no.append(ii[0]) for i in range(0, len(productslist)): # print('=======================================================================================================') # print('StockCode of product :',productslist[i]) # print('Description of product :',descriptions[i]) # print('MRP :',mrp[i]) # print('Category of product :',categoryofproduct[i]) # if categoryofproduct[i] in cat: vv = cat.index(categoryofproduct[i]) z = price_bucket_of_product(starts[vv], ends[vv], bucketsizes[vv], mrp[i]) # print('Price bucket :',z) # print('yes_no :',yes_no[i]) df1 = df1.append( { 'Description': descriptions[i], 'MRP': mrp[i], 'category': categoryofproduct[i], 'PriceBucket': z }, ignore_index=True) # hot_descp = pd.get_dummies(df1.Description) # df1 = df1.join(hot_descp) df1 = df1.drop('Description', axis=1) hot_category = pd.get_dummies(df1.category) df1 = df1.join(hot_category) df1 = df1.drop('category', axis=1) hot_PriceBucket = pd.get_dummies(df1.PriceBucket) df1 = df1.join(hot_PriceBucket) df1 = df1.drop('PriceBucket', axis=1) # print(df1) data = df1.values train_target = yes_no train_data = data # print(train_target) # print(train_data) x = tree.DecisionTreeClassifier() x.fit(train_data, train_target) # print(x) dot_data = StringIO() tree.export_graphviz(x, out_file=dot_data, feature_names=df1.columns.tolist(), class_names=['No', 'Yes'], filled=True, rounded=True, impurity=False) for i in dot_data: print(i) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('D:\project\search\static\stuff/segment.png')
clf_tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=split, min_samples_leaf=leaf) temp = [] for tr, te in sk_fold.split(X, y): clf_tree.fit(X[tr], y[tr]) acc = clf_tree.score(X[te], y[te]) tmp.append(acc) avg_acc = np.mean(np.array(acc)) # DT modeling (max_depth=10, min_samples_split=50, min_samples_leaf=50) clf_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=50) clf_tree.fit(X_tr, Y_tr) DT_pred = clf_tree.predict(X_te) confusion_matrix(Y_te, DT_pred) clf_tree.score(X_te, Y_te) print(classification_report(Y_te, DT_pred)) # DT plot dot_file = StringIO() tree.export_graphviz(clf_tree, out_file=dot_file) tree.export_graphviz(clf_tree, out_file=dot_file, filled=True) dot_file.getvalue() graph = pydotplus.graph_from_dot_data(dot_file.getvalue()) graph[-1].write_pdf(r'C:\Users\User\Desktop\moviedata\tree.pdf') Image(graph[1].create_png())
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def draw_tree(model, name): dot_data = StringIO() _tree.export_graphviz(model, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(name + ".pdf")
def train_holdout(base_dir, classifier_name, classifier): base = pd.read_csv(f'{base_dir}/features.csv', sep=';', header=None) # fetch folder from first index of base images_dirs = base.iloc[:, 0] # Separate X to a new DataFrame and convert to numpy array X = base.iloc[:, 1:] # Load classes names y = pd.read_csv(f'{base_dir}/Y.csv', sep=';', header=None) # HOLDOUT # separate the base in 70% to train and 30% to test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y) all_classes = np.unique(y.to_numpy()) def overfitting_prevent_train(X_train, y_train): # separate the train base in 70% to train and 30% do validation X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=.2, random_state=42, stratify=y_train) chunk_size = 10 graphic_data = [] current_epoch = 0 alpha = 0.8 smooth_error_training = [] smooth_error_validation = [] overfitting_count = 0 for chunk in chunk_generator(pd.concat([X_train, y_train], axis=1), chunk_size=chunk_size): classifier.partial_fit(chunk[0], np.ravel(chunk[1]), classes=all_classes) current_error_on_training = 1 - classifier.score(X_train, np.ravel(y_train)) current_error_on_validation = 1 - classifier.score(X_validation, np.ravel(y_validation)) graphic_data.append( GraphicData(x=current_epoch, error_on_val=current_error_on_validation, error_on_train=current_error_on_training) ) if current_epoch == 0: smooth_error_training.append(current_error_on_training) smooth_error_validation.append(current_error_on_validation) else: smooth_error_training.append( alpha * smooth_error_training[current_epoch - 1] + (1 - alpha) * graphic_data[ current_epoch - 1].error_on_train) smooth_error_validation.append( alpha * smooth_error_validation[current_epoch - 1] + (1 - alpha) * graphic_data[ current_epoch - 1].error_on_val) if current_epoch >= 1: if smooth_error_validation[-2] - smooth_error_validation[-1] < 1e-03: overfitting_count += 1 if overfitting_count >= 5: print(f'Overfitting Detected on {classifier_name}, epoch: {current_epoch}') break else: overfitting_count = 0 current_epoch += 1 print(f'Acurácia {classifier_name} Validação: {classifier.score(X_validation, np.ravel(y_validation))}') plt.title(f'Error Compare --> {classifier_name} --> {base_dir}') # plt.plot([data.x for data in graphic_data], # [data.error_on_train for data in graphic_data], label='Error on Train') # plt.plot([data.x for data in graphic_data], # [data.error_on_val for data in graphic_data], label='Error on Validation') plt.plot([data.x for data in graphic_data], smooth_error_training, label='Error on Train Smooth') plt.plot([data.x for data in graphic_data], smooth_error_validation, label='Error on Validation Smooth') plt.ylabel('Error') plt.xlabel('Epoch') plt.legend() plt.show() def normal_fit(): classifier.fit(X_train, y_train) def visualizate_data(): x_tsne = TSNE(n_components=2).fit_transform(X) y_aux = pd.DataFrame() y_aux['classes'] = y[0] tsne_df = pd.DataFrame() tsne_df['tsne-x'] = x_tsne[:, 0] tsne_df['tsne-y'] = x_tsne[:, 1] tsne_df = pd.concat([tsne_df, y_aux], axis=1) plt.figure(figsize=(16, 10)) sns.scatterplot('tsne-x', 'tsne-y', hue="classes", legend='full', palette=sns.color_palette("hls", 10), alpha=0.3, data=tsne_df) plt.show() if hasattr(classifier, 'partial_fit'): overfitting_prevent_train(X_train, y_train) else: normal_fit() predicated_rows = classifier.predict(X_test) predicated_proba = classifier.predict_proba(X_test) # aux for find proba for class aux = 0 for predicated, expected in zip(predicated_rows, y_test.iterrows()): if expected[1][0] != predicated: img_dir = images_dirs[expected[0]] percent = predicated_proba[aux][np.where(all_classes == predicated)] print(f'Confundiu {img_dir} com {predicated}, proba: {percent}') aux += 1 print(f'Acurácia {classifier_name} Teste : {classifier.score(X_test, y_test)}') cm = confusion_matrix(y_test, predicated_rows) df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes) sns.heatmap(df_cm, annot=True) plt.title(f'Confusion Matrix --> {classifier_name} --> {base_dir}') plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # visualizate_data() if isinstance(classifier, DecisionTreeClassifier): from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydot dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, rounded=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] graph.write_pdf(f'{classifier_name}.pdf') return classifier, X_test, y_test
def wlasne_drzewo(): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("my-tree.pdf")
tree.fit(x_train, y_train) print(tree.score(x_train, y_train)) print(tree.score(x_test, y_test)) # (fix it...) from sklearn.tree import export_graphviz export_graphviz(tree, out_file='cancertree.dot', class_names=['m','b'], feature_names=cancer.feature_names, filled=True) import pydot import graphviz from sklearn.externals.six import StringIO dotfile = StringIO() export_graphviz(tree, out_file=dotfile, class_names=['m','b'], feature_names=cancer.feature_names, filled=True) pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png") import matplotlib.image as mpimg img = mpimg.imread('dtree2.png.png') plt.imshow(img) plt.show() from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO()
import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label'] pima = pd.read_csv("C:/Users/Chaitali/Desktop/SIMLP 2019/diabetes.csv", header=None, names=col_names) print(pima.head()) feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree'] X = pima[feature_cols] y = pima.label X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test clf = DecisionTreeClassifier() # Train Decision Tree Classifer clf = clf.fit(X_train,y_train) #Predict the response for test dataset y_pred = clf.predict(X_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) dot_data = StringIO() export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names = feature_cols,class_names=['0','1']) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('diabetes.png') Image(graph.create_png())
array_severity = df['severity'].values scaler = preprocessing.StandardScaler().fit(array_asmd) rescaled_asmd = scaler.transform(array_asmd) #print(rescaled_asmd) rescaled_asmd_train, rescaled_asmd_test, severity_train, severity_test = train_test_split( rescaled_asmd, array_severity, test_size=0.25, random_state=1) #print(severity_train) clf_gini = DecisionTreeClassifier() clf_gini.fit(rescaled_asmd_train, severity_train) #print(clf_gini.score(rescaled_asmd_test,severity_test)) out = StringIO() tree.export_graphviz(clf_gini, out_file=out, feature_names=['age', 'shape', 'margin', 'density']) graph = graph_from_dot_data(out.getvalue()) Image(graph.create_png()) kcv_score = cross_val_score(clf_gini, rescaled_asmd, array_severity, cv=10) #print(kcv_score.mean()) #0.73 clf_rf = RandomForestClassifier() clf_rf.fit(rescaled_asmd_train, severity_train) rf_score = cross_val_score(clf_rf, rescaled_asmd, array_severity, cv=10) #print(rf_score.mean()) #0.769