Exemple #1
0
def decision_tree(train_features, train_labels, test_features, test_labels, feature_names):
    regressor = tree.DecisionTreeRegressor()
    regressor.fit(train_features, train_labels)

    test_results = cap_results(regressor.predict(test_features))
    train_results = cap_results(regressor.predict(train_features))

    print "test result", metrics.mean_squared_error(test_labels, test_results)
    print "test r2", metrics.r2_score(test_labels, test_results)
    print "train result", metrics.mean_squared_error(train_labels, train_results)
    print "train r2", metrics.r2_score(train_labels, train_results)

    # print "importances"
    # temp = []
    # for index, val in enumerate(regressor.feature_importances_):
    #     if val > 0.001:
    #         temp.append((index, val))
    # print sorted(temp, key=lambda x: x[1])

    '''graph stuff'''
    dot_data = StringIO()
    tree.export_graphviz(regressor, out_file=dot_data,
                        special_characters=True,
                        class_names=regressor.classes_,
                        impurity=False,
                        feature_names=feature_names)

    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("tree.pdf") 

    return (test_results, train_results)
Exemple #2
0
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile):
    ''' Retrains the tree estimator using the fold with the best results
    from the cross-validation process. Prints out a graph pdf file of 
    that estimator.'''
    # Hacky way to get the training data for the best fold
    bestfold = np.argmax(scores)
    cnt = 0
    for train, _ in cv:

        # Only do stuff when you've got the training indices for the best fold
        if(cnt == bestfold):
            # Fit
            treeest.fit(features[train], labels[train])

            # Get the dot file
            dot_data = StringIO()
            tree.export_graphviz(treeest, out_file=dot_data, \
                feature_names=featnames)

            # Convert the dot file to a graph
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf(outfile)
            return
        else:
            cnt += 1

    print("You should never see this text from dt_graph!")
    return
Exemple #3
0
def visualize_tree(clf, outname, headers):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers))
    graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8'))
    graph.write_pdf(outname)
def main():
	if (len(sys.argv) < 2):
		print("One Argument Required; Training Set")
		return
	X_train, Y_train = ParseTraining(sys.argv[1])
    #X_test, Y_test = ParseTraining(sys.argv[2])
    #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99)
    #X_train, X_test, Y_train, Y_test = X, X, Y, Y
    #clf = tree.DecisionTreeClassifier()
	clf = tree.DecisionTreeClassifier(max_depth=6)
    #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025))
    #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1)
    #clf = SVC(kernel="linear", C=0.025)
    #clf = AdaBoostClassifier()
    #clf = SVC(gamma=2, C=1)
	clf = clf.fit(X_train, Y_train)


    #feature_names = ["partAvg", "recavg", "latency", "ReadRate"]
	feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"]
    #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"]
    #feature_names = ["partAvg", "recAvg", "recVar"]
    #feature_names = ["recAvg", "recVar", "Read"]
    #feature_names = ["partAvg", "recVar"]
    ##class_names = ["Partition", "OCC", "2PL"]
    #class_names = ["OCC", "2PL"]
	class_names = ["Partition", "No Partition"]
	dot_data = StringIO()
	tree.export_graphviz(clf, out_file=dot_data,
						feature_names=feature_names,
						class_names=class_names,
						filled=True, rounded=True,
						special_characters=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_png("partition.png")
def tree3():
  global final_html
  global df,df_train,df_test,test_train_created,origin_df
  chi_key = list()
  init_style_string = template.style_string
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html()
		return final_html
  return 'helloo'  
Exemple #6
0
def mainTree():
	header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\
	smoke|drink|child|parent|bmi|where0|where1|\
	marriage0|marriage1|look0|look1|where2').split('|')
	MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|')
	FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|')
	matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt')
	FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches))
	FemaleClass=FemaleData[['id','class']]
	newMaleData=concatData(MaleData,FemaleClass)
	MaleArrays=scaleData(newMaleData,['id','gender'])
	pca=factors(MaleArrays[:,:-1],17)
	print 'PCA explained variance:', sum(pca.explained_variance_ratio_)
	pcaMaleArray=pca.transform(MaleArrays[:,:-1])
	MaleArrays=np.c_[pcaMaleArray,MaleArrays]


	trainData,testData=departData(MaleArrays,0.9)
	trainModel=decisionModel(trainData)

	dot_data = StringIO()
	tree.export_graphviz(trainModel, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") 
	

	rate=test(trainModel,testData)
	print 'Decision Model true rate',rate
Exemple #7
0
def run_DT_model_2(df, criteria_col):
    # run the tree for various 0,1 lebel (e.g. : high value or not..)
    from sklearn.metrics import confusion_matrix
    from sklearn.cross_validation import train_test_split
    from sklearn.externals.six import StringIO
    from IPython.display import Image  
    import pydotplus
    print ('criteria_col  =  ', criteria_col)
    tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder',
          'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders',
         'cancalled_orders', 'voucher_used']
    df_train_ = df 
    #df_train_tree = df_train_[tree_col]
    tree_data = df_train_[tree_col]
    tree_data = tree_data.dropna()
    tree_train, tree_test = train_test_split(tree_data,
                                           test_size=0.2, 
                                           random_state=200,
                                           stratify=tree_data[criteria_col])
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col])
    print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col]))
    # confusion matrix 
    print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:])))
    # visualize the tree 
    dot_data = StringIO()
    tree.export_graphviz(clf,
                       out_file=dot_data,
                       feature_names=tree_col[1:],
                       filled=True, 
                       rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return Image(graph.create_png()), tree_train, tree_test
Exemple #8
0
def decisionTree():
    iris = load_iris()
    clf = tree.DecisionTreeClassifier(
        criterion='gini', 
        splitter='best', 
        max_depth=None,
        min_samples_split=2, 
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, 
        max_features=None, 
        random_state=None, 
        max_leaf_nodes=None, 
        class_weight=None
    )
    clf = clf.fit(iris.data, iris.target)

    dot_data = StringIO() 
    tree.export_graphviz(
        clf, 
        out_file=dot_data, 
        feature_names=iris.feature_names,
        class_names=iris.target_names,
        filled=False,
        rounded=True,
        special_characters=True
    )
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("iris.pdf")
Exemple #9
0
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False):
    """ Classifies data using CART """
    try:
        accuracyRate, probabilities, timing = 0.0, [], 0.0
        # Perform classification
        cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth)
        startTime = time.time()
        prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug")
        cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr))
        prettyPrint("Submitting the test samples", "debug")
        predicted = cartClassifier.predict(Xte)
        endTime = time.time()
        # Compare the predicted and ground truth and append result to list
        accuracyRate = round(metrics.accuracy_score(predicted, yte), 2)
        # Also append the probability estimates
        probs = cartClassifier.predict_proba(Xte)
        probabilities.append(probs)
        timing = endTime-startTime # Keep track of performance
        if visualizeTree:
            # Visualize the tree
            dot_data = StringIO()
            tree.export_graphviz(cartClassifier, out_file=dot_data)
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug")
            graph.write_pdf("tree_%s.pdf" % getTimestamp())
  
    except Exception as e:
        prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error")

    return accuracyRate, timing, probabilities, predicted
def tree2():
  global final_html
  global df,origin_df
  chi_key = list()
  firstkey = ""
  init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px;  height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>"""
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		temp_count = 0
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		k1 = k.split(";")
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="float:right;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html()
		return final_html
  return 'helloo'  
def generate_plot(clf):
    print "\nGenerating plot..."
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("weather_forecast.pdf")
    print "Plot generated!"
def visualize_tree(dtree):
    dot_data = StringIO()
    tree.export_graphviz(dtree, out_file=dot_data,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    display(Image(graph.create_png()))
    def train_network(self):
        """ Pure virtual method for training the network
        """
        db_query = self._database_session.query(PregameHitterGameEntry)
        mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8)
        X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH)
        self._decision_tree.fit(X_train, Y_train)
        dot_data = StringIO()
        tree.export_graphviz(self._decision_tree, out_file=dot_data,
                             feature_names=PregameHitterGameEntry.get_input_vector_labels())
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("hitter_tree.pdf")
        x_test_actual = list()
        y_test_actual = list()
        for data in mlb_evaluation_data:
            try:
                postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id,
                                                                                              PostgameHitterGameEntry.game_date == data.game_date).one()
                y_test_actual.append([postgame_entry.actual_draftkings_points])
                x_test_actual.append(data.to_input_vector())
            except NoResultFound:
                print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id
                continue

        self._database_session.close()
Exemple #14
0
def create_tree(X, Y):
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(X, Y)

    from IPython.display import Image
    import pydotplus

    dot_data = StringIO()
    # tree.export_graphviz(clf, out_file=dot_data)
    # feature_names = ['Gender', 'Age']
    feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"]
    target_names = []

    for i in range(1, len(Y) + 1):
        target_names.append("Ad #" + str(i))

    tree.export_graphviz(
        clf,
        out_file=dot_data,
        feature_names=feature_names,
        class_names=target_names,
        filled=True,
        rounded=True,
        special_characters=True,
    )

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("Tree.pdf")

    return clf
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames):
    """Train and classify using a Decision Tree and prints the decision Tree."""
    decisionTree = DecisionTreeClassifier()
    model = decisionTree.fit(trainData, trainTargets)

    # Create graph description of the Decision Tree
    dot_data = StringIO() 
    #export_graphviz(model, out_file=dot_data, max_depth=5)
    print("Feature names:", featureNames)
    export_graphviz(model, out_file=dot_data, feature_names=featureNames, 
                    max_depth=5)
    export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, 
                    max_depth=5)
    #with open("DecisionTree.dot", 'r') as dotFile:
    #    dotFile.write(exportFile)
    # Create PDF from dot
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot"
    #graph = pydot.graph_from_dot_file(path) 
    #graph.write_pdf("DecisionTree.pdf")


    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Decision Tree:")
    showPerformance(testTargets, classification)
Exemple #16
0
def drawDecisionTree(dt, filename, featureNames, classNames):
    dot_data = StringIO()
    print featureNames
    print classNames
    tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(filename) 
	def printTreePDF(self, path = './tree.pdf'):
		if self.clf == None:
			raise NameError('Tree was not created!')
		else:
			dot_data = StringIO()
			tree.export_graphviz(self.clf, out_file=dot_data)
			graph = pydot.graph_from_dot_data(dot_data.getvalue())
			graph.write_pdf(path) 
Exemple #18
0
def createGraph(clf):
	with open("portScan.dot", 'w') as f:
		f = tree.export_graphviz(clf, out_file=f)

	dot_data = StringIO() 
	tree.export_graphviz(clf, out_file=dot_data) 
	graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
	graph.write_pdf("portScan.pdf") 
def printPdf(clf, dataTrain):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf('sentiment.pdf')
    print dataTrain.data[0]
Exemple #20
0
def export_tree(clf, filename, feature_names=None, max_depth=None):
    from sklearn.externals.six import StringIO
    import pydot

    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, max_depth=max_depth)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(filename)
Exemple #21
0
def drawDecisionTree(classIndex):
	clf = tree.DecisionTreeClassifier()
	clf = clf.fit(preference,y[classIndex])
	dot_data = StringIO()
	# change it: class_names = cnames[classIndex]
	tree.export_graphviz(clf,out_file=dot_data,feature_names= fname,filled=True, rounded=True,special_characters=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	filename = "decisionTree_" + str(classIndex) + ".pdf"
	graph.write_pdf(filename) 
Exemple #22
0
def tree_vis(clf):
    #fn = ''.join([random.choice(string.ascii_lowercase + string.digits) for _ in range(10)])
    fn = 'tree'
    fn = 'data/trees/{0}.png'.format(fn)
    dot_data = StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(fn) 
    return Image(filename=fn)
Exemple #23
0
def view(classifier):
    """ Renders a graph representation of classifier, and
        saves it to "MyTree.pdf" in the same folder
        as the executing script.
    """
    tree_dot = StringIO()
    tree.export_graphviz(classifier, out_file=tree_dot)
    graph = pydot.graph_from_dot_data(tree_dot.getvalue())
    graph.write_pdf("MyTree.pdf")
Exemple #24
0
def __plotTree(clf,name):  
    tree.export_graphviz(clf,out_file=outputdir + name) 
    dot_data = StringIO() 
    tree.export_graphviz(clf,out_file=dot_data)   
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf(outputdir +  name + '.pdf') 
    os.remove(outputdir + name)  

#plot utilities
def train_decision_tree_elite_status_classifier():
	"""Trains and validates a decision tree model for predicting users' Elite status."""
	model = train_and_validate_elite_status_classifier(DecisionTreeClassifier, DECISION_TREE_USER_ATTRIBUTES)

	# Output tree representation showing decision rules
	dot_data = StringIO()
	tree.export_graphviz(model, out_file=dot_data, class_names=True, filled=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf('analysis/analysis_results/decision_tree.pdf')
Exemple #26
0
	def save_tree_png(self, store):
		import pydot
		from sklearn.externals.six import StringIO 
		dot_data = StringIO()  
		tree.export_graphviz(self.clf, out_file=dot_data,  
							   feature_names=self.feature_names)
		graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
		with open(store.dataset_path + '/tree.png','wb') as f:
			f.write(graph.create_png())
Exemple #27
0
 def export(self, fpath):
     """
     Export the decision tree as a PDF file
     :return: None
     """
     dot_data = StringIO()
     tree.export_graphviz(self.model, out_file=dot_data)
     graph = pydot.graph_from_dot_data(dot_data.getvalue())
     graph.write_pdf(fpath)
def drawTree(X, y, names, depth, outFile, writePickle, pickleFile):
    clf = tree.DecisionTreeClassifier(max_depth=depth)  # criterion="entropy"
    clf = clf.fit(X, y)
    dot_data = StringIO()
    tree.export_graphviz(clf, feature_names=names, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(outFile)
    if writePickle:
        pickle.dump(clf, open(pickleFile, "wb"))
    return clf
Exemple #29
0
 def createTreePdf(self):
     try:
         import pydot
     except:
         return
     dot_data = StringIO()
     tree.export_graphviz(self.getClf(),
             out_file = dot_data, feature_names = self.featureNames)
     graph = pydot.graph_from_dot_data(dot_data.getvalue())
     graph.write_pdf("DT" + "-".join(self.classNames) + ".pdf")
    def printTree(self, export_pdf=True, pdf_name="Decision_Tree.pdf"):
        dot_data = StringIO() 
        export_graphviz(self.alg, out_file=dot_data, feature_names=self.predictors,    
                         filled=True, rounded=True, special_characters=True) 
        graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
        
        if export_pdf:
            graph.write_pdf(pdf_name)

        return graph
Exemple #31
0
# Getting the test data from dataset | Obtendo os dados de teste do conjunto de dados
test_target = iris.target[test_id_test]
test_data = iris.data[test_id_test]

# Classifying training data | Classificando os dados de treino
classifier = tree.DecisionTreeClassifier()
classifier.fit(train_data, train_target)

# Showing the real rating and the predicted rating | Mostrando a classificação real e a previsão.
print(f'Test target: {test_target}')
print(f'Predict target: {classifier.predict(test_data)}')

# Visualization Code of Decision Tree | Código de visualização da Árvore de Decisão
from sklearn.externals.six import StringIO
import pydot

dot_data = StringIO()

tree.export_graphviz(classifier,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True,
                     rounded=True,
                     impurity=False)

# I used this module (graphviz) to generate the graph
import graphviz as gp
graph = gp.Source(dot_data.getvalue())
graph.render("iris", view=True)
def train_holdout(base_dir, classifier_name, classifier):
    base = pd.read_csv(f'{base_dir}/features.csv', sep=';', header=None)
    # fetch folder from first index of base
    images_dirs = base.iloc[:, 0]
    # Separate X to a new DataFrame and convert to numpy array
    X = base.iloc[:, 1:]

    # Load classes names
    y = pd.read_csv(f'{base_dir}/Y.csv', sep=';', header=None)
    # HOLDOUT
    # separate the base in 70% to train and 30% to test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                        random_state=42, stratify=y)

    all_classes = np.unique(y.to_numpy())

    def overfitting_prevent_train(X_train, y_train):
        # separate the train base in 70% to train and 30% do validation
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=.2,
                                                                        random_state=42, stratify=y_train)

        chunk_size = 10

        graphic_data = []
        current_epoch = 0

        alpha = 0.8
        smooth_error_training = []
        smooth_error_validation = []
        overfitting_count = 0

        for chunk in chunk_generator(pd.concat([X_train, y_train], axis=1), chunk_size=chunk_size):
            classifier.partial_fit(chunk[0], np.ravel(chunk[1]), classes=all_classes)

            current_error_on_training = 1 - classifier.score(X_train, np.ravel(y_train))
            current_error_on_validation = 1 - classifier.score(X_validation, np.ravel(y_validation))

            graphic_data.append(
                GraphicData(x=current_epoch,
                            error_on_val=current_error_on_validation,
                            error_on_train=current_error_on_training)
            )

            if current_epoch == 0:
                smooth_error_training.append(current_error_on_training)
                smooth_error_validation.append(current_error_on_validation)
            else:
                smooth_error_training.append(
                    alpha * smooth_error_training[current_epoch - 1] + (1 - alpha) * graphic_data[
                        current_epoch - 1].error_on_train)
                smooth_error_validation.append(
                    alpha * smooth_error_validation[current_epoch - 1] + (1 - alpha) * graphic_data[
                        current_epoch - 1].error_on_val)

            if current_epoch >= 1:
                if smooth_error_validation[-2] - smooth_error_validation[-1] < 1e-03:
                    overfitting_count += 1
                    if overfitting_count >= 5:
                        print(f'Overfitting Detected on {classifier_name}, epoch: {current_epoch}')
                        break
                    else:
                        overfitting_count = 0

            current_epoch += 1

        print(f'Acurácia {classifier_name} Validação: {classifier.score(X_validation, np.ravel(y_validation))}')

        plt.title(f'Error Compare --> {classifier_name} --> {base_dir}')
        # plt.plot([data.x for data in graphic_data],
        #          [data.error_on_train for data in graphic_data], label='Error on Train')

        # plt.plot([data.x for data in graphic_data],
        #          [data.error_on_val for data in graphic_data], label='Error on Validation')

        plt.plot([data.x for data in graphic_data],
                 smooth_error_training, label='Error on Train Smooth')
        plt.plot([data.x for data in graphic_data],
                 smooth_error_validation, label='Error on Validation Smooth')

        plt.ylabel('Error')
        plt.xlabel('Epoch')
        plt.legend()
        plt.show()

    def normal_fit():
        classifier.fit(X_train, y_train)

    def visualizate_data():
        x_tsne = TSNE(n_components=2).fit_transform(X)

        y_aux = pd.DataFrame()
        y_aux['classes'] = y[0]

        tsne_df = pd.DataFrame()
        tsne_df['tsne-x'] = x_tsne[:, 0]
        tsne_df['tsne-y'] = x_tsne[:, 1]
        tsne_df = pd.concat([tsne_df, y_aux], axis=1)

        plt.figure(figsize=(16, 10))
        sns.scatterplot('tsne-x', 'tsne-y',
                        hue="classes",
                        legend='full',
                        palette=sns.color_palette("hls", 10),
                        alpha=0.3,
                        data=tsne_df)
        plt.show()

    if hasattr(classifier, 'partial_fit'):
        overfitting_prevent_train(X_train, y_train)
    else:
        normal_fit()

    predicated_rows = classifier.predict(X_test)
    predicated_proba = classifier.predict_proba(X_test)

    # aux for find proba for class
    aux = 0
    for predicated, expected in zip(predicated_rows, y_test.iterrows()):
        if expected[1][0] != predicated:
            img_dir = images_dirs[expected[0]]
            percent = predicated_proba[aux][np.where(all_classes == predicated)]
            print(f'Confundiu {img_dir} com {predicated}, proba: {percent}')
        aux += 1

    print(f'Acurácia {classifier_name} Teste : {classifier.score(X_test, y_test)}')

    cm = confusion_matrix(y_test, predicated_rows)
    df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)
    sns.heatmap(df_cm, annot=True)
    plt.title(f'Confusion Matrix --> {classifier_name} --> {base_dir}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

    # visualizate_data()

    if isinstance(classifier, DecisionTreeClassifier):
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydot

        dot_data = StringIO()
        export_graphviz(classifier, out_file=dot_data, rounded=True,
                        filled=True)
        graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
        graph.write_pdf(f'{classifier_name}.pdf')

    return classifier, X_test, y_test
Exemple #33
0
tree.fit(x_train, y_train)
print(tree.score(x_train, y_train))
print(tree.score(x_test, y_test))


# (fix it...)
from sklearn.tree import export_graphviz

export_graphviz(tree, out_file='cancertree.dot', class_names=['m','b'], feature_names=cancer.feature_names, filled=True)


import pydot
import graphviz
from sklearn.externals.six import StringIO 

dotfile = StringIO()
export_graphviz(tree, out_file=dotfile, class_names=['m','b'], feature_names=cancer.feature_names, filled=True)
pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png")

import matplotlib.image as mpimg

img = mpimg.imread('dtree2.png.png')
plt.imshow(img)
plt.show()

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
def plot(seed):

    filename = 'cryo.csv'
    random.seed(seed)
    np.random.seed(seed)
    data = pd.read_csv(filename, sep='\t')
    data = data.sample(frac=1).reset_index(drop=True)
    X = data.values[:, 0:6]
    y = data.values[:, -1]
    #indexList=[0,1,2,3,4,5,6,8,23,24,25]
    names = list(data)[0:6]
    #print(X.shape)
    #print(y.shape)
    #X=X[:,np.r_[indexList]]

    params = createHyperParameters(seed)

    kf = KFold(n_splits=5, random_state=seed, shuffle=False)
    acc_history = []
    split = 0

    reg = None
    reg = DecisionTreeClassifier()
    reg.set_params(**params)
    for (train_indices, val_indices) in kf.split(X, y):
        split = split + 1
        xtrain, xval = X[train_indices], X[val_indices]
        ytrain, yval = y[train_indices], y[val_indices]
        #        print(xval)
        #        ytrain = ytrain.reshape(-1,1)
        #        yval = yval.reshape(-1,1)
        reg.fit(xtrain, ytrain)
        ypred = reg.predict(xval)
        #        print(reg.predict_proba(xval))
        #        ypred2=reg.predict(xtrain)

        accuracy = accuracy_score(yval, ypred)
        #        print(accuracy)
        acc_history.append(accuracy)

    ACCVALMIN = np.min(acc_history)
    ACCVALMAX = np.max(acc_history)
    ACCVALMEAN = np.mean(acc_history)

    from sklearn.externals.six import StringIO
    from IPython.display import Image
    from sklearn.tree import export_graphviz
    import pydotplus
    from sklearn import tree
    import collections

    dot_data = StringIO()
    export_graphviz(reg,
                    out_file=dot_data,
                    feature_names=names,
                    filled=True,
                    rounded=True,
                    special_characters=True)
    colors = ('turquoise', 'orange')
    edges = collections.defaultdict(list)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    for edge in graph.get_edge_list():
        edges[edge.get_source()].append(int(edge.get_destination()))

    for edge in edges:
        edges[edge].sort()
        for i in range(2):
            dest = graph.get_node(str(edges[edge][i]))[0]
            dest.set_fillcolor(colors[i])

    predictions = reg.predict_proba(X)
    ROCAREA = roc_auc_score(y, predictions[:, 1])
    ypr = reg.predict(X)
    print("Global Set Accuracy Score : %.4f" % (accuracy_score(y, ypr)))
    print("Area under ROC Curve : %.4f" % (ROCAREA))

    fpr, tpr, _ = roc_curve(y, predictions[:, 1])
    plt.clf()
    plt.plot(fpr, tpr)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

    return (ACCVALMIN, ACCVALMAX, ACCVALMEAN, ROCAREA, params, graph)
Exemple #35
0
ac1
#for decision tree
ac2=100*accuracy_score(y_test,pred2)
ac2
#for Naive Bayes
ac3=100*accuracy_score(y_test,pred3)
ac3

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

import pydot

dotfile = StringIO()
export_graphviz(clf2, out_file=dotfile,  filled=True, rounded=True, special_characters=True)
(graph,) = pydot.graph_from_dot_data(dotfile.getvalue())  
Image(graph.create_png())

objects = ('NaiveBayes','Decision Tree', 'RandomForest',)
y_pos = np.arange(len(objects))
performance = [ac3,ac2,ac1]
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Usage')
plt.title('Classifier Accuracy')
 

plt.show()
    def customer_segment(customerslist, productname):
        g = globals()



        xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \
           'set r.Actual_Price=toFloat(r.Actual_Price)' \
           'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \
           'order by s.Category, pricelist'
        pricebucket = session.run(xx)

        for i in pricebucket:
            # print(i)
            actualcategoryspresent.append(i['s.Category'])
            if len(i['pricelist']) == 1:
                start = (i['pricelist'][0]) - 1
                end = (i['pricelist'][0]) + 1
            else:
                start = min(i['pricelist'])
                end = max(i['pricelist'])

            bucketsize = (end - start) / 10
            starts.append(start)
            ends.append(end)
            bucketsizes.append(bucketsize)
            cat.append(i['s.Category'])
            # print('start      :',start)
            # print('end        :',end)
            # print('bucket_size:',bucketsize)
        for i in customerslist:
            x = 'optional MATCH(c:customerid{CustomerID: "' + i + '"})-[r:Bought_this]->(s:stockcode{Description: "' + productname + '"}) ' \
                 'return distinct ' \
                 'case ' \
                 'when r.Quantity IS NULL THEN 0 ' \
                 'when r.Quantity IS NOT NULL THEN 1 ' \
                 'else r.Quantity END AS Quantity '
            for ii in session.run(x):
                target.append(ii[0])
        for i in customerslist:
            x = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) return c1.Country limit 1'
            result = session.run(x)
            for ii in result:
                countryofcustomer.append(ii[0])
        for i in customerslist:
            g[i + ' vector'] = [0] * len(cat)
            g[i + ' category-wise purchase vector'] = [0] * len(cat)
            g[i + ' bucket_vector'] = [0] * 4


            x='match(c1:customerid{CustomerID: "'+i+'"})-[r1:Bought_this]->(s1:stockcode) ' \
              'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price)' \
              'return c1.CustomerID as CustomerID, s1.Category as Category,reduce(sum=0, i in collect(r1.Quantity * r1.Price) | sum + i) as totalspent , reduce(sum=0, i in collect(r1.Quantity) | sum + i) as Quantity ' \
              'order by totalspent desc ' \

            result = session.run(x)
            count = 0
            for i in result:
                count = count + 1
                if count == 1:

                    totalspent.append(i['totalspent'])
                    category.append(i['Category'])
                    #print(i['totalspent'])
                    if i['Category'] in cat:
                        vv = cat.index(i['Category'])
                        g[i['CustomerID'] +
                          ' category-wise purchase vector'][vv] = i['Quantity']

                else:
                    if i['Category'] in cat:
                        vv = cat.index(i['Category'])
                        g[i['CustomerID'] +
                          ' category-wise purchase vector'][vv] = i['Quantity']

        for i in range(0, len(customerslist)):
            x='match(c1:customerid{CustomerID:"'+customerslist[i]+'"})-[r1:Bought_this]->(s1:stockcode{Category:"'+category[i]+'"}) ' \
              'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) ' \
              'return c1.CustomerID as CustomerID,collect(r1.Actual_Price) as prices_in_category'
            result = session.run(x)

            for yy in result:
                # print(yy[1])
                most_buyed_item_cost.append(float(Most_Common(yy[1])))

        for i in customerslist:
            dd = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) return c1.CustomerID as CustomerID, collect( distinct s1.Category) as Categoryss'
            result = session.run(dd)
            for uu in result:
                for hh in uu['Categoryss']:
                    #print(hh)

                    if hh in cat:
                        vv = cat.index(hh)
                        g[i + ' vector'][vv] = 1

        for t in range(0, len(customerslist)):

            if category[t] in cat:
                vv = cat.index(category[t])
            l = drop_in_bucket(starts[vv], ends[vv], bucketsizes[vv],
                               most_buyed_item_cost[t])
            pricesensi.append(l)
        ageassign(pricesensi)
        df = pd.DataFrame(columns=[
            'age', 'p_s', 'category', 'totalspent', 'total_cat_bought',
            'country'
        ])
        for t in range(0, len(customerslist)):
            #print(t)
            if category[t] in cat:
                vv = cat.index(category[t])
            # print('------------------------------------------------------------------------------------------------------------------')
            # print('customer                                         :',customerslist[t])
            # print('customer age                                     :',ages[t])
            # print('country                                          :',countryofcustomer[t])
            # print('total spent                                      :',totalspent[t])
            # print('category spent                                   :',category[t])
            # print('customer category vector                         :',g[customerslist[t]+' vector'])
            if pricesensi[t] == 'High':
                g[customerslist[t] + ' bucket_vector'][0] = 1
            if pricesensi[t] == 'Medium High':
                g[customerslist[t] + ' bucket_vector'][1] = 1
            if pricesensi[t] == 'Medium Low':
                g[customerslist[t] + ' bucket_vector'][2] = 1
            if pricesensi[t] == 'Low':
                g[customerslist[t] + ' bucket_vector'][3] = 1
            # print('customer category-wise purchase vector           :', g[customerslist[t] + ' category-wise purchase vector'])
            # print('customer bucket vector                           :',g[customerslist[t] +' bucket_vector'])
            # print('starting price of that category                  :',starts[vv])
            # print('ending price of that category                    :',ends[vv])
            # print('bucket size of category                          :',bucketsizes[vv])
            # print('most buyed item cost                             :',most_buyed_item_cost[t])
            # print('price sensitivity                                :',pricesensi[t])
            # print('total catogories bought                          :',sum(g[customerslist[t]+' vector']))
            # print('Yes/No                                           :',target[t])
            df = df.append(
                {
                    'age': ages[t],
                    'p_s': pricesensi[t],
                    'category': category[t],
                    'totalspent': totalspent[t],
                    'total_cat_bought': int(
                        sum(g[customerslist[t] + ' vector'])),
                    'country': countryofcustomer[t]
                },
                ignore_index=True)

        # print(df)
        df = df.drop('country', axis=1)

        hot_ps = pd.get_dummies(df.p_s)
        df = df.join(hot_ps)
        df = df.drop('p_s', axis=1)

        hot_category = pd.get_dummies(df.category)
        df = df.join(hot_category)
        df = df.drop('category', axis=1)

        # hot_country = pd.get_dummies(df.country)
        # df=df.join(hot_country)
        # df=df.drop('country',axis=1)

        data = df.values
        train_target = target
        train_data = data
        # print(train_target)
        x = tree.DecisionTreeClassifier()
        print("##################____________________productname", productname)
        print(train_data)
        print(train_target)
        x.fit(train_data, train_target)
        dot_data = StringIO()
        tree.export_graphviz(x,
                             out_file=dot_data,
                             feature_names=df.columns.tolist(),
                             class_names=['No', 'Yes'],
                             filled=True,
                             rounded=True,
                             impurity=False)
        # tree.export_graphviz(x,out_file='tree.dot')
        for i in dot_data:
            print(i)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        # print(graph)
        #os.remove("D:\E-commerce_data_visualization\search\static\stuff/segment.png")
        # os.remove(file) for file in os.listdir('path/to/directory') if file.endswith('.png')
        # os.system(rm ,"D:\E-commerce_data_visualization\search\static\stuff/segment.png")
        graph.write_png(
            os.path.join(base_dir, 'search/static/stuff/segment.png'))
        print(productname, "productname727")
Exemple #37
0
    def buildTree(self):

        k = 0

        #Until now I considered as threshold for test reward:
        # 1- Without considering FER: TEST_REWARD = 76.0
        # 2 - Without considering SER: TEST_REWARD = 76.0
        # 4 - Without considering ES: TEST_REWARD = 76.0
        TEST_REWARD = 76.0

        # I want to know the average reward of all features (in this case three features) after 25 run
        avgRewAllFeatures = []
        # I want to know the average error of all features (in this case three features) after 25 run
        total_error = np.array([])

        #  confidenceIntervalFeatures is a dictionary containing the Monte Carlo error considered for the self.num_trees_optimal policy trees for each run of the algorithm without considering the i-th feature (in this case the i-th key)
        confidenceIntervalFeatures = dict()
        for elem in self.features:
            confidenceIntervalFeatures[elem] = np.array([])

        # Average reward after self.n_runs for trees without a feature
        averageRewardFeatures = dict()
        for elem in self.features:
            averageRewardFeatures[elem] = np.array([])

        total_importance = [
            0.0, 0.0, 0.0
        ]  # it considers the importance of all runs of the algorithm

        while (k < self.num_run):
            print("RUN " + str(k))

            # first tree based on the randomly generated buffer
            self.initializeBuffer()

            # Fit regression model
            self.current_tree.fit(self.X, self.y)

            # i is used to count the number of updates a tree has done, j is used to update the value of epsilon
            # lastRun is used to count how many trees you want to build considering the optimal policy (epsilon = 0) after reaching the stopping criteria for training
            i = j = flag = lastRun = 0

            scores = []
            slidingWindowReward = []

            while (lastRun < self.num_trees_optimal_policy):

                print("Tree number: " + str(j))
                total_reward = 0
                epsilon = self.get_epsilon(j)
                current_state = self.env.reset()

                if (len(slidingWindowReward) == self.slidingWindow):
                    if (sum(slidingWindowReward) / len(slidingWindowReward) >
                            TEST_REWARD):
                        total_importance += self.current_tree.feature_importances_
                        print("Total importance: " + str(total_importance))
                        lastRun += 1
                        epsilon = 0  # in order to consider the optimal policy we set epsilon = 0
                    else:
                        slidingWindowReward = [
                        ]  # we clear the slidingWindow if the samples considered don't match our threshold (TEST_REWARD)

                while (i < self.update):

                    if ("Speech Emotion Recognition" not in self.features):
                        listx = list(current_state)
                        listx.remove(current_state[1])
                        tuplex = tuple(listx)
                        current_state = tuplex

                    if ("Object State" not in self.features):
                        listx = list(current_state)
                        listx.remove(current_state[2])
                        tuplex = tuple(listx)
                        current_state = tuplex

                    if (("Speech Emotion Recognition" in self.features)
                            and ("Object State" in self.features)):
                        current_state = current_state[self.initialIndex:self.
                                                      finalIndex]

                    action = self.choose_action(current_state, epsilon)
                    obs, reward, done, _ = self.env.step(action)
                    temp = obs

                    if ("Speech Emotion Recognition" not in self.features):
                        listx = list(obs)
                        listx.remove(obs[1])
                        tuplex = tuple(listx)
                        obs = tuplex
                    if ("Object State" not in self.features):
                        listx = list(obs)
                        listx.remove(obs[2])
                        tuplex = tuple(listx)
                        obs = tuplex
                    if (("Speech Emotion Recognition" in self.features)
                            and ("Object State" in self.features)):
                        obs = obs[self.initialIndex:self.finalIndex]

                    total_reward += reward

                    q_current = self.current_tree.predict([current_state])
                    q_new = self.current_tree.predict([obs])
                    q_current[0][action] = reward + self.gamma * np.max(
                        q_new[0])
                    self.buffer.append(
                        [current_state, action, obs, q_current[0]])
                    self.X.append(current_state)
                    self.y.append(q_current[0])

                    if (
                            not lastRun == 0
                    ):  # it means until we did not build self.num_trees_optimal_policy
                        self.X_final.append(current_state)
                        self.y_final.append(q_current[0])
                    current_state = temp
                    i += 1
                    if done:
                        current_state = self.env.reset()
                        if (not flag):
                            scores.append(total_reward)
                            flag = 1
                            if (lastRun == 0):
                                slidingWindowReward.append(self.test_reward())
                i = 0
                if (not flag):
                    scores.append(total_reward)
                    if (lastRun == 0):
                        slidingWindowReward.append(total_reward)

                self.current_tree = DecisionTreeRegressor()
                self.current_tree.fit(self.X, self.y)
                j += 1
                flag = 0

            # Testing considering all the variables but training the model on samples generated by last self.num_trees_optimal_policy with epsilon = 0
            print("Testing the tree considering all variables")
            self.current_tree = DecisionTreeRegressor()
            self.current_tree.fit(self.X_final, self.y_final)
            # I want to know the average reward after 25 run considering all variables
            avgRewAllFeatures.append(self.test_reward())

            # Visualize data
            if (self.MAX_LEN > 2 and self.update > 2):
                dot_data = StringIO()
                export_graphviz(self.current_tree,
                                out_file=dot_data,
                                filled=True,
                                rounded=True,
                                special_characters=True,
                                feature_names=self.features)
                graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
                Image(graph.create_png())
                graph.write_png("final_tree.png")

            # Now we build four trees (each one without considering one of the four features), we generate a test-set for each tree (so we generate some episodes)
            # From this episodes we compute the Monte-Carlo error that is (r + gamma * G_{t+1} - q^predicted(x,a))^{2} e we sum all these differences to compute the total error
            # Higher is this error, more important is the removed feature (so the feature we didn't consider)
            """for i in range(0, len(self.features)):
                print("Testing the final tree without considering " + str(self.features[i]))
                self.current_tree = DecisionTreeRegressor()
                X_feature = self.dataFilter(i)
                self.current_tree.fit(X_feature, self.y_final)
                r, e = self.getMonteCarloError(i)

                averageRewardFeatures[self.features[i]] = np.append(averageRewardFeatures[self.features[i]], r)
                confidenceIntervalFeatures[self.features[i]] = np.append(confidenceIntervalFeatures[self.features[i]], e)"""

            self.current_tree = DecisionTreeRegressor()
            self.current_tree.fit(self.X_final, self.y_final)
            r, e = self.getMonteCarloError(len(self.features))
            total_error = np.append(total_error, e)

            k += 1
        """"# This is the final average reward after 25 run considering all features
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% testing

# Create decision tree classifier object
clf = DecisionTreeClassifier(max_depth=5) # note: arguments are optional (used for pruning)

# Train decision tree classifier
clf = clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Evaluate model by printing the model accuracy (how often the classifier is correct)
print("\nAccuracy:", metrics.accuracy_score(y_test, y_pred))
print()

# attempting to visualize the data
print("Beginning Visualization...")
dotData = StringIO()

print("\nExporting Graphviz...")
export_graphviz(clf, out_file=dotData, filled=True, rounded=True, special_characters=True, feature_names = featureCols, class_names=['0','1'])

print("\nGraphing using pydotplus library...")
graph = pydotplus.graph_from_dot_data(dotData.getvalue())
graph.write_png('BigTheta.png')

print("Printing now...")
Image(graph.create_png())
Exemple #39
0
from sklearn.datasets import load_iris
from sklearn.externals.six import StringIO
from sklearn import tree

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

import os

os.unlink('iris.dot')
import pydot

dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")
from IPython.display import Image

dot_data = StringIO()
tree.export_graphviz(clf,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True,
                     rounded=True,
                     special_characters=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from matplotlib import pyplot as plt
from pydot import graph_from_dot_data
import pandas as pd
import numpy as np

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Categorical.from_codes(iris.target, iris.target_names)
X.head()
y = pd.get_dummies(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, feature_names=iris.feature_names)
(graph, ) = graph_from_dot_data(dot_data.getvalue())
graph.write_png("tmp.png")
plt.imshow(plt.imread("tmp.png"))
plt.show()

y_pred = dt.predict(X_test)
species = np.array(y_test).argmax(axis=1)
predictions = np.array(y_pred).argmax(axis=1)
confusion_matrix(species, predictions)
Exemple #41
0
def plotTree(treeName,tree,featureNames):
	treePic_dot = StringIO()
	export_graphviz(tree, out_file=treePic_dot, feature_names=featureNames, filled=True, rounded=True)
	graph = pydotplus.graph_from_dot_data(treePic_dot.getvalue())  
	Image(graph.create_png())
	graph.write_png(treeName+'.png')
Exemple #42
0
def draw_tree(model, name):
    dot_data = StringIO()
    _tree.export_graphviz(model, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(name + ".pdf")
Exemple #43
0
kyphosis_train = kyphosis[kyphosis.is_train]
kyphosis_test = kyphosis[kyphosis.is_train == False]

# Train model
kyphosis_features = kyphosis.columns[1:]
kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy',
                                         max_depth=None,
                                         min_samples_split=2,
                                         min_samples_leaf=1)
kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features],
                                      kyphosis_train['Kyphosis'])

# Print a string representation of the tree.
# If you have graphviz (www.graphviz.org) installed, you can write a pdf
# visualization using graph.write_pdf(filename)
kyphosis_dt_data = StringIO()
tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data)
kyphosis_dt_graph = pydotplus.parser.parse_dot_data(
    kyphosis_dt_data.getvalue())
print(kyphosis_dt_graph.to_string())

# Predict classes of test set and evaluate
kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features])

kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'],
                                          kyphosis_dt_pred,
                                          labels=['absent', 'present'])
print(kyphosis_dt_cm)
kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'],
                                         kyphosis_dt_pred)
kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'],
#!/usr/bin/env python
'''Read and write a string as a file-like object.'''

from sklearn.externals.six import StringIO

# create a sample
mysample = StringIO()
mysample.write('My first testing line.')
print(
    mysample)  #this only will indicate the location of the file in the memory

# retrieve contents using getvalue()
content = mysample.getvalue()
print(content)

# close my sample
mysample.close()
Exemple #45
0
from IPython.display import Image

import os
import sys


def conda_fix(graph):
    path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz")
    paths = ("dot", "twopi", "neato", "circo", "fdp")
    paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths}
    graph.set_graphviz_executables(paths)


from sklearn import tree

buffer = StringIO()
tree.export_graphviz(dt,
                     out_file=buffer,
                     feature_names=X.columns,
                     class_names=X.columns,
                     filled=True,
                     rounded=True,
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(buffer.getvalue())
conda_fix(graph)
graph.write_pdf("loan_tree.pdf")
Image(graph.create_png())
#ada-booster #boosting
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=dt,
def test_graphviz_toy():
    # Check correctness of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    out = StringIO()
    export_graphviz(clf, out_file=out)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test with feature_names
    out = StringIO()
    export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"])
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test with class_names
    out = StringIO()
    export_graphviz(clf, out_file=out, class_names=["yes", "no"])
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = yes"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \
                'class = yes"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \
                'class = no"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test plot_options
    out = StringIO()
    export_graphviz(clf,
                    out_file=out,
                    filled=True,
                    impurity=False,
                    proportion=True,
                    special_characters=True,
                    rounded=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                '0 [label=<X<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>' \
                'value = [0.5, 0.5]>, fillcolor="#e5813900"] ;\n' \
                '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \
                'fillcolor="#e58139ff"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \
                'fillcolor="#399de5ff"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test max_depth
    out = StringIO()
    export_graphviz(clf, out_file=out, max_depth=0, class_names=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = y[0]"] ;\n' \
                '1 [label="(...)"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test max_depth with plot_options
    out = StringIO()
    export_graphviz(clf, out_file=out, max_depth=0, filled=True, node_ids=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \
                'samples = 6\\nvalue = [3, 3]", fillcolor="#e5813900"] ;\n' \
                '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test multi-output with weighted samples
    clf = DecisionTreeClassifier(max_depth=2,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf = clf.fit(X, y2, sample_weight=w)

    out = StringIO()
    export_graphviz(clf, out_file=out, filled=True, impurity=False)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \
                'value = [[3.0, 1.5, 0.0]\\n' \
                '[3.0, 1.0, 0.5]]", fillcolor="#e5813900"] ;\n' \
                '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \
                '[3, 0, 0]]", fillcolor="#e58139ff"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \
                'value = [[0.0, 1.5, 0.0]\\n' \
                '[0.0, 1.0, 0.5]]", fillcolor="#e5813986"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \
                '[0, 1, 0]]", fillcolor="#e58139ff"] ;\n' \
                '2 -> 3 ;\n' \
                '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \
                '[0.0, 0.0, 0.5]]", fillcolor="#e58139ff"] ;\n' \
                '2 -> 4 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test regression output with plot_options
    clf = DecisionTreeRegressor(max_depth=3,
                                min_samples_split=2,
                                criterion="mse",
                                random_state=2)
    clf.fit(X, y)

    out = StringIO()
    export_graphviz(clf,
                    out_file=out,
                    filled=True,
                    leaves_parallel=True,
                    rotate=True,
                    rounded=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'graph [ranksep=equally, splines=polyline] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                'rankdir=LR ;\n' \
                '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \
                'value = 0.0", fillcolor="#e5813980"] ;\n' \
                '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \
                'fillcolor="#e5813900"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \
                'fillcolor="#e58139ff"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="False"] ;\n' \
                '{rank=same ; 0} ;\n' \
                '{rank=same ; 1; 2} ;\n' \
                '}'

    assert_equal(contents1, contents2)
    classifier = DecisionTreeClassifier(random_state=0)
    params = {"max_depth": range(1, 11)}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(X, y)
    print(pd.DataFrame(grid.cv_results_))
    return grid.best_estimator_

reg = fit_model(X_train, y_train)
reg.fit(X_train, y_train)

Z = reg.predict(X_test)
s = pickle.dumps(reg)

print(metrics.confusion_matrix(y_test, Z))
print(metrics.classification_report(y_test, Z))

dot_data = StringIO()

export_graphviz(reg, out_file="dot.dot", feature_names=list(data)[1:], class_names=["edible", "poisonous"])
#export_graphviz(reg, out_file=dot_data, feature_names=list(data)[1:])
# graph_ = pydot.graph_from_dot_data(dot_data.getvalue())
# graph_.write_pdf("tree.pdf")

feature_importances = reg.feature_importances_

fi = dict(zip(feature_importances, list(data)[1:]))
fi_S = sorted(fi.items(), key=operator.itemgetter(0), reverse=True)

print(fi_S)
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(regressor,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
print(predTree[0:5])
print(y_testset[0:5])

#Evaluation
from sklearn import metrics
import matplotlib.pyplot as plt

print("Decision Tree Accuracy: ", metrics.accuracy_score(y_testset, predTree))

#Tree visualization
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree

dot_data = StringIO()
filename = "DrugTree.png"
feature_names = my_data.columns[0:5]
targetNames = my_data["Drug"].unique().tolist()
out = tree.export_graphviz(drugTree,
                           feature_names=feature_names,
                           out_file=dot_data,
                           class_names=np.unique(y_trainset),
                           filled=True,
                           special_characters=True,
                           rotate=False)
graph = pydotplus.graph_from_dot_data(
    dot_data.getvalue()
)  #use dot data, which is the string representation of the tree and forms a graph
graph.write_png(filename)
img = mpimg.imread(filename)
labelSet = data.iloc[:, 4]
dataConvertList = {}
labelConvertList = {}

#convert string attribute values to integers
for feat in np.array(dataSet).transpose():
    i = 1
    for data in feat:
        if data not in dataConvertList.keys():
            dataConvertList[data] = i
            i = i + 1
i = 1
for data in labelSet:
    if data not in labelConvertList.keys():
        labelConvertList[data] = i
        i = i + 1
for key in dataConvertList:
    dataSet = dataSet.replace(key, dataConvertList[key])
for key in labelConvertList:
    labelSet = labelSet.replace(key, labelConvertList[key])

#train a model
model = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
s = model.fit(dataSet, labelSet)

#plot the decision tree by pydotplus
tree_file = StringIO()
tree.export_graphviz(model, out_file=tree_file)
graph = pydotplus.graph_from_dot_data(tree_file.getvalue())
graph.write_pdf("tree.pdf")
    def product_segment(productslist, customerid):
        descriptions = []
        mrp = []
        categoryofproduct = []
        yes_no = []
        cat = []
        xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \
           'set r.Actual_Price=toFloat(r.Actual_Price)' \
           'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \
           'order by s.Category, pricelist'
        pricebucket = session.run(xx)

        for i in pricebucket:
            # print(i)
            actualcategoryspresent.append(i['s.Category'])
            if len(i['pricelist']) == 1:
                start = (i['pricelist'][0]) - 1
                end = (i['pricelist'][0]) + 1
            else:
                start = min(i['pricelist'])
                end = max(i['pricelist'])

            bucketsize = (end - start) / 10
            starts.append(start)
            ends.append(end)
            bucketsizes.append(bucketsize)
            cat.append(i['s.Category'])
            # print('start      :',start)
            # print('end        :',end)
            # print('bucket_size:',bucketsize)

        for i in productslist:
            x='match() - [r:Bought_this]->(s:stockcode{StockCode:"'+i+'"}) ' \
              'return distinct s.Description as description, s.Category as category, r.Actual_Price as MRP ' \
              'limit 1'
            result = session.run(x)
            for i in result:
                descriptions.append(i['description'])
                mrp.append(i['MRP'])
                categoryofproduct.append(i['category'])
        df1 = pd.DataFrame(
            columns=['Description', 'MRP', 'category', 'PriceBucket'])
        for i in productslist:
            x = 'optional MATCH(c:customerid{CustomerID: "' + customerid + '"})-[r:Bought_this]->(s:stockcode{StockCode: "' + i + '"}) ' \
                'return distinct ' \
                'case ' \
                'when r.Quantity IS NULL THEN 0 ' \
                'when r.Quantity IS NOT NULL THEN 1 ' \
                'else r.Quantity END AS Quantity '
            for ii in session.run(x):
                yes_no.append(ii[0])
        for i in range(0, len(productslist)):
            # print('=======================================================================================================')
            # print('StockCode of product                             :',productslist[i])
            # print('Description of product                           :',descriptions[i])
            # print('MRP                                              :',mrp[i])
            # print('Category of product                              :',categoryofproduct[i])
            # if categoryofproduct[i] in cat:
            vv = cat.index(categoryofproduct[i])
            z = price_bucket_of_product(starts[vv], ends[vv], bucketsizes[vv],
                                        mrp[i])
            # print('Price bucket                                     :',z)
            # print('yes_no                                           :',yes_no[i])

            df1 = df1.append(
                {
                    'Description': descriptions[i],
                    'MRP': mrp[i],
                    'category': categoryofproduct[i],
                    'PriceBucket': z
                },
                ignore_index=True)

        # hot_descp = pd.get_dummies(df1.Description)
        # df1 = df1.join(hot_descp)
        df1 = df1.drop('Description', axis=1)

        hot_category = pd.get_dummies(df1.category)
        df1 = df1.join(hot_category)
        df1 = df1.drop('category', axis=1)

        hot_PriceBucket = pd.get_dummies(df1.PriceBucket)
        df1 = df1.join(hot_PriceBucket)
        df1 = df1.drop('PriceBucket', axis=1)
        # print(df1)
        data = df1.values
        train_target = yes_no
        train_data = data
        # print(train_target)
        # print(train_data)

        x = tree.DecisionTreeClassifier()

        x.fit(train_data, train_target)
        # print(x)
        dot_data = StringIO()
        tree.export_graphviz(x,
                             out_file=dot_data,
                             feature_names=df1.columns.tolist(),
                             class_names=['No', 'Yes'],
                             filled=True,
                             rounded=True,
                             impurity=False)

        for i in dot_data:
            print(i)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        graph.write_png('D:\project\search\static\stuff/segment.png')
print("")
print("Data 1's Upper p value with Gini -> ", gini_upper_p_data1)
print("")
print("Data 2's Lower p value with Gini -> ", gini_lower_p_data2)
print("")
print("Data 2's Upper p value with Gini -> ", gini_upper_p_data2)
print("")

#Part i

data_1_col_names = [
    "age", "job", "marital", "education", "balance", "housing", "duration",
    "poutcome"
]
data_2_col_names = ["job", "marital", "education", "housing"]
dot_data1_entropy = StringIO()
dot_data2_entropy = StringIO()
dot_data1_gini = StringIO()
dot_data2_gini = StringIO()
export_graphviz(entropy_data_1,
                out_file=dot_data1_entropy,
                filled=True,
                rounded=True,
                special_characters=True,
                feature_names=data_1_col_names,
                class_names=["0", "1"])
data_1_entropy_graph = pydotplus.graph_from_dot_data(
    dot_data1_entropy.getvalue())
data_1_entropy_graph.write_png('data_1_entropy.png')
Image(data_1_entropy_graph.create_png())
Exemple #53
0
def wlasne_drzewo():
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("my-tree.pdf")
Exemple #54
0
    RMSECvAll.append(math.sqrt( sum( (Originaly-PredictedYcv)**2 )/ OriginalX.shape[0]))
plt.figure()
plt.plot(CandidatesOfDTDepth, RMSECvAll, 'k', linewidth=2)
plt.xlabel("Depth of tree for DT")
plt.ylabel("RMSE in CV for DT")
plt.show()
OptimalMaxDepthDT = CandidatesOfDTDepth[np.where( RMSECvAll == np.min(RMSECvAll) )[0][0] ]
DTResult = tree.DecisionTreeRegressor(max_depth=OptimalMaxDepthDT, min_samples_leaf=MinSamplesLeafDT)
DTResult.fit( OriginalX, Originaly )
CalculatedYAll[:,7] = DTResult.predict(OriginalX)
np.random.seed(10000)
PredictedYcvAll[:,7] = model_selection.cross_val_predict(DTResult, OriginalX, Originaly, cv=FoldNumber)
np.random.seed()
# Check rules of DT
datapdDT = pd.read_csv("data.csv", encoding='SHIFT-JIS', index_col=0)
with contextlib.closing(StringIO()) as DTfile:
    tree.export_graphviz(DTResult, out_file=DTfile,
                         feature_names=datapdDT.columns[1:],
                         class_names=datapdDT.columns[0])
    output = DTfile.getvalue().splitlines()
output.insert(1, 'node[fontname="meiryo"];')
with open('DTResult.dot', 'w') as f:
    f.write('\n'.join(output))
# Estimate Y for new samples based on DT in 1. and 2.
PredictedY1All[:,7] = DTResult.predict(OriginalX_prediction1)
PredictedY2All[:,7] = DTResult.predict(OriginalX_prediction2)

# 9. Random Forest (RF)
NumberOfTreesRF = 500 # 1. Number of decision trees
CandidatesOfXvariablesRateRF = np.arange( 1, 10, dtype=float)/10 #candidates of the ratio of the number of explanatory variables (X) for decision trees
# Run RFR for every candidate of X-ratio and estimate values of objective variable (Y) for Out Of Bag (OOB) samples
Exemple #55
0
def plot_tree(clf, file_name, **kwargs):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, **kwargs)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(file_name)
Exemple #56
0
def main(data):
    y = data['Species']
    print(" ")
    print("Before going into our task do you want to check out the data ? ")
    print(" ")
    print("y- Yes")
    print("n-No")
    choice = input("Select your choice : ")
    choice = choice.lower()

    if (('y' in choice) or ('yes' in choice)):
        print(" ")
        print(" ")
        print("Ok ! Let's explore the data")
        print('\nOur dataset looks like : \n', data.head())
        print(" ")
        print('\nThe shape of the data is: ', data.shape)
        print(" ")
        print("\nWhat about the datatypes: \n", data.dtypes)
        print(" ")
        print("\nThe whole data can be described as : \n", data.describe())
        print(" ")
        print(" ")
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.externals.six import StringIO
    from sklearn.tree import export_graphviz
    import pydotplus
    print(" ")
    print(" ")
    print("Ok, now let's train the model and make predictions using it")
    print(" ")
    print(" ")
    print("\nDivide the data into attributes(inputs) and labels(outputs)")
    x = data.iloc[:, [0, 1, 2, 3]].values
    le = LabelEncoder()
    data['Species'] = le.fit_transform(data['Species'])
    y = data['Species'].values
    print("\nAttributes:\n", x)
    print("\nLables :", y)
    print("Next step is to split this data into training and test sets.")

    print(" ")
    print(" ")
    print("\nTrain-Test-Split : ")
    print("The test size in default is 20. Would you like to change it ?")
    print("y- Yes")
    print("n- No")
    size = input("\nYour choice : ")
    if (('y' in size) or ('yes' in size)):
        tsize = int(input("Specify the test size you want : "))
        tsize /= 100
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=tsize,
                                                            random_state=0)
        print("Splitted with test size ", tsize)
    else:
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        print("Splitted with default test size")
    print(" ")
    print(" ")
    print("\nLets explore the splitted data : ")
    print("X_Train data  : ", x_train.shape)
    print("X_Test data   : ", x_test.shape)
    print("Y_Train data  : ", y_train.shape)
    print("Y_Test data   : ", y_test.shape)
    print(y_test)

    print(" ")
    print(" ")
    print("Training the Algorithm")
    print("We are going to train the model")
    print("Which training method do you want ?")
    print("g - Gini")
    print("e - Entropy")
    method = input("\nYour choice : ")
    method = method.lower()
    if (('g' in method) or ('gini' in method)):
        dtmodel = DecisionTreeClassifier(criterion="gini",
                                         random_state=0,
                                         max_depth=3,
                                         min_samples_leaf=5)
        dtmodel.fit(x_train, y_train)
    elif (('e' in method) or ('entropy' in method)):
        dtmodel = DecisionTreeClassifier(criterion='entropy',
                                         random_state=0,
                                         max_depth=3,
                                         min_samples_leaf=5)
        dtmodel.fit(x_train, y_train)
    else:
        print("Wrong Choice")
        return ()
    print(" ")
    print(" ")
    print("Testing the Algorithm")
    y_pred = dtmodel.predict(x_test)
    print("Predicted values:")
    print(y_pred)
    print("Completed")
    print("Accuracy:", accuracy_score(y_test, y_pred) * 100)
    print("Report:", classification_report(y_test, y_pred))
    print("Confusion Matrix : ")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(9, 9))
    sns.heatmap(cm,
                annot=True,
                fmt=".3f",
                linewidths=.5,
                square=True,
                cmap='Blues')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = 'Accuracy Score: {0}'.format(
        dtmodel.score(x_test, y_test))
    plt.title(all_sample_title, size=15)
    plt.show()

    print(" ")
    print(" ")
    print("Now, let's visualize the Decision Tree to understand it better.")
    df = data.copy()
    df = df.drop('Species', axis=1)
    dot_data = StringIO()
    export_graphviz(dtmodel,
                    out_file=dot_data,
                    feature_names=df.columns,
                    filled=True,
                    rounded=True,
                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png("dtree.png")
    im = Image.open(r"dtree.png")
    im.show()
    print(" ")
    print(" ")

    Flag = True
    while Flag == True:
        print("\nWould you like to give try another input ?")
        print("y - Yes")
        print("n - No")
        sp = input("Your answer : ")
        sp = sp.lower()
        if (('y' in sp) or ('yes' in sp)):
            spe = []
            slen = float(input("Give Sepal Length in cm : "))
            spe.append(slen)
            swid = float(input("Give Sepal width in cm : "))
            spe.append(swid)
            plen = float(input("Give Petal Length in cm : "))
            spe.append(plen)
            pwid = float(input("Give Petal width in cm : "))
            spe.append(pwid)
            y_pred = dtmodel.predict([spe])
            print(" ")
            print(" ")
            print("Species according to encoding : ")
            print("0 - Iris-setosa")
            print("1 - Iris-versicolor")
            print("2 - Iris-virginica")
            print(" ")
            print(" ")
            print("The predicted species is ", y_pred)
            print("Were you expecting the same ?")
            print("OK")
            print("Do you want to try again ?")
            print("y - Yes")
            print("n - No")
            ans = input("Your choice : ")
            if (('y' in ans) or ('yes' in ans)):
                Flag = True
            else:
                Flag = False
        else:
            Flag = False

    print(" ")
    print(" ")
    print("Yippee... We learned to use Decision Tree")
    print("I really had fun")
    print("Hope you enjoyed it too")
    print("Bye")
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['target'])
# df['label'] = df.target.replace(dict(enumerate(df.target_names)))
print(df.head()) # to check the top results
print(iris.feature_names)
print(iris.target_names)
print(df.describe()) # to check difference between min and maxmium value
x = iris['data']
y = iris['target']

iris_df = pd.DataFrame(x, columns=iris['feature_names'])
print(iris_df.head)
x, y = shuffle(x, y, random_state=0)  # random shuffle
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm
clf = classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console
dot_data = StringIO()
tree.export_graphviz(classifier,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True, rounded=True,
                     impurity=False,
                     proportion=True)
graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph
graph[0].write_pdf("iris3.pdf") # run the file.

Exemple #58
0
    def train():

        balance_data_excel = DecisionTreeQuestionnaire.get_csv_file_data()
        ''' Clean the Data and replace with nan
        '''
        balance_data_excel = balance_data_excel.replace(r'^\s*$',
                                                        str(np.nan),
                                                        regex=True).replace(
                                                            '', str(np.nan))
        balance_data_excel = balance_data_excel.applymap(str)
        balance_data_excel

        print("Dataset Length:: ", len(balance_data_excel))
        print("Dataset Shape:: ", balance_data_excel.shape)

        X = balance_data_excel.iloc[:, :-1]
        y = balance_data_excel.iloc[:, 22]

        X = DecisionTreeQuestionnaire.encode_onehot(
            X,
            X.columns.get_values().tolist())
        X.head()

        le_y = LabelEncoder()
        y = le_y.fit_transform(y)

        cols = X.columns
        for c in cols:
            x = c
            if x.split('=')[1] == 'nan':
                X.drop(c, axis=1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=100)

        clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                             random_state=100,
                                             max_depth=100,
                                             min_samples_leaf=5,
                                             min_samples_split=8)
        abc = clf_entropy.fit(X_train, y_train)

        y_pred = clf_entropy.predict(X_test)

        print("Accuracy is ", accuracy_score(y_test, y_pred) * 100)
        ''' Convert Target Classes to key-value pairs
        '''
        print("le2")
        class_names = {}
        for i in range(len(le_y.classes_)):
            class_names[i] = le_y.classes_[i]
        print(class_names)

        features = {}
        for i in range(len(list(X.columns[0:56]))):
            features[i] = X.columns[i]

        #features

        for i in range(len(list(X.columns[0:56]))):
            DecisionTreeQuestionnaire.feature_names.append(X.columns[i])
        #feature_names

        dot_data = StringIO()
        export_graphviz(clf_entropy,
                        out_file=dot_data,
                        filled=True,
                        rounded=True,
                        special_characters=True,
                        feature_names=features,
                        class_names=class_names)

        # graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph = pydot.graph_from_dot_data(dot_data.getvalue())
        # graph_from_dot_file()

        graph.write_png('decisiontree.png')

        DecisionTreeQuestionnaire.tree_to_code(abc)

        DecisionTreeQuestionnaire.tree_to_code2(abc, class_names)

        DecisionTreeQuestionnaire.paths

        DecisionTreeQuestionnaire.isModelTrained = True

        for i in DecisionTreeQuestionnaire.paths:
            for k, v in i.items():
                print(k + ' : ' + v)
            print("-----------------------------------")

            print('Your inputs are not defined')
Exemple #59
0
def printTree(clf):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, filled=True, rounded=True,
                special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return graph.write_png('tree.png')
Exemple #60
0
    def decision_tree_training(self):

        self.target_names = ['lying', 'lie on the side', 'sitting', 'standing']
        self.feature_names = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
            '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]
        print('---start training decision tree---')
        #split dataset in two equal parts
        #print(np.shape(self.feature), np.shape(self.label))
        X_train, X_test, Y_train, Y_test = train_test_split(self.feature,
                                                            self.label,
                                                            test_size=0.25,
                                                            random_state=0)
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/X_train.txt',
            X_train,
            fmt='%f')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/X_test.txt',
            X_test,
            fmt='%f')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_train.txt',
            Y_train,
            fmt='%d')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_test.txt',
            Y_test,
            fmt='%d')
        print('---split data done!---')
        print()
        clf = DecisionTreeClassifier(criterion='gini',
                                     random_state=0)  # 默认使用CART算法
        print(np.shape(X_train), np.shape(Y_train.ravel()))
        clf.fit(X_train, Y_train.ravel())
        # cross_val_score(classifier, X_train, Y_train, cv=5)
        # visualization
        dot_data = StringIO()
        tree.export_graphviz(clf,
                             out_file=dot_data,
                             feature_names=self.feature_names,
                             class_names=self.target_names,
                             filled=True,
                             rounded=True,
                             impurity=False)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("decision_tree.pdf")
        # classifier.fit(X_train, Y_train)
        #验证测试集
        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        Y_true, Y_pred = Y_test.ravel(), clf.predict(X_test)
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_true.txt',
            Y_true,
            fmt='%d')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_pred.txt',
            Y_pred,
            fmt='%d')
        print(
            classification_report(Y_true,
                                  Y_pred,
                                  target_names=self.target_names))
        print()

        print('Decision Tree model saving ......')
        model_save_path = '/home/hts/posture_classification_based_pose/decision_tree/train_decision_tree_model.m'
        joblib.dump(clf, model_save_path)

        return