Ejemplo n.º 1
0
def test_graphviz_toy():
    # Check correctness of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=1,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    out = StringIO()
    export_graphviz(clf, out_file=out)
    contents1 = out.getvalue()
    contents2 = "digraph Tree {\n" \
                "0 [label=\"X[0] <= 0.0000\\ngini = 0.5\\n" \
                "samples = 6\", shape=\"box\"] ;\n" \
                "1 [label=\"gini = 0.0000\\nsamples = 3\\n" \
                "value = [ 3.  0.]\", shape=\"box\"] ;\n" \
                "0 -> 1 ;\n" \
                "2 [label=\"gini = 0.0000\\nsamples = 3\\n" \
                "value = [ 0.  3.]\", shape=\"box\"] ;\n" \
                "0 -> 2 ;\n" \
                "}"

    assert_equal(contents1, contents2)

    # Test with feature_names
    out = StringIO()
    export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"])
    contents1 = out.getvalue()
    contents2 = "digraph Tree {\n" \
                "0 [label=\"feature0 <= 0.0000\\ngini = 0.5\\n" \
                "samples = 6\", shape=\"box\"] ;\n" \
                "1 [label=\"gini = 0.0000\\nsamples = 3\\n" \
                "value = [ 3.  0.]\", shape=\"box\"] ;\n" \
                "0 -> 1 ;\n" \
                "2 [label=\"gini = 0.0000\\nsamples = 3\\n" \
                "value = [ 0.  3.]\", shape=\"box\"] ;\n" \
                "0 -> 2 ;\n" \
                "}"

    assert_equal(contents1, contents2)

    # Test max_depth
    out = StringIO()
    export_graphviz(clf, out_file=out, max_depth=0)
    contents1 = out.getvalue()
    contents2 = "digraph Tree {\n" \
                "0 [label=\"X[0] <= 0.0000\\ngini = 0.5\\n" \
                "samples = 6\", shape=\"box\"] ;\n" \
                "1 [label=\"(...)\", shape=\"box\"] ;\n" \
                "0 -> 1 ;\n" \
                "2 [label=\"(...)\", shape=\"box\"] ;\n" \
                "0 -> 2 ;\n" \
                "}"

    assert_equal(contents1, contents2)
Ejemplo n.º 2
0
def test_graphviz_toy():
    """Check correctness of graphviz output on a toy dataset."""
    clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_split=1)
    clf.fit(X, y)

    # test export code
    out = StringIO()
    tree.export_graphviz(clf, out_file=out)
    contents1 = out.getvalue()

    tree_toy = StringIO(
        "digraph Tree {\n"
        "0 [label=\"X[0] <= 0.0000\\nerror = 0.5"
        "\\nsamples = 6\\nvalue = [ 3.  3.]\", shape=\"box\"] ;\n"
        "1 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 3.  0.]\", shape=\"box\"] ;\n"
        "0 -> 1 ;\n"
        "2 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 0.  3.]\", shape=\"box\"] ;\n"
        "0 -> 2 ;\n"
        "}")
    contents2 = tree_toy.getvalue()

    assert contents1 == contents2, \
        "graphviz output test failed\n: %s != %s" % (contents1, contents2)

    # test with feature_names
    out = StringIO()
    out = tree.export_graphviz(clf, out_file=out,
                               feature_names=["feature1", ""])
    contents1 = out.getvalue()

    tree_toy = StringIO(
        "digraph Tree {\n"
        "0 [label=\"feature1 <= 0.0000\\nerror = 0.5"
        "\\nsamples = 6\\nvalue = [ 3.  3.]\", shape=\"box\"] ;\n"
        "1 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 3.  0.]\", shape=\"box\"] ;\n"
        "0 -> 1 ;\n"
        "2 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 0.  3.]\", shape=\"box\"] ;\n"
        "0 -> 2 ;\n"
        "}")
    contents2 = tree_toy.getvalue()

    assert contents1 == contents2, \
        "graphviz output test failed\n: %s != %s" % (contents1, contents2)

    # test improperly formed feature_names
    out = StringIO()
    assert_raises(IndexError, tree.export_graphviz,
                  clf, out, feature_names=[])
Ejemplo n.º 3
0
def visualize_tree(clf, outname, headers):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers))
    graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8'))
    graph.write_pdf(outname)
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames):
    """Train and classify using a Decision Tree and prints the decision Tree."""
    decisionTree = DecisionTreeClassifier()
    model = decisionTree.fit(trainData, trainTargets)

    # Create graph description of the Decision Tree
    dot_data = StringIO() 
    #export_graphviz(model, out_file=dot_data, max_depth=5)
    print("Feature names:", featureNames)
    export_graphviz(model, out_file=dot_data, feature_names=featureNames, 
                    max_depth=5)
    export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, 
                    max_depth=5)
    #with open("DecisionTree.dot", 'r') as dotFile:
    #    dotFile.write(exportFile)
    # Create PDF from dot
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot"
    #graph = pydot.graph_from_dot_file(path) 
    #graph.write_pdf("DecisionTree.pdf")


    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Decision Tree:")
    showPerformance(testTargets, classification)
Ejemplo n.º 5
0
def decision_tree(train_features, train_labels, test_features, test_labels, feature_names):
    regressor = tree.DecisionTreeRegressor()
    regressor.fit(train_features, train_labels)

    test_results = cap_results(regressor.predict(test_features))
    train_results = cap_results(regressor.predict(train_features))

    print "test result", metrics.mean_squared_error(test_labels, test_results)
    print "test r2", metrics.r2_score(test_labels, test_results)
    print "train result", metrics.mean_squared_error(train_labels, train_results)
    print "train r2", metrics.r2_score(train_labels, train_results)

    # print "importances"
    # temp = []
    # for index, val in enumerate(regressor.feature_importances_):
    #     if val > 0.001:
    #         temp.append((index, val))
    # print sorted(temp, key=lambda x: x[1])

    '''graph stuff'''
    dot_data = StringIO()
    tree.export_graphviz(regressor, out_file=dot_data,
                        special_characters=True,
                        class_names=regressor.classes_,
                        impurity=False,
                        feature_names=feature_names)

    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("tree.pdf") 

    return (test_results, train_results)
def tree3():
  global final_html
  global df,df_train,df_test,test_train_created,origin_df
  chi_key = list()
  init_style_string = template.style_string
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html()
		return final_html
  return 'helloo'  
Ejemplo n.º 7
0
def drawDecisionTree(dt, filename, featureNames, classNames):
    dot_data = StringIO()
    print featureNames
    print classNames
    tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(filename) 
Ejemplo n.º 8
0
def run_DT_model_2(df, criteria_col):
    # run the tree for various 0,1 lebel (e.g. : high value or not..)
    from sklearn.metrics import confusion_matrix
    from sklearn.cross_validation import train_test_split
    from sklearn.externals.six import StringIO
    from IPython.display import Image  
    import pydotplus
    print ('criteria_col  =  ', criteria_col)
    tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder',
          'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders',
         'cancalled_orders', 'voucher_used']
    df_train_ = df 
    #df_train_tree = df_train_[tree_col]
    tree_data = df_train_[tree_col]
    tree_data = tree_data.dropna()
    tree_train, tree_test = train_test_split(tree_data,
                                           test_size=0.2, 
                                           random_state=200,
                                           stratify=tree_data[criteria_col])
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col])
    print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col]))
    # confusion matrix 
    print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:])))
    # visualize the tree 
    dot_data = StringIO()
    tree.export_graphviz(clf,
                       out_file=dot_data,
                       feature_names=tree_col[1:],
                       filled=True, 
                       rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return Image(graph.create_png()), tree_train, tree_test
Ejemplo n.º 9
0
def main():
	if (len(sys.argv) < 2):
		print("One Argument Required; Training Set")
		return
	X_train, Y_train = ParseTraining(sys.argv[1])
    #X_test, Y_test = ParseTraining(sys.argv[2])
    #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99)
    #X_train, X_test, Y_train, Y_test = X, X, Y, Y
    #clf = tree.DecisionTreeClassifier()
	clf = tree.DecisionTreeClassifier(max_depth=6)
    #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025))
    #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1)
    #clf = SVC(kernel="linear", C=0.025)
    #clf = AdaBoostClassifier()
    #clf = SVC(gamma=2, C=1)
	clf = clf.fit(X_train, Y_train)


    #feature_names = ["partAvg", "recavg", "latency", "ReadRate"]
	feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"]
    #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"]
    #feature_names = ["partAvg", "recAvg", "recVar"]
    #feature_names = ["recAvg", "recVar", "Read"]
    #feature_names = ["partAvg", "recVar"]
    ##class_names = ["Partition", "OCC", "2PL"]
    #class_names = ["OCC", "2PL"]
	class_names = ["Partition", "No Partition"]
	dot_data = StringIO()
	tree.export_graphviz(clf, out_file=dot_data,
						feature_names=feature_names,
						class_names=class_names,
						filled=True, rounded=True,
						special_characters=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_png("partition.png")
Ejemplo n.º 10
0
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False):
    """ Classifies data using CART """
    try:
        accuracyRate, probabilities, timing = 0.0, [], 0.0
        # Perform classification
        cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth)
        startTime = time.time()
        prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug")
        cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr))
        prettyPrint("Submitting the test samples", "debug")
        predicted = cartClassifier.predict(Xte)
        endTime = time.time()
        # Compare the predicted and ground truth and append result to list
        accuracyRate = round(metrics.accuracy_score(predicted, yte), 2)
        # Also append the probability estimates
        probs = cartClassifier.predict_proba(Xte)
        probabilities.append(probs)
        timing = endTime-startTime # Keep track of performance
        if visualizeTree:
            # Visualize the tree
            dot_data = StringIO()
            tree.export_graphviz(cartClassifier, out_file=dot_data)
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug")
            graph.write_pdf("tree_%s.pdf" % getTimestamp())
  
    except Exception as e:
        prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error")

    return accuracyRate, timing, probabilities, predicted
Ejemplo n.º 11
0
def mainTree():
	header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\
	smoke|drink|child|parent|bmi|where0|where1|\
	marriage0|marriage1|look0|look1|where2').split('|')
	MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|')
	FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|')
	matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt')
	FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches))
	FemaleClass=FemaleData[['id','class']]
	newMaleData=concatData(MaleData,FemaleClass)
	MaleArrays=scaleData(newMaleData,['id','gender'])
	pca=factors(MaleArrays[:,:-1],17)
	print 'PCA explained variance:', sum(pca.explained_variance_ratio_)
	pcaMaleArray=pca.transform(MaleArrays[:,:-1])
	MaleArrays=np.c_[pcaMaleArray,MaleArrays]


	trainData,testData=departData(MaleArrays,0.9)
	trainModel=decisionModel(trainData)

	dot_data = StringIO()
	tree.export_graphviz(trainModel, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") 
	

	rate=test(trainModel,testData)
	print 'Decision Model true rate',rate
Ejemplo n.º 12
0
def tree2():
  global final_html
  global df,origin_df
  chi_key = list()
  firstkey = ""
  init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px;  height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>"""
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		temp_count = 0
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		k1 = k.split(";")
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="float:right;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html()
		return final_html
  return 'helloo'  
def generate_plot(clf):
    print "\nGenerating plot..."
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("weather_forecast.pdf")
    print "Plot generated!"
Ejemplo n.º 14
0
def visualize_tree(dtree):
    dot_data = StringIO()
    tree.export_graphviz(dtree, out_file=dot_data,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    display(Image(graph.create_png()))
    def train_network(self):
        """ Pure virtual method for training the network
        """
        db_query = self._database_session.query(PregameHitterGameEntry)
        mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8)
        X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH)
        self._decision_tree.fit(X_train, Y_train)
        dot_data = StringIO()
        tree.export_graphviz(self._decision_tree, out_file=dot_data,
                             feature_names=PregameHitterGameEntry.get_input_vector_labels())
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("hitter_tree.pdf")
        x_test_actual = list()
        y_test_actual = list()
        for data in mlb_evaluation_data:
            try:
                postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id,
                                                                                              PostgameHitterGameEntry.game_date == data.game_date).one()
                y_test_actual.append([postgame_entry.actual_draftkings_points])
                x_test_actual.append(data.to_input_vector())
            except NoResultFound:
                print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id
                continue

        self._database_session.close()
Ejemplo n.º 16
0
def create_tree(X, Y):
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(X, Y)

    from IPython.display import Image
    import pydotplus

    dot_data = StringIO()
    # tree.export_graphviz(clf, out_file=dot_data)
    # feature_names = ['Gender', 'Age']
    feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"]
    target_names = []

    for i in range(1, len(Y) + 1):
        target_names.append("Ad #" + str(i))

    tree.export_graphviz(
        clf,
        out_file=dot_data,
        feature_names=feature_names,
        class_names=target_names,
        filled=True,
        rounded=True,
        special_characters=True,
    )

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("Tree.pdf")

    return clf
Ejemplo n.º 17
0
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile):
    ''' Retrains the tree estimator using the fold with the best results
    from the cross-validation process. Prints out a graph pdf file of 
    that estimator.'''
    # Hacky way to get the training data for the best fold
    bestfold = np.argmax(scores)
    cnt = 0
    for train, _ in cv:

        # Only do stuff when you've got the training indices for the best fold
        if(cnt == bestfold):
            # Fit
            treeest.fit(features[train], labels[train])

            # Get the dot file
            dot_data = StringIO()
            tree.export_graphviz(treeest, out_file=dot_data, \
                feature_names=featnames)

            # Convert the dot file to a graph
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf(outfile)
            return
        else:
            cnt += 1

    print("You should never see this text from dt_graph!")
    return
Ejemplo n.º 18
0
def decisionTree():
    iris = load_iris()
    clf = tree.DecisionTreeClassifier(
        criterion='gini', 
        splitter='best', 
        max_depth=None,
        min_samples_split=2, 
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, 
        max_features=None, 
        random_state=None, 
        max_leaf_nodes=None, 
        class_weight=None
    )
    clf = clf.fit(iris.data, iris.target)

    dot_data = StringIO() 
    tree.export_graphviz(
        clf, 
        out_file=dot_data, 
        feature_names=iris.feature_names,
        class_names=iris.target_names,
        filled=False,
        rounded=True,
        special_characters=True
    )
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("iris.pdf")
Ejemplo n.º 19
0
def export_tree(clf, filename, feature_names=None, max_depth=None):
    from sklearn.externals.six import StringIO
    import pydot

    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, max_depth=max_depth)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(filename)
Ejemplo n.º 20
0
def createGraph(clf):
	with open("portScan.dot", 'w') as f:
		f = tree.export_graphviz(clf, out_file=f)

	dot_data = StringIO() 
	tree.export_graphviz(clf, out_file=dot_data) 
	graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
	graph.write_pdf("portScan.pdf") 
def printPdf(clf, dataTrain):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf('sentiment.pdf')
    print dataTrain.data[0]
	def printTreePDF(self, path = './tree.pdf'):
		if self.clf == None:
			raise NameError('Tree was not created!')
		else:
			dot_data = StringIO()
			tree.export_graphviz(self.clf, out_file=dot_data)
			graph = pydot.graph_from_dot_data(dot_data.getvalue())
			graph.write_pdf(path) 
Ejemplo n.º 23
0
def view(classifier):
    """ Renders a graph representation of classifier, and
        saves it to "MyTree.pdf" in the same folder
        as the executing script.
    """
    tree_dot = StringIO()
    tree.export_graphviz(classifier, out_file=tree_dot)
    graph = pydot.graph_from_dot_data(tree_dot.getvalue())
    graph.write_pdf("MyTree.pdf")
Ejemplo n.º 24
0
	def save_tree_png(self, store):
		import pydot
		from sklearn.externals.six import StringIO 
		dot_data = StringIO()  
		tree.export_graphviz(self.clf, out_file=dot_data,  
							   feature_names=self.feature_names)
		graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
		with open(store.dataset_path + '/tree.png','wb') as f:
			f.write(graph.create_png())
Ejemplo n.º 25
0
Archivo: dt.py Proyecto: mkdmkk/infaas
 def export(self, fpath):
     """
     Export the decision tree as a PDF file
     :return: None
     """
     dot_data = StringIO()
     tree.export_graphviz(self.model, out_file=dot_data)
     graph = pydot.graph_from_dot_data(dot_data.getvalue())
     graph.write_pdf(fpath)
Ejemplo n.º 26
0
Archivo: utils.py Proyecto: rhouck/re
def tree_vis(clf):
    #fn = ''.join([random.choice(string.ascii_lowercase + string.digits) for _ in range(10)])
    fn = 'tree'
    fn = 'data/trees/{0}.png'.format(fn)
    dot_data = StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(fn) 
    return Image(filename=fn)
def train_decision_tree_elite_status_classifier():
	"""Trains and validates a decision tree model for predicting users' Elite status."""
	model = train_and_validate_elite_status_classifier(DecisionTreeClassifier, DECISION_TREE_USER_ATTRIBUTES)

	# Output tree representation showing decision rules
	dot_data = StringIO()
	tree.export_graphviz(model, out_file=dot_data, class_names=True, filled=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf('analysis/analysis_results/decision_tree.pdf')
Ejemplo n.º 28
0
def drawDecisionTree(classIndex):
	clf = tree.DecisionTreeClassifier()
	clf = clf.fit(preference,y[classIndex])
	dot_data = StringIO()
	# change it: class_names = cnames[classIndex]
	tree.export_graphviz(clf,out_file=dot_data,feature_names= fname,filled=True, rounded=True,special_characters=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	filename = "decisionTree_" + str(classIndex) + ".pdf"
	graph.write_pdf(filename) 
Ejemplo n.º 29
0
def __plotTree(clf,name):  
    tree.export_graphviz(clf,out_file=outputdir + name) 
    dot_data = StringIO() 
    tree.export_graphviz(clf,out_file=dot_data)   
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf(outputdir +  name + '.pdf') 
    os.remove(outputdir + name)  

#plot utilities
Ejemplo n.º 30
0
 def createTreePdf(self):
     try:
         import pydot
     except:
         return
     dot_data = StringIO()
     tree.export_graphviz(self.getClf(),
             out_file = dot_data, feature_names = self.featureNames)
     graph = pydot.graph_from_dot_data(dot_data.getvalue())
     graph.write_pdf("DT" + "-".join(self.classNames) + ".pdf")
Ejemplo n.º 31
0
                csRoot, depth, criteria)

            # cria o diretório caso não exista
            for dir in list([csRoot, dotPath, graphPath, csvPredictionPath]):
                dirName = os.path.dirname(dir)
                if not os.path.exists(dirName):
                    os.makedirs(dirName)

            #cria o csv com as predições
            dfPredictions = dfCs.copy()
            dfPredictions = dfPredictions.iloc[y_test.index, ]
            dfPredictions[classLabel] = y_pred
            dfPredictions.to_csv(csvPredictionPath)

            dotFile = pydotplus.graph_from_dot_data(
                dot_data.getvalue()).to_string()
            #graphPath = ('{0}_ga.png' if applyPreProcessingWGA else "{0}.png")

            ## expressões regulares que excluem elementos indesejados da decision tree e faz aprimoramentos na visualização
            import re
            dotFile = re.sub('(style="rounded")',
                             ' style="filled, rounded", fillcolor="#FFFFFF"',
                             dotFile)
            dotFile = re.sub('(samples = [0-9]+<br\/>)', '', dotFile)
            dotFile = re.sub('(value = \[[0-9]+, [0-9]+\]<br\/>)', '', dotFile)
            dotFile = re.sub('(<br\/>class = [0-9])', '', dotFile)
            dotFile = re.sub('(<class = 1>)',
                             '<<b>smelly code</b>>, fillcolor="#e68743"',
                             dotFile)
            dotFile = re.sub('(class = 0)', 'not smelly code', dotFile)
            ## RE para modificar o tamanho do nó
Ejemplo n.º 32
0
def main(): 
	
	# Building Phase 
	data = importdata() 
	X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
	clf_gini = train_using_gini(X_train, X_test, y_train) 
	clf_entropy = tarin_using_entropy(X_train, X_test, y_train)

        #Visualizing tree using Gini Index
	dot_data = StringIO()
	export_graphviz(clf_gini, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_png('gini_graph.png')
	Image(graph.create_png())

	
	print('\n')
        # Operational Phase 
	print("Results Using Gini Index:") 
	print ("\n")
	
	# Prediction using gini 
	y_pred_gini = prediction(X_test, clf_gini)
        #Test instance prdictions
	print("\n")
	
	test1_set =[1,1,1,1]
	print ("Test instance 1:  ", test1_set) 
	test1 = clf_gini.predict([test1_set])
	print("Predicted label: ", test1)
	print("Actual label: B")
	print('\n')
        
	test2_set =[1,3,2,3]
	print ("Test instance 2:  ", test2_set) 
	test2 = clf_gini.predict([test2_set])
	print("Predicted label: ", test2)
	print("Actual label: R")
	print('\n')

	test3_set = [5,4,5,1]
	print ("Test instance 3:  ", test3_set) 
	test3 = clf_gini.predict([test3_set])
	print("Predicted label: ", test3)
	print("Actual label: L")
	print('\n')

	test7_set = [1,4,1,4]
	print ("Test instance 4:  ", test7_set) 
	test7 = clf_gini.predict([test7_set])
	print("Predicted label: ", test7)
	print("Actual label: B")
	print('\n')
        
	
	cal_accuracy(y_test, y_pred_gini)

        #Visualizing tree using Entropy
	dot_data = StringIO()
	export_graphviz(clf_entropy, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_png('entropy_graph.png')
	Image(graph.create_png())
        
	print("Results Using Entropy:")

	print('\n')
	# Prediction using entropy 
	y_pred_entropy = prediction(X_test, clf_entropy)
	print('\n')

	
	test4_set =[1,1,1,1]
	print ("Test instance 1:  ", test4_set) 
	test4 = clf_gini.predict([test4_set])
	print("Predicted label: ", test4)
	print("Actual label: B")
	print('\n')
        
	test5_set =[1,3,2,3]
	print ("Test instance 2:  ", test5_set) 
	test5 = clf_gini.predict([test5_set])
	print("Predicted label: ", test5)
	print("Actual label: R")
	print('\n')

	test6_set = [5,4,5,1]
	print ("Test instance 3:  ", test6_set) 
	test6 = clf_gini.predict([test6_set])
	print("Predicted label: ", test6)
	print("Actual label: L")
	print('\n')

	test8_set = [1,4,1,4]
	print ("Test instance 4:  ", test8_set) 
	test8 = clf_gini.predict([test8_set])
	print("Predicted label: ", test8)
	print("Actual label: B")
	print('\n')
	
	
	cal_accuracy(y_test, y_pred_entropy)
Ejemplo n.º 33
0
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):
    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    from sklearn.externals.six import StringIO
    from sklearn.tree import export_graphviz
    import pydotplus
    dot_data = StringIO()
    export_graphviz(regressor,
                    out_file=dot_data,
                    feature_names=feature_cols,
                    filled=True,
                    rounded=True,
                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    from brightics.common.report import png2MD
    fig_tree = png2MD(graph.create_png())

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['report'] = rb.get()

    return {'model': model}
Ejemplo n.º 34
0
validation_confusion_tree = confusion_matrix(Y2, tree_val_predictions)
print "Confusion Matrix: Decision Tree"
print validation_confusion_tree
print ""
validation_confusion_knn = confusion_matrix(Y2, knn_val_predictions)
print "Confusion Matrix: KNN"
print validation_confusion_knn
print ""

#Here we can get a visual of the Decision Tree by copying the output and
#pasting into https://dreampuf.github.io/GraphvizOnline/

from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot

plot = StringIO()
export_graphviz(tree,
                out_file=plot,
                filled=True,
                rounded=True,
                special_characters=True)

print plot.getvalue()

#Save the model using Pickle
import pickle
with open('knn_pickle', 'wb') as knn_model:
    pickle.dump(knn, knn_model)
Ejemplo n.º 35
0
def plotTree(treeName,tree,featureNames):
	treePic_dot = StringIO()
	export_graphviz(tree, out_file=treePic_dot, feature_names=featureNames, filled=True, rounded=True)
	graph = pydotplus.graph_from_dot_data(treePic_dot.getvalue())  
	Image(graph.create_png())
	graph.write_png(treeName+'.png')
Ejemplo n.º 36
0
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]),
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    feature_names, features = check_col_type(table, feature_cols)
    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    class_labels = sorted(set(y_train))
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(features, table[label_col], sample_weight, check_input,
                   X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_names,
                        class_names=classifier.classes_.astype(np.str),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_names)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)

    # Add tree plot
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
        lenses_list = []
    # print(lenses_dict)														#打印字典信息
    lenses_pd = pd.DataFrame(lenses_dict)  #生成pandas.DataFrame
    # print(lenses_pd)														#打印pandas.DataFrame
    le = LabelEncoder()  #创建LabelEncoder()对象,用于序列化
    for col in lenses_pd.columns:  #序列化
        lenses_pd[col] = le.fit_transform(lenses_pd[col])
    # print(lenses_pd)														#打印编码信息

    clf = tree.DecisionTreeClassifier(
        max_depth=6)  #创建DecisionTreeClassifier()类
    clf = clf.fit(lenses_pd.values.tolist(), lenses_target)  #使用数据,构建决策树

    dot_data = StringIO()
    tree.export_graphviz(
        clf,
        out_file=dot_data,  #绘制决策树
        feature_names=lenses_pd.keys(),
        class_names=clf.classes_,
        filled=True,
        rounded=True,
        special_characters=True)
    # graph = pydotplus.graph_from_dot_data(dot_data.getvalue());
    #下面这列解决中文乱码
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace(
        'helvetica', '"Microsoft YaHei"'))
    print(dot_data.getvalue())
    graph.write_pdf("tree.pdf")  #保存绘制好的决策树,以PDF的形式存储。

    print(clf.predict([[1, 1, 1, 0]]))  #预测
Ejemplo n.º 38
0
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.externals.six import StringIO

train = pd.read_excel("py_tree_learn.xls", "Sheet1")
# print(train)
_ = train.fillna(9999, inplace=True)
# print(train)
train_data = train.iloc[:, :-1]
train_target = train.iloc[:, -1]
# print(train_data)
train_data_1 = train_data.values
train_target_1 = train_target.values
# print(train_data_1)

clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=6)
clf = clf.fit(train_data_1, train_target_1)
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
print(dot_data.getvalue())

# new_train=(train['PENSION_FUND_STATUS'])
# # print(new_train)
# # print(new_train.value_counts())
# print(pd.crosstab(train.PENSION_FUND_STATUS,train.target_new,margins=True))
Ejemplo n.º 39
0
#Graphing trees
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz

pip install graphviz
pip install pydotplus
pip install pyparsing
import pydotplus as pypl

# Graphing 5 leaves node 
dot_data5 = StringIO()
export_graphviz(classifier5, out_file=dot_data5,  
                filled=True, rounded=True,
                special_characters=True, class_names = ['0', '1'])
graph = pypl.graph_from_dot_data(dot_data5.getvalue())  
Image(graph.create_png())
graph.write_png('5leavesNode.png')

# Graphing 15 leaves node 
dot_data15 = StringIO()
export_graphviz(classifier15, out_file=dot_data15,  
                filled=True, rounded=True,
                special_characters=True, class_names = ['0', '1'])
graph = pypl.graph_from_dot_data(dot_data15.getvalue())  
Image(graph.create_png())
graph.write_png('15leavesNode.png')


# Graphing 25 leaves node 
dot_data25 = StringIO()
Ejemplo n.º 40
0
clf_gini.fit(X_train, y_train)

#Decision Tree with Information Entropy

clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                     random_state=100,
                                     max_depth=7,
                                     min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

# evaluate algorithm

y_pred = clf_gini.predict(X_test)
y_pred_en = clf_entropy.predict(X_test)
#Accuracy

print("Accuracy for gini", accuracy_score(y_test, y_pred) * 100)
print("Accuracy for entropy", accuracy_score(y_test, y_pred_en) * 100)
import graphviz
list(X)
tree.export_graphviz(clf_gini, out_file='tree.dot')
from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
tree.export_graphviz(clf_gini, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("Elastomer.pdf")
tree.export_graphviz(clf_entropy, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("Elastomer_Entropy.pdf")
Ejemplo n.º 41
0
training_data = np.array(training_data)
training_class = np.array(training_class)
test_data = np.array(test_data)
test_class = np.array(test_class)

#building the classifier (the option random_state=RandomState(130) makes the algorithm deterministic)
clf = tree.DecisionTreeClassifier(criterion='gini',
                                  random_state=RandomState(130))
clf = clf.fit(training_data, training_class)

#print the decision tree in a pdf file
from sklearn.externals.six import StringIO

dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")

# the following code evaluates the decision tree on the test set and compute a confidence interval for
# the accuracy. You should create a list a, where a[i]=1 if the ith record test_data[i] has been classified
# correctly and 0 otherwise. Remember, a.append(1) add one more element to the list with value = 1.
a = []
pre_class = clf.predict(test_data)
for i in range(0, len(test_data)):
    if test_class[i] == pre_class[i]:
        a.append(1)
    else:
        a.append(0)
# fill properly this missing part

# The following code computes a confidence interval for the accuracy. The first argument is the confidence,
def show_tree(clf):
	dot_data = StringIO()
	export_graphviz(clf, out_file=dot_data)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("titanic_tree.pdf")
Ejemplo n.º 43
0
    confusion_matrix(y_validacao, y_predicao_validacao)))
print("Matriz de confusão da teste :\n {}".format(
    confusion_matrix(y_teste, y_predicao_teste)))

# Não estou conseguindo gerar um *.png
arquivo_dot = StringIO()

tree.export_graphviz(
    modeloAD,
    out_file=arquivo_dot,
    node_ids=True,
    feature_names=['Sangue', 'Da a luz', 'Pode voar', 'Mora na agua'],
    class_names=['SIM', 'NAO'],
    filled=True)

arvore = pdp.graph_from_dot_data(arquivo_dot.getvalue())

lista_edge = []
for edge in arvore.get_edge_list():
    lista_edge.append(edge.get_source())

nodes = arvore.get_node_list()
for node in nodes:
    if node.get_name() == '0':
        node.set_fillcolor('#F19C99')
    elif node.get_name() not in lista_edge:
        node.set_fillcolor('#E1D5E7')
    else:
        node.set_fillcolor('#D5E8D4')

arvore.write_png("arvore_mamifero.png")
Ejemplo n.º 44
0
def printTree(clf):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, filled=True, rounded=True,
                special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return graph.write_png('tree.png')
Ejemplo n.º 45
0
# Train model
kyphosis_features = kyphosis.columns[1:]
kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy',
                                         max_depth=None,
                                         min_samples_split=2,
                                         min_samples_leaf=1)
kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features],
                                      kyphosis_train['Kyphosis'])

# Print a string representation of the tree.
# If you have graphviz (www.graphviz.org) installed, you can write a pdf
# visualization using graph.write_pdf(filename)
kyphosis_dt_data = StringIO()
tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data)
kyphosis_dt_graph = pydotplus.parser.parse_dot_data(
    kyphosis_dt_data.getvalue())
print(kyphosis_dt_graph.to_string())

# Predict classes of test set and evaluate
kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features])

kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'],
                                          kyphosis_dt_pred,
                                          labels=['absent', 'present'])
print(kyphosis_dt_cm)
kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'],
                                         kyphosis_dt_pred)
kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'],
                                           kyphosis_dt_pred,
                                           pos_label='absent')
kyphosis_dt_rec = metrics.recall_score(kyphosis_test['Kyphosis'],
    tree_mod = DecisionTreeRegressor(max_depth=5)
    tree_mod.fit(x_train[final_columns], y_train)

    y_1_pred = tree_mod.predict(x_test[final_columns])

    x1 = np.asanyarray(y_1_pred)
    x2 = np.asanyarray(y_test)
    this_rmse = np.sqrt(np.mean(np.square(x1 - x2)))

    from sklearn import tree
    tree.export_graphviz(tree_mod, out_file='tree.dot')  #produces dot file

    import pydot
    dotfile = StringIO('tree.dot')
    tree.export_graphviz(tree_mod, out_file=dotfile)
    pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png")

    ############## Random Forest modelling ###########################

    fr_regr = RandomForestRegressor(max_depth=2,
                                    random_state=0,
                                    n_estimators=100)
    fr_regr.fit(x_train[final_columns], y_train)

    y_1_pred = fr_regr.predict(x_test[final_columns])

    x1 = np.asanyarray(y_1_pred)
    x2 = np.asanyarray(y_test)
    this_rmse_rf = np.sqrt(np.mean(np.square(x1 - x2)))

    ##########################################################################################
Ejemplo n.º 47
0
X = pd.DataFrame(data, columns=feature_names)
y = pd.Categorical.from_codes(target, target_names)
X.head()
y = pd.get_dummies(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, 
	test_size=0.3, random_state=1)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, feature_names=feature_names,
	class_names=target_names)
(graph, ) = graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#graph.write_png("crack.png")

y_pred = dt.predict(X_test)
sets = np.array(y_test).argmax(axis=1)
predictions = np.array(y_pred).argmax(axis=1)
matrix = confusion_matrix(sets, predictions)
print(matrix)
tp1 = matrix[0][0]
fp1 = matrix[1][0] + matrix[2][0]
fn1 = matrix[0][1] + matrix[0][2]
tn1 = matrix[1][1] + matrix[2][2]
prec1 = tp1/(tp1+fp1)
recall1 = tp1/(tp1+fn1)
f11 = 2*((prec1*recall1)/(prec1+recall1))
Ejemplo n.º 48
0
def fit_population_cv(population,
                      target_column_name,
                      identifier_column_name,
                      table_name,
                      folds=3,
                      parameters=None):
    """
    The internal wrapper of GridSearchCV fit algorithm for DecisionTree of scikit-learn.

    :param population: the population data whose functional type is 'table'
    :type population: dict

    :param target_column_name: the name of the attribute providing the class label of the observed subject.
       Must match one of the available population attributes.
    :type target_column_name: str

    :param identifier_column_name: the name of the attribute identifying each observed subject.
       Must match one of the available population attributes.
    :type identifier_column_name: str

    :param parameters: a dictionary containing the list of values to be tested that were parsed
    :type parameters: dict

    :param table_name: name of the table to create
    :type table_name:  str

    :param folds: number of folds used for the cross validation
    :type folds: int

    :raises IkatsException: error occurred.
    """

    if table_name is None or re.match('^[a-zA-Z0-9-_]+$', table_name) is None:
        raise ValueError("Error in table name")

    LOGGER.info("Starting Decision Tree CV Fit with scikit-learn")
    # To avoid having a dict as default arg of a function
    if parameters is None:
        parameters = {'max_depth': None, 'class_weight': False}

    try:
        desc_population = population.get('table_desc', None)

        LOGGER.info("with Population table_desc= %s", desc_population)

        # 1/ prepare the learning set
        #
        feature_vectors, target, class_names, column_names = split_population(
            population, target_column_name, identifier_column_name)

        # 2/ prepare the DecisionTree and CrossValidation procedure
        #
        mdl = tree.DecisionTreeClassifier()
        gcv = GridSearchCV(mdl, param_grid=parameters, cv=folds)
        gcv.fit(X=feature_vectors, y=target)

        LOGGER.info("   ... finished  fitting the Decision Tree CV to data")
        LOGGER.info(" - Exporting Decision Tree CV to dot format")
        dot_io = StringIO()
        tree.export_graphviz(gcv.best_estimator_,
                             out_file=dot_io,
                             feature_names=column_names,
                             class_names=class_names,
                             filled=True,
                             label='all')
        dot = dot_io.getvalue()
        LOGGER.info(
            "  ... finished exporting the Decision Tree CV to dot format")

        # Formatting the result dictionary to an IKATS table
        formatted_results = _fill_table_cv_results(gcv.cv_results_)
        best_params = gcv.best_params_
        best_params['balancing'] = best_params.pop('class_weight')
        best_params['max_depth'] = 0 if best_params[
            'max_depth'] is None else best_params['max_depth']
        best_params['balancing'] = best_params['balancing'] is not None
        formatted_best_params = json.dumps(best_params)
        LOGGER.info("... ended  Decision Tree CV Fit with scikit-learn")

        # Save the table
        description = "Result of Decision Tree Cross Validation operator"

        formatted_results['table_desc']['name'] = table_name
        formatted_results['table_desc']['desc'] = description
        IkatsApi.table.create(data=formatted_results)

        return gcv.best_estimator_, dot, formatted_best_params, table_name
    except IkatsException:
        raise
    except Exception:
        msg = "Unexpected error: fit_population(..., {}, {}, {})"
        raise IkatsException(
            msg.format(target_column_name, identifier_column_name, parameters))
Ejemplo n.º 49
0
def run_decision_tree(training_features,
                      training_labels,
                      test_features,
                      test_labels,
                      passed_parameters=None,
                      headings=None):
    """
    Classifies the data using sklearn's decision tree 
    Does not natively support pruning so max_depth is being used

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = tree.DecisionTreeClassifier()

    #set up parameters for the classifier
    if (passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(Decision Tree)'
            save_name = "Validation Curves - Decision Tree - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = tree.DecisionTreeClassifier(
        max_depth=classifier.best_estimator_.max_depth,
        criterion=classifier.best_estimator_.criterion)
    estimator.fit(training_features, training_labels)

    #plot the learning curve
    title = 'Learning Curves \n(Decision Tree, max depth=%i)' % classifier.best_estimator_.max_depth
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(
        os.path.join(results_location, 'Learning Curves - Decision Tree.png'))
    #plt.show()

    #save the visualization of the decision tree only use the top 5 levels for now
    tree_data = StringIO()
    tree.export_graphviz(estimator,
                         out_file=tree_data,
                         max_depth=5,
                         feature_names=headings)
    graph = pydot.graph_from_dot_data(tree_data.getvalue())
    graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf"))

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Ejemplo n.º 50
0
def parse_tree(file_path, outputfile):
    """

    :param file_path: Filepath of the tree !! Use same python version as for training and saving
    :param outputfile : Where do we save the treefile we output
    :return:
    """
    with open(file_path, 'rb') as tree_file:
        tree_model = pickle.load(tree_file)
    feature_names = tree_model.extract_features_names()
    decision_model = tree_model.classifier
    dot_data = StringIO()
    export_graphviz(decision_model,
                    out_file=dot_data,
                    filled=True,
                    rounded=True,
                    special_characters=True,
                    label='all',
                    class_names=decision_model.classes_.astype(str),
                    impurity=False,
                    feature_names=feature_names)

    input_label = [
        'class = %d' % i for i in range(tree_model.max_quality_change * 2 + 1)
    ]
    output_label = [
        'Action : ' + ''.join([low] * i)
        for i in range(tree_model.max_quality_change, 0, -1)
    ]
    output_label += ['Action : ' + same]
    output_label += [
        'Action : ' + ''.join([up] * i)
        for i in range(1, tree_model.max_quality_change + 1)
    ]
    class_label_mapper = {
        in_l: out_l
        for in_l, out_l in zip(input_label, output_label)
    }

    input_label = [
        '_switch_%d' % i for i in range(-tree_model.max_quality_change,
                                        tree_model.max_quality_change + 1)
    ]
    output_label = [
        'Switch : ' + ''.join([low] * i)
        for i in range(tree_model.max_quality_change, 0, -1)
    ]
    output_label += ['Switch : ' + same]
    output_label += [
        'Switch : ' + ''.join([up] * i)
        for i in range(1, tree_model.max_quality_change + 1)
    ]
    switch_mapper = [(in_l, out_l)
                     for in_l, out_l in zip(input_label, output_label)]

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    for n in graph.get_nodes():
        label = n.get_label()
        if label:
            label_parsed = label[1:-1].split('<br/>')
            if len(label_parsed) == 3:
                sample_size, values, class_label = label[1:-1].split('<br/>')
            else:
                feature_name, sample_size, values, class_label = label[
                    1:-1].split('<br/>')
            certainty = min(
                float(sample_size.split('=')[-1].strip()) /
                MAX_N_SAMPLES_CONFIDENCE, 1.0)
            label_parsed = parse_node_label(label, class_label_mapper,
                                            switch_mapper)
            n.set_label(label_parsed)
            reference_color = np.array(reference_colormap(1.0))
            reference_color[-1] = certainty
            reference_color *= reference_color
            n.set_fillcolor(
                convert_rgba2hex((reference_color * 255).astype(int)))
    png_binary = graph.create_png()
    with open(outputfile, 'wb') as outpng:
        outpng.write(png_binary)
Ejemplo n.º 51
0
def test_graphviz_toy():
    # Check correctness of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=1,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    out = StringIO()
    export_graphviz(clf, out_file=out)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test with feature_names
    out = StringIO()
    export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"])
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test with class_names
    out = StringIO()
    export_graphviz(clf, out_file=out, class_names=["yes", "no"])
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = yes"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \
                'class = yes"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \
                'class = no"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test plot_options
    out = StringIO()
    export_graphviz(clf, out_file=out, filled=True, impurity=False,
                    proportion=True, special_characters=True, rounded=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                '0 [label=<X<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>' \
                'value = [0.5, 0.5]>, fillcolor="#e5813900"] ;\n' \
                '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \
                'fillcolor="#e58139ff"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \
                'fillcolor="#399de5ff"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test max_depth
    out = StringIO()
    export_graphviz(clf, out_file=out, max_depth=0, class_names=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = y[0]"] ;\n' \
                '1 [label="(...)"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test max_depth with plot_options
    out = StringIO()
    export_graphviz(clf, out_file=out, max_depth=0, filled=True,
                    node_ids=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \
                'samples = 6\\nvalue = [3, 3]", fillcolor="#e5813900"] ;\n' \
                '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test multi-output with weighted samples
    clf = DecisionTreeClassifier(max_depth=2,
                                 min_samples_split=1,
                                 criterion="gini",
                                 random_state=2)
    clf = clf.fit(X, y2, sample_weight=w)

    out = StringIO()
    export_graphviz(clf, out_file=out, filled=True, impurity=False)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \
                'value = [[3.0, 1.5, 0.0]\\n' \
                '[3.0, 1.0, 0.5]]", fillcolor="#e5813900"] ;\n' \
                '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \
                '[3, 0, 0]]", fillcolor="#e58139ff"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \
                'value = [[0.0, 1.5, 0.0]\\n' \
                '[0.0, 1.0, 0.5]]", fillcolor="#e5813986"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \
                '[0, 1, 0]]", fillcolor="#e58139ff"] ;\n' \
                '2 -> 3 ;\n' \
                '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \
                '[0.0, 0.0, 0.5]]", fillcolor="#e58139ff"] ;\n' \
                '2 -> 4 ;\n' \
                '}'

    assert_equal(contents1, contents2)

    # Test regression output with plot_options
    clf = DecisionTreeRegressor(max_depth=3,
                                min_samples_split=1,
                                criterion="mse",
                                random_state=2)
    clf.fit(X, y)

    out = StringIO()
    export_graphviz(clf, out_file=out, filled=True, leaves_parallel=True,
                    rotate=True, rounded=True)
    contents1 = out.getvalue()
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'graph [ranksep=equally, splines=polyline] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                'rankdir=LR ;\n' \
                '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \
                'value = 0.0", fillcolor="#e5813980"] ;\n' \
                '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \
                'fillcolor="#e5813900"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \
                'fillcolor="#e58139ff"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="False"] ;\n' \
                '{rank=same ; 0} ;\n' \
                '{rank=same ; 1; 2} ;\n' \
                '}'

    assert_equal(contents1, contents2)
Ejemplo n.º 52
0
def writeTree(treeModel, namesList, filename):
    #utility function that plots a decision tree and saves to file
    dot_data = StringIO()
    export_graphviz(treeModel, out_file=dot_data, feature_names=namesList)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(filename)
Ejemplo n.º 53
0
def decision_tree(df_sub,df_train,df_test):


    '''
    wine = pd.read_csv('wine_data.csv',names=["Cultivator","Alcohol","Malic_Acid","Ash","Alcalinity_of_Ash","Magnesium","Total_Phenols","Falvanoids","Nonflavanoid_phenols","Proanthocyanins","Color_intensity","Hue","OD280","Proline"])

    #Look at the data
    wine.head()

    wine.describe().transpose()
    X = wine.drop('Cultivator',axis=1)
    y=wine['Cultivator']
    '''

    df_submission=df_sub
    df_train=df_train
    df_test=df_test

    drop_list=['Survived','Name','Sex','Ticket','Cabin','PassengerId','Embarked']
    drop_list2=['Name','Sex','Ticket','Cabin','PassengerId','Embarked']

    X_train=df_train.drop(drop_list,axis=1)
    X_test=df_test.drop(drop_list2,axis=1)
    y_train=df_train['Survived']
    y_test=df_submission['Survived']

    print(X_train.head())
    print(X_test.head())


    '''
    col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label']
    pima = pd.read_csv("pima-indians-diabetes.csv",header=None,names=col_names)
    pima.head()

    #split dataset in features and target variable
    feature_cols=['pregnant','insulin','bmi','age','glucose','bp','pedigree']
    
    X = pima[feature_cols]
    y=pima.label
    '''

    #Create DEcision Tree classifier object
    clf = DecisionTreeClassifier(max_depth=5,min_samples_leaf=10)

    #Train Decision Tree Classifier
    clf = clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)

    print('Training Accuracy: ',metrics.accuracy_score(y_train,y_pred))

    y_pred = clf.predict(X_test)

    #Model Accuracy, how often is the classifier correct?
    print("Test Accuracy:",metrics.accuracy_score(y_test,y_pred))

    print(pd.crosstab(y_test,y_pred,rownames=['True'],colnames=['Predicted'],margins=True))    


    #PREP DATA FOR LOOPING
    df_X_train=pd.DataFrame(X_train)
    df_X_test=pd.DataFrame(X_test)
    df_y_train=pd.DataFrame(y_train)
    df_y_test=pd.DataFrame(y_test)
    X=df_X_train.append(df_X_test)
    y=df_y_train.append(df_y_test)


    #LEARNING CURVE: LOOP FOR DIFFERENT TRAINING SIZES
#    n_range=[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]
    n_range=[0.01,0.02,0.1,0.25,0.4,0.5,0.6,0.75,0.9,0.98,0.99]#[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]
    scores = {}
    scores_list=[]
    train_scores_list=[]
    for n_size in n_range:
        print('n_range',n_size)
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=n_size, random_state=10)#,random_state=4)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_test)
        scores_list.append(metrics.accuracy_score(y_test,y_pred))
        #Train scores, for learning curves
        y_pred_train=clf.predict(X_train)
        train_scores_list.append(metrics.accuracy_score(y_train,y_pred_train))

    print("TRAINING SIZE")
    print('scores_lis',scores_list)
    a=['0.01','0.02','0.1','0.25','0.4','0.5','0.6','0.75','0.9','0.98','0.99']
    plt.plot(a,scores_list,a,train_scores_list)#(n_range,scores_list,n_range,train_scores_list)#n_range,scores_list)#,n_range,[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5])#(['0.05','0.1','0.15','0.2','0.25','0.3','0.35','0.4','0.45','0.5'],scores_list)
#    plt.plot(n_range,scores_list,n_range,train_scores_list)
    plt.title('Test Accuracy v. Train Accuracy, Decision Trees (Titanic)')
    plt.legend(['Test','Train'])
    plt.xlabel('Test Split')
    plt.ylabel('Accuracy')
    plt.ylim((0,1.0))
    plt.savefig('Titanic/mlp_Titanic_testSize.png')
    plt.show()



    #LEARNING CURVE: LOOP FOR DIFFERENT Max Depths
    l_range=[1,5,10,15,20]
    #scores = {}
    learning_list=[]
    time_list=[]
    for l_rate in l_range:
        print('l_rate',l_rate)
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=10)#,random_state=4)
        clf = DecisionTreeClassifier(max_depth=l_rate, min_samples_leaf=10)
        start_time=time.time()
        clf.fit(X_train,y_train)
        end_time=time.time()
        elapsed=end_time-start_time
        time_list.append(elapsed)
        y_pred=clf.predict(X_test)
        learning_list.append(metrics.accuracy_score(y_test,y_pred))

    print('LEARNING RATE')
    print('scores_lis',learning_list)
    plt.plot(['1','5','10','15','20'],learning_list)#l_range,learning_list)
    plt.title('Decision Tree Accuracy at Varying Max Depths (Titanic)')
    plt.xlabel('Max Depth')
    plt.ylabel('Testing Accuracy')
    plt.ylim((0,1.0))
    plt.savefig('Titanic/dt_Titanic_learningRate.png')
    plt.show()

    plt.plot(['1','5','10','15','20'],time_list)
    plt.xlabel('Max Depth')
    plt.ylabel('Training Time (sec)')
    plt.title('Decision Tree Training Time at Varying Max Depths (Titanic)')
#    plt.ylim((0,0.01))
    plt.savefig('Titanic/dt_Titanic_trainingTime.png')
    plt.show()




    import graphviz
    '''
    dot_data = tree.export_graphviz(clf,out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render("iris")
    '''
    from sklearn.externals.six import StringIO
    from IPython.display import Image
    import pydotplus
    
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file = dot_data, filled=True, rounded=True,
                    special_characters=True, #feature_names = feature_cols,
                    #class_names=['0','1'])
                         )
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png('Titanic/decisionTree_Titanic.png')
    Image(graph.create_png())
    
    
    print('after graphviz')

    #Section: Optimizing Decision Tree Performance
    '''
    #Create Decision Tree Classifier Object
    clf = DecisionTreeClassifier(criterion="entropy",max_depth=5)  #min_samples_leaf can be set to 5%, max_leaf nodes can also be set
    clf=clf.fit(X_train,y_train)

    y_pred = clf.predict(X_test)

    #Model Accuracy, how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
    '''


    '''
    dot_data = StringIO()
    export_graphviz(clf, out_file = dot_data, filled=True, rounded=True,
                    special_characters=True, feature_names = feature_cols,
                    class_names=['0','1'])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png('diabetes.png')
    Image(graph.create_png())
    '''
          
#    raise NotImplementedError
    return
Ejemplo n.º 54
0
def classify(**args):
    """
    Main method that prepares dataset, builds model, executes training and displays results.
    
    :param args: keyword arguments passed from cli parser
    """
    # only allow print-outs if execution has no repetitions
    allow_print = args['repetitions'] == 1
    # determine classification targets and parameters to construct datasets properly
    cls_target, cls_str = set_classification_targets(args['cls_choice'])
    d = prepare_dataset(
        0,  # any synthetic
        cls_target,
        args['batch_size'],
        train_shuffle_repeat=False,
        categorical_labels=False)

    print('\n\tTask: Classify «{}» using «{}» with DecisionTreeClassifier\n'.
          format(cls_str, d['data_str']))
    print_dataset_info(d)

    model = DecisionTreeClassifier(class_weight='balanced')

    # empty train data generator into list, then train. Careful with RAM
    train_data = [
        sample for batch in tqdm(
            d['train_data'], total=d['train_steps'], desc='prep_train')
        for sample in batch[0]
    ]
    model.fit(train_data, d['train_labels'])
    del train_data

    # predict on testset and calculate classification report and confusion matrix for diagnosis
    d = prepare_dataset(
        2,  # any handheld
        cls_target,
        args['batch_size'],
        train_shuffle_repeat=False,
        categorical_labels=False)
    test_data = [
        sample for batch in tqdm(
            d['test_data'], total=d['test_steps'], desc='prep_test')
        for sample in batch[0]
    ]
    print_dataset_info(d)
    pred = model.predict(test_data)
    del test_data

    if allow_print:
        # visualise decision tree, from datacamp.com/community/tutorials/decision-tree-classification-python
        dot_data = StringIO()
        export_graphviz(model,
                        out_file=dot_data,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf('img/decision_tree.pdf')

        diagnose_output(d['test_labels'], pred, d['classes_trans'])

    return balanced_accuracy_score(d['test_labels'], pred)
Ejemplo n.º 55
0
test_target = iris.target[test_idx]
##
test_data = iris.data[test_idx]

# 2. Train a classifier
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)

# 3. Predict label for new flower
print(test_target)  # [0,1,2]
print(clf.predict(test_data))  # splits out the same labels [0,1,2]

# 4. Visualize the tree
##
from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
tree.export_graphviz(clf,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True,
                     rounded=True,
                     impurity=False)

print("should export the tree.dot")

import graphviz as gp
graph = gp.Source(dot_data.getvalue())
graph.render("iris", view=True)
Ejemplo n.º 56
0
os.environ["PATH"] += os.pathsep + 'c:/Program Files (x86)/Graphviz2.38/bin/'
#%%
graph1 = Source(tree.export_graphviz(clf, out_file=None, class_names= ['0', '1']  , filled = True))
display(SVG(graph1.pipe(format='svg')))
#change labels names
graph2 = Source( tree.export_graphviz(clf, out_file=None, feature_names=X.columns, filled=True, class_names=['NoDiabetis','Diabetis']))
graph2
#change max_depth : 1 to 4
Source(tree.export_graphviz(clf, out_file=None, max_depth=1, feature_names=X.columns, class_names=['NonDB','DB'], label='all', filled=True, leaves_parallel=True, impurity=True, node_ids=True, proportion=True, rotate=True, rounded=True, special_characters=False, precision=1))
#https://stackoverflow.com/questions/27817994/visualizing-decision-tree-in-scikit-learn
# This is for saving image in file system
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html
import pydotplus
dotfile = StringIO()
tree.export_graphviz(clf, out_file=dotfile, filled=True, feature_names=X.columns, class_names=['NoDiabetis','Diabetis'])
pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("E:/graphs/dtree2.png")
#True should be returned. goto location and see the file

#%%%  Create Decision Tree classifer object
#change max_depth at the time of creation and method
#criterio= entropy, gini
clf3 = DecisionTreeClassifier(criterion="entropy", max_depth=3)
# Train Decision Tree Classifer
clf3 = clf3.fit(X_train,y_train)
#Visualise
Source(tree.export_graphviz(clf3, out_file=None, class_names= ['0', '1']  , filled = True, feature_names=X.columns,node_ids=True))
#display(SVG(graph3b.pipe(format='svg')))
X_train[0:1]  
#Class:1 : glucose > 127, glucose < 158, bmi, age,
#Predict the response for test dataset
y_pred3 = clf3.predict(X_test)
Ejemplo n.º 57
0
    def show_tree(self):
        '''return a png of the tree'''
        assert self.clf
        try:
            import pydotplus as pydot
        except ImportError:
            import pydot # dirty hack for read the docs

        dot_data = StringIO() 
        tree.export_graphviz(self.clf, out_file=dot_data, 
                             feature_names=self.feature_names) 
        dot_data = dot_data.getvalue()#.encode('ascii') # @UndefinedVariable
        graph = pydot.graph_from_dot_data(dot_data)[0]  
        img = graph.create_png()
        return img

       
# if __name__ == '__main__':
#     from test import test_utilities
#     import matplotlib.pyplot as plt
# 
#     ema_logging.log_to_stderr(ema_logging.INFO)
# 
#     def scarcity_classify(outcomes):
#         outcome = outcomes['relative market price']
#         change = np.abs(outcome[:, 1::]-outcome[:, 0:-1])
#         
#         neg_change = np.min(change, axis=1)
#         pos_change = np.max(change, axis=1)
#         
#         logical = (neg_change > -0.6) & (pos_change > 0.6)
#         
#         classes = np.zeros(outcome.shape[0])
#         classes[logical] = 1
#         
#         return classes
#  
#     results = test_utilities.load_scarcity_data()
#     
#     cart = setup_cart(results, scarcity_classify)
#     cart.build_tree()
#     
#     print(cart.boxes_to_dataframe())
#     print(cart.stats_to_dataframe())
#     cart.display_boxes(together=True)
#     
#     img = cart.show_tree()
#      
#     import matplotlib.pyplot as plt
#     import matplotlib.image as mpimg
#   
#     # treat the dot output string as an image file
#     sio = StringIO()
#     sio.write(img)
#     sio.seek(0)
#     img = mpimg.imread(sio)
#       
#     # plot the image
#     imgplot = plt.imshow(img, aspect='equal')
#       
#     plt.show()
def draw_tree(model, name):
    dot_data = StringIO()
    _tree.export_graphviz(model, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(name + ".pdf")
Ejemplo n.º 59
0
def run_model(df, vectorizer, classifier):
    # load data
    x = df['Cleaned'].values
    y = df['Class'].values

    # split dataset into training and test sets, with 80:20 split
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1000,
                                                        stratify=y)

    if vectorizer == "count":
        vectorizer = CountVectorizer()

    if vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()

    vectorizer.fit(x_train)

    X_train = vectorizer.transform(x_train)
    X_test = vectorizer.transform(x_test)

    if classifier == "naive_bayes":
        classifier = MultinomialNB()

    if classifier == "decision_tree":
        classifier = DecisionTreeClassifier(
        )  # manual search tried, but default hyperparameters were best

    if classifier == "random_forest":
        clf = RandomForestClassifier()  # default n_estimators=100

        # define random search space based on decision tree depth
        hyp = {
            "n_estimators": [50, 100, 150,
                             200],  # number of trees in the forest
            "max_depth": [40, 50, None],  # max depth of tree
            "max_features": [10, 20, 'sqrt', None],
            "min_samples_split": randint(1, 11),
            "bootstrap": [True, False],  # to use bagging or not
            "criterion": ["gini", "entropy"]
        }  # gini impurity or information gain

        # random search over 5-fold cross validation (stratified k-fold by default)
        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=100,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        n_estimators = search_result.best_estimator_.get_params(
        )['n_estimators']
        max_depth = search_result.best_estimator_.get_params()['max_depth']
        max_features = search_result.best_estimator_.get_params(
        )['max_features']
        min_samples_split = search_result.best_estimator_.get_params(
        )['min_samples_split']
        bootstrap = search_result.best_estimator_.get_params()['bootstrap']
        criterion = search_result.best_estimator_.get_params()['criterion']

        print("Random search results: ")
        print("Best n_estimators: ", n_estimators)
        print("Best max_depth: ", max_depth)
        print("Best max_features:", max_features)
        print("Best max_features:", min_samples_split)
        print("Best bootstrap:", bootstrap)
        print("Best criterion:", criterion)

        # set the classifier to the one with best hyperparameters from random search
        classifier = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            min_samples_split=min_samples_split,
            bootstrap=bootstrap,
            criterion=criterion)

    if classifier == "logistic_regression":
        # by a manual search the lbfgs solver showed best results
        # number of max iterations is increased to allow lbfgs solver to converge
        # compare loss functions over 5-fold cross validation
        ovr_clf = LogisticRegression(multi_class='ovr',
                                     solver='lbfgs',
                                     max_iter=1000)
        ovr_score = cross_val_score(ovr_clf, X_train, y_train, cv=5).mean()

        mce_clf = LogisticRegression(multi_class='multinomial',
                                     solver='lbfgs',
                                     max_iter=1000)
        mce_score = cross_val_score(mce_clf, X_train, y_train, cv=5).mean()

        # choose the better performing hyperparameters
        if (ovr_score > mce_score):
            classifier = LogisticRegression(multi_class='ovr',
                                            solver='lbfgs',
                                            max_iter=1000)
        else:
            classifier = LogisticRegression(multi_class='multinomial',
                                            solver='lbfgs',
                                            max_iter=1000)

    if classifier == "linear_svm":
        clf = svm.LinearSVC(max_iter=1000)

        hyp = {
            "loss": ['hinge', 'squared_hinge'],
            "multi_class": ['ovr', 'crammer_singer']
        }

        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=20,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        loss = search_result.best_estimator_.get_params()['loss']
        multi_class = search_result.best_estimator_.get_params()['multi_class']

        print("Best loss: ", loss)
        print("Best multi_class:", multi_class)

        classifier = svm.LinearSVC(loss=loss,
                                   multi_class=multi_class,
                                   max_iter=1000)

    if classifier == "nonlinear_svm":
        clf = svm.SVC()
        hyp = {
            "gamma": ['auto', 'scale'],
            "kernel": ['poly', 'rbf', 'sigmoid']
        }

        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=20,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        gamma = search_result.best_estimator_.get_params()['gamma']
        kernel = search_result.best_estimator_.get_params()['kernel']

        print("Best gamma: ", gamma)
        print("Best kernel:", kernel)

        classifier = svm.SVC(gamma=gamma, kernel=kernel)

    if classifier == "knn":
        classifier = KNeighborsClassifier(
            n_neighbors=5)  # change k-value as needed

    if classifier == "mlp":
        clf = MLPClassifier()
        hyp = {
            "hidden_layer_sizes": [(64, ), (64, 64), (64, 64, 64), (128, ),
                                   (128, 128), (128.128, 128), (256, 256, 256),
                                   (512, 512, 512)]
        }

        grid_search = GridSearchCV(clf, hyp, cv=5)
        search_result = grid_search.fit(X_train, y_train)

        hidden_layer_sizes = search_result.best_estimator_.get_params(
        )['hidden_layer_sizes']

        print("Best hidden layer size:", hidden_layer_sizes)

        classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                                   verbose=True)  # uses reLU, adam by default

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    # print metrics
    print("\nClassification report summary:")
    print(
        classification_report(y_test,
                              y_pred,
                              labels=[i + 1 for i in range(20)],
                              digits=3))

    print("Accuracy:", classifier.score(X_test, y_test))
    print("Macro-F1:", f1_score(y_test, y_pred, average='macro'))

    # if decision tree or random forest, generates plot of tree
    if classifier == "decision_tree" or classifier == "random_forest":

        # print 5 most important tokens:
        swapped_vocab = dict([
            (value, key) for key, value in vectorizer.vocabulary_.items()
        ])
        print("5 most important tokens: ")
        for i in np.argsort(classifier.feature_importances_)[-5:][::-1]:
            print(swapped_vocab[i])

        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus

        dot_data = StringIO()

        if classifier == "decision_tree":
            export_graphviz(classifier,
                            out_file=dot_data,
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

            graph.write_pdf("decision_tree.pdf")

        else:
            # get a random one of the 100 trees in the forest
            export_graphviz(classifier.estimators_[random.randint(1, 101)],
                            out_file=dot_data,
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

            graph.write_pdf("random_forest.pdf")

    # if logistic regression, plot most important terms
    if classifier == "logistic_regression":
        plot_lr_coef(classifier, vectorizer)

    # get confusion matrix for plot
    cm = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

    return vectorizer, classifier, cm
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['target'])
# df['label'] = df.target.replace(dict(enumerate(df.target_names)))
print(df.head()) # to check the top results
print(iris.feature_names)
print(iris.target_names)
print(df.describe()) # to check difference between min and maxmium value
x = iris['data']
y = iris['target']

iris_df = pd.DataFrame(x, columns=iris['feature_names'])
print(iris_df.head)
x, y = shuffle(x, y, random_state=0)  # random shuffle
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm
clf = classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console
dot_data = StringIO()
tree.export_graphviz(classifier,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True, rounded=True,
                     impurity=False,
                     proportion=True)
graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph
graph[0].write_pdf("iris3.pdf") # run the file.