def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames):
    """Train and classify using a Decision Tree and prints the decision Tree."""
    decisionTree = DecisionTreeClassifier()
    model = decisionTree.fit(trainData, trainTargets)

    # Create graph description of the Decision Tree
    dot_data = StringIO() 
    #export_graphviz(model, out_file=dot_data, max_depth=5)
    print("Feature names:", featureNames)
    export_graphviz(model, out_file=dot_data, feature_names=featureNames, 
                    max_depth=5)
    export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, 
                    max_depth=5)
    #with open("DecisionTree.dot", 'r') as dotFile:
    #    dotFile.write(exportFile)
    # Create PDF from dot
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot"
    #graph = pydot.graph_from_dot_file(path) 
    #graph.write_pdf("DecisionTree.pdf")


    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Decision Tree:")
    showPerformance(testTargets, classification)
def export_tree(forest):
    i_tree = 0
    for tree_in_forest in forest.estimators_:
        with open('trees/tree_' + str(i_tree) + '.dot', 'w') as my_file:
            tree.export_graphviz(tree_in_forest, out_file=my_file,
                                 feature_names=utils.heatmap_feature_names[:len(utils.heatmap_feature_names) - 1])
        i_tree += 1
Example #3
0
def visualize_tree(clf, outname, headers):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers))
    graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8'))
    graph.write_pdf(outname)
Example #4
0
def create_tree(args):
    # load
    df = pd.read_csv(args['<input>'])
    class_attr = args['<class>']

    # check
    if class_attr not in df.columns:
        print('Class attribute "{}" not in dataset!'.format(class_attr))
        sys.exit(1)

    # flags & options
    verbose = False
    if args['--verbose']:
        verbose = True
    exceptions = None
    output = None
    if args['--output']:
        output = args['--output']
    if args['--except']:
        exceptions = args['--except'].split(',')
    samples = int(args['--samples'])

    # get numeric columns and drop class and exceptions (our features)
    features = df._get_numeric_data().columns.difference([class_attr])
    if exceptions:
        if verbose:
            print('Removing the following: {}'.format(', '.join(exceptions)))

        features = features.difference(exceptions)

    # verbose detail
    if verbose:
        print('Using the following features: {}'.format(', '.join(features)))

    # create tree
    dt = DecisionTreeClassifier(
            min_samples_split=samples,
            criterion='entropy',
            splitter='best',
            random_state=99)
    y = df[class_attr]
    X = df[features]
    dt.fit(X, y)

    # export graph of tree if graph output specified
    if output:
        import subprocess
        if verbose:
            print('Saving as {0}.dot and {0}.png'.format(output))

        # save dot
        with open(output + '.dot', 'w') as f:
            export_graphviz(dt, out_file=f, feature_names=features)

        command = 'dot -Tpng {0}.dot -o {0}.png'.format(output).split()
        try:
            subprocess.check_call(command)
        except:
            print('Problem creating graphviz image, is graphviz installed?')
            sys.exit(1)
Example #5
0
def main():
    #pre-processing 预处理
    from sklearn.datasets import load_iris  #导入IRIS数据集
    iris=load_iris()  #特征矩阵
#    print(iris)
    print(len(iris['data']))
    from sklearn.cross_validation import train_test_split
    #数据集大小为150*0.2=30,随机状态为1说明随机选取
    train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
    
    #Model
    from sklearn import tree
    clf=tree.DecisionTreeClassifier(criterion="entropy")
    clf.fit(train_data,train_target)
    y_pred = clf.predict(test_data)
    
    #Veriify准确率和混淆矩阵
    from sklearn import metrics
    print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))
    #横轴表示实际值,纵轴表示预测值
    print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred))
    #矩阵中的1实际值是第二类,预测值是第三类
    
    with open('./data/tree.dot','w') as fw:
        tree.export_graphviz(clf,out_file=fw)
Example #6
0
def drawDecisionTree(dt, filename, featureNames, classNames):
    dot_data = StringIO()
    print featureNames
    print classNames
    tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(filename) 
Example #7
0
def create_tree(X, Y):
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(X, Y)

    from IPython.display import Image
    import pydotplus

    dot_data = StringIO()
    # tree.export_graphviz(clf, out_file=dot_data)
    # feature_names = ['Gender', 'Age']
    feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"]
    target_names = []

    for i in range(1, len(Y) + 1):
        target_names.append("Ad #" + str(i))

    tree.export_graphviz(
        clf,
        out_file=dot_data,
        feature_names=feature_names,
        class_names=target_names,
        filled=True,
        rounded=True,
        special_characters=True,
    )

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("Tree.pdf")

    return clf
Example #8
0
def main():
    data = run_game()

    clf = DecisionTreeClassifier(criterion='entropy')

    game_data = [[i[0], i[1]] for i in data]
    profits = [i[2] for i in data]

    clf.fit(game_data, profits)

    with open('tree.dot', 'w') as dotfile:
        export_graphviz(
            clf,
            dotfile,
            feature_names=['coin', 'bet']
        )

    predictions_lose1 = [clf.predict([0, 0]) for x in xrange(100)]
    predictions_lose2 = [clf.predict([0, 1]) for x in xrange(100)]
    predictions_win = [clf.predict([1, 1]) for x in xrange(100)]

    print 'All these profit predictions should be zero:'
    print predictions_lose1
    print 'Accuracy was', calculate_accuracy(predictions_lose1, np.array([0]))

    print 'All these profit predictions should be zero:'
    print predictions_lose2
    print 'Accuracy was', calculate_accuracy(predictions_lose2, np.array([0]))

    print 'All these profit predictions should be two:'
    print predictions_win
    print 'Accuracy was', calculate_accuracy(predictions_win, np.array([2]))
Example #9
0
def tree(labels,X,df,i):
  tree = DT(max_depth = 4)
  tree.fit(X,labels)
  impt = tree.feature_importances_
  para = tree.get_params()
  export_graphviz(tree, out_file = OUTPUT_DIRECTORY+str(i)+"_tree.dot", feature_names = df.columns)
  return impt
Example #10
0
def draw_tree(clf):
    import pydot
    import StringIO
    output = StringIO.StringIO()
    tree.export_graphviz(clf, out_file=output)
    graph = pydot.graph_from_dot_data(output.getvalue())
    graph.write_pdf('tree.pdf')
def visualize_tree(dt, fv_columns, exclude_attrs, create_file=True):
    """Create tree png using graphviz.

    """
    if isinstance(dt, DTMatcher):
        tree = dt.clf
    else:
        tree = dt
    if exclude_attrs is None:
        feature_names = fv_columns
    else:
        cols = [c not in exclude_attrs for c in fv_columns]
        feature_names = fv_columns[cols]

    with open("dt_.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt_.dot", "-o", "dt_.png"]
    try:
        subprocess.check_call(command)
    except:
        logger.error("Could not run dot, ie graphviz, to "
             "produce visualization")
        return
    print("Execute the following command in IPython command prompt:")
    print("")
    print("from IPython.display import Image")
    print("Image(filename='dt_.png') ")
    def train_network(self):
        """ Pure virtual method for training the network
        """
        db_query = self._database_session.query(PregameHitterGameEntry)
        mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8)
        X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH)
        self._decision_tree.fit(X_train, Y_train)
        dot_data = StringIO()
        tree.export_graphviz(self._decision_tree, out_file=dot_data,
                             feature_names=PregameHitterGameEntry.get_input_vector_labels())
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("hitter_tree.pdf")
        x_test_actual = list()
        y_test_actual = list()
        for data in mlb_evaluation_data:
            try:
                postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id,
                                                                                              PostgameHitterGameEntry.game_date == data.game_date).one()
                y_test_actual.append([postgame_entry.actual_draftkings_points])
                x_test_actual.append(data.to_input_vector())
            except NoResultFound:
                print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id
                continue

        self._database_session.close()
Example #13
0
def learn_dtree(data, csvfile):
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
    k = data.groupby(['operator'])
   # k = data.groupby(['operator'])
    f = k['isCovered'].agg({'mean_kill': np.mean, 'number of mutants': len, 'number of killed':np.sum})
    f.to_csv(csvfile)
    fig = plt.figure()
    # ax = Axes3D(fig)
    # ax = fig.add_subplot(111, projection='3d')
    plt.scatter(standardize (f['mean']),f['sum'])
    plt.ylabel('mutant_size')
    plt.xlabel('expected_kill (standatdize)')
    # print f[f['len'] > 25000] 
    # ax.set_xlabel('mean')
    # ax.set_ylabel('len')
    # ax.set_zlabel('sum')
    
    plt.show()

    # plt.show()
    # for m in k.groups:
    #   print m,len(k.groups[m]),
    data['op'] = pd.factorize(data['operator'])[0]
    data['m'] = pd.factorize(data['method'])[0]
    HLdata['c'] = pd.factorize(data['class'])[0]

    # plt.show()
    plt.close()
    x = data[['op', 'c', 'testId']].values
    y = data['isCovered'].values
    clf.fit(x,y)
    dot_data = StringIO.StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    return dot_data.getvalue()
def visualize_tree(dtree):
    dot_data = StringIO()
    tree.export_graphviz(dtree, out_file=dot_data,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    display(Image(graph.create_png()))
def generate_plot(clf):
    print "\nGenerating plot..."
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("weather_forecast.pdf")
    print "Plot generated!"
def tree2():
  global final_html
  global df,origin_df
  chi_key = list()
  firstkey = ""
  init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px;  height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>"""
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		temp_count = 0
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		k1 = k.split(";")
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="float:right;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html()
		return final_html
  return 'helloo'  
 def set_yaml_infos(self, data):
     if self.logging >= 3: print("data: %s"%data)
     if self.logging >= 2: print("data.task_name: %s"%data.task_name)
     objects = data.objects
     task_name = data.task_name
     self.original_objects = objects
     all_data = {}
     if self.logging >= 1: print(">>>> Receiving Objects from YAML")
     if self.logging >= 2: print("Objects Received from YAML: \r\n %s"%objects)
     for object in objects:
         name = object.name
         color = object.color
         primitive = object.primitives
         cub_res = self.get_surrounding_cuboid(object)
         self.cubeized_objects[name] = {'color': color, 'dimensions': cub_res}
         randomized_objects = self.randomize_objects(self.cubeized_objects, self.number_of_data,
                                                     self.size_tresh_perc,
                                                     self.color_tresh)
         all_data[name] = randomized_objects
         self.max_height = max(self.max_height, max(cub_res))
     data, labels = self.convert_to_dataset(all_data)
     if "task 3" in task_name:
       rnd_data, rnd_labels = self.create_random_obstacles(number=1000)
     else:
       rnd_data, rnd_labels = [],[]
     if self.logging >= 3: print("All Data + Labels \r\n %s  \r\n %s"
                                 %( (labels + rnd_labels), (data+rnd_data)))
     self.clf.fit(data+rnd_data, labels+rnd_labels)
     if self.logging >=3:
         tree.export_graphviz(self.clf, out_file='tree.dot', feature_names=['h', 's', 'v', 'site'])
         print("max height: %s\n"%self.max_height)
Example #18
0
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False):
    """ Classifies data using CART """
    try:
        accuracyRate, probabilities, timing = 0.0, [], 0.0
        # Perform classification
        cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth)
        startTime = time.time()
        prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug")
        cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr))
        prettyPrint("Submitting the test samples", "debug")
        predicted = cartClassifier.predict(Xte)
        endTime = time.time()
        # Compare the predicted and ground truth and append result to list
        accuracyRate = round(metrics.accuracy_score(predicted, yte), 2)
        # Also append the probability estimates
        probs = cartClassifier.predict_proba(Xte)
        probabilities.append(probs)
        timing = endTime-startTime # Keep track of performance
        if visualizeTree:
            # Visualize the tree
            dot_data = StringIO()
            tree.export_graphviz(cartClassifier, out_file=dot_data)
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug")
            graph.write_pdf("tree_%s.pdf" % getTimestamp())
  
    except Exception as e:
        prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error")

    return accuracyRate, timing, probabilities, predicted
Example #19
0
def classfyWithScipy(dataSet,labels,dataToClassfy):
    clf = tree.DecisionTreeClassifier(criterion="entropy").fit(dataSet,labels)
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("entropy.pdf") 
    return clf.predict(dataToClassfy)
Example #20
0
def build_tree_image(model,X):
    dotfile = open("tree.dot", 'w')
    export_graphviz(model,
                              out_file = dotfile,
                              feature_names = X.columns)
    dotfile.close()
    system("dot -Tpng tree.dot -o tree.png")
def main():
	if (len(sys.argv) < 2):
		print("One Argument Required; Training Set")
		return
	X_train, Y_train = ParseTraining(sys.argv[1])
    #X_test, Y_test = ParseTraining(sys.argv[2])
    #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99)
    #X_train, X_test, Y_train, Y_test = X, X, Y, Y
    #clf = tree.DecisionTreeClassifier()
	clf = tree.DecisionTreeClassifier(max_depth=6)
    #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025))
    #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1)
    #clf = SVC(kernel="linear", C=0.025)
    #clf = AdaBoostClassifier()
    #clf = SVC(gamma=2, C=1)
	clf = clf.fit(X_train, Y_train)


    #feature_names = ["partAvg", "recavg", "latency", "ReadRate"]
	feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"]
    #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"]
    #feature_names = ["partAvg", "recAvg", "recVar"]
    #feature_names = ["recAvg", "recVar", "Read"]
    #feature_names = ["partAvg", "recVar"]
    ##class_names = ["Partition", "OCC", "2PL"]
    #class_names = ["OCC", "2PL"]
	class_names = ["Partition", "No Partition"]
	dot_data = StringIO()
	tree.export_graphviz(clf, out_file=dot_data,
						feature_names=feature_names,
						class_names=class_names,
						filled=True, rounded=True,
						special_characters=True)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_png("partition.png")
Example #22
0
def mainTree():
	header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\
	smoke|drink|child|parent|bmi|where0|where1|\
	marriage0|marriage1|look0|look1|where2').split('|')
	MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|')
	FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|')
	matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt')
	FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches))
	FemaleClass=FemaleData[['id','class']]
	newMaleData=concatData(MaleData,FemaleClass)
	MaleArrays=scaleData(newMaleData,['id','gender'])
	pca=factors(MaleArrays[:,:-1],17)
	print 'PCA explained variance:', sum(pca.explained_variance_ratio_)
	pcaMaleArray=pca.transform(MaleArrays[:,:-1])
	MaleArrays=np.c_[pcaMaleArray,MaleArrays]


	trainData,testData=departData(MaleArrays,0.9)
	trainModel=decisionModel(trainData)

	dot_data = StringIO()
	tree.export_graphviz(trainModel, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") 
	

	rate=test(trainModel,testData)
	print 'Decision Model true rate',rate
def drawDecTree(decTree, X, y, outdir, label=randint(100), featNames=None):
    decTree.fit(X, y)
    #return decTree.feature_importances_
    dot_data = StringIO.StringIO()
    tree.export_graphviz(decTree, out_file=dot_data, feature_names=featNames)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(outdir +  "/" + str(label) + "_graph" + ".png")
def create_pdf(clf):
	print 'Drawing tree...'
	"""Save dec tree graph as pdf."""
	dot_data = StringIO.StringIO() 
	tree.export_graphviz(clf, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf('NvD5.pdf')
def fit_decision_tree(train_X, train_y, test_X, test_y):
    # print classification reports
    # print accuracy
    # The format should be
    """
    Classification Report:
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.85      4932
        1.0       0.75      0.60      0.67      2676

    avg / total       0.78      0.79      0.78      7608

    Accuracy: 0.788512092534"""
    dtc = tree.DecisionTreeClassifier()
    dtc = dtc.fit(train_X,train_y.flat)
    pred_y = dtc.predict(test_X)

    print classification_report(test_y, pred_y)
    print accuracy_score(test_y,pred_y)


    # create the graph - Here you just need to create the dot file. Please uncomment my code below

    from sklearn.externals.six import StringIO
    f = open('tre.dot','w')
    tree.export_graphviz(dtc, out_file=f) # please change your_tree_model_fit with the variable you used above
    f.close()
Example #26
0
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile):
    ''' Retrains the tree estimator using the fold with the best results
    from the cross-validation process. Prints out a graph pdf file of 
    that estimator.'''
    # Hacky way to get the training data for the best fold
    bestfold = np.argmax(scores)
    cnt = 0
    for train, _ in cv:

        # Only do stuff when you've got the training indices for the best fold
        if(cnt == bestfold):
            # Fit
            treeest.fit(features[train], labels[train])

            # Get the dot file
            dot_data = StringIO()
            tree.export_graphviz(treeest, out_file=dot_data, \
                feature_names=featnames)

            # Convert the dot file to a graph
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf(outfile)
            return
        else:
            cnt += 1

    print("You should never see this text from dt_graph!")
    return
Example #27
0
def main(features_fpath, classes_fpath):
    
    with open(features_fpath) as features_file:
        for line in features_file:
            if '#' in line:
                spl = line.split()
                names = spl[1:]
    
    X = scale(np.genfromtxt(features_fpath)[:,1:].copy())
    y = np.loadtxt(classes_fpath)
    
    forest = ExtraTreesClassifier(max_depth=4,
                                  criterion="entropy",
                                  compute_importances=True)
    
    scores = cross_val_score(forest, X, y, score_func=f1_score, cv=5)
    print(scores)
    
    forest.fit(X, y)
    
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Print the feature ranking
    print("Feature ranking:")
    for f in xrange(len(importances[indices])):
        print("%d. feature %s (%f)" % (f + 1, names[indices[f]], 
                                       importances[indices[f]]))
        
    export_graphviz(forest, 'bala.dot')
def make_tree_test():
    from sklearn import tree
    
    import StringIO
    
    import pydot
    
    from IPython.display import display, Image     
    
    x,y,dates,movies = load_data()
    
    #x =  add_missed_value_indicator(x)   
                
    test_x, train_x, test_y, train_y = create_test_train_set(x, y)          
    
    clf = tree.DecisionTreeClassifier(min_samples_split=3000)
    
    fit = clf.fit(train_x,train_y)
    
    dot_data = StringIO.StringIO()
    
    tree.export_graphviz(fit, 
       feature_names=train_x.columns,
       class_names=["1","2","3","4","5"],
       out_file=dot_data)
    
    graph = pydot.graph_from_dot_data(dot_data.getvalue())   
    
    graph[0].write_png("tree_toy.png")
    
    img = Image(graph[0].create_png()) 
    
    display(img)

    return fit
Example #29
0
def arbolesRegresion(caract):
    
    clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True)
    
    importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0]    
    mae=mse=r2=0
    
    kf = KFold(len(boston_Y), n_folds=10, indices=True)
    for train, test in kf:
        trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]
            
        nCar=len(caract)
        train=np.zeros((len(trainX), nCar))
        test=np.zeros((len(testX), nCar))
        trainYNuevo=trainY
        
        for i in range(nCar):
            for j in range(len(trainX)):
                train[j][i]=trainX[j][caract[i]]
                
            for k in range(len(testX)):
                test[k][i]=testX[k][caract[i]]
        
        trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1))
        
        clf.fit(train, trainYNuevo)
        prediccion=clf.predict(test)            
        
#        clf.fit(trainX, trainY)
#        prediccion=clf.predict(testX)
            
        mae+=metrics.mean_absolute_error(testY, prediccion)
        mse+=metrics.mean_squared_error(testY, prediccion)
        r2+=metrics.r2_score(testY, prediccion)
        
        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        for i in range(13):
            importancias[i] = importancias[i] + feature_importance[i]
        
    print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf)
    
    for i in range(13):
        importancias[i] = importancias[i]/10
        
    sorted_idx = np.argsort(importancias)
    pos = np.arange(sorted_idx.shape[0]) + .5
    importancias = np.reshape(importancias, (len(importancias), -1))

    boston = datasets.load_boston()
    pl.barh(pos, importancias[sorted_idx], align='center')
    pl.yticks(pos, boston.feature_names[sorted_idx])
    pl.xlabel('Importancia relativa')
    pl.show()    
    
    import StringIO, pydot 
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("bostonTree.pdf") 
def tree3():
  global final_html
  global df,df_train,df_test,test_train_created,origin_df
  chi_key = list()
  init_style_string = template.style_string
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		DV_tree = Listkey2.get('DV')
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Build Tree" and key1 <> DV_tree):
				chi_key.append(key1)
		df1 = df.loc[:,chi_key]
		df2 = df1.values
		Y = df[DV_tree]
		clf = tree.DecisionTreeClassifier()
		clf = clf.fit(df2,Y.values)
		dot_data = StringIO()
		tree.export_graphviz(clf, out_file=dot_data)
		k = dot_data.getvalue()
		left_px = 600
		width_px = 150
		top_px = 50
		height_px = 309
		s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px)
		temp_df = df[0:15]	
		t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>"""
		final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html()
		return final_html
  return 'helloo'  
Example #31
0
#
# TODO: Create an DT classifier. No need to set any parameters
#
from sklearn import tree
dtc = tree.DecisionTreeClassifier()

#
# TODO: train the classifier on the training data / labels:
# TODO: score the classifier on the testing data / labels:
#

dtc.fit(X_train, y_train)

score = dtc.score(X_test, y_test)

print "High-Dimensionality Score: ", round((score * 100), 3)

#
# TODO: Use the code on the course's SciKit-Learn page to output a .DOT file
# Then render the .DOT to .PNGs. Ensure you have graphviz installed.
# If not, `brew install graphviz`. If you can't, use: http://webgraphviz.com/.
# On Windows 10, graphviz installs via a msi installer that you can download from
# the graphviz website. Also, a graph editor, gvedit.exe can be used to view the
# tree directly from the exported tree.dot file without having to issue a call.
#
tree.export_graphviz(dtc.tree_, out_file='tree.dot', feature_names=X.columns)

from subprocess import call
call(['dot', '-T', 'png', 'tree.dot', '-o', 'tree.png'])
Example #32
0
print(dt_grid_estimator.best_score_)
final_estimator = dt_grid_estimator.best_estimator_
results = dt_grid_estimator.cv_results_
print(results.get("mean_test_score"))
print(results.get("mean_train_score"))
print(results.get("params"))

#get the logic or model learned by Algorithm
#issue: not readable
print(final_estimator.tree_)

#get the readable tree structure from tree_ object
#visualize the deciion tree
dot_data = io.StringIO()
tree.export_graphviz(final_estimator,
                     out_file=dot_data,
                     feature_names=X_train.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
graph.write_pdf("C:/Users/Algorithmica/Downloads/tree.pdf")

#read test data
titanic_test = pd.read_csv(
    "C:\\Users\\Algorithmica\\Downloads\\titanic_test.csv")
print(titanic_test.info())

titanic_test[imputable_cont_features] = cont_imputer.transform(
    titanic_test[imputable_cont_features])
titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked'])
titanic_test['Embarked'] = le_embarked.transform(titanic_test['Embarked'])
titanic_test['Sex'] = le_sex.transform(titanic_test['Sex'])
Example #33
0
df2, targets, job_num = tar_encode(df, "y")

###Drop the categorical columns
df2.drop(['y'], axis=1, inplace=True)
df2.drop(['job'], axis=1, inplace=True)
df2.drop(['marital'], axis=1, inplace=True)
df2.drop(['education'], axis=1, inplace=True)
df2.drop(['default'], axis=1, inplace=True)
df2.drop(['housing'], axis=1, inplace=True)
df2.drop(['loan'], axis=1, inplace=True)
df2.drop(['contact'], axis=1, inplace=True)
df2.drop(['month'], axis=1, inplace=True)
df2.drop(['day_of_week'], axis=1, inplace=True)
df2.drop(['poutcome'], axis=1, inplace=True)

##correlation
print df2.corr()

features = list(df2.columns[0:20])
y = df2["target"]
X = df2[features]
dt = DecisionTreeClassifier(max_depth=4)
dt.fit(X, y)

tree.export_graphviz(dt,
                     out_file='C:/MIS680/tree.dot',
                     feature_names=X.columns)
(graph, ) = pydot.graph_from_dot_file('C:/MIS680/tree.dot')
graph.write_png('C:/MIS680/tree.png')
Example #34
0
# export model params
Estimators = clf.estimators_
Importances = clf.feature_importances_

numberClasses = clf.n_classes_
numberInputs = len(clf.feature_importances_)
numberTrees = len(clf.estimators_)

fo = open("RandomForestModel.txt", "w")
fo.write("RandomForestClassifier\n")
fo.write("IrisRandomForestModel\n")
fo.write("classification\n")
fo.write("binarySplit\n")

fo.write(str(numberInputs) + "\n")
for num in range(0, numberInputs):
    fo.write(shuxingname[num] + ", double,continuous,NA,NA,asMissing\n")
    print(shuxingname[num])

fo.write(str(numberClasses) + "\n")
for num in range(0, numberClasses):
    fo.write(classname[num] + "\n")
    print(classname[num])

fo.write(str(numberTrees) + "\n")
fo.close()
for num in range(0, numberTrees):
    fileName = "irsRF_" + str(num) + ".dot"
    with open(fileName, 'w') as f:
        f = tree.export_graphviz(Estimators[num], out_file=f)
Example #35
0
    for each_label in lensesLabels:  # 提取信息,生成字典
        for each in lenses:
            lenses_list.append(each[lensesLabels.index(each_label)])
        lenses_dict[each_label] = lenses_list
        lenses_list = []
    # print(lenses_dict)														#打印字典信息
    lenses_pd = pd.DataFrame(lenses_dict)  # 生成pandas.DataFrame
    # print(lenses_pd)														#打印pandas.DataFrame
    le = LabelEncoder()  # 创建LabelEncoder()对象,用于序列化
    for col in lenses_pd.columns:  # 序列化
        lenses_pd[col] = le.fit_transform(lenses_pd[col])
    # print(lenses_pd)														#打印编码信息

    clf = tree.DecisionTreeClassifier(
        max_depth=4)  # 创建DecisionTreeClassifier()类
    clf = clf.fit(lenses_pd.values.tolist(), lenses_target)  # 使用数据,构建决策树

    dot_data = StringIO()
    tree.export_graphviz(
        clf,
        out_file=dot_data,  # 绘制决策树
        feature_names=lenses_pd.keys(),
        class_names=clf.classes_,
        filled=True,
        rounded=True,
        special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("tree.pdf")  # 保存绘制好的决策树,以PDF的形式存储。

    print(clf.predict([[1, 1, 1, 0]]))  # 预测
Example #36
0
# -*- coding:utf-8 -*-
# 使用ID3算法进行分类
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz

data = pd.read_csv('../data/titanic_data.csv', encoding='utf-8')
data.drop(['PassengerId'], axis=1, inplace=True)  # 舍弃ID列,不适合作为特征

# 数据是类别标签,将其转换为数,用1表示男,0表示女。
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data.loc[data['Sex'] == 'female', 'Sex'] = 0
data.fillna(int(data.Age.mean()), inplace=True)
print(data.head(5))  # 查看数据

X = data.iloc[:, 1:3]  # 为便于展示,未考虑年龄(最后一列)
y = data.iloc[:, 0]

dtc = DTC(criterion='entropy')  # 初始化决策树对象,基于信息熵
dtc.fit(X, y)  # 训练模型
print('输出准确率:', dtc.score(X, y))

# 可视化决策树,导出结果是一个dot文件,需要安装Graphviz才能转换为.pdf或.png格式
with open('../tmp/tree.dot', 'w') as f:
    f = export_graphviz(dtc, feature_names=X.columns, out_file=f)
Example #37
0
test_idx = [0,50,100]

train_target = np.delete(iris.target,test_idx)
train_data = np.delete(iris.data,test_idx,axis=0)

print ("Data", train_data)
print ("Teste", test_idx)

test_target = iris.target[test_idx]
test_data = iris.data[test_idx]

clf = tree.DecisionTreeClassifier()
clf.fit(train_data,train_target)

print ("Resposta:",clf.predict(test_data))

# print "Resposta:",clf.predict([[5.5,2.4,3.7,1.0],[5.8,	2.7	,5.1,	1.9]])
#outra forma de criar o iris

# dot_data = tree.export_graphviz(clf, out_file=None)
# graph = graphviz.Source(dot_data)
# graph.render("iris")

dot_data = tree.export_graphviz(clf, out_file=None,
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,
                         filled=True, rounded=True,
                         special_characters=True)
graph = graphviz.Source(dot_data)

graph.render('iris')
Example #38
0
mpl.rc('ytick', labelsize=12)

#%% decision trees
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, export_graphviz

iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X, y)

export_graphviz(
        tree_clf, out_file="iris_tree.dot",
        feature_names=iris.feature_names[2:],
        class_names=iris.target_names,
        rounded=True, filled=True)

#%% Plot decision boundaries
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[0, 7.5, 0, 3], iris=True, 
                           legend=False, plot_training=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
Example #39
0
features = ['Age', 'Experience', 'Rank', 'Nationality']

X = df[features]
y = df['Go']

print(X)
print(y)

Now we can create the actual decision tree, fit it with our details, and save a .png file on the computer:

Example
Create a Decision Tree, save it as an image, and show the image:

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X, y)
data = tree.export_graphviz(dtree, out_file=None, feature_names=features)
graph = pydotplus.graph_from_dot_data(data)
graph.write_png('mydecisiontree.png')

img=pltimg.imread('mydecisiontree.png')
imgplot = plt.imshow(img)
plt.show()

Result Explained
The decision tree uses your earlier decisions to calculate the odds for you to wanting to go see a comedian or not.

Let us read the different aspects of the decision tree:



Rank
def ejecutarModeloyGuardarlo(nombreModelo, modelo, pathModelo, ds_train_f, ds_train_t, ds_test_f, ds_test_t, feature_names, modeloEsGrid,
                             modoDebug, dir_subgrupo_img):
    print((datetime.datetime.now()).strftime("%Y%m%d_%H%M%S") + " Ejecutando " + nombreModelo + " ...")
    out_grid_best_params = []

    param_parada_iteraciones = 10  # early_stopping_rounds: es el numero de iteraciones en las que ya no mejora el error diferencial train-test, evitando iterar tanto en XGBoost y reducir el overfitting
    eval_set = [(ds_train_f, ds_train_t), (ds_test_f, ds_test_t)]

    #-------- PINTAR EL ERROR DE OVERFITTING ---------------------------
    #-------------------URL: https://machinelearningmastery.com/avoid-overfitting-by-early-stopping-with-xgboost-in-python/
    #--- URL: https://xgboost.readthedocs.io/en/latest/parameter.html

    METODO_EVALUACION="map"  # map: Mean average Precision. aucpr: Area under the PR curve (peores resultados en precisión)
    # Explicacion: https://xgboost.readthedocs.io/en/latest/parameter.html

    # Con PARAMETROS PARA VER EL OVERFITTING
    modelo = modelo.fit(ds_train_f, ds_train_t, eval_metric=[METODO_EVALUACION], early_stopping_rounds=param_parada_iteraciones, eval_set=eval_set, verbose=False)  # ENTRENAMIENTO (TRAIN)

    # --------------- Pintar dibujo---------------------------------------------------------------
    y_pred = modelo.predict(ds_test_f)
    y_pred = y_pred.astype(float)
    predictions = [round(value) for value in y_pred]
    precision_para_medir_overfitting = precision_score(ds_test_t, predictions)
    print("Accuracy (PRECISION) para medir el overfitting: %.2f%%" % (precision_para_medir_overfitting * 100.0))
    results = modelo.evals_result()

    epochs = len(results['validation_0'][METODO_EVALUACION])
    x_axis = range(0, epochs)
    fig, ax = pyplot.subplots()
    ax.plot(x_axis, results['validation_0'][METODO_EVALUACION], label='Train')
    ax.plot(x_axis, results['validation_1'][METODO_EVALUACION], label='Test')
    ax.legend()
    pyplot.xlabel("Numero de epochs")
    pyplot.ylabel(METODO_EVALUACION)
    pyplot.title("Modelo: " + nombreModelo + " - Metodo de evaluacion: " + METODO_EVALUACION)
    path_img_metricas_modelo_ovft = dir_subgrupo_img + nombreModelo + "_" + METODO_EVALUACION + ".png"
    print("Pintando IMG de metricas del modelo overfitting (train vs test). Path: " + path_img_metricas_modelo_ovft)
    plt.savefig(path_img_metricas_modelo_ovft, bbox_inches='tight')
    plt.clf();        plt.cla();        plt.close();  # Limpiando dibujo
    #------------------------------------------------------------------------------


    # print("Se guarda el modelo " + nombreModelo + " en: " + pathModelo)
    if modeloEsGrid:
        s = pickle.dump(modelo.best_estimator_, open(pathModelo, 'wb'))
        out_grid_best_params = modelo.best_params_
        print("Modelo GRID tipo " + nombreModelo + " Los mejores parametros probados son: " + str(modelo.best_params_))

        if modoDebug and nombreModelo == "rf_grid":
            feature_imp = pd.Series(modelo.best_estimator_.feature_importances_, index=feature_names).sort_values(
                ascending=False)
            print("Importancia de las features en el modelo " + nombreModelo + " ha sido:")
            print(feature_imp.to_string())

            print("Generando dibujo de un árbol de decision (elegido al azar de los que haya)...")
            print(feature_names)
            print("Guardando dibujo DOT en: " + pathModelo + '.dot' + " Convertirlo ONLINE en: http://viz-js.com/")
            export_graphviz(modelo.best_estimator_.estimators_[1], out_file=pathModelo + '.dot',
                            feature_names=feature_names, class_names=list('TARGET'), rounded=True, proportion=False,
                            precision=2, filled=True)

            # Online Viewers:
            # http: // www.webgraphviz.com /
            # http: // sandbox.kidstrythisathome.com / erdos /
            # http: // viz - js.com /
            # Conversion local de DOT a PNG (en mi PC no consigo instalarlo):
            # call(['dot', '-Tpng', pathModelo + '.dot', '-o', pathModelo + '.png', '-Gdpi=600'])  # Convert to png

    else:
        s = pickle.dump(modelo, open(pathModelo, 'wb'))
    return modelo
Example #41
0
import numpy as np
print("Passing: %d out %d (%.2f%%)" %
      (np.sum(cianjur_pass), len(cianjur_pass),
       100 * float(np.sum(cianjur_pass)) / len(cianjur_pass)))
#%% 5.fit a decision tree
from sklearn import tree
bogor = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
bogor = bogor.fit(cianjur_train_att, cianjur_train_pass)

#%% 6.visualize tree
import graphviz
yogyakarta = tree.export_graphviz(bogor,
                                  out_file=None,
                                  label="all",
                                  impurity=False,
                                  proportion=True,
                                  feature_names=list(cianjur_train_att),
                                  class_names=["fail", "pass"],
                                  filled=True,
                                  rounded=True)
malang = graphviz.Source(yogyakarta)
malang

#%% 7.save tree
tree.export_graphviz(yogyakarta,
                     out_file="student-performance.dot",
                     label="all",
                     impurity=False,
                     proportion=True,
                     feature_names=list(cianjur_train_att),
                     class_names=["fail", "pass"],
                    overfit_values[f][d],
                    ax=axs[i // 4, i % 4],
                    title='Overfitting for max_depth = %d with %s criteria' %
                    (d, f),
                    xlabel='min_impurity_decrease',
                    ylabel='accuracy',
                    percentage=True)
                i += 1
            i += 1
        plt.suptitle('QOT Overfitting - Decision Trees')
        plt.savefig(subDir + 'QOT Overfitting - Decision Trees')

        dot_data = export_graphviz(best_tree,
                                   out_file=(subDir + 'QOT - ' + key +
                                             ' - dtree.dot'),
                                   filled=True,
                                   rounded=True,
                                   special_characters=True,
                                   class_names=['negative', 'positive'])
        # Convert to png
        call([
            'dot', '-Tpng', (subDir + 'QOT - ' + key + ' - dtree.dot'), '-o',
            (subDir + 'QOT Decision Trees - ' + key +
             ' - tree representation.png'), '-Gdpi=600'
        ])

        prd_trn = best_tree.predict(trnX)
        prd_tst = best_tree.predict(tstX)
        ds.plot_evaluation_results(["negative", "positive"], trnY, prd_trn,
                                   tstY, prd_tst)
        plt.suptitle('QOT Decision Trees - ' + key +
Example #43
0
def runDecisionTreeI(plotTree=False,
                     trainSize=0.3,
                     pruneTree=False,
                     pruningThreshold=0,
                     maxDepth=50):
    global train_sizes, accuracy_1_test, accuracy_1_train, total_nodes, mean_error_1
    ## STORE CATEGORICAL COLUMNS
    cols_to_drop = [1, 2, 3]
    all_features = []

    ## PRUNING FUNCTION
    def prune_index(inner_tree, index, threshold):
        global post_prune_count
        if inner_tree.value[index].min() < threshold:
            # turn node into a leaf by "unlinking" its children
            inner_tree.children_left[index] = TREE_LEAF
            inner_tree.children_right[index] = TREE_LEAF
        # if there are shildren, visit them as well
        if inner_tree.children_left[index] != TREE_LEAF:
            prune_index(inner_tree, inner_tree.children_left[index], threshold)
            prune_index(inner_tree, inner_tree.children_right[index],
                        threshold)

    ## LOAD FEATURE NAMES FROM FILE
    features_file = open(
        '../datasets/network-intrusions/pcap-features-all.txt', "r")
    for x in features_file.readlines():
        all_features.append(x.rstrip())
    train_features = all_features[0:39]

    ## LOAD DATASET INTO DATAFRAME
    df = pd.read_csv(sys.argv[1], header=None)
    df.columns = all_features

    ## CHANGE CATEGORICAL DATA TO INTEGER TYPE LABELS USING OneHotEncoder
    le = preprocessing.LabelEncoder()
    cols_to_drop = [1, 2, 3]
    for x in cols_to_drop:
        le.fit(df.iloc[:, x])
        df.iloc[:, x] = le.transform(df.iloc[:, x])

    ## FEATURES AND LABEL SELECTION
    X = df.iloc[:, 0:39].values
    y = df.iloc[:, 42].values

    ## TRAIN/TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=(1 -
                                                                   trainSize),
                                                        train_size=trainSize,
                                                        shuffle=True)
    ## FIT DATA TO MODEL
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=maxDepth)
    clf = clf.fit(X_train, y_train)

    ## PRUNE NODES THAT HAVE MINIMUM CLASS COUNT 500
    if pruneTree == True:
        prune_index(clf.tree_, 0, pruningThreshold)

    ## COUNT NODES AFTER PRUNING
    count_prunes = 0
    for x in range(len(clf.tree_.value)):
        if clf.tree_.value[x].min() < pruningThreshold:
            count_prunes = count_prunes + 1
    print count_prunes

    ## PREDICT THE VALUES OF THE TESTING SET
    predictions = clf.predict(X_test)

    ## COUNT MISPREDICTIONS
    count = 0
    for x, z in zip(y_test, predictions):
        if x != z:
            count = count + 1

    print 'RESULTS FOR DATASET I'
    print '----------------------------------------'
    print 'Test Size:'
    print len(X_test)
    print 'Train Size:'
    print len(X_train)
    print 'Accuracy on Test Data:'
    print clf.score(X_test, y_test)
    print 'Accuracy on Train Data:'
    print clf.score(X_train, y_train)
    print 'Mis-Classified:'
    print str(count) + ' Out of ' + str(len(y_test))
    print 'Number of Nodes:'
    print len(clf.tree_.value)

    if pruneTree == True:
        print 'Number of Nodes After Pruning:'
        print len(clf.tree_.value) - count_prunes
    print '----------------------------------------'
    ## DRAW A DECISION TREE GRAPH
    if plotTree == True:
        dot_data = tree.export_graphviz(clf,
                                        feature_names=train_features,
                                        class_names=['normal', 'attack'],
                                        filled=True,
                                        rounded=True,
                                        out_file=None)
        graph = graphviz.Source(dot_data)
        graph.render('test-v2')

    ## APPEND RESULTS TO GLOBALS
    train_sizes.append(trainSize)
    total_nodes.append(len(clf.tree_.value) - count_prunes)
    accuracy_1_test.append(clf.score(X_test, y_test))
    accuracy_1_train.append(clf.score(X_train, y_train))
Example #44
0
plt.figure(facecolor='w')
plt.plot(depth, err_list, 'ro-', lw=3)
plt.xlabel('决策树深度', fontsize=16)
plt.ylabel('错误率', fontsize=16)
plt.grid(True)
plt.title('决策树深度太多,导致的过拟合问题', fontsize=18)
plt.show()

from skimage import io
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(
    model,
    out_file=None,
    feature_names=['PCA1', 'PCA2'],
    class_names=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],
    filled=True,
    rounded=True,
    special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

with open('iris.dot', 'w') as f:
    f = tree.export_graphviz(model, out_file=f)

import pydotplus
dot_data = tree.export_graphviz(model, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('iris3.pdf')
def run_prediction(df, input_features=None, clf=None, num_iters=10):

    global FEATURES
    global RUN_CT
    global PREDICTIONS
    global METADATA
    global DCT_COUNT
    features = None

    print(input_features)
    if input_features is None or input_features == [
            'dummy1'
    ] or input_features == ['dummy2']:
        features = [
            c for c in list(df.columns.values)
            if not c in PREDICTIONS + METADATA
        ]
        # features = FEATURES
        # print "FEATURES",FEATURES[92:]
    else:
        features = input_features

    le = preprocessing.LabelEncoder()

    # print("input features",features)

    # scaler = Normalizer()
    scaler = StandardScaler()
    results = []

    df_copy = df.copy()

    userIDs = [1]
    if 'userID' in df_copy.columns.values:
        userIDs = set(df_copy['userID'].tolist())

    if not PERSONALIZED:
        userIDs = [1]

    for facet in PREDICTIONS:
        # if facet not in df.columns.values:
        #     continue
        # print(facet)
        for i in range(num_iters):
            print(DCT_COUNT, facet)
            # print(i)
            RUN_CT += 1
            res = {'prediction': facet}

            # 0) Slice to only feature columns and  test column
            # 0a) Convert data to np array
            # 1) Random permutation

            y_tests_all = []
            y_preds_all = []
            y_scores_all = []
            for u in userIDs:
                data_okay = True
                res = {'prediction': facet}
                # print(u, userIDs)
                if input_features not in [
                        'dummy1', 'dummy2'
                ] and facet != 'userID' and PERSONALIZED:
                    df = df_copy[df_copy['userID'] == u]
                while True:

                    try:
                        (session_nums_train,
                         session_nums_test) = train_test_split_list(
                             list(set(df['session_num'].tolist())), 0.8)
                        y = df[facet].as_matrix()
                        le.fit(y)

                        df_train = df[df['session_num'].isin(
                            session_nums_train)]
                        df_test = df[df['session_num'].isin(session_nums_test)]
                        X_train = df_train[features].as_matrix()
                        X_test = df_test[features].as_matrix()
                        assert (len(df_train) + len(df_test)) == len(df.index)
                        y_train = df_train[facet].tolist()

                        y_train = le.transform(y_train)
                        y_test = df_test[facet].tolist()
                        y_test = le.transform(y_test)
                        if len(set(y_train)) == 1 or len(set(y_test)) == 0:
                            data_okay = False
                            break

                        scaler = StandardScaler()

                        if input_features == ['dummy1']:
                            lr_model = DummyClassifier(
                                strategy='stratified',
                                random_state=random.randint(1, 4294967294))
                        elif input_features == ['dummy2']:
                            lr_model = DummyClassifier(
                                strategy='most_frequent',
                                random_state=random.randint(1, 4294967294))
                        else:

                            X_tofit = df[features].as_matrix()
                            y_tofit = df[facet].as_matrix()
                            y_tofit = le.transform(y_tofit)
                            scaler.fit(X_tofit)
                            X_tofit = scaler.transform(X_tofit)

                            imap = Isomap(n_components=10)
                            pca = PCA(n_components=10)
                            rfe = RFE(
                                LinearSVC(random_state=random.randint(
                                    1, 4294967294)))
                            sfm = SelectFromModel(
                                LinearSVC(random_state=random.randint(
                                    1, 4294967294)))

                            knn = KNeighborsClassifier(n_neighbors=10)
                            gnb = GaussianNB()
                            mlp = MLPClassifier(alpha=1)
                            gsc = GridSearchCV(LinearSVC(), {
                                'dual': [True, False],
                                'C': [0.1, 1, 10]
                            })
                            svm = SVC(probability=True,
                                      gamma=0.7,
                                      C=1,
                                      random_state=random.randint(
                                          1, 4294967294))
                            dct = DecisionTreeClassifier(max_depth=8)
                            rfc = RandomForestClassifier(
                                random_state=random.randint(1, 4294967294))
                            ada = AdaBoostClassifier(
                                random_state=random.randint(1, 4294967294))
                            qda = QuadraticDiscriminantAnalysis()
                            anovakbest_filter = SelectKBest(
                                f_classif, k=min([20, len(features)]))
                            ovr = OneVsRestClassifier(
                                LinearSVC(random_state=random.randint(
                                    1, 4294967294)))
                            ovo = OneVsOneClassifier(
                                LinearSVC(random_state=random.randint(
                                    1, 4294967294)))
                            # ovr = OneVsRestClassifier(MLPClassifier(alpha=1))
                            # ovo = OneVsOneClassifier(MLPClassifier(alpha=1))

                            clf_map = {
                                'knn': knn,
                                'gnb': gnb,
                                'svm': svm,
                                'rfc': rfc,
                                'ada': ada,
                                'qda': qda,
                                'svc': svm,
                                'mlp': mlp,
                                'ovr': ovr,
                                'ovo': ovo,
                                'gsc': gsc,
                                'dct': dct
                            }
                            if clf is not None and clf in clf_map.keys():
                                # lr_model = clf_map[clf]
                                classifier = clf_map[clf]
                                lr_model = Pipeline([('anova',
                                                      anovakbest_filter),
                                                     ('clf', classifier)])
                                # if classifier==dct:
                                #     DOT_DATA = tree.export_graphviz(clf, out_file=None,
                                #                                     feature_names=input_features,
                                #                                     class_names=facet,
                                #                                     filled=True, rounded=True,
                                #                                     special_characters=True)

                                # lr_model = Pipeline([('pca', pca), ('clf', clf_map[clf])])
                                # lr_model = Pipeline([('feature_selection', rfe), ('clf', clf_map[clf])])
                                # lr_model = Pipeline([('feature_selection', sfm), ('clf', clf_map[clf])])
                            else:
                                lr_model = Pipeline([('anova',
                                                      anovakbest_filter),
                                                     ('clf', gnb)])

                        scaler.fit(X_train)
                        lr_model.fit(scaler.transform(X_train), y_train)
                        if clf == 'dct':
                            DCT_COUNT += 1
                            export_graphviz(
                                classifier,
                                out_file='/Users/Matt/Desktop/output/out%d.dot'
                                % DCT_COUNT,
                                feature_names=features)
                        break
                    except ValueError as e:
                        print(e)
                        print("fail")
                        pass

                # print("INPUT FEATURES", input_features,u,facet)

                if not data_okay:
                    print("NOT OKAY!")
                    continue
                X_test = scaler.transform(X_test)
                y_pred = lr_model.predict(X_test)

                def f(label, l):
                    return sum(l == label) / len(l)

                # y_score = lr_model.predict_proba(X_test)[:,1]

                # y_score = lr_model.predict_proba(X_test)[:,1]
                y_tests_all += list(y_test)
                y_preds_all += list(y_pred)
                # y_scores_all += list(y_score)
            res["accuracy"] = metrics.accuracy_score(y_tests_all, y_preds_all)
            # res["f1"] = metrics.f1_score(y_test,y_pred,average='samples')
            # res["precision"] = metrics.precision_score(y_test,y_pred,average='samples')
            # res["recall"] = metrics.recall_score(y_test,y_pred,average='samples')
            # res["n_queries"] = df_test['queries_num'].tolist()

            # try:
            #     res["aucroc"] = metrics.roc_auc_score(y_test,y_score)
            # except ValueError:
            #     if sum(y_test) > 1:
            #         res["aucroc"] = 1
            #     else:
            #         res["aucroc"] = 0
            #
            # res["ap"] = metrics.average_precision_score(y_test,y_score)
            res['y_true'] = le.inverse_transform(y_tests_all)
            res['y_pred'] = le.inverse_transform(y_preds_all)
            # res['y_true'] = y_test
            # res['y_pred'] = y_pred
            # res['y_score'] = y_scores_all
            res['run_ct'] = RUN_CT

            # print "SCORE",res["f1"]

            # print "SCORE",res["ap"],res["aucroc"]

            results += [res]
    return pd.DataFrame(results)
get_ipython().system('conda install -c conda-forge pydotplus -y')
get_ipython().system('conda install -c conda-forge python-graphviz -y')
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
get_ipython().run_line_magic('matplotlib', 'inline')
dot_data = StringIO()
filename = "loan.png"
featureNames = df.columns[0:8]
targetNames = df['loan_status'].unique().tolist()
out = tree.export_graphviz(Tree,
                           feature_names=featureNames,
                           out_file=dot_data,
                           class_names=np.unique(y_trainset),
                           filled=True,
                           special_characters=True,
                           rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img, interpolation='nearest')

# # Support Vector Machine

# In[32]:

df.dtypes
df = df[pd.to_numeric(df['education'], errors='coerce').notnull()]
Example #47
0
                        ha='right',
                        fontsize=20)
hm.xaxis.set_ticklabels(hm.xaxis.get_ticklabels(),
                        rotation=0,
                        ha='right',
                        fontsize=20)
plt.ylabel('True label', fontsize=20)
plt.xlabel('Predicted label', fontsize=20)
plt.title("Decision Tree - Entropy")
plt.tight_layout()
plt.show()

# display decision tree
dot_data = tree.export_graphviz(clf_gini,
                                filled=True,
                                rounded=True,
                                class_names='survived',
                                feature_names=tt.iloc[:, 0:].columns,
                                out_file=None)

graph = graph_from_dot_data(dot_data)
graph.write_pdf("decision_tree_gini.pdf")
webbrowser.open_new(r'decision_tree_gini.pdf')

dot_data = tree.export_graphviz(clf_entropy,
                                filled=True,
                                rounded=True,
                                class_names='survived',
                                feature_names=tt.iloc[:, 0:].columns,
                                out_file=None)

graph = graph_from_dot_data(dot_data)
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['target'])
# df['label'] = df.target.replace(dict(enumerate(df.target_names)))
print(df.head()) # to check the top results
print(iris.feature_names)
print(iris.target_names)
print(df.describe()) # to check difference between min and maxmium value
x = iris['data']
y = iris['target']

iris_df = pd.DataFrame(x, columns=iris['feature_names'])
print(iris_df.head)
x, y = shuffle(x, y, random_state=0)  # random shuffle
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm
clf = classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console
dot_data = StringIO()
tree.export_graphviz(classifier,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True, rounded=True,
                     impurity=False,
                     proportion=True)
graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph
graph[0].write_pdf("iris3.pdf") # run the file.

#Starting implementation correr en jupyter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#La siguiente instruccion permite incorporar las graficas en este documento
%matplotlib inline
from sklearn import tree
df = pd.read_csv("iris_df.csv")
df.columns = ["X1", "X2", "X3","X4", "Y"]
df.head()

#implementation
from sklearn.cross_validation import train_test_split
decision = tree.DecisionTreeClassifier(criterion="gini")
X = df.values[:, 0:4]
Y = df.values[:, 4]
trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3)
decision.fit(trainX, trainY)
print("Accuracy: \n", decision.score(testX, testY))

#Visualisation
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus as pydot
dot_data = StringIO()
tree.export_graphviz(decision, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig('tree_confusion_matrix.png',dpi=500,bbox_inches='tight')

data=pd.read_csv('/content/drive/My Drive/ML/wine.csv')
X=data[['0','1','2','3','4','5','6','7','8','9','10','11','12']].values
Y=data['13']
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=4)
clf = DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest)#返回预测的准确accuracy
print(score)
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','OD280/OD315稀释葡萄酒','脯氨酸']

dot_data = tree.export_graphviz(clf,out_file = None,feature_names= feature_name,class_names=["琴酒","雪莉","贝尔摩德"],filled=True,rounded=True) 
graph = graphviz.Source(dot_data)
graph
# graph.format = 'png'
# graph.render("test",view=True)
#graph.view()
# system("dot -Tpng dtree2.png")

clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=30 ,splitter="random") 
clf = clf.fit(Xtrain, Ytrain) 
score = clf.score(Xtest, Ytest)
score

import graphviz 
dot_data = tree.export_graphviz(clf,feature_names= feature_name,class_names=["琴酒","雪莉","贝尔摩德"],filled=True,rounded=True ) 
graph = graphviz.Source(dot_data)
model = SVC()
model.fit(train_x, train_y)
predictions = model.predict(test_x)

accuracy = accuracy_score(test_y, predictions) * 100
print("Accuracy %.2f%%"% accuracy)

from sklearn.tree import DecisionTreeClassifier

SEED = 50
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth = 2)
model.fit(raw_train_x, train_y)
predictions = model.predict(raw_test_x)

accuracy = accuracy_score(test_y, predictions) * 100
print("Accuracy %.2f%%"% accuracy)

from sklearn.tree import export_graphviz
import graphviz

features = x.columns
dot_data = export_graphviz(model, out_file=None,
                           filled = True, rounded = True,
                            feature_names = features, class_names=["no","yes"])
graph = graphviz.Source(dot_data)
graph

Example #52
0
                               criterion='gini')
final.fit(X_train, Y_train)
y_pred = final.predict(X_final)

print('Accuracy:', metrics.accuracy_score(Y_final, y_pred))
#.74--- WOO

estimator = final.estimators_[5]

#Visualizing the features in my decision tree. One of the perks of a decision tree is its relative interpretability compared to other ML algorithms.

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator,
                rounded=True,
                proportion=False,
                precision=2,
                filled=True)

import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree

tree.plot_tree(classifier)

#PCA

from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
music_feature.loc[:] = min_max_scaler.fit_transform(music_feature.loc[:])
Example #53
0
y = FeaturePicker.Category.to_frame()
X_train, X_test, y_train, y_test = train_test_split(FeaturePicker.iloc[:, 1:],
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

#Sklearn will generate a decision tree for your dataset using an optimized version of the CART algorithm when you run the following code
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

#metrics
import sklearn.metrics as met
y_pred = dtree.predict(X_test)
print(met.classification_report(y_test, y_pred))
#Decode the arrays back into string

#Graph the Tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dtree,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


# Pull out one tree from the forest
tree =model.estimators_[5]
# Import tools needed for visualization

# Pull out one tree from the forest
tree = model.estimators_[5]
# Saving feature names
feature_list = list(train.columns)
# Export the image to a dot file

export_graphviz(tree, out_file = 'Big_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('Big_tree.dot')
# Write graph to a png file
graph.write_png('Big_tree.png')
print(test)
predicted_aqhi = model.predict(test)
make_submission(predicted_aqhi,'Submission(RF)')
print("\n\nThe forest include only 10 trees and each tree included only the 3 level\n")
# Limit depth of tree to 3 levels


import matplotlib.pyplot as plt
_, ax = plt.subplots()
ax.scatter(x = range(0, len(aqiArray1)), y=aqiArray1, c = 'blue', label = 'Actual', alpha = 0.3)
ax.scatter(x = range(0, predicted_aqhi.size), y=predicted_aqhi, c = 'red', label = 'Predicted', alpha = 0.3)
    return y_hat-y

grad = loss_gradient(y,y_hat)
#%% 6. Pseudo Residuals - basically the negative of the gradient
pseudo_residuals = -loss_gradient(y,y_hat)

#%% 7 train first tree
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth = 1)
#%%
ft = regressor.fit(x,pseudo_residuals)
print(ft)
#%%
from sklearn.tree import export_graphviz
import graphviz 
tree = export_graphviz(regressor, impurity = False, filled = True)
open("boston.jpg","w").write(tree)
graph = graphviz.Source(tree)
print(graph)
#%%
y1 = y_hat + regressor.predict(x)
plt.figure(figsize = (12,8))
plt.plot(y)
plt.plot(y_hat)
plt.plot(y1)
plt.legend(['real values','mean','newest prediction'])
#%% check the new loss
cl = compute_loss(y,y1).mean() #around 23
#%% Second iteration of the tree
pseudo_residuals = -loss_gradient(y,y1) # not y_hat, but y1
regressor.fit(x,pseudo_residuals)
Example #56
0
            Y_test[i] = 1
            #print ("Hello")
        elif (Y_test[i] == '-'):
            Y_test[i] = 0
    #this part is for tra
    # for i in range(len(model_pred_a)):
    #     if (model_pred_a[i]== '+') :
    #         model_pred_a[i]=1
    #         #print ("Hello")
    #     elif (model_pred_a[i]=='-'):
    #         model_pred_a[i]=0

    for i in range(len(model_pred)):
        if (model_pred[i] == '+'):
            model_pred[i] = 1
            # print ("Hello")
        elif (model_pred[i] == '-'):
            model_pred[i] = 0
    model_pred = model_pred.astype(np.int)
    #model_pred_a=model_pred_a.astype(np.int)
    #print(model_pred_a)
    Y_test = Y_test.astype(np.int)
    #model_pred = np.array(model_pred)
    accuracy = np.equal(model_pred, Y_test).sum() / len(Y_test)
    #this part is for tra
    #accuracy_a=np.equal(model_pred_a,Y_test_1).sum()/len(Y_test_1)
    print(accuracy)

part2 = tree.export_graphviz(model, out_file='2e.dot')
# print (accuracy)
Example #57
0
d=pd.get_dummies(d,columns=['HC','DI','GA','RND'])
print(d)

d['RC']=d.apply(lambda row: 0 if (row['Risco']) == 'alto'
                else 1 if (row['Risco']) == 'moderado'
                else 2, axis=1)
print()

print(d.head())
print()

#d['teste']=d.apply(lambda row:10 if (row['DI_baixa'])==1 and (row['RND_0 a 15'])==1 else 15,axis = 1)
#print(d.head())
#print()

d = d.sample(frac=1)
d_train = d
d_test = d
d_train_att = d_train.drop(['RC'],axis=1)
d_train_pass = d_train['RC']

from sklearn import tree
t = tree.DecisionTreeClassifier(criterion="entropy")
t = t.fit (d_train_att,d_train_pass)

tree.export_graphviz(t, out_file="risco.dot",label="all",impurity=False,proportion=True,
                     feature_names=list(d_train_att),class_names=['alto','moderado','baixo'],
                     filled=True,rounded=True)

t.predict([[0,1,0,0,1,0,1,0,0,1]])
# VISUALIZATION #

if input("Make model tree? (y/N): ").lower() == 'y':
	print("\n\tSTARTING GRAPH")
	FEATURES = ['Average_Position_X', 'Average_Position_Y',
	'Total_Distance', 'Average_Distance', 'Total_Duration',
	'Average_Duration', 'Longest_Dist', 'WMax', 'EMax', 'NMax',
	'SMax', 'Church-bin', 'NView-bin', 'Wallace-bin', 'Home-bin',
	'0-HourBin', '1-HourBin', '2-HourBin', '3-HourBin', '4-HourBin',
	'5-HourBin', '6-HourBin', '7-HourBin', '8-HourBin', '9-HourBin',
	'10-HourBin', '11-HourBin', '12-HourBin', '13-HourBin', '14-HourBin',
	'15-HourBin', '16-HourBin', '17-HourBin', '18-HourBin', '19-HourBin',
	'20-HourBin', '21-HourBin', '22-HourBin', '23-HourBin']

	DAYS = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
	ABBREVIATED_DAYS = ["Sunday", "Mon-Thu", "Wednesday", "Friday", "Saturday"]
	dot_data = StringIO()
	tree.export_graphviz(
		clf, out_file=dot_data, filled=True, rounded=True, impurity=True,
		class_names=ABBREVIATED_DAYS,
		rotate=True,
		feature_names=FEATURES)

	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph = graph[0]
	graph.write_pdf("tree_model.pdf")

	print("\n\t~~~FINISHED GRAPH~~~")
	print("\tSaved as tree_model.pdf")
ns_pr_rf = len(
    labels_train_test[labels_train_test == 1]) / len(labels_train_test)

plt.plot([0, 1], [ns_pr_rf, ns_pr_rf], linestyle='--')
plt.plot(rf_recall, rf_precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(['No Skill', 'Random Forest'])
plt.savefig('rf_PR.png')
plt.show()

# 5.4 A decision tree
tree = rf.estimators_[5]
export_graphviz(tree,
                out_file='tree.dot',
                feature_names=features_list_train,
                rounded=True,
                precision=1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

# 5.5 A smaller tree
rf_small = RandomForestClassifier(n_estimators=1660, max_depth=3)
rf_small.fit(features_train_train, labels_train_train)
tree_small = rf_small.estimators_[5]
export_graphviz(tree_small,
                out_file='small_tree.dot',
                feature_names=features_list_train,
                rounded=True,
                precision=1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
Example #60
0
import numpy as np
import pandas as pd
import os
from sklearn import tree
from sklearn import preprocessing
from IPython.display import Image

mypath = 'C:\\Users\\ellen\\Desktop'
os.chdir(mypath)

train = pd.read_csv("106.csv")
features = ["time", "water", "age"]
trainer = pd.DataFrame([train["time"], train["water"], train["age"]]).T
tree_model = tree.DecisionTreeClassifier(max_depth=3)

tree_model.fit(X=trainer, y=train["survive"])

tree_model.score(X=trainer, y=train["survive"])

with open("tree3.dot", 'w') as f:
    f = tree.export_graphviz(tree_model, feature_names=features, out_file=f)