stop =N else: stop = start +len_fold test = data.ix[rows[start:stop]] train = data.ix[rows[:start]+rows[stop:]] if resample: train_len=start+N-stop no_resamples = N-train_len train_rows = list(train.index) random_extra_rows =[random.choice(train_rows) for row in range(no_resamples)] train_rows = train_rows+random_extra_rows train=train.ix[train_rows] yield {'test':test, 'train':train} start=stop df=ct.cleaneddf(no_bins=10)[0] df2=ct.cleaneddf(no_bins=10)[1] df=df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] data_type_dict={'Survived':'nominal', 'Pclass':'ordinal', 'Sex':'nominal', 'Age':'ordinal', 'SibSp':'ordinal', 'Parch':'ordinal', 'Fare':'ordinal', 'Embarked':'nominal'} def tree_train(data_type_dict, train_data,test_data, response, no_folds, min_node_size, max_depth, no_iter): parameters={'min_node_size':min_node_size, 'max_node_depth':max_depth, 'threshold':0, 'metric_kind':'Gini', 'alpha':0, 'response':response} model=ticart.ClassificationTree()
else: stop = start +len_fold test = data.ix[rows[start:stop]] train = data.ix[rows[:start]+rows[stop:]] if resample: train_len=start+N-stop no_resamples = N-train_len train_rows = list(train.index) random_extra_rows =[random.choice(train_rows) for row in range(no_resamples)] train_rows = train_rows+random_extra_rows train=train.ix[train_rows] yield {'test':test, 'train':train} start=stop df = ct.cleaneddf(no_bins=10)[0] df2=ct.cleaneddf(no_bins=10)[1] df=df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] data_type_dict={'Survived':'nominal', 'Pclass':'ordinal', 'Sex':'nominal', 'Age':'ordinal', 'SibSp':'ordinal', 'Parch':'ordinal', 'Fare':'ordinal', 'Embarked':'nominal'} def tree_train(data_type_dict, train_data,test_data, response, no_folds, min_node_size, max_depth, no_iter): parameters={'min_node_size':min_node_size, 'max_node_depth':max_depth, 'threshold':0, 'metric_kind':'Gini', 'alpha':0, 'response':response} model=ticart.ClassificationTree()
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import cleantitanic as ct import matplotlib.pylab as plt data = ct.cleaneddf() traindf, testdf =data[0], data[1] def proportionSurvived(discreteVar): by_var = traindf.groupby([discreteVar,'Survived']) table = by_var.size().unstack() normedtable = table.div(table.sum(1), axis=0) return normedtable discreteVarList = ['Sex', 'Pclass', 'Embarked'] fig1, axes1 = plt.subplots(3,1) for i in range(3): var = discreteVarList[i] table = proportionSurvived(var) table.plot(kind='barh', stacked=True, ax=axes1[i]) fig1.show() fig2, axes2 = plt.subplots(2,3) genders=traindf.Sex.unique() classes=traindf.Pclass.unique()
root_list = [] root_list.append(root_index) layout = g.layout_reingold_tilford(root=root_list) ig.plot(g, layout=layout, margin=margin) def predict(self, data_point, class_probs=False): if class_probs: return self.vertices[0].get_data_leaf(data_point).prediction else: return self.vertices[0].get_data_leaf(data_point).predicted_class # https://triangleinequality.wordpress.com/2013/09/01/decision-trees-part-3-pruning-your-tree/ if __name__ == '__main__': import cleantitanic as ct df = ct.cleaneddf()[0] df = df[[ 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked' ]] data_type_dict = { 'survived': 'nominal', 'pclass': 'ordinal', 'sex': 'nominal', 'age': 'ordinal', 'sibsp': 'ordinal', 'parch': 'ordinal', 'fare': 'ordinal', 'embarked': 'nominal' } g = ClassificationTree()
root_list = [] root_list.append(root_index) layout = g.layout_reingold_tilford(root=root_list) ig.plot(g, layout=layout, margin=margin) def predict(self, data_point, class_probs=False): #预测函数 if class_probs: return self.vertices[0].get_data_leaf(data_point).prediction else: return self.vertices[0].get_data_leaf(data_point).predicted_class # https://triangleinequality.wordpress.com/2013/09/01/decision-trees-part-3-pruning-your-tree/ if __name__ == '__main__': import cleantitanic as ct df = ct.cleaneddf()[0] #这个是在进行数据的预处理 df = df[[ 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked' ]] #select relevant datas data_type_dict = { 'survived': 'nominal', 'pclass': 'ordinal', 'sex': 'nominal', #离散特征 'age': 'ordinal', #连续特征 'sibsp': 'ordinal', 'parch': 'ordinal', 'fare': 'ordinal', 'embarked': 'nominal' } g = ClassificationTree()
else: if type(self.pivot) ==set: if datapoint[self.split_attribute] in self.pivot: return self.left else: return self.right else: if datapoint[self.split_attribute] <=self.pivot: return self.left else: return self.right #Test Code Here t=PivotDecisionTree() t.create_vertex() t.set_root(t.vertices[0]) root = t.get_root() t.leaves.add(root) t.split_vertex(vertex=t.get_root(), split_attribute='sex', pivot=set(['female'])) import cleantitanic as ct data = ct.cleaneddf()[0] t.response='survived' root = t.get_root() root.local_data=root.local_filter(data) for child in root.children: child.local_data=child.local_filter(root.local_data) t.set_predictions() for leaf in t.leaves: print leaf.prediction