stop =N
        else:
            stop = start +len_fold
        test = data.ix[rows[start:stop]]
        train = data.ix[rows[:start]+rows[stop:]]
        if resample:
            train_len=start+N-stop
            no_resamples = N-train_len
            train_rows = list(train.index)
            random_extra_rows =[random.choice(train_rows) for row in range(no_resamples)]
            train_rows = train_rows+random_extra_rows
            train=train.ix[train_rows]
        yield {'test':test, 'train':train}
        start=stop

df=ct.cleaneddf(no_bins=10)[0]
df2=ct.cleaneddf(no_bins=10)[1]

df=df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
       'Fare', 'Embarked']]
data_type_dict={'Survived':'nominal', 'Pclass':'ordinal', 'Sex':'nominal', 
                'Age':'ordinal', 'SibSp':'ordinal', 'Parch':'ordinal', 
                'Fare':'ordinal', 'Embarked':'nominal'}

       
def tree_train(data_type_dict, train_data,test_data, response, no_folds,
                   min_node_size, max_depth, no_iter):
        parameters={'min_node_size':min_node_size, 'max_node_depth':max_depth, 
                    'threshold':0, 'metric_kind':'Gini', 'alpha':0,
                    'response':response}
        model=ticart.ClassificationTree()
        else:
            stop = start +len_fold
        test = data.ix[rows[start:stop]]
        train = data.ix[rows[:start]+rows[stop:]]
        if resample:
            train_len=start+N-stop
            no_resamples = N-train_len
            train_rows = list(train.index)
            random_extra_rows =[random.choice(train_rows) for row in range(no_resamples)]
            train_rows = train_rows+random_extra_rows
            train=train.ix[train_rows]
        yield {'test':test, 'train':train}
        start=stop


df = ct.cleaneddf(no_bins=10)[0]
df2=ct.cleaneddf(no_bins=10)[1]

df=df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
       'Fare', 'Embarked']]
data_type_dict={'Survived':'nominal', 'Pclass':'ordinal', 'Sex':'nominal', 
                'Age':'ordinal', 'SibSp':'ordinal', 'Parch':'ordinal', 
                'Fare':'ordinal', 'Embarked':'nominal'}

       
def tree_train(data_type_dict, train_data,test_data, response, no_folds,
                   min_node_size, max_depth, no_iter):
        parameters={'min_node_size':min_node_size, 'max_node_depth':max_depth, 
                    'threshold':0, 'metric_kind':'Gini', 'alpha':0,
                    'response':response}
        model=ticart.ClassificationTree()
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import cleantitanic as ct
import matplotlib.pylab as plt

data = ct.cleaneddf()
traindf, testdf =data[0], data[1]


def proportionSurvived(discreteVar):
    by_var = traindf.groupby([discreteVar,'Survived'])
    table = by_var.size().unstack()
    normedtable = table.div(table.sum(1), axis=0)
    return normedtable

discreteVarList = ['Sex', 'Pclass', 'Embarked']

fig1, axes1 = plt.subplots(3,1)

for i in range(3):
    var = discreteVarList[i]
    table = proportionSurvived(var)
    table.plot(kind='barh', stacked=True, ax=axes1[i])
fig1.show()

fig2, axes2 = plt.subplots(2,3)
genders=traindf.Sex.unique()
classes=traindf.Pclass.unique()
Esempio n. 4
0
        root_list = []
        root_list.append(root_index)
        layout = g.layout_reingold_tilford(root=root_list)
        ig.plot(g, layout=layout, margin=margin)

    def predict(self, data_point, class_probs=False):
        if class_probs:
            return self.vertices[0].get_data_leaf(data_point).prediction
        else:
            return self.vertices[0].get_data_leaf(data_point).predicted_class


# https://triangleinequality.wordpress.com/2013/09/01/decision-trees-part-3-pruning-your-tree/
if __name__ == '__main__':
    import cleantitanic as ct
    df = ct.cleaneddf()[0]
    df = df[[
        'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
        'embarked'
    ]]
    data_type_dict = {
        'survived': 'nominal',
        'pclass': 'ordinal',
        'sex': 'nominal',
        'age': 'ordinal',
        'sibsp': 'ordinal',
        'parch': 'ordinal',
        'fare': 'ordinal',
        'embarked': 'nominal'
    }
    g = ClassificationTree()
        root_list = []
        root_list.append(root_index)
        layout = g.layout_reingold_tilford(root=root_list)
        ig.plot(g, layout=layout, margin=margin)

    def predict(self, data_point, class_probs=False):  #预测函数
        if class_probs:
            return self.vertices[0].get_data_leaf(data_point).prediction
        else:
            return self.vertices[0].get_data_leaf(data_point).predicted_class


# https://triangleinequality.wordpress.com/2013/09/01/decision-trees-part-3-pruning-your-tree/
if __name__ == '__main__':
    import cleantitanic as ct
    df = ct.cleaneddf()[0]  #这个是在进行数据的预处理
    df = df[[
        'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
        'embarked'
    ]]  #select relevant datas
    data_type_dict = {
        'survived': 'nominal',
        'pclass': 'ordinal',
        'sex': 'nominal',  #离散特征
        'age': 'ordinal',  #连续特征
        'sibsp': 'ordinal',
        'parch': 'ordinal',
        'fare': 'ordinal',
        'embarked': 'nominal'
    }
    g = ClassificationTree()
Esempio n. 6
0
        else:
            if type(self.pivot) ==set:
                if datapoint[self.split_attribute] in self.pivot:
                    return self.left
                else:
                    return self.right
            else:
                if datapoint[self.split_attribute] <=self.pivot:
                    return self.left
                else:
                    return self.right



#Test Code Here
t=PivotDecisionTree()
t.create_vertex()
t.set_root(t.vertices[0])
root = t.get_root()
t.leaves.add(root)
t.split_vertex(vertex=t.get_root(), split_attribute='sex', pivot=set(['female']))
import cleantitanic as ct
data = ct.cleaneddf()[0]
t.response='survived'
root = t.get_root()
root.local_data=root.local_filter(data)
for child in root.children:
    child.local_data=child.local_filter(root.local_data)
t.set_predictions()
for leaf in t.leaves:
    print leaf.prediction