This repository has been archived by the owner on May 2, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_tree.py
86 lines (69 loc) · 2.4 KB
/
create_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Create a DecisionTree from a training set. Be sure to specify a Graphviz
output or else this will all be for naught.
Usage:
create_tree.py <input> <class> [options]
Options:
-h --help Show this screen.
-v --verbose Verbose mode.
-s SAMPLES --samples SAMPLES Minimum samples for a grouping [default: 20].
-e EXCEPT --except EXCEPT Columns to exclude (comma delimited).
-o OUT --output OUT Filename for graphviz output (no extension).
"""
from docopt import docopt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import sys
def create_tree(args):
# load
df = pd.read_csv(args['<input>'])
class_attr = args['<class>']
# check
if class_attr not in df.columns:
print('Class attribute "{}" not in dataset!'.format(class_attr))
sys.exit(1)
# flags & options
verbose = False
if args['--verbose']:
verbose = True
exceptions = None
output = None
if args['--output']:
output = args['--output']
if args['--except']:
exceptions = args['--except'].split(',')
samples = int(args['--samples'])
# get numeric columns and drop class and exceptions (our features)
features = df._get_numeric_data().columns.difference([class_attr])
if exceptions:
if verbose:
print('Removing the following: {}'.format(', '.join(exceptions)))
features = features.difference(exceptions)
# verbose detail
if verbose:
print('Using the following features: {}'.format(', '.join(features)))
# create tree
dt = DecisionTreeClassifier(
min_samples_split=samples,
criterion='entropy',
splitter='best',
random_state=99)
y = df[class_attr]
X = df[features]
dt.fit(X, y)
# export graph of tree if graph output specified
if output:
import subprocess
if verbose:
print('Saving as {0}.dot and {0}.png'.format(output))
# save dot
with open(output + '.dot', 'w') as f:
export_graphviz(dt, out_file=f, feature_names=features)
command = 'dot -Tpng {0}.dot -o {0}.png'.format(output).split()
try:
subprocess.check_call(command)
except:
print('Problem creating graphviz image, is graphviz installed?')
sys.exit(1)
if __name__ == '__main__':
args = docopt(__doc__)
create_tree(args)