Beispiel #1
0
def Choosing_the_Best_Split():
  print '>>Choosing the Best Split'
  reload(treepredict)
  print treepredict.giniimpurity(treepredict.my_data)
  print treepredict.entropy(treepredict.my_data)
  set1,set2=treepredict.divideset(treepredict.my_data,2,'yes')
  print treepredict.entropy(set1)
  print treepredict.giniimpurity(set1)
results = treepredict.divideset(treepredict.my_data, 2, "yes")
# results is now a list of lists

# See if records are divided according to  FAQ field (column) ...

print "\nDivision on Read FAQ field...\n"
for list in results:
    for item in list:
        print "%15s %15s %5s %10d %15s" % tuple(item)


# Let's see the difference between gini- and entropy-based impurities
# of the current data (no splitting)
print "\nParent node...\n"
gini = treepredict.giniimpurity(treepredict.my_data)
entr = treepredict.entropy(treepredict.my_data)
print "Gini: %8f    Entropy: %8f" % (gini, entr)

# Let's now split on the Read FAQ field and assess impurity
node1, node2 = treepredict.divideset(treepredict.my_data, 2, "yes")
print "\nRead FAQ =  Yes leaf node...\n"
gini = treepredict.giniimpurity(node1)
entr = treepredict.entropy(node1)
print "Gini: %8f    Entropy: %8f" % (gini, entr)

# Build the DT recursively using the buildtree function; assumes
# last column/field is the classification attribute.

tree = treepredict.buildtree(treepredict.my_data)

# Let's see what it looks like...
Beispiel #3
0
#! /usr/bin/.env python2

import treepredict

print "Gini impurity\n"
print treepredict.giniimpurity(treepredict.my_data)

print "\n"

print "treepredict.entropy\n"
print treepredict.entropy(treepredict.my_data)

print "\n"

set1, set2 = treepredict.divideset(treepredict.my_data, 2, 'yes')

print "Gini impurity\n"
print treepredict.giniimpurity(set1)
print "treepredict.entropy\n"
print treepredict.entropy(set1)

print '\n'
tree = treepredict.buildtree(treepredict.my_data)
print 'tree: ', tree

print '\n'

print 'classify: ', treepredict.classify(['(direct)', 'USA', 'yes', 5], tree)
Beispiel #4
0
pprint(treepredict.divideset(treepredict.my_data, 2, 'yes'))
# ([['slashdot', 'USA', 'yes', 18, 'None'],
#   ['google', 'France', 'yes', 23, 'Premium'],
#   ['digg', 'USA', 'yes', 24, 'Basic'],
#   ['kiwitobes', 'France', 'yes', 23, 'Basic'],
#   ['slashdot', 'France', 'yes', 19, 'None'],
#   ['digg', 'New Zealand', 'yes', 12, 'Basic'],
#   ['google', 'UK', 'yes', 18, 'Basic'],
#   ['kiwitobes', 'France', 'yes', 19, 'Basic']],
#  [['google', 'UK', 'no', 21, 'Premium'],
#   ['(direct)', 'New Zealand', 'no', 12, 'None'],
#   ['(direct)', 'UK', 'no', 21, 'Basic'],
#   ['google', 'USA', 'no', 24, 'Premium'],
#   ['digg', 'USA', 'no', 18, 'None'],
#   ['google', 'UK', 'no', 18, 'None'],
#   ['kiwitobes', 'UK', 'no', 19, 'None'],
#   ['slashdot', 'UK', 'no', 21, 'None']])

print(treepredict.giniimpurity(treepredict.my_data))
# 0.6328125

print(treepredict.entropy(treepredict.my_data))
# 1.50524081494

set1, set2 = treepredict.divideset(treepredict.my_data, 2, 'yes')
print(treepredict.entropy(set1))
# 1.2987949407

print(treepredict.entropy(set2))
# 1.2987949407
  def testBasics(self):
    d = treepredict.testdata()
    self.assertAlmostEquals(1.5052408, treepredict.entropy(d))

    s1, s2 = treepredict.divideset(d, 2, 'yes')
    self.assertAlmostEquals(1.2987949, treepredict.entropy(s1))
    def testBasics(self):
        d = treepredict.testdata()
        self.assertAlmostEquals(1.5052408, treepredict.entropy(d))

        s1, s2 = treepredict.divideset(d, 2, 'yes')
        self.assertAlmostEquals(1.2987949, treepredict.entropy(s1))