Ejemplo n.º 1
0
Archivo: run.py Proyecto: wz125/courses
def Dealing_with_Missing_Data():
  print '>>Dealing with Missing Data'
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  print '------------------'
  print treepredict.mdclassify(['google',None,'yes',None],tree)
  print treepredict.mdclassify(['google','France',None,None],tree)
Ejemplo n.º 2
0
def train_random_trees(train_data, origin_attribute_list, label_list,
                       sample_copy_count, attribute_count_per_tree):
    trees = []  #用随机选取的多个训练集
    for sample_copies_index in range(0, sample_copy_count):
        sample_copy = generate_random_sample(train_data)
        #每棵决策树使用的属性集(随机)
        random_attributes_lists = choose_attributes_lists(
            len(origin_attribute_list), attribute_count_per_tree)
        #用不同属性集训练的决策树
        for attributes_lists_per_tree in random_attributes_lists:  #根据随机选定的属性集训练每棵决策树
            #根据当前决策树使用的属性集,重新生成训练集(只剩下用到的属性)
            reduced_data = organize_sample_with_selected_attributes(
                sample_copy, attributes_lists_per_tree)
            #将属性序号(attributes_lists_per_tree)转换成属性取值信息(real_attribute_list)
            real_attribute_list = []
            for index in attributes_lists_per_tree:
                real_attribute_list.append(origin_attribute_list[index])
            tree = treepredict.buildtree(reduced_data, real_attribute_list,
                                         label_list)
            tree_with_attribute_index = {
                'tree': tree,
                'attributes_index': attributes_lists_per_tree
            }
            trees.append(tree_with_attribute_index)
    return trees
Ejemplo n.º 3
0
def testing_gain_increments(increments=[]):
    classresults = {}

    for increment in increments:
        tree = treepredict.buildtree(train_data,
                                     gain_increment=increment,
                                     gain_threshold=0,
                                     instance_minimum=1)

        trainConfMat, crTrain = treepredict.testTree(train_data, tree)
        print 'Training set confusion matrix (Classification rate:', crTrain, '):'
        for row in trainConfMat:
            print '\t'.join(map(lambda x: str(x), row))

        print ''

        testConfMat, crTest = treepredict.testTree(test_data, tree)
        print 'Test set confusion matrix (Classification rate:', crTest, '):'
        for row in testConfMat:
            print '\t'.join(map(lambda x: str(x), row))

        print ''

        classresults[increment] = [crTest]

    return classresults
Ejemplo n.º 4
0
Archivo: run.py Proyecto: wz125/courses
def Modeling_Home_Prices(): 
  print '>>Modeling Home Prices'
  import zillow
  if os.path.exists('housedata.txt'):
    f=open('housedata.txt','r')
    lines=f.readlines()
    housedata=[]
    for line in lines:
      fields=line.split('\t')
      l1=[fields[0],fields[1],fields[2],fields[3],fields[4],fields[5],fields[6]]
      housedata.append(l1)
    f.close();
  else:
    housedata=zillow.getpricelist( )
    f=open('housedata.txt','w')
    for l in housedata:
      if l is None:
        continue
      print l
      for k in l:
        f.write('%s\t' % (k))
      f.write('\n')
    f.close
  reload(treepredict)
  housetree=treepredict.buildtree(housedata,scoref=treepredict.variance)
  treepredict.drawtree(housetree,'housetree.jpg')
Ejemplo n.º 5
0
def main():
    from treepredict import buildtree, entropy, drawtree
    # house_data = getpricelist()
    # print house_data
    print 'build tree'
    t = buildtree(house_data, scoref=entropy)
    print 'draw tree'
    drawtree(t, 'house_price_tree.jpeg')
Ejemplo n.º 6
0
Archivo: run.py Proyecto: wz125/courses
def Pruning_the_Tree():
  print '>>Pruning the Tree'
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  print '------------------'
  treepredict.prune(tree,0.1)
  treepredict.printtree(tree)
  treepredict.prune(tree,1.0)
  treepredict.printtree(tree)
def main(rows):
    # fruits with their colors and size
    tree = treepredict.buildtree(rows)
    # print(treepredict.classify([2, 'red'], tree))
    # print(treepredict.classify([5, 'red'], tree))
    # print(treepredict.classify([1, 'green'], tree))

    # 决策树
    treepredict.printtree(tree)
    treepredict.drawtree(tree, jpeg='treeview.jpg')
def testing_gain_increments(increments=[]):
  classresults={}
  
  for increment in increments:
    tree=treepredict.buildtree(train_data,gain_increment=increment,gain_threshold=0,instance_minimum=1)

    trainConfMat, crTrain = treepredict.testTree(train_data, tree)
    print 'Training set confusion matrix (Classification rate:', crTrain,'):'
    for row in trainConfMat:
      print '\t'.join(map(lambda x:str(x), row))

    print ''
  
    testConfMat, crTest  = treepredict.testTree(test_data,  tree) 
    print 'Test set confusion matrix (Classification rate:', crTest,'):'
    for row in testConfMat:
      print '\t'.join(map(lambda x:str(x), row))

    print ''

    
    classresults[increment]=[crTest]

  return classresults
Ejemplo n.º 9
0
            #print doc.toxml()
            gender = doc.getElementsByTagName('gender')[0].firstChild.data
            age = doc.getElementsByTagName('age')[0].firstChild.data
            loc = doc.getElementsByTagName('location')[0].firstChild.data

            region = None
            for r, s in stateregions.iteritems():
                if loc[0:2] in s: region = r

            if region:
                result.append((gender, int(age), region, rating))
        except:
            pass
    return result


if __name__ == '__main__':
    d = getrandomratings(50)

    # hu, all results are always of the same gender?
    pdata = getpeopledata(d)
    print pdata

    import drawtree
    import treepredict

    tree = treepredict.buildtree(pdata, treepredict.variance)
    treepredict.prune(tree, 0.5)
    drawtree.drawtree(tree, 'hottree.png')
    print 'Wrote hottree.png'
Ejemplo n.º 10
0
def train_simple_tree(training_data):
    print "Training Simple Tree"
    tree = treepredict.buildtree(training_data)
    return tree
Ejemplo n.º 11
0
	def decision_tree1(self, evt):
		import treepredict
		reload(treepredict)
		full_price = 1130
		flights = self.fl_lines[('PEK','PVG')].set_of_flights

		data = []
		for deptid in flights.keys():
			flight = flights[deptid]
			ftype = flight['ftype']
			deptdate = flight['date']
			deptdate = deptdate.split('/')

			# 获得周几
			weekday = datetime.datetime(int(deptdate[0]),int(deptdate[1]),int(deptdate[2])).weekday()
			weekday = int(weekday)
			#print weekday
			time = flight['time']

			# 若起飞时间非常早或者非常晚,取为1,否则为0
			if int(time[0:2]) < 9 or int(time[0:2]) > 20:
				time = 1
			else:
				time = 0

			# 处理价格
			dd = flight['date']

			# str-->date
			dd = dd.split('/')
			deptdate = datetime.date(int(dd[0]), int(dd[1]), int(dd[2]))
			price = flight['price']

			points = []
			for ftdate in price.keys():
				ff = ftdate.split('/')
				fetchdate = datetime.date(int(ff[0]), int(ff[1]), int(ff[2]))
				days = (deptdate-fetchdate).days
				points.append((days, price[ftdate]))

			points = self.pre(points)
			p = []
			for i in points.keys():
				if i >= 6:
					p.append(points[i])

			#print p
			if len(p) <= 1:
				continue
			result_price = p[0]
			p.pop(0)
			avg_price = sum(p)/len(p)

			result_price = int(float(result_price)/float(full_price)*10)
			avg_price = int(float(avg_price)/float(full_price)*10)

			data.append((weekday, time, avg_price, result_price))
##		fout = open('task.txt', 'w')
##		lines = ['%s %s %s %s\n' %v for v in data]
##		fout.writelines(lines)
##		fout.close()
		flighttree = treepredict.buildtree(data, scoref = treepredict.giniimpurity)
		treepredict.drawtree(flighttree,'flighttree_entropy.jpg')
 def testBasics(self):
   t = treepredict.buildtree(treepredict.testdata())
   self.assertEquals(treepredict.classify(['(direct)', 'USA', 'yes', 5], t),
       {'Basic': 4})
  try:
    zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
    use = doc.getElementsByTagName('useCode')[0].firstChild.data
    year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
    bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
    bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
    #rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
    price = doc.getElementsByTagName('amount')[0].firstChild.data
  except Exception, e:
    #print e
    return None

  #return zipcode, use, int(year), float(bath), int(bed), int(rooms), price
  return zipcode, use, int(year), float(bath), int(bed), price


def getpricelist():
  return filter(None, [getaddressdata(line.strip(), 'Cambridge,MA')
      for line in open('addresslist.txt')])


if __name__ == '__main__':
  import drawtree
  import treepredict

  housedata = getpricelist()
  print housedata
  tree = treepredict.buildtree(housedata, scorefun=treepredict.variance)
  drawtree.drawtree(tree, 'zillow.png')
  print "Wrote zillow.png"
import treepredict as tr

tree = tr.buildtree(tr.my_data)
tr.printtree(tree)
print tr.mdclassify(['google',None,'yes',None],tree)
print tr.mdclassify(['google','France',None,None],tree)
Ejemplo n.º 15
0
def train_simple_tree(training_data):
    print "Training Simple Tree"
    tree = treepredict.buildtree(training_data)
    return tree
Ejemplo n.º 16
0
import treepredict
# fruits with their colors and size
fruits = [
[4, 'red', 'apple'],
[4, 'green', 'apple'],
[1, 'red', 'cherry'],
[1, 'green', 'grape'],
[5, 'red', 'apple']
]
tree = treepredict.buildtree(fruits)
treepredict.classify([2, 'red'], tree)
treepredict.classify([5, 'red'], tree)
treepredict.classify([1, 'green'], tree)
treepredict.printtree(tree)
#treepredict.drawtree(tree, jpeg='treeview.jpg')
Ejemplo n.º 17
0
orig_date = datetime.date(2014, 2, 1)
for i in range(60):
    timedelta = datetime.timedelta(i)
    cur_date = orig_date + timedelta
    str_date = cur_date.strftime('%y/%m/%d')
    str_date = "20" + str_date
    new_flights = fl_lines.get_flights_by_dpdate(str_date)
    train_flights.update(new_flights)

#train_flights = fl_lines.get_flights_by_dpdate('2014/05/25')
#print train_flights
print '训练数据读入完成,用时', time.clock()

train_data = get_data(train_flights)
#writeintxt('train_data.txt',train_data)
flighttree = treepredict.buildtree(train_data, scoref=treepredict.giniimpurity)
#treepredict.drawtree(flighttree, 'test.jpg')
print '树训练完成,用时%s,数据%s条' % (time.clock(), len(train_data))

test_flights = {}
orig_date = datetime.date(2014, 5, 2)
for i in range(10):
    timedelta = datetime.timedelta(i)
    cur_date = orig_date + timedelta
    str_date = cur_date.strftime('%y/%m/%d')
    str_date = "20" + str_date
    new_flights = fl_lines.get_flights_by_dpdate(str_date)
    test_flights.update(new_flights)

test_data = get_data(test_flights)
#writeintxt('test_data.txt', test_data)
Ejemplo n.º 18
0
Archivo: lc.py Proyecto: mattc58/lc
 def make_tree(self, data):
     '''
     Make a decision tree with the supplied data
     '''
     return treepredict.buildtree(data)
Ejemplo n.º 19
0
Archivo: run.py Proyecto: wz125/courses
def Classifying_New_Observations():
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  print '>>Classifying New Observations'
  print treepredict.classify(['(direct)','USA','yes',5],tree)
Ejemplo n.º 20
0
Archivo: run.py Proyecto: wz125/courses
def Recursive_Tree_Building():
  print '>>Recursive Tree Building'
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  treepredict.printtree(tree)
increments=[0]

for i in xrange(1,10):
  x=10**(-i)
  increments.append(x)

print 'Increments to be tested and passed to gain_increments',increments
accuracyTest=testing_gain_increments(increments)
#print accuracyTest
values=accuracyTest.keys()
values.sort(cmp=lambda a,b:cmp(accuracyTest[a],accuracyTest[b]))
print 'Increment value with best classification rate was ',values[-1]

# Let's see what it looks like...
#print "\nFinal tree...\n"
treepredict.printtree(treepredict.buildtree(train_data,gain_increment=values[-1],gain_threshold=0,instance_minimum=1))

# Produce a png of the tree
treepredict.drawtree(tree,jpeg="sample_tree.jpg")
#print "\npng of tree generated using PIL (Python Imaging Library) modules.\n"

# Let's classify an incoming record of '(direct), USA, yes, 5' ...
#incoming = ['(direct)','USA','yes',5]
#print "Prediction of new record: ",treepredict.classify(incoming,tree)

# Finally, what does pruning do with say a mingain = 0.9 ?
#print "\nPruned tree...\n"
#treepredict.prune(tree,0.9)
#treepredict.printtree(tree)

# For group homework, modify "buildtree" function so that it stops
	code = doc.getElementsByTagName('code')[0].firstChild.data

	if code != '0': 
		print 'Code Error!'
		return None
	try:
		zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
		use = doc.getElementsByTagName('useCode')[0].firstChild.data
		year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
		bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
		bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
		rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
		price = doc.getElementsByTagName('amount')[0].firstChild.data
	except:
		print 'Error!'
		return None

	return (str(zipcode), str(use), int(year), float(bath), int(bed), int(rooms), int(price))

def getPriceList(filename):
	pricelist = []
	for line in open(filename):
		data = getAddressData(line.strip(), 'Cambridge,MA')
		if data != None: pricelist.append(data)
	return pricelist

if __name__ == '__main__':
	housedata = getPriceList('addresslist.txt')
	housetree = treepredict.buildtree(housedata, scoreFunction=treepredict.variance)
	treepredict.drawTree(housetree, 'housetree.jpg')
Ejemplo n.º 23
0
        if best_gain > 0:
            trueBranch = buildtree(best_sets[0])
            falseBranch = buildtree(best_sets[1])
            return decisionnode(col_num=best_criteria[0],
                                test=best_criteria[1],
                                tb=trueBranch,
                                fb=falseBranch)
        else:
            return decisionnode(results=uniquecounts(data))


set0, set1 = dividedata(data, 2, lambda answer: answer == 2017)
print(entropy(set0), entropy(set1), sep='\n')

tree = treepredict.buildtree(treepredict.data)

print(tree.col_num)
print(tree.test)
print(tree.results)
print("")
print(tree.tb.col_num)
print(tree.tb.test)
print(tree.tb.results)
print("")
print(tree.tb.tb.col_num)
print(tree.tb.tb.test)
print(tree.tb.tb.results)
print("")
print(tree.tb.fb.col_num)
print(tree.tb.fb.test)
Ejemplo n.º 24
0
# Script to demonstrate the CART-like DT classifier from
# Chapter 7 of "Programming Collective Intelligence" by
# T. Segaran, O'Reilly, (c) 2007
#
import treepredict
import fileinput
import Image
import ImageDraw


# If the last parameter is set to 0, then all attributes other than 'age' and 'war' would be used.

train_data, test_data = fileinput.loadDataset(5, ['age','gender','occupation','fantasy','film-noir', 'drama', 'western'], 1)

tree=treepredict.buildtree(train_data,gain_increment=0,gain_threshold=0,instance_minimum=100)

# Let's see what it looks like...
print "\nFinal tree...\n"
treepredict.printtree(tree)

trainConfMat, crTrain = treepredict.testTree(train_data, tree)
print 'Training set confusion matrix (Classification rate:', crTrain,'):'
for row in trainConfMat:
    print '\t'.join(map(lambda x:str(x), row))

print ''
  

testConfMat, crTest  = treepredict.testTree(test_data,  tree) 
print 'Test set confusion matrix (Classification rate:', crTest,'):'
 def testBasics(self):
     t = treepredict.buildtree(treepredict.testdata())
     self.assertEquals(
         treepredict.classify(['(direct)', 'USA', 'yes', 5], t),
         {'Basic': 4})
    if code != '0':
        return None

    # extract the info about this property
    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return zipcode, use, int(year), float(bath), int(bed), int(rooms), price

def getpricelist():
    l1 = []
    for line in file('addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambridge+MA')
        l1.append(data)
    return l1


if __name__ == "__main__":
    housedata = getpricelist()
    housedata = [data for data in housedata if data != None]
    housetree = treepredict.buildtree(housedata, treepredict.variance)
    treepredict.drawtree(housetree, 'housetree.jpg')
Ejemplo n.º 27
0
                        sort=False)
dataset = ads_df_final[['Adjective', 'Adverb', 'Noun', 'Verb', 'Sentiment']]

############# Splitting the Dataset into Testing and Training Sets ##############
final_acc = 0.0

for i in range(no_of_trials):
    splitRatio = 0.7
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    #print(trainingSet)
    #    print(type(trainingSet))
    print('Split {0} rows into train = {1} and test = {2} rows'.format(
        len(dataset), len(trainingSet), len(testSet)))

    ############# Model Building ##############
    b = dt.buildtree(trainingSet)
    dt.drawtree(b, jpeg='treeview.jpg')

    #print("original_testset=",testSet)
    ############# Preparing Testing DataSet ##############
    testlabels = []
    for i in range(len(testSet)):
        label = testSet[i].pop(-1)
        testlabels.append(label)

    #print("testSet=",testSet)
    #print("testlabels=",testlabels)
    ############# Classification of Test Records ##############
    number = 0
    for i in range(len(testSet)):
        #print("\ntest_data",testSet[i])
Ejemplo n.º 28
0
            gender = doc2.getElementsByTagName('gender')[0].firstChild.data
            age = doc2.getElementsByTagName('age')[0].firstChild.data
            loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2]

            # 将州转换成地区
            for r, s in stateregions.items():
                if loc in s: region = r

            if region != None:
                result.append((gender, int(age), region, rating))
        except:
            pass
    return result


l1 = getrandomratings(500)
print len(l1)
pdata = getpeopledata(l1)
print pdata[0]

import treepredict

hottree = treepredict.buildtree(pdata, scoref=treepredict.variance)
treepredict.prune(hottree, 0.5)
treepredict.drawtree(hottree, 'hottree.jpg')

south = treepredict.mdclassify((None, None, 'south'), hottree)
midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree)
print south[10] / sum(south.values())
print midat[10] / sum(midat.values())
Ejemplo n.º 29
0
import treepredict
# fruits with their colors and size
fruits = [[4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'],
          [1, 'green', 'grape'], [5, 'red', 'apple']]
tree = treepredict.buildtree(fruits)
treepredict.classify([2, 'red'], tree)
treepredict.classify([5, 'red'], tree)
treepredict.classify([1, 'green'], tree)
treepredict.printtree(tree)
#treepredict.drawtree(tree, jpeg='treeview.jpg')
Ejemplo n.º 30
0
orig_date = datetime.date(2014, 2, 1)
for i in range(60):
	timedelta = datetime.timedelta(i)
	cur_date = orig_date + timedelta
	str_date = cur_date.strftime('%y/%m/%d')
	str_date = "20"+str_date
	new_flights = fl_lines.get_flights_by_dpdate(str_date)
	train_flights.update(new_flights)

#train_flights = fl_lines.get_flights_by_dpdate('2014/05/25')
#print train_flights
print '训练数据读入完成,用时', time.clock()

train_data = get_data(train_flights)
#writeintxt('train_data.txt',train_data)
flighttree = treepredict.buildtree(train_data, scoref=treepredict.giniimpurity)
#treepredict.drawtree(flighttree, 'test.jpg')
print '树训练完成,用时%s,数据%s条' %(time.clock(), len(train_data))

test_flights = {}
orig_date = datetime.date(2014, 5, 2)
for i in range(10):
	timedelta = datetime.timedelta(i)
	cur_date = orig_date + timedelta
	str_date = cur_date.strftime('%y/%m/%d')
	str_date = "20"+str_date
	new_flights = fl_lines.get_flights_by_dpdate(str_date)
	test_flights.update(new_flights)

test_data = get_data(test_flights)
#writeintxt('test_data.txt', test_data)
Ejemplo n.º 31
0
for i in xrange(1, 10):
    x = 10**(-i)
    increments.append(x)

print 'Increments to be tested and passed to gain_increments', increments
accuracyTest = testing_gain_increments(increments)
#print accuracyTest
values = accuracyTest.keys()
values.sort(cmp=lambda a, b: cmp(accuracyTest[a], accuracyTest[b]))
print 'Increment value with best classification rate was ', values[-1]

# Let's see what it looks like...
#print "\nFinal tree...\n"
treepredict.printtree(
    treepredict.buildtree(train_data,
                          gain_increment=values[-1],
                          gain_threshold=0,
                          instance_minimum=1))

# Produce a png of the tree
treepredict.drawtree(tree, jpeg="sample_tree.jpg")
#print "\npng of tree generated using PIL (Python Imaging Library) modules.\n"

# Let's classify an incoming record of '(direct), USA, yes, 5' ...
#incoming = ['(direct)','USA','yes',5]
#print "Prediction of new record: ",treepredict.classify(incoming,tree)

# Finally, what does pruning do with say a mingain = 0.9 ?
#print "\nPruned tree...\n"
#treepredict.prune(tree,0.9)
#treepredict.printtree(tree)
Ejemplo n.º 32
0
def do_simpletree_kcross_validation(fin,finy,kfolds):
    print "Starting k=" + str(kfolds)+" validation for Simple tree"
    #there is 2500 tracks
    labels = dt.get_lines(finy,int)
    pb = ProgBar()
    lines = dt.get_lines(fin,float," ", callback = pb.callback)
    del pb
    #normalize features
    
    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)


    block_size = len(lines)/kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

   
    print "number of chunks = " +str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []

    for i in range(0,len(example_chunks)):

        #we leave set in index i out of train
        print "prepare validation set"
        validationdata = example_chunks[i]

        #extract validation chunk
        print "leaving out block " + str(i) + " for validation"
        leaveout = i
        validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ]
        
        trainingdata = []

        print("merging blocks "),
        for j in range(0,len(example_chunks)):
            if(j != leaveout):
                #print "j="+str(j) + " i="+ str(leaveout)
                print(str(j) + ","),
                trainingdata = trainingdata + example_chunks[j]

        print "\nprepare training set"

        print "training on " + str(len(trainingdata))
        print "each track has " + str(len(trainingdata[0])) + " features"

        tree = treepredict.buildtree(trainingdata)

        print "testing on " + str(len(validationdata))
        corrects = 0
        #classify a set of entries
        for example in validationdata:
            #print example.features
            result = treepredict.classify(example.features,tree)
            #print 'expected : ' + str(example.label) + ' result : '+ str(result)
            if(result == example.label):
                corrects = corrects + 1
        #calculate the % of accuracy
        accuracy_percentage = (corrects*100)/len(validationdata)
        print "accuracy = " + str(accuracy_percentage) + "%"
        accuracy_results.append(accuracy_percentage)
    avgcc = dt.average(accuracy_results)
    print "average accuracy ="+  str(avgcc) + "%"
def do_simpletree_kcross_validation(fin, finy, kfolds):
    print "Starting k=" + str(kfolds) + " validation for Simple tree"
    #there is 2500 tracks
    labels = dt.get_lines(finy, int)
    pb = ProgBar()
    lines = dt.get_lines(fin, float, " ", callback=pb.callback)
    del pb
    #normalize features

    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)

    block_size = len(lines) / kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

    print "number of chunks = " + str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []

    for i in range(0, len(example_chunks)):

        #we leave set in index i out of train
        print "prepare validation set"
        validationdata = example_chunks[i]

        #extract validation chunk
        print "leaving out block " + str(i) + " for validation"
        leaveout = i
        validationdata = [
            exampleentry(validationdata[i][0:len(validationdata[i]) - 1],
                         validationdata[i][-1])
            for i in range(0, len(validationdata))
        ]

        trainingdata = []

        print("merging blocks "),
        for j in range(0, len(example_chunks)):
            if (j != leaveout):
                #print "j="+str(j) + " i="+ str(leaveout)
                print(str(j) + ","),
                trainingdata = trainingdata + example_chunks[j]

        print "\nprepare training set"

        print "training on " + str(len(trainingdata))
        print "each track has " + str(len(trainingdata[0])) + " features"

        tree = treepredict.buildtree(trainingdata)

        print "testing on " + str(len(validationdata))
        corrects = 0
        #classify a set of entries
        for example in validationdata:
            #print example.features
            result = treepredict.classify(example.features, tree)
            #print 'expected : ' + str(example.label) + ' result : '+ str(result)
            if (result == example.label):
                corrects = corrects + 1
        #calculate the % of accuracy
        accuracy_percentage = (corrects * 100) / len(validationdata)
        print "accuracy = " + str(accuracy_percentage) + "%"
        accuracy_results.append(accuracy_percentage)
    avgcc = dt.average(accuracy_results)
    print "average accuracy =" + str(avgcc) + "%"
Ejemplo n.º 34
0
import treepredict

# main function
# print('<----DivideSet---->')
# for item in treepredict.divideset(treepredict.my_data, 2, 'yes'):
#     print(item)
#
print('\n<----Build and Display the Tree---->')
tree = treepredict.buildtree(treepredict.my_data)
treepredict.printtree(tree)
#
# print('\n<----Graphical Display---->')
# path = 'output/treeview.jpg'
# treepredict.drawtree(tree, jpeg=path)
# print("picture has been saved in " + path)
#
# print('\n<----Classify and prune---->')
# test = ['(direct)', 'USA', 'yes', 5]
# print(test)
# print(treepredict.classify(test, tree), '\n')
#
# print('Before pune:')
# treepredict.printtree(tree)
# treepredict.prune(tree, 1.0)
# print('\nAfter pune:')
# treepredict.printtree(tree)

# print('<----Zillow API---->')
# import zillow
# # housedata = zillow.getpricelist()
# # print('house data saved!')
print "\nParent node...\n"
gini = treepredict.giniimpurity(treepredict.my_data)
entr = treepredict.entropy(treepredict.my_data)
print "Gini: %8f    Entropy: %8f" % (gini, entr)

# Let's now split on the Read FAQ field and assess impurity
node1, node2 = treepredict.divideset(treepredict.my_data, 2, "yes")
print "\nRead FAQ =  Yes leaf node...\n"
gini = treepredict.giniimpurity(node1)
entr = treepredict.entropy(node1)
print "Gini: %8f    Entropy: %8f" % (gini, entr)

# Build the DT recursively using the buildtree function; assumes
# last column/field is the classification attribute.

tree = treepredict.buildtree(treepredict.my_data)

# Let's see what it looks like...
print "\nFinal tree...\n"
treepredict.printtree(tree)

# Produce a png of the tree
treepredict.drawtree(tree, jpeg="sample_tree.jpg")
print "\npng of tree generated using PIL (Python Imaging Library) modules.\n"

# Let's classify an incoming record of '(direct), USA, yes, 5' ...
incoming = ["(direct)", "USA", "yes", 5]
print "Prediction of new record: ", treepredict.classify(incoming, tree)

# Let's see how the missing data classification via
# the "mdclassify" function performs on our sample data.
Ejemplo n.º 36
0
    ["t", "China", "no", 17, "None"],
]

my_data2 = [
    ["a", "USA", "yes", "18", "None"],
    ["b", "France", "yes", "23", "Premium"],
    ["c", "USA", "yes", "24", "Basic"],
    ["d", "France", "yes", "23", "Basic"],
]


train_flowers = data.read_filedata("..//data//train_data.txt", "ALL", ",", [0, 1, 2, 3])
test_flowers = data.read_filedata("..//data//test_data.txt", "ALL", ",", [0, 1, 2, 3])

tree = DecisionTree(train_flowers)
treepredict.buildtree(tree)
tree.printTree()

right = 0
wrong = 0
for flower in test_flowers:
    result = treepredict.predic(tree, flower)
    if flower[-1] in result:
        if right == 49:
            pass
        right += 1
    else:
        wrong += 1

print "正确预测:" + str(right) + "个"
print "错误预测:" + str(wrong) + "个"
Ejemplo n.º 37
0
import treepredict as tp

if __name__ == '__main__':

    print("ESERCIZIO SU IRIS DATASET\n")

    train_data = []
    test_data = []

    mydata = tp.aprifile("iris.txt")

    train_data, test_data = tp.createdataset2(mydata, 60, [])

    print("TRAIN DATA : \n")

    print(train_data, "\n")

    print("TEST DATA: \n")

    print(train_data)

    iris_tree = tp.buildtree(train_data)

    tp.drawtree(iris_tree, "iris_tree.jpeg")

    tp.fperformance(mydata)
Ejemplo n.º 38
0
import advancedclassify as ad
import treepredict as tr

agesonly = ad.loadmatch('agesonly.csv', allnum=True)
matchmaker = ad.loadmatch('matchmaker.csv')

# ad.plotagematches(agesonly)

age = []
for line in file('agesonly.csv'):
    l = []
    for w in line.split(','):
        l.append(int(w))
    age.append(l)
tree = tr.buildtree(age)
tr.printtree(tree)
tr.drawtree(tree)

print tr.classify(tree, [65, 63])

avgs = ad.lineartrain(agesonly)
print avgs

print ad.dpclassify([30, 25], avgs.values())
print ad.dpclassify([25, 40], avgs.values())
print ad.dpclassify([48, 20], avgs.values())

print tr.classify(tree, [30, 25])
print tr.classify(tree, [25, 40])
print tr.classify(tree, [48, 20])
Ejemplo n.º 39
0
def tree_view():
    from treepredict import buildtree, drawtree
    my_data = [map(float, line.split(',')) for line in open('data/agesonly.csv')]
    tree = buildtree(my_data)
    drawtree(tree, 'treeview.png')
Ejemplo n.º 40
0
import treepredict
import preprocessor
import postprocessor
import arff
import copy

label_count = 6
train_data_file = '.\\scene\\scene-train-tiny.arff'
test_data_file = '.\\scene\\scene-test-tiny.arff'
method = input('1 单标签;2 多个二类分类')
if method == '1':
    #读取训练集,建树(多标签转换成单标签)
    (attributes_list, label_value_list,train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE)
    train_data = preprocessor.translate_label_multiclass(train_data, label_count)
    tree = treepredict.buildtree(train_data, attributes_list, label_value_list)
    treepredict.printtree(tree)

    #读取测试集,验证效果
    (test_attributes_list, test_label_value_list, test_data) = preprocessor.read_data(test_data_file, label_count, arff.DENSE)
    test_data_copy = copy.deepcopy(test_data)
    predicted_labels_list = []
    for row in test_data:
        result = treepredict.classify(row, tree, test_attributes_list)
        post_result = treepredict.post_classify(result)
        decoded_result = preprocessor.label_decoding(post_result)
        predicted_labels_list.append(decoded_result)

    hamming_loss = postprocessor.hamming_loss(test_data_copy, predicted_labels_list)
    print('hamming loss of merging labels:', hamming_loss)
else :
    #当做多个二类分类问题处理
Ejemplo n.º 41
0
# Script to demonstrate the CART-like DT classifier from
# Chapter 7 of "Programming Collective Intelligence" by
# T. Segaran, O'Reilly, (c) 2007
#
import treepredict
import fileinput
import Image
import ImageDraw


# If the last parameter is set to 0, then all attributes other than 'age' and 'war' would be used.

train_data, test_data = fileinput.loadDataset(2, ['age', 'gender','occupation','unknown genre','film-noir', 'horror', 'western'], 1)

tree=treepredict.buildtree(train_data,gain_increment=0,gain_threshold=0,instance_minimum=0)

# Let's see what it looks like...
print "\nFinal tree...\n"
treepredict.printtree(tree)

trainConfMat, crTrain = treepredict.testTree(train_data, tree)
print 'Training set confusion matrix (Classification rate:', crTrain,'):'
for row in trainConfMat:
    print '\t'.join(map(lambda x:str(x), row))

print ''
  

testConfMat, crTest  = treepredict.testTree(test_data,  tree) 
print 'Test set confusion matrix (Classification rate:', crTest,'):'
Ejemplo n.º 42
0
    # 状态码为0代表操作成功, 否则代表有错误发生
    if code != '0': return None

    # 提取有关该房产的信息
    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price)


'''读取addresslist.txt文件并构造一个数据列表'''
def getpricelist():
    l1 = []
    for line in file('../data/addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambridge, MA')
        l1.append(data)
    return l1

import treepredict
housedata = getpricelist()
housetree = treepredict.buildtree(housedata, scoref=treepredict.variance)
treepredict.drawtree(housetree, 'housetree.jpg')
Ejemplo n.º 43
0
import treepredict as tp

if __name__ == '__main__':

    print("ESERCIZIO SU MUSHROOMS DATASET\n")

    train_data = []
    test_data = []

    print("ALL DATASET:\n")
    mydata = tp.aprifile("mushrooms_final.txt")

    train_data, test_data = tp.createdataset2(mydata, 3250, [])

    print("TRAIN DATA : \n")

    print(train_data, "\n")

    print("TEST DATA: \n")

    print(train_data)

    mushrooms_tree = tp.buildtree(train_data)

    tp.drawtree(mushrooms_tree, "mushrooms_tree.jpeg")

    tp.fperformance(mydata)