def Modeling_Home_Prices(): print '>>Modeling Home Prices' import zillow if os.path.exists('housedata.txt'): f=open('housedata.txt','r') lines=f.readlines() housedata=[] for line in lines: fields=line.split('\t') l1=[fields[0],fields[1],fields[2],fields[3],fields[4],fields[5],fields[6]] housedata.append(l1) f.close(); else: housedata=zillow.getpricelist( ) f=open('housedata.txt','w') for l in housedata: if l is None: continue print l for k in l: f.write('%s\t' % (k)) f.write('\n') f.close reload(treepredict) housetree=treepredict.buildtree(housedata,scoref=treepredict.variance) treepredict.drawtree(housetree,'housetree.jpg')
def main(): from treepredict import buildtree, entropy, drawtree # house_data = getpricelist() # print house_data print 'build tree' t = buildtree(house_data, scoref=entropy) print 'draw tree' drawtree(t, 'house_price_tree.jpeg')
def main(rows): # fruits with their colors and size tree = treepredict.buildtree(rows) # print(treepredict.classify([2, 'red'], tree)) # print(treepredict.classify([5, 'red'], tree)) # print(treepredict.classify([1, 'green'], tree)) # 决策树 treepredict.printtree(tree) treepredict.drawtree(tree, jpeg='treeview.jpg')
if code != '0': return None # extract the info about this property try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None return zipcode, use, int(year), float(bath), int(bed), int(rooms), price def getpricelist(): l1 = [] for line in file('addresslist.txt'): data = getaddressdata(line.strip(), 'Cambridge+MA') l1.append(data) return l1 if __name__ == "__main__": housedata = getpricelist() housedata = [data for data in housedata if data != None] housetree = treepredict.buildtree(housedata, treepredict.variance) treepredict.drawtree(housetree, 'housetree.jpg')
treepredict.printtree(tree) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain,'):' for row in trainConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest,'):' for row in testConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' # Produce a png of the tree print '\nPrinting tree image...' treepredict.drawtree(tree,jpeg="sample_tree.jpg") # For group homework, modify "buildtree" function so that it stops # when a threshold value on entropy is no longer satisfied. It should # accept a minimum gain parameter and stop dividing the branch if # this condition is not met. Pruning the tree will not be used in # this cas.
rooms=doc.getElementsByTagName('totalRooms')[0].firstChild.data price=doc.getElementsByTagName('amount')[0].firstChild.data except: return None print zipcode,use,year,bath,bed,rooms,price return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) def getpricelist(): ll=[] for line in file('addresslist.txt'): data=getaddressdata(line.strip(),'Cambrige,MA') #这里需要加个判断语句,因为list会蛋疼地将None加入,使求方差时出错 if data!=None: ll.append(data) return ll # # # # #全局测试代码一 if 1: housedata=getpricelist() import treepredict housetree=treepredict.buildtree(housedata,scoref=treepredict.variance) treepredict.drawtree(housetree)
def tree_view(): from treepredict import buildtree, drawtree my_data = [map(float, line.split(',')) for line in open('data/agesonly.csv')] tree = buildtree(my_data) drawtree(tree, 'treeview.png')
def decision_tree1(self, evt): import treepredict reload(treepredict) full_price = 1130 flights = self.fl_lines[('PEK','PVG')].set_of_flights data = [] for deptid in flights.keys(): flight = flights[deptid] ftype = flight['ftype'] deptdate = flight['date'] deptdate = deptdate.split('/') # 获得周几 weekday = datetime.datetime(int(deptdate[0]),int(deptdate[1]),int(deptdate[2])).weekday() weekday = int(weekday) #print weekday time = flight['time'] # 若起飞时间非常早或者非常晚,取为1,否则为0 if int(time[0:2]) < 9 or int(time[0:2]) > 20: time = 1 else: time = 0 # 处理价格 dd = flight['date'] # str-->date dd = dd.split('/') deptdate = datetime.date(int(dd[0]), int(dd[1]), int(dd[2])) price = flight['price'] points = [] for ftdate in price.keys(): ff = ftdate.split('/') fetchdate = datetime.date(int(ff[0]), int(ff[1]), int(ff[2])) days = (deptdate-fetchdate).days points.append((days, price[ftdate])) points = self.pre(points) p = [] for i in points.keys(): if i >= 6: p.append(points[i]) #print p if len(p) <= 1: continue result_price = p[0] p.pop(0) avg_price = sum(p)/len(p) result_price = int(float(result_price)/float(full_price)*10) avg_price = int(float(avg_price)/float(full_price)*10) data.append((weekday, time, avg_price, result_price)) ## fout = open('task.txt', 'w') ## lines = ['%s %s %s %s\n' %v for v in data] ## fout.writelines(lines) ## fout.close() flighttree = treepredict.buildtree(data, scoref = treepredict.giniimpurity) treepredict.drawtree(flighttree,'flighttree_entropy.jpg')
dataset = ads_df_final[['Adjective', 'Adverb', 'Noun', 'Verb', 'Sentiment']] ############# Splitting the Dataset into Testing and Training Sets ############## final_acc = 0.0 for i in range(no_of_trials): splitRatio = 0.7 trainingSet, testSet = splitDataset(dataset, splitRatio) #print(trainingSet) # print(type(trainingSet)) print('Split {0} rows into train = {1} and test = {2} rows'.format( len(dataset), len(trainingSet), len(testSet))) ############# Model Building ############## b = dt.buildtree(trainingSet) dt.drawtree(b, jpeg='treeview.jpg') #print("original_testset=",testSet) ############# Preparing Testing DataSet ############## testlabels = [] for i in range(len(testSet)): label = testSet[i].pop(-1) testlabels.append(label) #print("testSet=",testSet) #print("testlabels=",testlabels) ############# Classification of Test Records ############## number = 0 for i in range(len(testSet)): #print("\ntest_data",testSet[i]) a = dt.classify(testSet[i], b)
gain_threshold=0, instance_minimum=1) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain, '):' for row in trainConfMat: print '\t'.join(map(lambda x: str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest, '):' for row in testConfMat: print '\t'.join(map(lambda x: str(x), row)) print '' # Let's see what it looks like... #print "\nFinal tree...\n" #treepredict.printtree(tree) # Produce a png of the tree print '\nPrinting tree image...' treepredict.drawtree(tree, jpeg="sample_tree.jpg") # For group homework, modify "buildtree" function so that it stops # when a threshold value on entropy is no longer satisfied. It should # accept a minimum gain parameter and stop dividing the branch if # this condition is not met. Pruning the tree will not be used in # this cas.
url += "&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid # 得到所有关于此人的详细信息 try: rating = int(float(rating) + 0.5) doc2 = xml.dom.minidom.parseString( urllib.request.urlopen(url).read()) gender = doc2.getElementsByTagName('gender')[0].firstChild.data age = doc2.getElementsByTagName('age')[0].firstChild.data loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2] # 将州转换为地区 for r, s in stateregions.items(): if loc in s: region = r if region != None: result.append((gender, int(age), region, rating)) except: pass return result if __name__ == '__main__': #只有在执行当前模块时才会运行此函数 l1 = getrandomratings(500) print(len(l1)) pdata = getpeopledata(l1) print(pdata) tree = treepredict.buildtree(pdata, scoref=treepredict.variance) #创建决策树 treepredict.prune(tree, 0.5) #剪支 treepredict.drawtree(tree, 'hot.jpg')
def Graphical_Display(): print '>>Graphical Display' treepredict.drawtree(tree,jpeg='treeview.jpg')
try: zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data use=doc.getElementsByTagName('useCode')[0].firstChild.data year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data price=doc.getElementsByTagName('amount')[0].firstChild.data except: return None return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) # 读取文件构造数据集 def getpricelist(): l1=[] for line in open('addresslist.txt'): data=getaddressdata(line.strip(),'Cambridge,MA') print(data) l1.append(data) return l1 if __name__=='__main__': #只有在执行当前模块时才会运行此函数 housedata = getpricelist() print(housedata) tree = treepredict.buildtree(housedata,scoref=treepredict.variance) #创建决策树 treepredict.drawtree(tree,'house.jpg')
code = doc.getElementsByTagName('code')[0].firstChild.data if code != '0': return None try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price) def getpricelist(): l1 = [] for line in file('addresslist.txt'): data = getaddressdata(line.strip(), 'Cambridge,MA') if data is not None: l1.append(data) return l1 if __name__ == "__main__": housedata = getpricelist() housetree = treepredict.buildtree(housedata, scoref=treepredict.variance) treepredict.drawtree(housetree, "housetree.jpg")
gender = doc2.getElementsByTagName('gender')[0].firstChild.data age = doc2.getElementsByTagName('age')[0].firstChild.data loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2] # 将州转换成地区 for r, s in stateregions.items(): if loc in s: region = r if region != None: result.append((gender, int(age), region, rating)) except: pass return result l1 = getrandomratings(500) print len(l1) pdata = getpeopledata(l1) print pdata[0] import treepredict hottree = treepredict.buildtree(pdata, scoref=treepredict.variance) treepredict.prune(hottree, 0.5) treepredict.drawtree(hottree, 'hottree.jpg') south = treepredict.mdclassify((None, None, 'south'), hottree) midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree) print south[10] / sum(south.values()) print midat[10] / sum(midat.values())
price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None print zipcode, use, year, bath, bed, rooms, price return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price) def getpricelist(): ll = [] for line in file('addresslist.txt'): data = getaddressdata(line.strip(), 'Cambrige,MA') #这里需要加个判断语句,因为list会蛋疼地将None加入,使求方差时出错 if data != None: ll.append(data) return ll # # # # #全局测试代码一 if 1: housedata = getpricelist() import treepredict housetree = treepredict.buildtree(housedata, scoref=treepredict.variance) treepredict.drawtree(housetree)
import treepredict import zillow #tree = treepredict.buildtree(treepredict.my_data) #treepredict.printtree(tree) #treepredict.drawtree(tree, jpeg = 'treeview.jpg') #print treepredict.classify(['(direct)', 'USA', 'yes', 5], tree) #treepredict.prune(tree, 1) #treepredict.printtree(tree) #print treepredict.mdclassify(['google', 'None', 'yes', None], tree) housedata = zillow.getpricelist() housetree = treepredict.buildtree(housedata,scoref = treepredict.variance) treepredict.drawtree(housetree, jpeg = 'housetree.jpg')
import treepredict as tp if __name__ == '__main__': print("ESERCIZIO SU IRIS DATASET\n") train_data = [] test_data = [] mydata = tp.aprifile("iris.txt") train_data, test_data = tp.createdataset2(mydata, 60, []) print("TRAIN DATA : \n") print(train_data, "\n") print("TEST DATA: \n") print(train_data) iris_tree = tp.buildtree(train_data) tp.drawtree(iris_tree, "iris_tree.jpeg") tp.fperformance(mydata)
import treepredict as tr agesonly = ad.loadmatch('agesonly.csv', allnum=True) matchmaker = ad.loadmatch('matchmaker.csv') # ad.plotagematches(agesonly) age = [] for line in file('agesonly.csv'): l = [] for w in line.split(','): l.append(int(w)) age.append(l) tree = tr.buildtree(age) tr.printtree(tree) tr.drawtree(tree) print tr.classify(tree, [65, 63]) avgs = ad.lineartrain(agesonly) print avgs print ad.dpclassify([30, 25], avgs.values()) print ad.dpclassify([25, 40], avgs.values()) print ad.dpclassify([48, 20], avgs.values()) print tr.classify(tree, [30, 25]) print tr.classify(tree, [25, 40]) print tr.classify(tree, [48, 20]) numericalset = ad.loadnumerical()
# 状态码为0代表操作成功, 否则代表有错误发生 if code != '0': return None # 提取有关该房产的信息 try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price) '''读取addresslist.txt文件并构造一个数据列表''' def getpricelist(): l1 = [] for line in file('../data/addresslist.txt'): data = getaddressdata(line.strip(), 'Cambridge, MA') l1.append(data) return l1 import treepredict housedata = getpricelist() housetree = treepredict.buildtree(housedata, scoref=treepredict.variance) treepredict.drawtree(housetree, 'housetree.jpg')
import treepredict as tp if __name__ == '__main__': print("ESERCIZIO SU MUSHROOMS DATASET\n") train_data = [] test_data = [] print("ALL DATASET:\n") mydata = tp.aprifile("mushrooms_final.txt") train_data, test_data = tp.createdataset2(mydata, 3250, []) print("TRAIN DATA : \n") print(train_data, "\n") print("TEST DATA: \n") print(train_data) mushrooms_tree = tp.buildtree(train_data) tp.drawtree(mushrooms_tree, "mushrooms_tree.jpeg") tp.fperformance(mydata)