コード例 #1
0
ファイル: run.py プロジェクト: wz125/courses
def Modeling_Home_Prices(): 
  print '>>Modeling Home Prices'
  import zillow
  if os.path.exists('housedata.txt'):
    f=open('housedata.txt','r')
    lines=f.readlines()
    housedata=[]
    for line in lines:
      fields=line.split('\t')
      l1=[fields[0],fields[1],fields[2],fields[3],fields[4],fields[5],fields[6]]
      housedata.append(l1)
    f.close();
  else:
    housedata=zillow.getpricelist( )
    f=open('housedata.txt','w')
    for l in housedata:
      if l is None:
        continue
      print l
      for k in l:
        f.write('%s\t' % (k))
      f.write('\n')
    f.close
  reload(treepredict)
  housetree=treepredict.buildtree(housedata,scoref=treepredict.variance)
  treepredict.drawtree(housetree,'housetree.jpg')
コード例 #2
0
def main():
    from treepredict import buildtree, entropy, drawtree
    # house_data = getpricelist()
    # print house_data
    print 'build tree'
    t = buildtree(house_data, scoref=entropy)
    print 'draw tree'
    drawtree(t, 'house_price_tree.jpeg')
コード例 #3
0
def main(rows):
    # fruits with their colors and size
    tree = treepredict.buildtree(rows)
    # print(treepredict.classify([2, 'red'], tree))
    # print(treepredict.classify([5, 'red'], tree))
    # print(treepredict.classify([1, 'green'], tree))

    # 决策树
    treepredict.printtree(tree)
    treepredict.drawtree(tree, jpeg='treeview.jpg')
コード例 #4
0
    if code != '0':
        return None

    # extract the info about this property
    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return zipcode, use, int(year), float(bath), int(bed), int(rooms), price

def getpricelist():
    l1 = []
    for line in file('addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambridge+MA')
        l1.append(data)
    return l1


if __name__ == "__main__":
    housedata = getpricelist()
    housedata = [data for data in housedata if data != None]
    housetree = treepredict.buildtree(housedata, treepredict.variance)
    treepredict.drawtree(housetree, 'housetree.jpg')
コード例 #5
0
treepredict.printtree(tree)

trainConfMat, crTrain = treepredict.testTree(train_data, tree)
print 'Training set confusion matrix (Classification rate:', crTrain,'):'
for row in trainConfMat:
    print '\t'.join(map(lambda x:str(x), row))

print ''
  

testConfMat, crTest  = treepredict.testTree(test_data,  tree) 
print 'Test set confusion matrix (Classification rate:', crTest,'):'
for row in testConfMat:
    print '\t'.join(map(lambda x:str(x), row))

print ''
    
    


# Produce a png of the tree
print '\nPrinting tree image...'
treepredict.drawtree(tree,jpeg="sample_tree.jpg")


# For group homework, modify "buildtree" function so that it stops
# when a threshold value on entropy is no longer satisfied. It should
# accept a minimum gain parameter and stop dividing the branch if
# this condition is not met.  Pruning the tree will not be used in
# this cas.
コード例 #6
0
ファイル: zillow.py プロジェクト: Windriver/codelab
        rooms=doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price=doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None
    print zipcode,use,year,bath,bed,rooms,price

    return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price)

def getpricelist():
    ll=[]
    for line in file('addresslist.txt'):
        data=getaddressdata(line.strip(),'Cambrige,MA')
        #这里需要加个判断语句,因为list会蛋疼地将None加入,使求方差时出错
        if data!=None:
            ll.append(data)
    return ll


#
#

#
#

#全局测试代码一
if 1:
    housedata=getpricelist()
    import treepredict
    housetree=treepredict.buildtree(housedata,scoref=treepredict.variance)
    treepredict.drawtree(housetree)
コード例 #7
0
ファイル: advclassify.py プロジェクト: mgduarte/homework
def tree_view():
    from treepredict import buildtree, drawtree
    my_data = [map(float, line.split(',')) for line in open('data/agesonly.csv')]
    tree = buildtree(my_data)
    drawtree(tree, 'treeview.png')
コード例 #8
0
	def decision_tree1(self, evt):
		import treepredict
		reload(treepredict)
		full_price = 1130
		flights = self.fl_lines[('PEK','PVG')].set_of_flights

		data = []
		for deptid in flights.keys():
			flight = flights[deptid]
			ftype = flight['ftype']
			deptdate = flight['date']
			deptdate = deptdate.split('/')

			# 获得周几
			weekday = datetime.datetime(int(deptdate[0]),int(deptdate[1]),int(deptdate[2])).weekday()
			weekday = int(weekday)
			#print weekday
			time = flight['time']

			# 若起飞时间非常早或者非常晚,取为1,否则为0
			if int(time[0:2]) < 9 or int(time[0:2]) > 20:
				time = 1
			else:
				time = 0

			# 处理价格
			dd = flight['date']

			# str-->date
			dd = dd.split('/')
			deptdate = datetime.date(int(dd[0]), int(dd[1]), int(dd[2]))
			price = flight['price']

			points = []
			for ftdate in price.keys():
				ff = ftdate.split('/')
				fetchdate = datetime.date(int(ff[0]), int(ff[1]), int(ff[2]))
				days = (deptdate-fetchdate).days
				points.append((days, price[ftdate]))

			points = self.pre(points)
			p = []
			for i in points.keys():
				if i >= 6:
					p.append(points[i])

			#print p
			if len(p) <= 1:
				continue
			result_price = p[0]
			p.pop(0)
			avg_price = sum(p)/len(p)

			result_price = int(float(result_price)/float(full_price)*10)
			avg_price = int(float(avg_price)/float(full_price)*10)

			data.append((weekday, time, avg_price, result_price))
##		fout = open('task.txt', 'w')
##		lines = ['%s %s %s %s\n' %v for v in data]
##		fout.writelines(lines)
##		fout.close()
		flighttree = treepredict.buildtree(data, scoref = treepredict.giniimpurity)
		treepredict.drawtree(flighttree,'flighttree_entropy.jpg')
コード例 #9
0
dataset = ads_df_final[['Adjective', 'Adverb', 'Noun', 'Verb', 'Sentiment']]

############# Splitting the Dataset into Testing and Training Sets ##############
final_acc = 0.0

for i in range(no_of_trials):
    splitRatio = 0.7
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    #print(trainingSet)
    #    print(type(trainingSet))
    print('Split {0} rows into train = {1} and test = {2} rows'.format(
        len(dataset), len(trainingSet), len(testSet)))

    ############# Model Building ##############
    b = dt.buildtree(trainingSet)
    dt.drawtree(b, jpeg='treeview.jpg')

    #print("original_testset=",testSet)
    ############# Preparing Testing DataSet ##############
    testlabels = []
    for i in range(len(testSet)):
        label = testSet[i].pop(-1)
        testlabels.append(label)

    #print("testSet=",testSet)
    #print("testlabels=",testlabels)
    ############# Classification of Test Records ##############
    number = 0
    for i in range(len(testSet)):
        #print("\ntest_data",testSet[i])
        a = dt.classify(testSet[i], b)
コード例 #10
0
                             gain_threshold=0,
                             instance_minimum=1)

trainConfMat, crTrain = treepredict.testTree(train_data, tree)
print 'Training set confusion matrix (Classification rate:', crTrain, '):'
for row in trainConfMat:
    print '\t'.join(map(lambda x: str(x), row))

print ''

testConfMat, crTest = treepredict.testTree(test_data, tree)
print 'Test set confusion matrix (Classification rate:', crTest, '):'
for row in testConfMat:
    print '\t'.join(map(lambda x: str(x), row))

print ''

# Let's see what it looks like...
#print "\nFinal tree...\n"
#treepredict.printtree(tree)

# Produce a png of the tree
print '\nPrinting tree image...'
treepredict.drawtree(tree, jpeg="sample_tree.jpg")

# For group homework, modify "buildtree" function so that it stops
# when a threshold value on entropy is no longer satisfied. It should
# accept a minimum gain parameter and stop dividing the branch if
# this condition is not met.  Pruning the tree will not be used in
# this cas.
コード例 #11
0
ファイル: hotornot.py プロジェクト: simmbaa/data-mining-1
        url += "&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid

        # 得到所有关于此人的详细信息
        try:
            rating = int(float(rating) + 0.5)
            doc2 = xml.dom.minidom.parseString(
                urllib.request.urlopen(url).read())
            gender = doc2.getElementsByTagName('gender')[0].firstChild.data
            age = doc2.getElementsByTagName('age')[0].firstChild.data
            loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2]

            # 将州转换为地区
            for r, s in stateregions.items():
                if loc in s: region = r

            if region != None:
                result.append((gender, int(age), region, rating))
        except:
            pass
    return result


if __name__ == '__main__':  #只有在执行当前模块时才会运行此函数
    l1 = getrandomratings(500)
    print(len(l1))
    pdata = getpeopledata(l1)
    print(pdata)
    tree = treepredict.buildtree(pdata, scoref=treepredict.variance)  #创建决策树
    treepredict.prune(tree, 0.5)  #剪支
    treepredict.drawtree(tree, 'hot.jpg')
コード例 #12
0
ファイル: run.py プロジェクト: wz125/courses
def Graphical_Display():
  print '>>Graphical Display'
  treepredict.drawtree(tree,jpeg='treeview.jpg')
コード例 #13
0
    try:
        zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data
        use=doc.getElementsByTagName('useCode')[0].firstChild.data
        year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data
        bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms=1         #doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price=doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price)

# 读取文件构造数据集
def getpricelist():
    l1=[]
    for line in open('addresslist.txt'):
        data=getaddressdata(line.strip(),'Cambridge,MA')
        print(data)
        l1.append(data)
    return l1



if __name__=='__main__':  #只有在执行当前模块时才会运行此函数
    housedata = getpricelist()
    print(housedata)
    tree = treepredict.buildtree(housedata,scoref=treepredict.variance)   #创建决策树
    treepredict.drawtree(tree,'house.jpg')
コード例 #14
0
    code = doc.getElementsByTagName('code')[0].firstChild.data
    if code != '0':
        return None
    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price)


def getpricelist():
    l1 = []
    for line in file('addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambridge,MA')
        if data is not None:
            l1.append(data)
    return l1


if __name__ == "__main__":
    housedata = getpricelist()
    housetree = treepredict.buildtree(housedata, scoref=treepredict.variance)
    treepredict.drawtree(housetree, "housetree.jpg")
コード例 #15
0
            gender = doc2.getElementsByTagName('gender')[0].firstChild.data
            age = doc2.getElementsByTagName('age')[0].firstChild.data
            loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2]

            # 将州转换成地区
            for r, s in stateregions.items():
                if loc in s: region = r

            if region != None:
                result.append((gender, int(age), region, rating))
        except:
            pass
    return result


l1 = getrandomratings(500)
print len(l1)
pdata = getpeopledata(l1)
print pdata[0]

import treepredict

hottree = treepredict.buildtree(pdata, scoref=treepredict.variance)
treepredict.prune(hottree, 0.5)
treepredict.drawtree(hottree, 'hottree.jpg')

south = treepredict.mdclassify((None, None, 'south'), hottree)
midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree)
print south[10] / sum(south.values())
print midat[10] / sum(midat.values())
コード例 #16
0
ファイル: zillow.py プロジェクト: ugiwgh/codelab
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None
    print zipcode, use, year, bath, bed, rooms, price

    return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price)


def getpricelist():
    ll = []
    for line in file('addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambrige,MA')
        #这里需要加个判断语句,因为list会蛋疼地将None加入,使求方差时出错
        if data != None:
            ll.append(data)
    return ll


#
#

#
#

#全局测试代码一
if 1:
    housedata = getpricelist()
    import treepredict
    housetree = treepredict.buildtree(housedata, scoref=treepredict.variance)
    treepredict.drawtree(housetree)
コード例 #17
0
import treepredict
import zillow
#tree = treepredict.buildtree(treepredict.my_data)
#treepredict.printtree(tree)
#treepredict.drawtree(tree, jpeg = 'treeview.jpg')
#print treepredict.classify(['(direct)', 'USA', 'yes', 5], tree)
#treepredict.prune(tree, 1)
#treepredict.printtree(tree)
#print treepredict.mdclassify(['google', 'None', 'yes', None], tree)
housedata = zillow.getpricelist()
housetree = treepredict.buildtree(housedata,scoref = treepredict.variance)
treepredict.drawtree(housetree, jpeg = 'housetree.jpg')
コード例 #18
0
import treepredict as tp

if __name__ == '__main__':

    print("ESERCIZIO SU IRIS DATASET\n")

    train_data = []
    test_data = []

    mydata = tp.aprifile("iris.txt")

    train_data, test_data = tp.createdataset2(mydata, 60, [])

    print("TRAIN DATA : \n")

    print(train_data, "\n")

    print("TEST DATA: \n")

    print(train_data)

    iris_tree = tp.buildtree(train_data)

    tp.drawtree(iris_tree, "iris_tree.jpeg")

    tp.fperformance(mydata)
コード例 #19
0
import treepredict as tr

agesonly = ad.loadmatch('agesonly.csv', allnum=True)
matchmaker = ad.loadmatch('matchmaker.csv')

# ad.plotagematches(agesonly)

age = []
for line in file('agesonly.csv'):
    l = []
    for w in line.split(','):
        l.append(int(w))
    age.append(l)
tree = tr.buildtree(age)
tr.printtree(tree)
tr.drawtree(tree)

print tr.classify(tree, [65, 63])

avgs = ad.lineartrain(agesonly)
print avgs

print ad.dpclassify([30, 25], avgs.values())
print ad.dpclassify([25, 40], avgs.values())
print ad.dpclassify([48, 20], avgs.values())

print tr.classify(tree, [30, 25])
print tr.classify(tree, [25, 40])
print tr.classify(tree, [48, 20])

numericalset = ad.loadnumerical()
コード例 #20
0
    # 状态码为0代表操作成功, 否则代表有错误发生
    if code != '0': return None

    # 提取有关该房产的信息
    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except:
        return None

    return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price)


'''读取addresslist.txt文件并构造一个数据列表'''
def getpricelist():
    l1 = []
    for line in file('../data/addresslist.txt'):
        data = getaddressdata(line.strip(), 'Cambridge, MA')
        l1.append(data)
    return l1

import treepredict
housedata = getpricelist()
housetree = treepredict.buildtree(housedata, scoref=treepredict.variance)
treepredict.drawtree(housetree, 'housetree.jpg')
コード例 #21
0
import treepredict as tp

if __name__ == '__main__':

    print("ESERCIZIO SU MUSHROOMS DATASET\n")

    train_data = []
    test_data = []

    print("ALL DATASET:\n")
    mydata = tp.aprifile("mushrooms_final.txt")

    train_data, test_data = tp.createdataset2(mydata, 3250, [])

    print("TRAIN DATA : \n")

    print(train_data, "\n")

    print("TEST DATA: \n")

    print(train_data)

    mushrooms_tree = tp.buildtree(train_data)

    tp.drawtree(mushrooms_tree, "mushrooms_tree.jpeg")

    tp.fperformance(mydata)