/
houseParser.py
118 lines (104 loc) · 3.5 KB
/
houseParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding=utf8 -*-
import sys
import urllib2
import time
import datetime
import json
from bs4 import BeautifulSoup
import xlwt
import xlrd
from config import *
from house import House
class HouseParser:
"""
对RootUrl页面进行爬取处理
"""
def __init__(self):
#正确处理的houseId的列表
self.success_houseIds = {}
self.houses = []
# 最后存储的Excel文件
self.book = xlwt.Workbook(encoding='utf-8')
self.worksheet = self.book.add_sheet('houseinfo')
self.writeExcelHead()
# Excel文件中的索引位置
self.excelIndex = 0
#此次处理的时间戳
self.timestamp = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
def getHouseList(self,pageCount):
"""
获取所有的学期房列表
"""
request = urllib2.Request(RootUrl+str(pageCount))
content = urllib2.urlopen(request).read()
originContent = content.decode('gb2312','ignore').encode('utf-8')
#寻找content中所有的列信息
soup = BeautifulSoup(originContent)
body = soup.body
houseList = body.find('div',id='searchmain_c_1').find_all('div',attrs={"class": "inventory_list_house inventory_out _houselist"})
for houseStr in houseList:
self.processHouse(houseStr)
#每页数据获取后进行存储
self.saveToFile()
self.houses = []
def processHouse(self,houseStr):
"""
对每个学区房记录进行处理
"""
houseInfo = houseStr.find('div',attrs={"class": "inventory_list_r_tit_list"}).find_all('a')
houseId = houseInfo[0]['href'][len(PrefixURL):][:-1]
address = houseStr.find('div',attrs={"class": "inventory_list_r_name_ad"}).text
detailInfo = houseStr.find('div',attrs={"class": "inventory_list_r_details_r"})
details = detailInfo.find_all('span')
price = details[2].text
area = details[1].text
#如果houseId没被处理过
if not houseId in self.success_houseIds:
print houseId,address,price,area
house = House()
house.houseId = houseId
house.address = address
house.price = price
house.area = area
house.flage = 0
self.houses.append(house)
self.success_houseIds[houseId] = house
else:
print "id:%s exist" % houseId
print houseId,address,price,area
house = self.success_houseIds[houseId]
if(houseId == house.houseId and address == house.address and price == house.price and area == house.area):
house.flage = 1
else:
house.flage = 2
print "The same houseId have different data"
self.houses.append(house)
def writeExcelHead(self):
"""
定义Excel表的表头
"""
self.worksheet.write(0, 0, label = '编号')
self.worksheet.write(0, 1, label = '地址')
self.worksheet.write(0, 2, label = '每套面积')
self.worksheet.write(0, 3, label = '单价')
self.worksheet.write(0, 4, label = '重复标记')
def saveToFile(self):
"""
保存到指定的文件中
"""
for index,house in enumerate(self.houses):
actualIndex = index + self.excelIndex
self.worksheet.write(actualIndex+1, 0, label = house.houseId)
self.worksheet.write(actualIndex+1, 1, label = house.address)
self.worksheet.write(actualIndex+1, 2, label = house.area)
self.worksheet.write(actualIndex+1, 3, label = house.price)
self.worksheet.write(actualIndex+1, 4, label = house.flage)
self.excelIndex += len(self.houses)
#self.book.save('houseInfo_origin_'+self.timestamp+'.xls')
self.book.save('houseInfo_origin.xls')
if __name__ == '__main__':
houseParser = HouseParser()
for pageCount in range(TotalPage):
print "第%d页\n" % (pageCount + 1)
houseParser.getHouseList(pageCount+1)
time.sleep(PagePerTime)