import urllib import urllib2 from urlparse import urlparse from bs4 import BeautifulSoup import time import SouFunLogger __author__ = 'wpbird' default_url = 'http://esf.soufun.com' default_dir = 'f:\\soufun' default_sep = os.path.sep default_time = time.strftime("%Y-%m-%d", time.localtime()) default_encode = 'gbk' default_imgDir = 'f:\\soufun\\imgs' LOG = SouFunLogger.initLog() #获取图片 def getImage(url, name, imgs): result = urlparse(url) imgpath = default_imgDir + default_sep + unicode(name) + default_sep + result.path.split('/')[-1] + default_sep if not os.path.exists(imgpath): os.makedirs(imgpath) for img in imgs: print u'%s 写入' % img urllib.urlretrieve(img, unicode(imgpath) + ''.join(urlparse(img).path.split('/')[-2:])) #抓取指定url内容 def getUrlInfo(url): html = '' try:
def parse(page, url): LOG = SouFunLogger.initLog() detail = SouFunDetail() #解析的网页 detail.sourceUrl = url #解析网页的内容 detail.content = page #内容解析 content = BeautifulSoup(page, from_encoding="utf-8") #标题 detail.title = ''.join(content.find('h1', attrs={"class": "icon_tag20120517"}).stripped_strings) #房源标号 detail.no = content.select(".gray6 span")[1].string[5:].strip() #发布时间 publish_time = content.find('p', class_="gray6").contents[4] publish_time = publish_time[5:len(publish_time) - 1] detail.publish_time = publish_time dllist = content.select('.base_info dl') items = [] for dl in dllist: items.extend([''.join(tag.stripped_strings) for tag in dl.contents if isinstance(tag, bs4.element.Tag)]) detail.total = '' detail.roomStyle = '' detail.area = '' detail.useArea = '' detail.year = '' detail.face = '' detail.floor = '' detail.structure = '' detail.decoration = '' detail.type = '' detail.build_type = '' detail.right = '' detail.meetTime = '' detail.build_name = '' detail.support = '' for item in items: item_spi = [it.strip() for it in item.split(u':')] if len(item_spi) == 2: if item_spi[0].find(u'总价') >= 0 < item_spi[1].find('('): detail.total = item_spi[1] elif item_spi[0].find(u'户型') >= 0: detail.roomStyle = item_spi[1] elif item_spi[0].find(u'使用面积') >= 0: detail.useArea = item_spi[1] elif item_spi[0].find(u'建筑面积') >= 0: detail.area = item_spi[1] elif item_spi[0].find(u'年代') >= 0: detail.year = item_spi[1] elif item_spi[0].find(u'朝向') >= 0: detail.face = item_spi[1] elif item_spi[0].find(u'楼层') >= 0: detail.floor = item_spi[1] elif item_spi[0].find(u'结构') >= 0: detail.structure = item_spi[1] elif item_spi[0].find(u'装修') >= 0: detail.decoration = item_spi[1] elif item_spi[0].find(u'住宅类别') >= 0: detail.type = item_spi[1] elif item_spi[0].find(u'建筑类别') >= 0: detail.build_type = item_spi[1] elif item_spi[0].find(u'产权性质') >= 0: detail.right = item_spi[1] elif item_spi[0].find(u'看房时间') >= 0: detail.meetTime = item_spi[1] elif item_spi[0].find(u'楼盘名称') >= 0: detail.build_name = item_spi[1] elif item_spi[0].find(u'配套设施') >= 0: detail.support = item_spi[1] #电话 detail.phone = content.find("span", attrs={"id": "mobilecode"}).string LOG.debug( 'total:%s roomstyle: %s area:%s useArea:%s' % (detail.total, detail.roomStyle, detail.area, detail.useArea)) LOG.debug( 'year:%s face:%s floor:%s structure:%s decoration:%s type:%s build_type:%s right:%s meetTime:%s build_name:%s support:%s' % ( detail.year, detail.face, detail.floor, detail.structure, detail.decoration, detail.type, detail.build_type, detail.right, detail.meetTime, detail.build_name, detail.support)) #房源描述 detail.describe = ''.join(content.select('.describe.mt10 div')[0].stripped_strings) #户型图 if not len(content.select('#esfbjxq_117 img')): detail.sizeImg = '' else: detail.sizeImg = ''.join([content.select('#esfbjxq_117 img')[0]['src']]) LOG.debug('sizeImg:%s' % detail.sizeImg) #室内图 roomImgList = [] for img in content.select('#esfbjxq_116 img'): if img.has_key('src'): roomImgList.append(img['src']) detail.indoorImgs = ','.join(roomImgList) LOG.debug('indoorImgs:%s' % detail.indoorImgs) #外景图 js动态写上的,需要可以执行网页的解析器 #地图交通 (地址 交通状况)#0是地址,1交通 addAndTra = [''.join(tt.stripped_strings) for tt in content.select('#esfbjxq_121 p')] detail.address = addAndTra[0][3:] detail.traffic = ''.join(addAndTra[1][5:].split()) LOG.debug('address:%s traffic:%s' % (detail.address, detail.traffic)) #小区简介(物业类型 绿化率 物业费 物业公司 楼盘名称 开发商) detail.wuyeType = '' detail.lvhua = '' detail.wuyeFee = '' detail.wuyeComp = '' detail.developer = '' briefIntro = [''.join(tt.stripped_strings) for tt in content.select('dl.mt10 dd')] for tmp in briefIntro: items = tmp.strip().split(u':') if len(items) == 2: if items[0].find(u'物业类型') >= 0: detail.wuyeType = items[1] elif items[0].find(u'绿 化 率') >= 0: detail.lvhua = items[1] elif items[0].find(u'物 业 费') >= 0: detail.wuyeFee = items[1] elif items[0].find(u'物业公司') >= 0: detail.wuyeComp = items[1] elif items[0].find(u'开 发 商') >= 0: detail.developer = items[1] LOG.debug('wuyeType:%s lvhua:%s wuyefee:%s wuyeCom:%s developer:%s' % (detail.wuyeType, detail.lvhua, detail.wuyeFee, detail.wuyeComp, detail.developer)) return detail