def get_house(location="binjiang", current_id=1): current_page = 1 total_page = 2 while current_page < total_page or current_page == total_page: url = 'http://hz.lianjia.com/ershoufang/' + location + '/pg' + str(current_page) + '/' page = urllib2.urlopen(url) soup = BeautifulSoup(page, "html.parser") # print soup ID_num = current_id i = 1 print soup.find_all("a", attrs={"target": "_blank", 'class': 'title', 'data-el': "ershoufang"}) for price in soup.find_all("a", attrs={"target": "_blank", 'class':'title','data-el': "ershoufang"}): # 获取链接 url_text = price.get('href') print str(ID_num) +" "+str(i)+ " "+url_text update_info("url", url_text, ID_num) ID_num += 1 i += 1 current_id = ID_num return get_row()
def get_house(location="binjiang", current_id=1): global location_chinese current_page = 1 # 当前在第几页 total_page = 0 # 在这个区里一共有多少页房产信息 url = 'http://hz.lianjia.com/ershoufang/' + location req = urllib2.Request(url) page = urllib2.urlopen(req) soup = BeautifulSoup(page, "html.parser") if location == 'binjiang': location_chinese = u'滨江' elif location == 'xihu': location_chinese = u'西湖' elif location == 'xiacheng': location_chinese = u'下城' elif location == 'jianggan': location_chinese = u'江干' elif location == 'gongshu': location_chinese = u'拱墅' elif location == 'shangcheng': location_chinese = u'上城' elif location == 'yuhang': location_chinese = u'余杭' elif location == 'xiaoshan': location_chinese = u'萧山' else: print 'wrong location' try: error = soup.title.text if error == u"验证异常流量-链家网": print u'ip被封 请尝试更换代理' return get_row() else: pass except: pass for link in soup.find_all('div', 'resultDes clear'): context = link.get_text() total_house = re.findall(r"\d+\.?\d*", context)[0] # 总共有多少套房子 print location + u'一共有' + total_house + u'套房子' total_page = int(total_house) / 30 + 1 # 求出一共有多少页 # total_page=2 while current_page <= total_page: # 遍历这个区域的所有房子的信息 url = 'http://hz.lianjia.com/ershoufang/' + location + '/pg' + str( current_page) + '/' page = urllib2.urlopen(url) soup = BeautifulSoup(page, "html.parser") ID_num = current_id for price in soup.find_all('div', 'totalPrice'): # 总价的信息 insert_info("Id", ID_num) unit_price = price.get_text() unit_price = unit_price[:-1] # 把最后的一个万字去掉 update_info('money', unit_price, ID_num) update_info('current_data', current_data, ID_num) ID_num += 1 ID_num = current_id for link in soup.find_all('div', 'houseInfo'): # 房子的相关信息,排除出各种别墅 # print url context = link.get_text() # print 'info:'+context village = context.split('|')[0] house_type = context.split('|')[1] square = context.split('|')[2][:-3] # 把平米两个字去掉 orientation = context.split('|')[3] if u'别墅' in house_type: house_type = context.split('|')[2] square = context.split('|')[3][:-3] # 把平米两个字去掉 orientation = context.split('|')[4] update_info("village", village, ID_num) update_info("house_type", house_type, ID_num) update_info("square", square, ID_num) update_info("orientation", orientation, ID_num) update_info("location", location_chinese, ID_num) if len(context.split("|")) >= 5: decorate = context.split('|')[4] update_info("decorate", decorate, ID_num) else: pass ID_num += 1 ID_num = current_id for price in soup.find_all('div', 'unitPrice'): # 单价的信息 unit_price = price.get_text() # print unit_price unit_price = re.findall(r"\d+\.?\d*", unit_price)[0] update_info("per_square", unit_price, ID_num) update_info("page", current_page, ID_num) ID_num += 1 ID_num = current_id for price in soup.find_all("a", attrs={ "target": "_blank", 'class': "title" }): # 获取链接 url_text = price.get('href') # print url_text update_info("url", url_text, ID_num) ID_num += 1 current_id = ID_num # print current_page # print ID_num current_page += 1 return get_row()
'class': "title" }): # 获取链接 url_text = price.get('href') # print url_text update_info("url", url_text, ID_num) ID_num += 1 current_id = ID_num # print current_page # print ID_num current_page += 1 return get_row() if __name__ == '__main__': create_table() row = get_row() # 获取数据库中有多少行数据 row = get_house('binjiang', row + 1) print u'当前时间为:*************************************************' print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house("jianggan", row + 1) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house('gongshu', row + 1) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house('shangcheng', row + 1) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house('yuhang', row + 1) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house('xiaoshan', row + 1) print u'总计已采集数据量为' + str(row) + ' ' + str(time.clock()) row = get_house('xihu', row + 1)