def ho_parse(self, bid, co_id): payload = '<?xml version="1.0" encoding="utf-8" standalone="yes"?><param funname="SouthDigital.CMS.CBuildTableEx.GetBuildHTMLEx"><item>'\ +bid+'</item><item>1</item><item>1</item><item>100</item><item>1000</item><item>g_oBuildTable</item><item> 1=1</item><item>1</item></param>' payload = parse.quote(payload) try: res = requests.post( 'http://www.hbsfdc.com/Common/Agents/ExeFunCommon.aspx', data=payload, headers=self.headers) except Exception as e: log.error("{}楼栋请求失败".format(bid)) con = res.content.decode() ho_list = re.findall("title='(.*?)'>", con, re.S | re.M) for ho in ho_list: house = House(co_index) house.co_id = co_id house.bu_id = bid house.ho_name = re.search('房号:(.*)', ho).group(1) house.ho_type = re.search('用途:(.*)', ho).group(1) house.ho_room_type = re.search('户型:(.*)', ho).group(1) house.ho_build_size = re.search('总面积:(.*)', ho).group(1) if re.search('售价:(.*)', ho): house.ho_price = re.search('售价:(.*)', ho).group(1) else: house.ho_price = None house.insert_db()
def house_info(self,ho_url,co_id,bu_id): url = "http://222.77.178.63:7002/" + ho_url url.rstrip('=') res = requests.get(url,headers=self.headers) res.encoding = 'gbk' html = etree.HTML(res.text) house_detail_list = html.xpath("//td/a[@target]/@href") for house_detail in house_detail_list: try: detail_url = "http://222.77.178.63:7002/" + house_detail detail_res = requests.get(detail_url,headers=self.headers) detail_res.encoding = 'gbk' con = detail_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_floor = re.search('实际层.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_type = re.search('房屋类型.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_price = re.search('总价.*?">(.*?)<',con,re.S|re.M).group(1) ho.insert_db() except Exception as e: # log.error('房屋信息错误{}'.format(e)) print('房屋信息错误{}'.format(e))
def ho_info(self, url, co_id, bu_id): ho_url = 'http://www.aqhouse.net/' + url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy) break except Exception as e: print(e) ho_html = etree.HTML(ho_res.text) room_list = ho_html.xpath("//td[@nowrap]/a/..") for room in room_list: try: room_info = room.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = room.xpath("./a/text()")[0] ho.ho_build_size = re.search('建筑面积:(.*?)平方米', room_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)平方米', room_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*?)平方米', room_info).group(1) ho.ho_room_type = re.search('套型:(.*)', room_info).group(1) ho.ho_price = re.search('价格.*?:(.*?)元/平方米', room_info).group(1) ho.insert_db() except: print('房屋解析失败')
def house_parse(self,bu_id,co_id,sid,propertyid): data = { 'propertyid':propertyid, 'sid':sid, 'buildingid':bu_id, 'tid':'price', 'page':1 } res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm',data=data,headers=self.headers) page = re.search('页数.*?/(\d+)',res.text).group(1) for i in range(1,int(page)+1): data['page'] = i ho_res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm', data=data, headers=self.headers) con = ho_res.text ho_html = etree.HTML(con) house_list = ho_html.xpath("//tr[@onmouseout]") for house in house_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = house.xpath("./td[3]/a/div/text()")[0] ho.unit = house.xpath("./td[2]/a/div/text()")[0] buildsize = house.xpath("./td[4]/a/div/span/@class") truesize = house.xpath("./td[5]/a/div/span/@class") price = house.xpath("./td[9]/a/div/span/@class") ho.ho_build_size = self.number_replace(buildsize) ho.ho_true_size = self.number_replace(truesize) ho.ho_price = self.number_replace(price) ho.insert_db()
def house_parse(self, house_url, co_id, bu_id): ho = House(co_index) url = "http://spf.tlfdc.cn/" + house_url res = requests.get(url, headers=self.headers) con = res.text ho_name = re.findall('室号:(.*?)套', con, re.S | re.M) ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M) ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M) ho_price = re.findall('价格:(.*?)元', con, re.S | re.M) ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"', con, re.S | re.M) for index in range(0, len(ho_name)): try: ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] ho.ho_price = ho_price[index] ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index] res = requests.get(ho_detail_url, headers=self.headers) res = res.content.decode('gb2312') ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res, re.S | re.M)[0].strip() ho.insert_db() except: print('房号错误,co_index={},url={}'.format(co_index, url), e) continue
def house_parse(self, ho_url, co_id, bu_id): house_url = "http://61.143.241.154/" + ho_url ho_res = requests.get(house_url, headers=headers) html = etree.HTML(ho_res.content.decode('gbk')) detail_list = html.xpath("//td[@height='80']/a/@href") for detail in detail_list: try: detail_url = 'http://61.143.241.154/' + detail res = requests.get(detail_url, headers=headers) con = res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房屋号.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.orientation = re.search('房屋朝向.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_price = re.search('申报总价.*?">(.*?)</td', con, re.S | re.M).group(1) ho.insert_db() except Exception as e: log.error("{}房屋请求解析失败{}".format(detail, e))
def get_build_info(self, url, response,co_id, bu_id): house = House(co_index) json_html = json.loads(response.text) for i in json_html: ho_name = i['roomno'] # 房号 ho_type = i['ghyt'] # 用途 ho_true_size = i['tnmj'] # 预测套内面积 ho_floor = i['floorindex'] # 楼层 ho_build_size = i['jzmj'] # 建筑面积 house.co_id = co_id house.bu_id = bu_id house_code = i["fwcode"] house.ho_name = ho_name house.ho_type = ho_type house.ho_true_size = ho_true_size house.ho_floor = ho_floor house.ho_build_size = ho_build_size house_detail_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomview.jhtml?id="+str(house_code) try: res = requests.get(house_detail_url,headers=self.headers) house.ho_share_size = re.search('实测分摊面积.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) house.ho_price = re.search('总价.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) except Exception as e: print("co_index={},房屋详情页{}请求失败!".format(co_index,house_detail_url)) print(e) continue house.insert_db()
def get_house_info(self, co_id, bu_id, id): house_list_url = "http://xx.yyfdcw.com/hetong/fdc_xxdxx.asp?id=" + str( id) res = requests.get(house_list_url, headers=self.headers) con = res.content.decode('gbk') house_list = re.findall("onClick=.*?open\('(.*?)',", con, re.S | re.M) for house_ in house_list: try: house_url = "http://xx.yyfdcw.com/hetong/" + house_ except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) continue ho_res = requests.get(house_url, headers=self.headers) ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('实际层.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.insert_db()
def house_parse(self, bu_id, co_id): # 房屋信息解析 ho = House(co_index) house_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/probld/NBView.do?" formdata = {"nid": bu_id, "projectid": co_id} try: res = requests.post(house_url, data=formdata, headers=self.headers) except Exception as e: print("co_index={},房屋详情页无法访问".format(co_index), e) con = res.text ho_name = re.findall('\'\);">(.*?) ', con, re.S | re.M) ho_build_size = re.findall('<span.*?建筑面积:(.*?)㎡', con, re.S | re.M) ho_true_size = re.findall('<span.*?套内面积:(.*?)分', con, re.S | re.M) ho_share_size = re.findall('<span.*?分摊面积:(.*?)㎡', con, re.S | re.M) ho_type = re.findall('<span.*?用途:(.*?)房', con, re.S | re.M) ho_price = re.findall('<span.*?单价:(.*?)"', con, re.S | re.M) ho_id = re.findall("getHouseBaseInfo\('(.*?)'\)", con, re.S | re.M) for index in range(0, len(ho_id)): ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_build_size = ho_build_size[index] ho.ho_type = ho_type[index] ho.ho_share_size = ho_share_size[index] ho.ho_price = ho_price[index] ho.ho_true_size = ho_true_size[index] ho.ho_num = ho_id[index] ho.insert_db()
def house_info(self, co_id, bu_id, house_url_list): for house_ in house_url_list: house_url = "http://www.njhouse.com.cn/2016/spf/" + house_ try: # ho_res = requests.get(house_url,headers=self.headers) ho_pro = Proxy_contact(app_name="nanjing", method='get', url=house_url, headers=self.headers) ho_con = ho_pro.contact() ho_con = ho_con.decode('gbk') # ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?<td>(.*?)元', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('房屋类型.*?<td>(.*?)</td', ho_con, re.S | re.M).group(1) except Exception as e: log.error("房屋详情页错误{}".format(e)) continue ho.insert_db()
def get_house_info(self, build_num, sid): try: house_url = 'http://www.tmsf.com/newhouse/NewPropertyHz_showbox.jspx?buildingid=' + build_num + '&sid=' + sid house = House(co_index) house.bu_id = 'buildingid":(.*?),' house.co_build_size = 'builtuparea":(.*?),' house.ho_price = 'declarationofroughprice":(.*?),' house.ho_name = 'houseno":(.*?),' house.ho_true_size = 'setinsidefloorarea":(.*?),' house.ho_share_size = 'poolconstructionarea":(.*?),' house.ho_type = 'houseusage":(.*?),' p_2 = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p_2.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, bu_id): house_url = 'http://www.ytfcjy.com/Common/Agents/ExeFunCommon.aspx' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ bu_id + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>80</item>\r\n<item>720</item>\r\n<item>g_oBuildTable</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall("title='(.*?)'", html, re.S | re.M) for i in house_info_list: house = House(co_index) house.ho_name = re.search('房号:(.*?)单元', i, re.S | re.M).group(1) house.ho_build_size = re.search('总面积:(.*?) 平方米', i, re.S | re.M).group(1) house.ho_type = re.search('用途:(.*?)户', i, re.S | re.M).group(1) house.ho_price = re.search('价格:(.*?) 元', i, re.S | re.M).group(1) house.bu_id = bu_id house.info = i house.insert_db()
def ho_info(self, bu_url_list, co_id): for bu_url in bu_url_list: try: res = requests.get(bu_url, headers=self.headers) html = etree.HTML(res.text) house_info_list = html.xpath("//li[@class='tjCor4']") for house_info in house_info_list: house = house_info.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = re.search('dbh=(\d+)', bu_url).group(1) ho.ho_name = re.search('房号:(.*?)<br', house).group(1) ho.ho_room_type = re.search('户型:(.*?)<br', house).group(1) ho.ho_build_size = re.search('建筑面积:(.*?)平方米', house).group(1) ho.ho_price = re.search('单价:(.*?)元', house).group(1) ho.ho_type = re.search('用途:(.*?)<br', house).group(1) ho.insert_db() except Exception as e: log.error('房号信息错误', e)
def get_house_info(self, house_url_list): for url in house_url_list: response = requests.get(url) html = etree.HTML(response.text) con = html.xpath("//tr[@align='center']") for i in con: try: house = House(co_index) # house.ho_num = 'NHOUSENO">(.*?)<' house.ho_name = i.xpath("./td/text()")[1] house.ho_floor = i.xpath("./td/text()")[0] house.ho_build_size = i.xpath("./td/text()")[3] house.ho_true_size = i.xpath("./td/text()")[4] house.ho_share_size = i.xpath("./td/text()")[5] house.ho_room_type = i.xpath("./td/text()")[2] house.ho_price = i.xpath("./td/text()")[-1] house.orientation = i.xpath("./td/text()")[-2] house.bu_id = re.search('ID=(\d+)',url).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def ho_parse(self, co_id, bu_id, ho_list): for ho in ho_list: ho_url = ho.xpath("./@href")[0] house_url = "http://110.89.45.7:8082" + ho_url # while True: # try: # proxy = self.proxies[random.randint(0,9)] try: ho_res = requests.get( house_url, headers=self.headers, ) except: continue # break # except: # continue con = ho_res.text house = House(co_index) house.co_id = co_id house.bu_id = bu_id house.ho_name = re.search('房 号.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_price = re.search('申报单价.*?">(.*?)</td', con, re.S | re.M).group(1) house.ho_type = re.search('房屋用途.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.insert_db() time.sleep(random.randint(0, 3))
def house_info(self,co_id,bu_id,dong_url): url = self.start_url + "/" +dong_url res = requests.get(url,headers=self.headers) res.encoding = 'gbk' con = res.text house_list = re.findall('房屋号.*?<a href="(.*?)"',con,re.S|re.M) for house in house_list: house_url = self.start_url + "/" + house # while True: # try: # proxy = self.proxies[random.randint(0,9)] # ho_res = requests.get(house_url,headers=self.headers,proxies=proxy) # if ho_res.status_code == 200: # break # except: # continue # ho_res.encoding = 'gbk' # ho_con = ho_res.text connect = Proxy_contact(app_name='maoming',method='get',url=house_url,headers=self.headers) content = connect.contact() if content is False: continue ho_con = content.decode('gbk') try: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房屋号.*?">(.*?)</',ho_con,re.S|re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)m',ho_con,re.S|re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)m',ho_con,re.S|re.M).group(1) ho.ho_type = re.search('房屋用途.*?">(.*?)<',ho_con,re.S|re.M).group(1) ho.ho_price = re.search('申报总价.*?">(.*?)<',ho_con,re.S|re.M).group(1) ho.orientation = re.search('朝向.*?">(.*?)<',ho_con,re.S|re.M).group(1) ho.insert_db() except Exception as e: print("房屋解析失败",e)
def comm_crawler(self, comm_url, co_develops, co_pre_sale, co_name, co_pre_sale_date): ho = House(co_index) comm_res = requests.get(comm_url, headers=self.headers) comm_html = etree.HTML(comm_res.text) value = comm_html.xpath("//input[@id='propertyid']/@value")[0] sid = comm_html.xpath("//input[@id='sid']/@value")[0] # detail_url = "http://hu.tmsf.com/newhouse/property_"+str(sid)+"_"+str(value)+"_price.htm" bu = Building(co_index) bu_num = comm_html.xpath("//div[@id='building_dd']//a")[1:] # bu_info,bu_num_list = self.build(comm_html,value) self.comm_info(co_develops, co_pre_sale, co_name, co_pre_sale_date, value) # page_html = requests.get(detail_url,headers=self.headers) for bu_ in bu_num: bu.bu_num = bu_.xpath("./text()")[0] bu_id = bu_.xpath("./@id")[0] bu.bu_id = re.search('\d+', bu_id).group(0) bu.co_id = value bu.insert_db() detail_url = "http://hu.tmsf.com/newhouse/property_" + str( sid) + "_" + str(value) + "_price.htm?buildingid=" + str( bu.bu_id) page_html = requests.get(detail_url, headers=self.headers) page = re.search('页数 \d+/(\d+)', page_html.text).group(1) for i in range(1, int(page) + 1): detail_url = detail_url + "?page=" + str(i) detail_res = requests.get(detail_url, headers=self.headers) house_html = etree.HTML(detail_res.text) house_url_list = house_html.xpath("//td[@width='100']/a/@href") house_bu_num = house_html.xpath("//td[@width='100']/a/text()") house_name = house_html.xpath( "//td[@width='101'][1]/a/div/text()") for index in range(1, len(house_url_list) + 1): try: ho.bu_num = house_bu_num[index] # 楼号 栋号 house_url = "http://hu.tmsf.com" + house_url_list[index] house_res = requests.get(house_url, headers=self.headers) house_html = house_res.text ho.bu_id = bu.bu_id ho.co_id = re.search('楼盘主页.*?_\d+_(\d+)_info', house_html).group(1) # 小区id ho.ho_name = house_name[index] # 房号:3单元403 # ho.ho_num = re.search('_(\d+).htm',house_url).group(1) # 房号id ho.ho_type = re.search('房屋用途:.*?>(.*?)<', house_html).group( 1) # 房屋类型:普通住宅 / 车库仓库 ho.ho_floor = re.search('第(.*?)层', house_html).group(1) build_text = re.search('建筑面积:(.*?)平方米', house_html).group(1) build_num = re.findall('class="(.*?)"', build_text) ho.ho_build_size = self.number(build_num) # 建筑面积 size_text = re.search('套内面积:(.*?)平方米', house_html).group(1) size_num = re.findall('class="(.*?)"', size_text) ho.ho_true_size = self.number(size_num) # 预测套内面积,实际面积 price_text = re.search('总 价:(.*?)万元', house_html).group(1) # 价格 price_num = re.findall('class="(.*?)"', price_text) ho.ho_price = self.number(price_num) ho.insert_db() except: continue