def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.fjnpfdc.com/House/' + i house_res = requests.get(house_url, headers=self.headers) house_con = house_res.content.decode('gbk') house.bu_id = bu_id house.co_id = co_id house.bu_num = re.search('幢 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_name = re.search('房 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}错误".format(co_index, i), e)
def get_house_detail(self, house_url_list): for i in house_url_list: res = requests.get(i) html = res.content.decode('gbk') bu_name = re.search('楼号:.*?HouseNum">(.*?)</span>', html, re.S | re.M).group(1) co_name = re.search('项目名称.*?PrjName">(.*?)</span>', html, re.S | re.M).group(1) ho_id = re.findall("aspx\?Room=(.*?)'.*?<b>(.*?)</b>", html, re.S | re.M) # 房号和房号id对应的字段 ho_id_dict = {} for k in ho_id: ho_id_dict[k[0]] = k[1] house_info = re.findall("<Room><Cell RoomID='(.*?)'.*?BArea='(.*?)'.*?HouseUse='(.*?)'.*?</Room>", html, re.S | re.M) for j in house_info: try: h = House(self.co_index) h.ho_name = ho_id_dict[j[0]] h.ho_true_size = j[1] h.ho_type = j[2] h.co_name = co_name h.bu_num = bu_name h.insert_db() except Exception as e: print('房屋错误,co_index={},url={}'.format(co_index, i), e) continue
def get_house_info(self, house_url_list, co_name, bu_num): for i in house_url_list: try: house = House(co_index) house.co_name = co_name house.bu_num = bu_num house_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(house_url, headers=self.headers) html = response.text house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html, re.S | re.M)[0] house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html, re.S | re.M)[0] house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html, re.S | re.M)[0] house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<', html, re.S | re.M)[0] house.ho_build_size = re.findall( 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0] house.ho_true_size = re.findall( 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0] house.ho_share_size = re.findall( 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0] house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<', html, re.S | re.M)[0] house.insert_db() except Exception as e: print(e)
def get_house_detail(self, house_detail_url_list, co_id, bu_id): for i in house_detail_url_list: detail_url = 'http://www.yzfdc.cn/' + i try: house = House(co_index) time.sleep(3) response = self.s.get(detail_url, headers=self.headers) html = response.text house.co_name = re.search('lblxmmc.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_num = re.search('lbldh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_name = re.search('lblfh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('lblfwxz.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, detail_url), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def get_house_info(self, zu_house_url, bu_num, co_id): try: house = House(co_index) house.bu_num = bu_num house.co_id = co_id result = self.s.get(zu_house_url, headers=self.headers).text house.info = re.search('ItemName.*?>(.*?)<', result).group(1).strip() ho_code_list = re.findall("OnClick=.__doPostBack\(.*?,'(.*?)'\)", result, re.S | re.M) ho_msg_list = re.findall("OnClick=.__doPostBack\('(.*?)'", result, re.S | re.M) self.get_house_detail(zu_house_url, ho_msg_list, ho_code_list, house) except Exception as e: print(e)
def house_info(self, house_list, bu_id, co_id): ho = House(co_index) for house_url in house_list: url = "http://ris.szpl.gov.cn/bol/" + house_url res = requests.get(url, headers=self.headers) ho.ho_num = re.search('id=(\d+)', house_url).group(1) con = res.text ho.bu_num = re.search('情况.*?">(.*?)&', con).group(1) ho.bu_id = bu_id ho.co_id = co_id ho.ho_floor = re.search('楼层.*?">(\d+)&', con).group(1) ho.ho_num = re.search('房号.*?">(\d+)&', con).group(1) ho.ho_type = re.search('用途.*?">(\d+)&', con).group(1) ho.ho_room_type = re.search('户型.*?">(\d+)&', con).group(1) ho.ho_build_size = re.search('建筑面积<.*?">(\d+.\d+)平方米', con).group(1) ho.ho_true_size = re.search('户内面积<.*?">(\d+.\d+)平方米', con).group(1) ho.insert_db()
def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id): for i in range(len(build_url_list)): try: build = Building(co_index) build.co_id = co_id build.co_name = co_name build.bu_pre_sale = bu_pre_sale_list[i] build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1) build_url = 'http://221.2.144.162:8090/' + build_url_list[i] response = requests.get(build_url, headers=self.headers) html = response.content.decode('gbk') build.bu_num = re.findall('<font color=white.*?><b>(.*?)<', html, re.S | re.M)[0] build.bu_address = re.findall('坐落位置:</b>(.*?)<', html, re.S | re.M)[0] build.insert_db() ho_url_list = re.findall('background-.*?href=(.*?) ', html, re.S | re.M) ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<', html, re.S | re.M) for i in range(len(ho_url_list)): try: house = House(co_index) house_url = 'http://221.2.144.162:8090/' + ho_url_list[ i] result = requests.get( house_url, headers=self.headers).content.decode('gbk') house.bu_id = build.bu_id house.co_id = co_id house.ho_type = re.findall( '用 途:.*?<td.*?>(.*?)<', result, re.S | re.M)[0] house.ho_build_size = re.findall( '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0] house.bu_num = build.bu_num house.co_name = co_name house.ho_name = ho_name_list[i] house.insert_db() except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.ndjsj.gov.cn/House/' + i house.bu_num = '幢 号:.*?<td.*?>(.*?)<' house.ho_name = '房 号:.*?<td.*?>(.*?)<' house.co_name = '项目名称:.*?<td.*?>(.*?)<' house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<' house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<' house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<' house.ho_type = '房屋用途:.*?<td.*?>(.*?)<' house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<' house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('宁德房号错误,url={}'.format(house_url), e)
def get_house_info(self, code, co_name): house_url = 'http://house.bffdc.gov.cn/Common/Agents/ExeFunCommon.aspx?' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ code[ 0] + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>55</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item>false</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.post(url=house_url, data=payload, headers=headers) html = response.text info = re.findall("title='(.*?)'", html, re.S | re.M) for i in info: try: house = House(co_index) house.bu_num = code[1] house.ho_name = re.search('房号:(.*?)\r\n', i).group(1) house.ho_type = re.search('用途:(.*?)\r\n', i).group(1) house.ho_room_type = re.search('户型:(.*?)\r\n', i).group(1) house.ho_build_size = re.search('总面积:(.*?)\r\n', i).group(1) house.co_name = co_name house.insert_db() except Exception as e: print(e)
def get_house_info(self, bu_con): bu_html = etree.HTML(bu_con) house = House(co_index) ho = bu_html.xpath("//tr[@height='30']//span/a") bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) for ho_info in ho: try: ho_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + ho_info.xpath( "./@value")[0] try: ho_res = requests.get(ho_detail, headers=self.headers) ho_con = ho_res.text except Exception as e: print("co_index={},房屋详情页{}请求失败".format( co_index, ho_detail)) print(e) continue house.ho_name = re.search('房号.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_floor = re.search('所在层.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊共有面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_type = re.search('房屋用途.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.bu_num = re.search('幢号.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.bu_id = bu_id except: house.ho_name = ho_info.xpath("./@id")[0] house.bu_id = bu_id house.insert_db()
def get_build_url_list(self, url_list): for i in url_list: try: res = requests.get(i) html = res.content.decode('gbk') for k in re.findall('项目名称.*?</dl>', html, re.S | re.M): try: c = Comm(self.co_index) c.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) c.co_address = re.search('class="address"(.*?)</dd>', k, re.S | re.M).group(1) c.area = re.search('"city">(.*?)</dd>', k, re.S | re.M).group(1) c.co_develops = re.search('"average">(.*?)</dd>', k, re.S | re.M).group(1) c.insert_db() global count count += 1 print(count) url = re.search('a href="(.*?)">', k, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') build_info_str = re.search('楼盘表</td>(.*?)合 计', html, re.S | re.M).group(1) for j in re.findall('<tr.*?</tr>', build_info_str, re.S | re.M): try: b = Building(self.co_index) b.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) b.bu_all_house = re.search( 'absmiddle" />(.*?)</a>', j, re.S | re.M).group(1) b.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) b.bu_build_size = re.search( 'td class="t_c">.*?td class="t_c">(.*?㎡)</td>', j, re.S | re.M).group(1) b.insert_db() url = re.search('a href="(.*?)"', j, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') # 解析html获取iframe表单的数据 house_url = self.url_source + re.search( '<iframe.*?"(.*?)"', html, re.S | re.M).group(1) logic_house_url = house_url.replace( 'Default', 'GetData') logic_house_html = requests.get( url=logic_house_url).content.decode() logic_id = re.search( '<LOGICBUILDING_ID>(.*?)<', logic_house_html, re.S | re.M).group(1) final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id final_html = requests.get( url=final_url).content.decode('gbk') for l in re.findall( '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>', final_html, re.S | re.M): try: h = House(self.co_index) h.info = final_html h.ho_name = l h.co_name = re.search( 'html">(.*?)</a>', k, re.S | re.M).group(1) h.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) h.insert_db() except Exception as e: continue except Exception as e: continue except Exception as e: continue except Exception as e: continue
def comm_crawler(self, comm_url, co_develops, co_pre_sale, co_name, co_pre_sale_date): ho = House(co_index) comm_res = requests.get(comm_url, headers=self.headers) comm_html = etree.HTML(comm_res.text) value = comm_html.xpath("//input[@id='propertyid']/@value")[0] sid = comm_html.xpath("//input[@id='sid']/@value")[0] # detail_url = "http://hu.tmsf.com/newhouse/property_"+str(sid)+"_"+str(value)+"_price.htm" bu = Building(co_index) bu_num = comm_html.xpath("//div[@id='building_dd']//a")[1:] # bu_info,bu_num_list = self.build(comm_html,value) self.comm_info(co_develops, co_pre_sale, co_name, co_pre_sale_date, value) # page_html = requests.get(detail_url,headers=self.headers) for bu_ in bu_num: bu.bu_num = bu_.xpath("./text()")[0] bu_id = bu_.xpath("./@id")[0] bu.bu_id = re.search('\d+', bu_id).group(0) bu.co_id = value bu.insert_db() detail_url = "http://hu.tmsf.com/newhouse/property_" + str( sid) + "_" + str(value) + "_price.htm?buildingid=" + str( bu.bu_id) page_html = requests.get(detail_url, headers=self.headers) page = re.search('页数 \d+/(\d+)', page_html.text).group(1) for i in range(1, int(page) + 1): detail_url = detail_url + "?page=" + str(i) detail_res = requests.get(detail_url, headers=self.headers) house_html = etree.HTML(detail_res.text) house_url_list = house_html.xpath("//td[@width='100']/a/@href") house_bu_num = house_html.xpath("//td[@width='100']/a/text()") house_name = house_html.xpath( "//td[@width='101'][1]/a/div/text()") for index in range(1, len(house_url_list) + 1): try: ho.bu_num = house_bu_num[index] # 楼号 栋号 house_url = "http://hu.tmsf.com" + house_url_list[index] house_res = requests.get(house_url, headers=self.headers) house_html = house_res.text ho.bu_id = bu.bu_id ho.co_id = re.search('楼盘主页.*?_\d+_(\d+)_info', house_html).group(1) # 小区id ho.ho_name = house_name[index] # 房号:3单元403 # ho.ho_num = re.search('_(\d+).htm',house_url).group(1) # 房号id ho.ho_type = re.search('房屋用途:.*?>(.*?)<', house_html).group( 1) # 房屋类型:普通住宅 / 车库仓库 ho.ho_floor = re.search('第(.*?)层', house_html).group(1) build_text = re.search('建筑面积:(.*?)平方米', house_html).group(1) build_num = re.findall('class="(.*?)"', build_text) ho.ho_build_size = self.number(build_num) # 建筑面积 size_text = re.search('套内面积:(.*?)平方米', house_html).group(1) size_num = re.findall('class="(.*?)"', size_text) ho.ho_true_size = self.number(size_num) # 预测套内面积,实际面积 price_text = re.search('总 价:(.*?)万元', house_html).group(1) # 价格 price_num = re.findall('class="(.*?)"', price_text) ho.ho_price = self.number(price_num) ho.insert_db() except: continue