def comm_info(self, comm_url_list): for comm_url in comm_url_list: try: url = "http://as.gzfcxx.cn" + comm_url res = requests.get(url, headers=self.headers) co = Comm(co_index) co.co_name = re.search('项目名称.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_id = re.search('yszh=(\d+)', comm_url).group(1) co.co_develops = re.search('开发商.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_address = re.search('坐落.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_pre_sale = re.search('许可证.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_handed_time = re.search('交房时间.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.insert_db() html = etree.HTML(res.text) build_detail = html.xpath("//a[@class='a3']/@href")[0] except Exception as e: log.error('小区信息错误', e) continue self.build_info(build_detail, co.co_id)
def start_crawler(self): data = { "Submit":"(unable to decode value)" } res = requests.post(self.start_url,data=data,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) comm_url_list = html.xpath("//tr//span[@style='width:270px; color:#006']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.fxfdcw.com/' + comm_url com_res = requests.get(url,headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('xmid=(\d+)',comm_url).group(1) co.co_name = re.search('项目名称.*?">(.*?)</',con,re.S|re.M).group(1) co.co_develops = re.search('开发企业:(.*?)  ',con,re.S|re.M).group(1) co.co_address = re.search('项目地址.*?">(.*?)</',con,re.S|re.M).group(1) co.co_build_size = re.search('建筑面积.*?">(.*?)</',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(.*?)</',con,re.S|re.M).group(1) co.insert_db() bu_list = re.findall("window.open\('(.*?)'\)",con,re.S|re.M) except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_list,co.co_id)
def baiyin_start(self): page = self.get_all_page() print(page) for i in range(1, int(page) + 1): res = requests.get(self.url + '?page=' + str(i), headers=self.headers) html = res.content.decode('gbk') tree = etree.HTML(html) community_list = tree.xpath('//tr[@align="center"]') for i in community_list[1:]: try: comm = Comm(self.CO_INDEX) href = i.xpath('td/a/@href') area = i.xpath('td[1]/text()') if not area: area = None else: area = area[0] href = href[0] comm.area = area self.get_comm_detail(href, comm) except Exception as e: href = i.xpath('td/a/@href') if not href: continue href = href[0] comm_url = self.URL_FRONT + href print('小区错误:', comm_url) print(e)
def start_crawler(self): res = requests.get(url, headers=self.headers) content = res.text page = re.search('页数:1/(.*?) ', content, re.S | re.M).group(1) for i in range(1, int(page) + 1): page_url = 'http://newhouse.ntfdc.net/house_certification.aspx?p=' + str( i) response = requests.get(page_url, headers=self.headers) html = response.text comm_html = re.search('class="layer-bd tb-style1">.*?</table>', html, re.S | re.M).group() comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M)[1:] for info in comm_info_list: try: comm = Comm(co_index) comm.co_pre_sale = re.search('<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_name = re.search('<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_type = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_pre_sale_date = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_develops = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, page_url), e)
def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) html = etree.HTML(res.text) comm_url_list = html.xpath("//div[@class='post']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.ggsfcw.com/' + comm_url comm_res = requests.get(url, headers=self.headers) com_html = etree.HTML(comm_res.text) comm = Comm(co_index) comm.co_name = re.search('<h3.*?">(.*?)</', comm_res.text).group(1) comm.co_id = re.search('n=(\d+)', comm_res.text).group(1) comm.co_address = re.search('地址.*?">(.*?)</', comm_res.text).group(1) comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1) comm.co_develops = re.search('开发商.*?">(.*?)</', comm_res.text).group(1) comm.co_use = re.search('规划用途.*?">(.*?)</', comm_res.text).group(1) comm.insert_db() except Exception as e: log.error("小区信息错误", e) continue bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a") self.build_info(bu_list, comm.co_id)
def get_comm_info(self, comm_html_list): for i in comm_html_list: comm = Comm(co_index) comm.co_name = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_develops = re.search('<td.*?><a.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) detail_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_comm_detail(detail_url, comm)
def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://house.bffdc.gov.cn/public/project/' + i response = requests.get(comm_url) html = response.text comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) build_url_list = build_info.split(';;') self.get_build_info(build_url_list, comm.co_name) global count count += 1 print(count) except Exception as e: print(e)
def start_crawler(self): querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"} payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622ProjectIntroduce%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DProjectIntroduce%2626amp%263BShowModeCode%263Ddefault%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622EnterPriseName%267C0%2624Name%267C0%2624Location%267C0%2624SoilUse%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E" response = requests.request("POST", url, data=payload, params=querystring) html = response.text comm_info_list = re.findall('class="tdctfield tdctwidthset ".*?</tr>', html, re.S | re.M) for i in comm_info_list: comm = Comm(co_index) comm.co_develops = re.search('class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_name = re.search( 'class="spanctfield".*?class="spanctfield".*?<a.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search( 'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_type = re.search( 'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_id = re.search('EnterPriseName_(.*?)"', i, re.S | re.M).group(1) comm.insert_db() self.get_build_info(comm.co_id)
def co_parse(self,url_list): for url in url_list: try: co_url = url.xpath("./@href")[0] new_url = "http://tmsf.qzfdcgl.com" + co_url co_res = requests.get(new_url,headers=self.headers) con = co_res.text co = Comm(co_index) co.co_id = re.search('property_(.*?)_info',co_url).group(1) co.co_name = re.search('楼盘名称:</span>(.*)',con).group(1) co.co_develops = re.search('项目公司:</span>(.*)',con).group(1) co.co_address = re.search('物业地址:</span>(.*?)</p',con,re.S|re.M).group(1) co.area = re.search('所属城区:</span>(.*)',con).group(1) co.insert_db() sid = re.search('property_(\d+)_',co_url).group(1) propertyid = re.search('(\d+)_info',co_url).group(1) bu_url = new_url.replace('info','price') res = requests.get(bu_url,headers=self.headers) bu_html = etree.HTML(res.text) bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a") except: continue for bu_ in bu_idlist[1:]: id = bu_.xpath("./@id")[0] bu_id = re.search('.*?(\d+)',id).group(1) bu = Building(co_index) bu.bu_id = bu_id bu.co_id = co.co_id bu.bu_num = bu_.xpath("./text()")[0] bu.insert_db() self.house_parse(bu_id,co.co_id,sid,propertyid)
def comm_info(self, co_develops, co_pre_sale, co_name, co_pre_sale_date, sid): co = Comm(co_index) co.co_pre_sale = co_pre_sale co.co_id = sid co.co_name = co_name co.co_pre_sale_date = co_pre_sale_date co.co_develops = co_develops co.insert_db()
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1,int(page)+1): url = self.start_url + '?pageIndex=2' + str(page) page_res = requests.get(url,headers=self.headers) html = etree.HTML(page_res.text) comm_info_list = html.xpath("//ul/li/div") for comm_info in comm_info_list: try: co = Comm(co_index) co.co_name = comm_info.xpath("./p/a/text()")[0] deve = comm_info.xpath("./p[2]/text()")[0] addr = comm_info.xpath("./p[3]/text()")[0] co.co_develops = re.search('开发商:(.*)',deve).group(1) co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1) comm_url = comm_info.xpath("./p/a/@href")[0] co.co_id = re.search('projectId=(\d+)',comm_url).group(1) co.insert_db() co_url = 'http://www.bdfdc.net' + comm_url co_res = requests.get(co_url,headers=self.headers) time.sleep(5) bu_html = etree.HTML(co_res.text) bu_url_list = bu_html.xpath("//div[@style]/a")[1:] except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_url_list,co.co_id)
def get_data_obj(self, analyzer, co_index): if analyzer == 'comm': return Comm(co_index) elif analyzer == 'build': return Building(co_index) elif analyzer == 'house': return House(co_index)
def comm(self, tag): co = Comm(co_index) co.co_name = tag.xpath("./td[@width='143']/a/text()")[0] co.area = tag.xpath("./td[@width='184']/text()")[0] co.co_develops = tag.xpath("./td[@width='192']/text()")[0] co_id = tag.xpath("./td/a/@href")[0] co.co_id = re.search('mmcid=(\d+)&', co_id).group(1) co.co_open_time = tag.xpath("./td[@width='95']/text()")[0] buid_all_url = "http://www.syfc.com.cn" + co_id co.insert_db() global count count += 1 print(count) return buid_all_url, co.co_id
def start(self): response = requests.get(self.url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//ul[@class="NewsList"]/li/a/@href') for i in range(len(comm_url_list)): comm = Comm(8) comm_url = 'http://www.cxsfdcglzx.com/touming/' + comm_url_list[i] print(comm_url) self.get_comm_info(comm_url, comm)
def start_crawler(self): for i in self.url: res = requests.get(url=i) html = res.content.decode() c = Comm(self.co_index) c.co_name = re.search('楼盘名称:</h5></td><td><span>(.*?)<', html, re.S | re.M).group(1) c.co_develops = re.search( '开发建设单位:</h5></td><td><span>(.*?)</span>', html, re.S | re.M).group(1) c.co_address = re.search('项目位置:</h5></td><td><span>(.*?)</span>', html, re.S | re.M).group(1) c.co_build_size = re.search( '建筑面积:</h5></td><td><span>(.*?)</span>', html, re.S | re.M).group(1) print(c.to_dict()) c.insert_db()
def start(self): b = AllListUrl(first_page_url=self.url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = self.url + '&page=' + str(i) response = requests.get(url=all_page_url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//dt[@class="name"]/a/@href') area_list = tree.xpath('//dl[@class="houseList_n"]/dd[3]/text()') for i in range(len(comm_url_list)): url = 'http://www.fzfgj.cn/' + comm_url_list[i] try: comm = Comm(11) comm.area = area_list[i].replace('所属区域:', '') self.get_comm_info(url, comm) except BaseException as e: print('小区错误,co_index={},url={}'.format(co_index, url), e)
def start_crawler(self): url_list = self.get_all_page_url() for url in url_list: res = requests.get(url, headers=self.headers) html = res.content.decode('gb2312') info_list = re.search('可售套数(.*?)<!--进行翻页显示和处理-->', html, re.S | re.M).group(1) for info in re.findall('<tr.*?</tr>', info_list, re.S | re.M): try: comm = Comm(1) comm_detail_url = re.search('<a href="(.*?)">', info, re.S | re.M).group(1) comm_area = re.findall('<td align="center">(.*?)</td>', info, re.S | re.M)[1] comm.area = comm_area # href = 'http://www.bsfcj.com/PubInfo/' + 'lpxx.asp?qyxmbm=DBDHDADCDADADADFDDDBDCDJ000001' href = 'http://www.bsfcj.com/PubInfo/' + comm_detail_url comm = self.get_comm_detail(href, comm) comm.insert_db() except Exception as e: print('小区列表页解析有错,co_index={},'.format(self.co_index), e)
def get_comm_info(self, comm_info_list): for i in comm_info_list: try: comm = Comm(co_index) comm.co_name = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://www.fjnpfdc.com/House/' + i comm.co_develops = '公司名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<' comm.co_name = '项目名称:.*?<td.*?>(.*?)<' comm.co_address = '项目坐落:.*?<td.*?>(.*?)<' comm.co_use = '规划用途:.*?<td.*?>(.*?)<' comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<' comm.co_id = 'ProjectId=(.*?)&' p = ProducerListUrl( page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="<a href='(BuildingInfo.*?)'", analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print("co_index={},小区{}错误".format(co_index, i), e)
def get_comm_info(self, html): html_info = re.search('预售商品房住宅项目公示(.*?)</table>', html).group(1) comm_list = re.findall( '<td(.*?)ahref="(.*?)">(.*?)</a(.*?)<ahref="(.*?)">(.*?)</a></td><td(.*?)>(.*?)</td></tr>', html_info) for i in comm_list: try: comm = Comm(2) url = 'http://www.bjjs.gov.cn/' + i[1] self.get_comm_detail(url, comm) global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, url), e)
def get_comm_info(self, comm_detail_url_list): for i in comm_detail_url_list: try: comm = Comm(co_index) comm_url = 'http://www.ndjsj.gov.cn' + i comm.co_develops = '公司名称:.*?<td.*?>(.*?)<' comm.co_name = '项目名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<' comm.co_address = '项目坐落:.*?<td.*?>(.*?)<' comm.co_use = '规划用途:.*?<td.*?>(.*?)<' comm.co_size = '占地面积:.*?<td.*?>(.*?)<' comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule="(BuildingInfo\?BuildingId=.*?)'", analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print('宁德小区错误,url={}'.format(comm_url), e)
def get_comm_detail(self, comm_detail_url): comm = Comm(co_index) try: response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_pre_sale = re.search('预售许可证号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_land_use = re.search('土地使用权证号及用途:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_build_size = re.search('本期预售总建筑面积:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_all_house = re.search('本期总单元套数:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('发证日期:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) return comm except Exception as e: print('小区详情错误,co_index={},url={}'.format(co_index, comm_detail_url), e) return comm
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_pre_sale = re.search('预销售许可证号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发建设单位.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_handed_time = re.search('发证日期.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('项 目 名 称.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项 目 座 落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
def start(self): page = self.get_all_page() for i in range(1, int(page) + 1): url = 'http://www.czhome.com.cn/complexPro.asp?page=' + str( i ) + '&districtID=0&projectAdr=&projectName=&buildingType=0&houseArea=0&averagePrice=0&selState=-1' response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) comm_url_list = tree.xpath('//*[@id="Table8"]/tr/td[2]/a/@href') for i in range(len(comm_url_list)): try: comm = Comm(7) comm_url = 'http://www.czhome.com.cn/' + comm_url_list[i] self.get_comm_info(comm_url, comm) except Exception as e: print("co_index={},小区:{}无法提取".format(co_index, comm_url)) print(e)
def start_crawler(self): for i in self.area_list: data = {'districtID': i} res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp', data=data) html_str = res.content.decode('gbk') # 根据返回结果 获取每个地区的返回分页 url_list = re.findall('value="(/complexpro.*?)"', html_str, re.S | re.M) for k in url_list: response = requests.get('http://www.fangdi.com.cn' + k, headers=self.headers) html = response.content.decode('gbk') comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group() comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html, re.S | re.M) for info in comm_info_list: try: comm = Comm(co_index) comm_url = re.search('<a href=(.*?)>', info, re.S | re.M).group(1) comm.co_name = re.search('<a.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_house = re.search( '<a.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_size = re.search( '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.area = re.search( '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_id = re.search('projectID=(.*?)==', info, re.S | re.M).group(1) self.get_comm_info(comm_url, comm) except Exception as e: print( '小区错误,co_index={},url={}'.format( co_index, 'http://www.fangdi.com.cn' + k), e)
def get_comm_info(self, all_html_list): for html in all_html_list: try: comm_info_paper_list = re.findall('<tr>.*?</tr>', html, re.S | re.M) for i in comm_info_paper_list[1:]: comm = Comm(co_index) comm.area = re.search('align="center">(.*?)<', i, re.S | re.M).group(1) comm.co_name = re.search( 'align="center".*?align="center".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search( 'align="center".*?align="center".*?align="center".*?title="(.*?)"', i, re.S | re.M).group(1) comm.co_all_house = re.search( 'align="center".*?align="center".*?align="center".*?align="center".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_id = re.search('projectID=(.*?)&', i, re.S | re.M).group(1) comm.insert_db() self.get_build_info(comm.co_id) except Exception as e: print('解析错误,co_index={},方法:get_comm_info'.format(co_index), e)
def start(self): page = self.get_all_page() for i in range(1, int(page) + 1): url = 'http://www.funi.com/loupan/region_0_0_0_0_' + str(i) response = self.request_proxy(url) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//dt[@class="clearfix"]/h2/a/@href') for i in comm_url_list: comm = Comm(co_index) i = i.split(';') if i: i = i[0] detail_url = 'http://www.funi.com/' + i + '/detail.htm' comm_index_url = 'http://www.funi.com/' + i try: comm = self.get_comm_info(comm_index_url, comm) self.get_comm_detail(detail_url, comm) except Exception as e: print('小区错误:co_index={},url={}'.format(co_index, detail_url), e)
def start(self): page = self.get_all_page() count = 0 for i in range(1, int(page) + 1): try: url = 'http://www.czfdc.gov.cn/spf/gs.php?pageid=' + str(i) response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) comm_url_list = tree.xpath('//td[@align="left"]/a/@href') for j in comm_url_list: count += 1 print(count) comm = Comm(6) comm_url = 'http://www.czfdc.gov.cn/spf/' + j self.get_comm_info(comm_url, comm) except Exception as e: print('co_index={},翻页有问题,url={}'.format(self.co_index, url), e) continue
def start(self): b = AllListUrl(first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='pageTotal = (.*?);', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = 'http://fsfc.fsjw.gov.cn/search/index.do?p=' + str(i) response = requests.get(url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//*[@id="content"]/div[2]/div[1]/dl/dd/h3/a/@value') for i in comm_url_list: comm = Comm(co_index) url = 'http://fsfc.fsjw.gov.cn/hpms_project/roomView.jhtml?id=' + i try: response = requests.get(url, headers=self.headers) except Exception as e: print(e) print("co_index={},小区详情页{}请求失败".format(co_index, url)) continue self.get_comm_info(url,response, comm)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( 'm²', '').strip() comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( '套', '').strip() comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"', html, re.S | re.M)[0].strip() comm.insert_db() global count count += 1 print(count) build_url_list = re.findall("window.open\('(.*?)'", html, re.S | re.M) bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<", html, re.S | re.M) bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<", html, re.S | re.M) qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M) for index in range(len(build_url_list)): try: build = Building(co_index) build.bu_name = bu_name_list[index].strip() build.bu_all_house = bu_all_house_list[index].strip() build.co_id = comm.co_id build.bu_id = qrykey[index].strip() build.insert_db() except Exception as e: print(e) self.get_house_info(build_url_list) except Exception as e: print(e)