def get_comm_detail(self, comm_list): for i in comm_list: comm_url = 'http://www.yzfdc.cn/' + i try: comm = Comm(co_index) content = self.s.get(comm_url, headers=self.headers) html = content.text comm.co_name = re.search('class="zxlp_08".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search( 'class="zxlp_08" href=.*?ProjectId=(.*?)"', html, re.S | re.M).group(1) comm.co_develops = re.search('开 发 商:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('项目类型:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('所属区位:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('开盘日期:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_handed_time = re.search('交付日期:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目具体地址:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_url = re.search( '(/BuildingDish_Publicity.aspx\?Projectid=.*?)"', html, re.S | re.M).group(1) self.get_build_info(build_url, comm.co_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_detail(self, comm_url): comm = Comm(co_index) co_url = 'http://tz.tmsf.com' + comm_url response = requests.get(co_url, headers=self.headers) html = response.content.decode('utf-8') comm.co_name = re.search('<span class="buidname colordg">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('楼盘地址:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) if '[' in comm.co_address: comm.area = re.search('\[(.*?)\]', comm.co_address, re.S | re.M).group(1) comm.co_type = re.search('物业类型:.*?<span title="(.*?)"', html, re.S | re.M).group(1) comm.co_open_time = re.search('最新开盘:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('项目公司:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_build_type = re.search('建筑形式:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('id="propertyid".*?value="(.*?)"', html, re.S | re.M).group(1) comm.insert_db() sid = re.search('id="sid" name="sid" value="(.*?)"', html, re.S | re.M).group(1) build_url = re.search('id="index_bar">楼盘主页.*?href="(.*?)"', html, re.S | re.M).group(1) self.get_build_info(build_url, comm.co_id, sid)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm.co_id = '楼盘首页.*?aid-(.*?)/' comm.co_name = 'class="ls">(.*?)<' comm.co_type = '物业类型</em>(.*?)<' comm.area = '区域所属:</em>(.*?)<' comm.co_green = '绿 化 率:</em>(.*?)<' comm.co_volumetric = '容 积 率:</em>(.*?)<' comm.co_build_type = '楼 层:</em>(.*?)<' comm.co_size = '占地面积:</em>(.*?)<' comm.co_build_size = '建筑面积:</em>(.*?)<' comm.co_develops = '开 发 商:</em><.*?target="_blank">(.*?)<' comm.co_address = '项目地址:</em>(.*?)<' data_list = comm.to_dict() p = ProducerListUrl( page_url=i, request_type='get', encode='gbk', analyzer_rules_dict=data_list, current_url_rule= 'colspan="3" align="right"><a href="(.*?)"', analyzer_type='regex', headers=self.headers) more_build_url = p.get_details() self.get_build_info(more_build_url) except Exception as e: print(e)
def co_parse(self,url_list): for url in url_list: try: co_url = url.xpath("./@href")[0] new_url = "http://tmsf.qzfdcgl.com" + co_url co_res = requests.get(new_url,headers=self.headers) con = co_res.text co = Comm(co_index) co.co_id = re.search('property_(.*?)_info',co_url).group(1) co.co_name = re.search('楼盘名称:</span>(.*)',con).group(1) co.co_develops = re.search('项目公司:</span>(.*)',con).group(1) co.co_address = re.search('物业地址:</span>(.*?)</p',con,re.S|re.M).group(1) co.area = re.search('所属城区:</span>(.*)',con).group(1) co.insert_db() sid = re.search('property_(\d+)_',co_url).group(1) propertyid = re.search('(\d+)_info',co_url).group(1) bu_url = new_url.replace('info','price') res = requests.get(bu_url,headers=self.headers) bu_html = etree.HTML(res.text) bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a") except: continue for bu_ in bu_idlist[1:]: id = bu_.xpath("./@id")[0] bu_id = re.search('.*?(\d+)',id).group(1) bu = Building(co_index) bu.bu_id = bu_id bu.co_id = co.co_id bu.bu_num = bu_.xpath("./text()")[0] bu.insert_db() self.house_parse(bu_id,co.co_id,sid,propertyid)
def comm_info(self,comm_url_list): for comm_url in comm_url_list: try: co_url = 'http://222.77.178.63:7002/' + comm_url co_res = requests.get(co_url,headers=self.headers) con = co_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('projectID=(.*)',comm_url).group(1) co.co_name = re.search('项目名称:.*?">(.*?)</',con,re.S|re.M).group(1) co.area = re.search('所在区县:.*?">(.*?)</',con,re.S|re.M).group(1) co.co_address = re.search('项目地址:.*?">(.*?)</',con,re.S|re.M).group(1) co.co_develops = re.search('企业名称:.*?blank">(.*?)</',con,re.S|re.M).group(1) co.co_all_house = re.search('>总套数.*?">(\d+)<',con,re.S|re.M).group(1) co.co_all_size = re.search('>总面积.*?">(.*?)<',con,re.S|re.M).group(1) project_name = parse.quote(co.co_name) co.insert_db() except Exception as e: # log.error('小区信息错误{}'.format(e)) print('小区信息错误{}'.format(e)) sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" +co.co_id + "&projectname=" + project_name res = requests.get(sale_url,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) temp_url_list = html.xpath("//a/@href") self.build_info(co.co_id,temp_url_list)
def start_crawler(self): data = { "Submit":"(unable to decode value)" } res = requests.post(self.start_url,data=data,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) comm_url_list = html.xpath("//tr//span[@style='width:270px; color:#006']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.fxfdcw.com/' + comm_url com_res = requests.get(url,headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('xmid=(\d+)',comm_url).group(1) co.co_name = re.search('项目名称.*?">(.*?)</',con,re.S|re.M).group(1) co.co_develops = re.search('开发企业:(.*?)  ',con,re.S|re.M).group(1) co.co_address = re.search('项目地址.*?">(.*?)</',con,re.S|re.M).group(1) co.co_build_size = re.search('建筑面积.*?">(.*?)</',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(.*?)</',con,re.S|re.M).group(1) co.insert_db() bu_list = re.findall("window.open\('(.*?)'\)",con,re.S|re.M) except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_list,co.co_id)
def comm_info(self, comm_url_list): for comm_url in comm_url_list: try: co_res = requests.get(comm_url, headers=self.headers) co = Comm(co_index) co.co_id = re.search('bh=(\d+)', comm_url).group(1) co.co_name = re.search('项目名称.*?td>(.*?)</', co_res.text, re.S | re.M).group(1) co.co_develops = re.search('公司名称.*?strong>(.*?)</s', co_res.text, re.S | re.M).group(1) co.co_address = re.search('项目坐落.*?">(.*?)</', co_res.text, re.S | re.M).group(1) co.co_pre_sale = re.search('预售证号.*?td>(.*?)</', co_res.text, re.S | re.M).group(1) co.co_pre_sale_date = re.search('批准时间.*?td>(.*?)</', co_res.text, re.S | re.M).group(1) co.co_build_size = re.search('预售面积.*?">(.*?)</', co_res.text, re.S | re.M).group(1) co.insert_db() html = etree.HTML(co_res.text) bu_info_list = html.xpath("//tr[@style]") except Exception as e: log.error('小区信息错误', e) continue self.build_info(bu_info_list, co.co_id) bu_url_list = re.findall("window.open\('(.*?)'\)", co_res.text, re.S | re.M) self.ho_info(bu_url_list, co.co_id)
def comm_info(self, comm_url_list): # 小区信息 co = Comm(co_index) build_url_list = [] for comm_url in comm_url_list: co.co_id = re.search('id=(\d+)', comm_url).group(1) detail_url = "http://ris.szpl.gov.cn/bol/" + comm_url.lstrip(".") url = "http://ris.szpl.gov.cn/bolprojectdetail.aspx?id=" + str( co.co_id) try: res = requests.get(detail_url, headers=self.headers) con = res.text co.co_pre_sale = re.search('许可证号.*?">(.*?)&', con).group(1) co.co_name = re.search('项目名称.*?">(.*?)&', con).group(1) co.co_address = re.search('所在位置.*?">(.*?)&', con).group(1) co.co_develops = re.search('发展商.*?">(.*?)&', con).group(1) co_type = re.search('住宅.*?面积.*?">(.*?)平方米.*?套数.*?">(.*?)&', con) co.co_build_size = co_type.group(1) co.co_all_house = co_type.group(2) co.insert_db() response = requests.get(url, headers=self.headers) content = etree.HTML(response.text) build_url = content.xpath("//td/a/@href") build_url_list.extend(build_url) except: continue self.build_info(build_url_list)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: response = requests.get(i, headers=self.headers) html = response.text comm = Comm(co_index) comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0] comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0] comm.co_id = re.findall('PROJECT_XMBH">(.*?)<', html, re.S | re.M)[0] comm.insert_db() global count count += 1 print(count) bu_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) self.get_build_info(bu_info, comm.co_id, i) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = i.replace('view', 'detail') comm.co_type = '物业类型:.*?<dd>(.*?)<' comm.area = '区域所属:.*?<dd>(.*?)<' comm.co_build_size = '建筑面积:.*?<dd>(.*?)<' comm.co_size = '占地面积:.*?<dd>(.*?)<' comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<' comm.co_build_type = '楼 层:.*?<dd>(.*?)<' comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<' comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/' comm.co_name = '<h1 class="title">(.*?)<' comm.co_address = '楼盘地址:.*?<dd>(.*?)<' comm.co_develops = '开发商:.*?<dd(.*?)<' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print(e)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1,int(page)+1): url = self.start_url + '?pageIndex=2' + str(page) page_res = requests.get(url,headers=self.headers) html = etree.HTML(page_res.text) comm_info_list = html.xpath("//ul/li/div") for comm_info in comm_info_list: try: co = Comm(co_index) co.co_name = comm_info.xpath("./p/a/text()")[0] deve = comm_info.xpath("./p[2]/text()")[0] addr = comm_info.xpath("./p[3]/text()")[0] co.co_develops = re.search('开发商:(.*)',deve).group(1) co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1) comm_url = comm_info.xpath("./p/a/@href")[0] co.co_id = re.search('projectId=(\d+)',comm_url).group(1) co.insert_db() co_url = 'http://www.bdfdc.net' + comm_url co_res = requests.get(co_url,headers=self.headers) time.sleep(5) bu_html = etree.HTML(co_res.text) bu_url_list = bu_html.xpath("//div[@style]/a")[1:] except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_url_list,co.co_id)
def get_comm_info(self, comm_url, area): try: comm = Comm(co_index) comm.area = area.strip() comm_url = comm_url.replace('..', '') response = self.s.get(comm_url) html = response.text comm.co_name = re.findall('项目名称:.*?<TD.*?><FONT.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<TD.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('开发公司:.*?<TD.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_pre_sale = re.findall('预售证名称:.*?<TD.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('纳入网上可售面积:.*?<TD.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_id = re.search('\?(.*?)$', comm_url).group(1) comm.insert_db() global count count += 1 print(count) build_url_list = re.findall("(HouseList/HouseInfo.aspx\?.*?)'", html, re.S | re.M) self.get_build_url(build_url_list, comm.co_id) except Exception as e: print(e)
def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://house.bffdc.gov.cn/public/project/' + i response = requests.get(comm_url) html = response.text comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) build_url_list = build_info.split(';;') self.get_build_info(build_url_list, comm.co_name) global count count += 1 print(count) except Exception as e: print(e)
def start_crawler(self): for i in range(1, 10000): formdata = { "currentpage": i, "pagesize": 20, } try: res = requests.post( "http://fdc.xmtfj.gov.cn:8001/home/Getzslp", data=formdata, headers=self.headers) con = json.loads(res.text) body = con['Body'] info_dict = json.loads(body)['bodylist'] for i in info_dict: comm = Comm(co_index) comm.co_name = i['XMMC'] comm.co_id = i['TRANSACTION_ID'] comm.co_address = i['XMDZ'] comm.co_pre_sale = i['YSXKZH'] comm.co_all_house = i['PZTS'] comm.co_build_size = i['PZMJ'] comm.co_area = i['XMDQ'] comm.co_pre_date = i['GETDATE'] comm.insert_db() except Exception as e: print( '小区错误,co_index={},url={},data'.format( co_index, 'http://fdc.xmtfj.gov.cn:8001/home/Getzslp', formdata), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://221.2.144.162:8090/' + i response = requests.get(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_id = re.search('id=(\d+)', i).group(1) comm.co_name = re.findall('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('开 发 商:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall( '城 区:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_type = re.findall('物业类型:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('物业位置:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() build_url_list = re.findall("height=20.*?<a href=(.*?) ", html, re.S | re.M) bu_pre_sale_list = re.findall("height=20.*?<Td>(.*?)<", html, re.S | re.M) self.get_build_info(build_url_list, bu_pre_sale_list, comm.co_name, comm.co_id) except Exception as e: print("co_index={},小区信息错误".format(co_index), e)
def comm_info(self, comm_url_list): for comm_url in comm_url_list: try: url = "http://as.gzfcxx.cn" + comm_url res = requests.get(url, headers=self.headers) co = Comm(co_index) co.co_name = re.search('项目名称.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_id = re.search('yszh=(\d+)', comm_url).group(1) co.co_develops = re.search('开发商.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_address = re.search('坐落.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_pre_sale = re.search('许可证.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.co_handed_time = re.search('交房时间.*?ck">(.*?)<', res.text, re.S | re.M).group(1) co.insert_db() html = etree.HTML(res.text) build_detail = html.xpath("//a[@class='a3']/@href")[0] except Exception as e: log.error('小区信息错误', e) continue self.build_info(build_detail, co.co_id)
def start_crawler(self): for region in self.region.items(): region_code = region[0] region_name = region[1] url = self.start_url + region_code + '.html' b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页>', ) page = b.get_page_count() for i in range(1, int(page) + 1): new_url = url + "?page=" + str(i) res = requests.get(new_url, headers=self.headers) html = etree.HTML(res.text) co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']") for co in co_list: comm = Comm(co_index) co_url = co.xpath("./dt/h4/a/@href")[0] comm.co_name = co.xpath("./dt/h4/a/text()")[0] comm.co_address = co.xpath(".//address/text()")[0] comm.co_id = re.search('\d+', co_url).group(0) comm.co_develops = co.xpath( "./dd[@class='dev']/a/text()")[0] comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0] comm.co_type = co.xpath(".//p/span[2]/text()")[0] comm.area = region_name comm.insert_db() detail_url = "http://www.zstmsf.com" + co_url self.bu_parse(detail_url, comm.co_id)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: code = i.split(',') comm_url = 'http://www.tmsf.com/newhouse/property_' + code[ 0] + '_' + code[1] + '_info.htm' comm = Comm(co_index) comm.co_name = 'buidname.*?>(.*?)<' comm.co_address = '--位置行--.*?<span.*?title="(.*?)"' comm.co_build_type = '建筑形式:<.*?>(.*?)<' comm.co_develops = '项目公司:<.*?>(.*?)<' comm.co_volumetric = '容 积 率:</span>(.*?)<' comm.co_green = '绿 化 率:</span>(.*?)<' comm.co_size = '占地面积:</span>(.*?)<' comm.co_build_size = '总建筑面积:</span>(.*?)<' comm.co_all_house = '总户数:</span>(.*?)<' comm.co_id = 'info" href="/newhouse/property_(.*?)_info' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule='一房一价<.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) build_all_url = p.get_details() global count count += 1 print('comm:', count) self.get_build_info(build_all_url) except Exception as e: print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) html = etree.HTML(res.text) comm_url_list = html.xpath("//div[@class='post']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.ggsfcw.com/' + comm_url comm_res = requests.get(url, headers=self.headers) com_html = etree.HTML(comm_res.text) comm = Comm(co_index) comm.co_name = re.search('<h3.*?">(.*?)</', comm_res.text).group(1) comm.co_id = re.search('n=(\d+)', comm_res.text).group(1) comm.co_address = re.search('地址.*?">(.*?)</', comm_res.text).group(1) comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1) comm.co_develops = re.search('开发商.*?">(.*?)</', comm_res.text).group(1) comm.co_use = re.search('规划用途.*?">(.*?)</', comm_res.text).group(1) comm.insert_db() except Exception as e: log.error("小区信息错误", e) continue bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a") self.build_info(bu_list, comm.co_id)
def comm_parse(self, url_list, region): for co_url in url_list: comm_url = "http://110.89.45.7:8082" + co_url comm_res = requests.get(comm_url, headers=self.headers) con = comm_res.text co = Comm(co_index) co.co_id = re.search('ProjectId=(.*)', co_url).group(1) co.co_name = re.search('项目名称.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_develops = re.search('公司名称.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_address = re.search('项目坐落.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_use = re.search('规划用途.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_build_size = re.search('建筑面积.*?">(.*?)</td', con, re.S | re.M).group(1) co.area = region co.co_residential_size = re.search( '批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_pre_sale = re.search('预售许可证.*?">(.*?)</td', con, re.S | re.M).group(1) co.insert_db() co_html = etree.HTML(comm_res.text) bu_urllist = co_html.xpath("//span/a/@href") self.bu_parse(co.co_id, bu_urllist)
def get_comm_detail(self, comm_detail_url): comm_url = 'http://www.kmhouse.org' + comm_detail_url try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.content.decode('gbk') co_id = re.search('Preid=(.*?)&', comm_detail_url).group(1) co_name = re.search('楼盘名称.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) area = re.search('所在地区.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_address = re.search('楼盘地址.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_pre_sale = re.search('预售证号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_volumetric = re.search('容 积 率.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_green = re.search('绿 化 率.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_build_start_time = re.search('开工时间.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = co_name comm.area = area comm.co_id = co_id comm.co_address = co_address comm.co_pre_sale = co_pre_sale comm.co_volumetric = co_volumetric comm.co_green = co_green comm.co_build_start_time = co_build_start_time comm.insert_db() global count count += 1 print('count:', count) except Exception as e: print('小区详情错误,co_index={},url={}'.format(co_index, comm_url), e)
def start_crawler(self): querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"} payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622ProjectIntroduce%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DProjectIntroduce%2626amp%263BShowModeCode%263Ddefault%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622EnterPriseName%267C0%2624Name%267C0%2624Location%267C0%2624SoilUse%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E" response = requests.request("POST", url, data=payload, params=querystring) html = response.text comm_info_list = re.findall('class="tdctfield tdctwidthset ".*?</tr>', html, re.S | re.M) for i in comm_info_list: comm = Comm(co_index) comm.co_develops = re.search('class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_name = re.search( 'class="spanctfield".*?class="spanctfield".*?<a.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search( 'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_type = re.search( 'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_id = re.search('EnterPriseName_(.*?)"', i, re.S | re.M).group(1) comm.insert_db() self.get_build_info(comm.co_id)
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.ytfcjy.com/public/project/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0] comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0] comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0] comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0] comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.insert_db() global count count += 1 print(count) build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print(e)
def get_comm_info(self,comm_url_list): for comm_url in comm_url_list: url = self.url + comm_url try: res = requests.get(url,headers=self.headers) except Exception as e: print("co_index={},小区信息错误".format(co_index),e) continue con = res.text co = Comm(co_index) co.co_id = re.search('Id=(\d+)',comm_url).group(1) co.co_name = re.search('项目名称.*?Name">(.*?)</span',con,re.S|re.M).group(1) co.co_develops = re.search('开发商.*?Name">(.*?)</span',con,re.S|re.M).group(1) co.co_address = re.search('地址.*?Address">(.*?)</span',con,re.S|re.M).group(1) co.co_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',con,re.S|re.M).group(1) co.co_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1) co.co_size = re.search('占地面积.*?mzgm">(.*?)</span',con,re.S|re.M).group(1) co.co_green = re.search('绿化率.*?Jdl">(.*?)</span',con,re.S|re.M).group(1) co.co_volumetric = re.search('容积率.*?Rjl">(.*?)</span',con,re.S|re.M).group(1) co.co_build_start_time = re.search('开工日期.*?kgrq">(.*?)</span',con,re.S|re.M).group(1) co.co_build_end_time = re.search('竣工日期.*?syrq">(.*?)</span',con,re.S|re.M).group(1) co.insert_db() presell_url_list = re.findall('【<a href="(.*?)" target="_self"',con,re.S|re.M) self.get_build_info(presell_url_list,co.co_id)
def comm_info(self,url): comm_url = self.start_url + "/" + url res = requests.get(comm_url,headers=self.headers) res.encoding = 'gbk' con = res.text co = Comm(co_index) co.co_id = re.search('kfsid=(\d+)',url).group(1) co.co_name = re.search('itemname.*?">(.*?)</font',con).group(1) co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_residential_size = re.search('>住宅面积:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_address = re.search('项目座落.*?;">(.*?)</',con,re.S|re.M).group(1) co.area = re.search('所在地区.*?">(.*?)</td',con,re.S|re.M).group(1) try: co.co_build_size = re.search('建筑面积.*?">(.*?) ', con, re.S | re.M).group(1) co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_land_use = re.search('土地证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_use = re.search('用途.*?">(.*?)<br',con,re.S|re.M).group(1) except: co.co_build_size = None co.co_plan_project = None co.co_land_use = None co.co_work_pro = None co.co_us = None co.insert_db() co_html = etree.HTML(con) bu_list = co_html.xpath("//table[@id='preselltable1']/tr[@bgcolor='white']") self.build_info(bu_list,co.co_id)
def get_comm_info(self, all_url_list): try: c = Comm(co_index) c.co_name = "class='newtopleft font-k'>(.*?)</li>" c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"' c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>" c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<" c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>" c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<" c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<" c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<" data_list = c.to_dict() p = ProducerListUrl(page_url=all_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, analyzer_type='regex', headers=self.headers) p.get_details() global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def get_comm_info(self, comm_html_list): for i in comm_html_list: comm = Comm(co_index) comm.co_name = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_develops = re.search('<td.*?><a.*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) detail_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_comm_detail(detail_url, comm)
def get_comm_info(self, comm_id_list): for i in comm_id_list: try: comm = Comm(co_index) comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = i comm.insert_db() bu_html = re.search( '<table class="table table-bordered itemInfoDetail.*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr>.*?</tr>', bu_html, re.S | re.M)[1:] for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<td>.*?<td>.*?<td>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('buildId=(.*?)&', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('<a href="(.*?)"', bu_html, re.S | re.M).group(1) response = requests.get(house_url, headers=self.headers) html = response.text house_url_list = re.findall( '<td width="110">.*?<a.*?href="(.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, comm.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def comm_info(self, comm_url_list): for temp in comm_url_list: comm_url = "http://www.njhouse.com.cn/2016/spf/" + temp try: co = Proxy_contact(app_name="nanjing", method='get', url=comm_url, headers=self.headers) co_res = co.contact() except Exception as e: log.error("小区页面访问失败{}".format(e)) continue con = co_res.decode('gbk') comm = Comm(co_index) comm.co_id = re.search('prjid=(\d+)" ta', con).group(1) comm.co_name = re.search('<h2>(.*?)<em>', con).group(1) comm.area = re.search("\[.*?'>(.*?)</a>\]", con).group(1) comm.co_develops = re.search('开发企业</td>.*?">(.*?)</a', con, re.S | re.M).group(1) comm.co_address = re.search('项目地址.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_open_time = re.search('开盘时间.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_use = re.search('用途.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_pre_sale = re.findall("'_blank'>(\d+)</a>", con) # comm.co_land_use = re.search('土地使用.*?span>(.*?)</span',con,re.S|re.M).group(1) comm.co_plan_project = re.search('工程规划.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地规划.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_work_pro = re.search('施工.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_all_house = re.search('入网总套数.*?">(.*?)</td', con, re.S | re.M).group(1) comm.co_all_size = re.search('入网总面积.*?td>(.*?)m', con, re.S | re.M).group(1) comm.insert_db() build_temp = "http://www.njhouse.com.cn/2016/spf/sales.php?prjid=" + str( comm.co_id) while True: try: build_proxy = Proxy_contact(app_name="nanjing", method='get', url=build_temp, headers=self.headers) build_temp_con = build_proxy.contact() build_temp_con = build_temp_con.decode('gbk') html = etree.HTML(build_temp_con) break except: continue build_url_list = html.xpath("//div[@class='fdxs_left']/a/@href") self.build_info(build_url_list, comm.co_id)