class hefeiLandSupplySpider(CrawlSpider): # TODO name = 'hefeiLandSupply' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(hefeiLandSupplySpider, cls).__new__(cls) # TODO pathPage = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),'Logs/hefeiLandSupplyPage.txt') # TODO cls.pathDetail = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), 'data/合肥土地市场_土地供应_合肥.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write("""爬取地址url,唯一标识,\n""") return cls.instance def __init__(self): super(hefeiLandSupplySpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('hefei', 'hefeiLandSupply') self.duplicateUrl = 0 # TODO self.targetUrl = 'http://ggzy.hefei.gov.cn/hftd/tdgy/?Paging={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 8): yield Request(self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True ) except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="ewb-r-bd-con"]/ul/li') for dataItem in dataItems: title = dataItem.xpath('div/a/text()').extract_first() url = 'http://ggzy.hefei.gov.cn' + dataItem.xpath('div/a/@href').extract_first()[1:] msgTime = dataItem.xpath('span/text()').extract_first() yield Request(url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, 'msgTime': msgTime, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class shanxiTransformNoticeSpider(CrawlSpider): # TODO name = 'shanxiTransformResult' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(shanxiTransformNoticeSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/shanxiTransformResultPage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/山西省自然资源厅_出让结果_山西.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """文件标题,时间,来源,文件编号,宗地编号,编号,地块位置,土地位置,土地面积(亩),土地面积(平方米),土地用途,成交价(万元),竞得人,公示期,联系单位,单位地址,邮政编码,联系电话,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(shanxiTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('shanxi', 'shanxiTransformResult') self.duplicateUrl = 0 self.targetUrl = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index_{}.shtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 14): yield Request( self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') else: yield Request( 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index.shtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 'index', 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//ul[@class="zwgk_right_content"]/li') for dataItem in dataItems: title = dataItem.xpath('a/text()').extract_first() url = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg' + dataItem.xpath( 'a/@href').extract_first()[1:] yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_27 = '' SJ_28 = '' LY_29 = '' WJBT_30 = '' ZDBH_31 = '' BH_32 = '' DKWZ_33 = '' TDWZ_34 = '' TDMJM_35 = '' TDMJPFM_36 = '' TDYT_37 = '' CJJ_38 = '' JDR_39 = '' GSQ_40 = '' LXDW_41 = '' DWDZ_42 = '' YZBM_43 = '' LXDH_44 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_27 = response.meta.get('title') # 时间 SJ_28 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_29 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 文件编号 WJBT_30 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 公示期 GSQ_40 = reFunction( f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)。', items) # 联系单位 LXDW_41 = reFunction( '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_42 = reFunction( '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_43 = reFunction( '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_44 = reFunction( '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_27 + SJ_28) soup = BeautifulSoup( response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if table: if '竣工时间' in items: try: tdData = htmlTable.tableTrTdUNregulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('地块编号')[_] if tdData.get( '地块编号') else '' # 地块位置 DKWZ_33 = tdData.get('位置')[_] if tdData.get( '位置') else '' # 土地位置 TDWZ_34 = tdData.get('位置')[_] if tdData.get( '位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get( '出让面积平方米/亩')[_] if tdData.get( '出让面积平方米/亩') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get(list( tdData.keys())[7])[_] if tdData.get( list(tdData.keys())[7]) else '' # 土地用途 TDYT_37 = tdData.get('用途')[_] if tdData.get( '用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(万元)')[_] if tdData.get( '成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('受让人')[_] if tdData.get( '受让人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except: for tdData in table.find_all('tr')[2:]: # 宗地编号 ZDBH_31 = tdData.find_all('td')[4].string.strip() # 地块位置 DKWZ_33 = tdData.find_all('td')[5].string.strip() # 土地位置 TDWZ_34 = tdData.find_all('td')[5].string.strip() # 土地面积(亩) TDMJM_35 = tdData.find_all('td')[6].string.strip() # 土地面积(平方米) TDMJPFM_36 = tdData.find_all( 'td')[7].string.strip() # 土地用途 TDYT_37 = tdData.find_all('td')[8].string.strip() # 成交价(万元) CJJ_38 = tdData.find_all('td')[9].string.strip() # 竞得人 JDR_39 = tdData.find_all('td')[3].string.strip() # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '转让方' not in items: if len(table.find_all('tr')[1].find_all('td')) < 5: table.find_all('tr')[1].extract() table.find_all('tr')[0].find_all('td')[-1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 编号 BH_32 = tdData.get('编号')[_] if tdData.get('编号') else '' # 地块位置 DKWZ_33 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 土地位置 TDWZ_34 = tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get( '土地面积(平方米)') else '' # 土地用途 TDYT_37 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(万元)')[_] if tdData.get('成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('竞得人')[_] if tdData.get( '竞得人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '地块基本情况' in items: # 宗地编号 ZDBH_31 = reFunction( '宗地编号\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction( '地块位置\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(亩) TDMJM_35 = reFunction( '土地面积\(公顷\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction( '土地用途\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) CJJ_38 = reFunction( '成交价\(万元\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 竞得人 JDR_39 = reFunction( '受让单位\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '转让方' in items: # 编号 BH_32 = reFunction( '不动产权登记证号:([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction( '宗地位置:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(平方米) TDMJPFM_36 = reFunction( '面\s*积:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction( '土地用途:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) # CJJ_38 # 竞得人 JDR_39 = reFunction( '受让方:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class zhengzhouLandTransformNoticeSpider(CrawlSpider): # TODO name = 'zhengzhouLandTransformNotice' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(zhengzhouLandTransformNoticeSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/zhengzhouLandTransformNoticePage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/郑州市自然资源和规划局_土地协议出让公告_郑州.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """标题,来源,时间,编号,土地位置,使用权面积,规划用地性质,出让年限,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(zhengzhouLandTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('zhengzhou', 'zhengzhouLandTransformNotice') self.duplicateUrl = 0 self.targetUrl = 'http://zzland.zhengzhou.gov.cn/xycrgg/index_{}.jhtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 3): priority = 4 - int(page) yield Request( self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # headers={'Content-Type': 'application/json'}, dont_filter=True) yield Request( 'http://zzland.zhengzhou.gov.cn/xycrgg/index.jhtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 1, 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="box-content"]/ul/li') for dataItem in dataItems: title = dataItem.xpath('a/h1/text()').extract_first() url = dataItem.xpath('a/@href').extract_first() yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') '''data.xpath("string(path)") path -- xpath提取的路径 这里提取到父标签 ''' # TODO 共有字段 # 标题 BT_10 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_11 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 SJ_12 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 编号 BH_13 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[1])").extract()) # 土地位置 TDWZ_14 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[2])").extract()) # 使用权面积 SYQMJ_15 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[3])").extract()) # TODO 规划用地性质 GHYDXZ_16 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[4])").extract()) # 出让年限 CRNX_17 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[5])").extract()) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_10 + SJ_12) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_10, LY_11, SJ_12, BH_13, TDWZ_14, SYQMJ_15, GHYDXZ_16, CRNX_17, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class zhengzhouAppropriateResultSpider(CrawlSpider): # TODO name = 'zhengzhouAppropriateResult' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(zhengzhouAppropriateResultSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/zhengzhouAppropriateResultPage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/郑州市自然资源和规划局_划拨供地结果_郑州.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """标题,来源,时间,序号,批准文号,用地单位,供地方式,批准时间,位置,用途,面积,容积率,供应方案文号,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(zhengzhouAppropriateResultSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('zhengzhou', 'zhengzhouAppropriateResult') self.duplicateUrl = 0 self.targetUrl = 'http://zzland.zhengzhou.gov.cn/hbgd/index_{}.jhtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 3): priority = 4 - int(page) yield Request( self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # headers={'Content-Type': 'application/json'}, dont_filter=True) yield Request( 'http://zzland.zhengzhou.gov.cn/hbgd/index.jhtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 1, 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="box-content"]/ul/li') for dataItem in dataItems: title = dataItem.xpath('a/h1/text()').extract_first() url = dataItem.xpath('a/@href').extract_first() yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') BT_47 = '' LY_55 = '' LYSJ_48 = '' XH_49 = '' PZWH_50 = '' YDDW_51 = '' GDFS_52 = '' PZSJ_53 = '' WZ_54 = '' YT_55 = '' MJ_56 = '' RJL_57 = '' GYWAFA_58 = '' # TODO 共有字段 # 标题 BT_47 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_55 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 LYSJ_48 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 解析 table 若出错 使用正则 htmlTable = htmlTableTransformer() if '宗地编号' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find_all('table')[0] if not table.tbody.find_all('tr')[0].find_all( text=re.compile("序号|受让人")): table.tbody.find_all('tr')[0].extract() tdsData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdsData.values())[0])): # if response.url == 'http://zzland.zhengzhou.gov.cn/hbgd/1715241.jhtml': # print() # 序号 XH_49 = tdsData.get('序号')[_] if tdsData.get( '序号') else '' # 批准文号 PZWH_50 = tdsData.get('批准文号')[_] if tdsData.get( '批准文号') else '' # 用地单位 YDDW_51_ = tdsData.get('用地单位(受让人)')[_] if tdsData.get( '用地单位(受让人)') else tdsData.get( '受让人')[_] if tdsData.get('受让人') else '' YDDW_51 = YDDW_51_ if YDDW_51_ else tdsData.get( '单位')[_] # 供地方式 GDFS_52 = tdsData.get('供地方式')[_] if tdsData.get( '供地方式') else tdsData.get('供应方式')[_] if tdsData.get( '供应方式') else '' # 批准时间 PZSJ_53 = tdsData.get('批准时间')[_] if tdsData.get( '批准时间') else tdsData.get('签订日期')[_] if tdsData.get( '签订日期') else '' # 位置 WZ_54_0 = tdsData.get('土地位置') WZ_54_1 = tdsData.get('土地座落') WZ_54_2 = tdsData.get('宗地位置') WZ_54_3 = tdsData.get('位置') WZ_54_ = list( filter(None, [WZ_54_0, WZ_54_1, WZ_54_2, WZ_54_3])) WZ_54 = WZ_54_[0][_] if WZ_54_ else '' # 用途 YT_55_0 = tdsData.get('用途') YT_55_1 = tdsData.get('土地用途') YT_55_2 = tdsData.get('用途明细') YT_55_ = list(filter(None, [YT_55_0, YT_55_1, YT_55_2])) YT_55 = YT_55_[0][_] if YT_55_ else '' # 面积 MJ_56_0 = tdsData.get('面积(平方米)') MJ_56_1 = tdsData.get('划拨面积') MJ_56_2 = tdsData.get('出让/划拨面积') MJ_56_3 = tdsData.get('面积(公顷)') MJ_56_ = list( filter(None, [MJ_56_0, MJ_56_1, MJ_56_2, MJ_56_3])) MJ_56 = MJ_56_[0][_] if MJ_56_ else '' # 容积率 RJL_57 = tdsData.get('容积率')[_] if tdsData.get( '容积率') else '' # 供应方案文号 GYWAFA_58 = tdsData.get('供应方案文号')[_] if tdsData.get( '供应方案文号') else '' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_47 + LYSJ_48) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_47, LY_55, LYSJ_48, XH_49, PZWH_50, YDDW_51, GDFS_52, PZSJ_53, WZ_54, YT_55, MJ_56, RJL_57, GYWAFA_58, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: pass else: # 进行正则匹配 # 序号 XH_49 = reFunction(f'宗地编号([{self.reStr}]*)地块位置', items) # 用地单位 YDDW_51 = reFunction(f'受让单位([{self.reStr}]*)备注:', items) # 位置 WZ_54 = reFunction(f'地块位置([{self.reStr}]*)土地用途', items) # 用途 YT_55 = reFunction(f'土地用途([{self.reStr}]*)土地面积', items) # 面积 MJ_56 = reFunction(f'土地面积\(公顷\)([{self.reStr}]*)项目名称', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_47 + LYSJ_48) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_47, LY_55, LYSJ_48, XH_49, PZWH_50, YDDW_51, GDFS_52, PZSJ_53, WZ_54, YT_55, MJ_56, RJL_57, GYWAFA_58, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class shanxiTransformNoticeSpider(CrawlSpider): # TODO name = 'shanxiTransformNotice' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(shanxiTransformNoticeSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/shanxiTransformNoticePage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/山西省自然资源厅_出让公告_山西.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """公告类型,文件标题,时间,来源,正文标题,宗地编号,土地位置,出让面积(m2),绿化用地,道路用地,土地用途,岀让年限,容积率,建筑密度,绿地率,建筑空间,起始价(万元),保证金(万元),竞价幅度(万元),报名时间起止日期,挂牌开始时间,挂牌截止时间,保证金到账截止时间,联系地址,联系人,联系电话,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(shanxiTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('shanxi', 'shanxiTransformNotice') self.duplicateUrl = 0 self.targetUrl = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crgg/index_{}.shtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 33): yield Request( self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') else: yield Request( 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crgg/index.shtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 'index', 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//ul[@class="zwgk_right_content"]/li') for dataItem in dataItems: title = dataItem.xpath('a/text()').extract_first() url = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crgg' + dataItem.xpath( 'a/@href').extract_first()[1:] yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') GGLX_1 = '' WJBT_2 = '' SJ_3 = '' LY_4 = '' ZWBT_5 = '' ZDBH_6 = '' TDWZ_7 = '' CRMJ_8 = '' LHYD_9 = '' DLYD_10 = '' TDYT_11 = '' CRNX_12 = '' RJL_13 = '' JZMD_14 = '' LDL_15 = '' JZKJ_16 = '' QSJ_17 = '' BZJ_18 = '' JJFD_19 = '' BMRQ_20 = '' GPRQ_21 = '' GPJZSJ_22 = '' BZJDZSJ_23 = '' LXDZ_24 = '' LXR_25 = '' LXDH_26 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 公告类型 GGLX_1 = '出让公告' # 文件标题 WJBT_2 = response.meta.get('title') # 时间 SJ_3 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_4 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 正文标题 ZWBT_5 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_2 + SJ_3) # 报名时间起止日期 BMRQ_20 = reFunction(f'报名申请时间:\s*([\w]*);', items) if reFunction( f'报名申请时间:\s*([\w]*);', items ) else reFunction(f'申请人可于(\w*),向我局提交书面申请', items) if reFunction( f'申请人可于(\w*),向我局提交书面申请', items ) else reFunction(f'申请时间为:(\w*)', items) if reFunction( f'申请时间为:(\w*)', items) else reFunction(f'申请人可于(\w*)到', items) GPTime = reFunction(f'网上挂牌(报价)时间:\s*([\w]*)', items) if reFunction( f'网上挂牌(报价)时间:\s*([\w]*)', items) else reFunction( f'挂牌时间为:\s*([\w]*)', items) try: if GPTime: # 挂牌开始时间 GPRQ_21 = GPTime.split('至')[0] # 挂牌截止时间 GPJZSJ_22 = GPTime.split('至')[1] else: GPRQ_21 = reFunction(f'挂牌时间为:\s*([\s\S]*)', reFunction('六、([\s\S]*)七、', items)) GPJZSJ_22 = reFunction(f'挂牌时间为:\s*([\s\S]*)', reFunction('六、([\s\S]*)七、', items)) except Exception as e: self.log(f'详情页数据挂牌时间解析失败, 请求:{response.url}, 信息: {e}', level=logging.DEBUG) GPRQ_21 = '' GPJZSJ_22 = '' # 保证金到账截止时间 BZJDZSJ_23 = reFunction( f'保证金到账截止时间为:\s*([\w]*)', items) if reFunction( f'保证金到账截止时间为:\s*([\w]*)', items) else reFunction( f'保证金交纳截止时间:\s*([\w]*)', items) if reFunction( f'保证金交纳截止时间:\s*([\w]*)', items) else reFunction( f'保证金的截止时间为\s*([\w]*)', items) # 联系地址 LXDZ_24 = reFunction( '联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) if reFunction( f'联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else reFunction( '单位地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 联系人 LXR_25 = reFunction( f'联\s系\s人:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 联系电话 LXDH_26 = reFunction( f'联系电话:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) if '挂牌出让宗地的基本情况和规划指标等要求' not in items and '宗地编号' not in items: # 处理 table 情况 soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') try: tdReplace = table.tbody.find_all('tr')[0].find( 'td', colspan='4') if table.tbody.find_all('tr')[0].find( 'td', colspan='4') else table.tbody.find_all( 'tr')[0].find('td', colspan="2") number = table.tbody.find_all('tr')[0].index(tdReplace) tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.tbody.find_all('tr')[1].extract() except: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.thead.find_all('tr')[0].find( 'td', colspan='4') if table.thead.find_all('tr')[0].find( 'td', colspan='4') else table.thead.find_all( 'tr')[0].find('td', colspan="2") number = table.thead.find_all('tr')[0].index(tdReplace) tdList = table.thead.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.thead.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.thead.find_all('tr')[1].extract() table.tbody.insert( 0, table.thead.find_all('tr')[0]) # 插入 thead 的内容 table.thead.extract() htmlTable = htmlTableTransformer() try: tdData = htmlTable.tableTrTdRegulationToList(table) if not tdData and 'thead' in items: # 如果没有拿到 则可能存在 thead soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.thead.find_all('tr')[0].find( 'td', colspan='4') if table.thead.find_all('tr')[0].find( 'td', colspan='4') else table.thead.find_all( 'tr')[0].find('td', colspan="2") number = table.thead.find_all('tr')[0].index(tdReplace) tdList = table.thead.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.thead.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.thead.find_all('tr')[1].extract() table.tbody.insert( 0, table.thead.find_all('tr')[0]) # 插入 thead 的内容 table.thead.extract() htmlTable = htmlTableTransformer() except: tdData = {} for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_6 = tdData.get('编号')[_] if tdData.get('编号') else '' # 土地位置 TDWZ_7 = tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 出让面积(m2) CRMJ_8_0 = tdData.get('土地面积') CRMJ_8_1 = tdData.get('土地面积(平方米)') CRMJ_8_ = list(filter(None, [CRMJ_8_0, CRMJ_8_1])) CRMJ_8 = CRMJ_8_[0][_] if CRMJ_8_ else '' # 土地用途 TDYT_11 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 岀让年限 CRNX_12 = tdData.get('出让年限(年)')[_] if tdData.get( '出让年限(年)') else '' # 容积率 RJL_13 = tdData.get('容积率')[_] if tdData.get( '容积率') else tdData.get('容 积 率')[_] if tdData.get( '容 积 率') else '' # 建筑密度 # JZMD_14 # 绿地率 LDL_15 = tdData.get('绿化率')[_] if tdData.get('绿化率') else '' # 建筑空间 JZKJ_16 = tdData.get('控制高度(m)')[_] if tdData.get( '控制高度(m)') else tdData.get('建筑限高(m)')[_] if tdData.get( '建筑限高(m)') else '' # 起始价(万元) QSJ_17 = tdData.get('挂牌起始价(万元)')[_] if tdData.get( '挂牌起始价(万元)') else '' # 保证金(万元) BZJ_18 = tdData.get('竞买保证金(万元)')[_] if tdData.get( '竞买保证金(万元)') else tdData.get( '竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else '' # 竞价幅度(万元) JJFD_19 = tdData.get('増价幅度(万元/次)')[_] if tdData.get( '増价幅度(万元/次)') else '' # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) # TODO 判断 elif '挂牌出让宗地的基本情况和规划指标等要求' in items: for item in re.split( '\d、', reFunction('一、挂牌出让宗地的基本情况和规划指标等要求:([\s\S]*)二、', items)): # TODO if not item.strip(): continue # 宗地编号 ZDBH_6 = reFunction( f'^([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)宗地位于', item) # 土地位置 TDWZ_7 = reFunction( f'宗地位于([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 出让面积(m2) CRMJ_8 = reFunction( f'土地出让面积([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 土地用途 TDYT_11 = reFunction( f'宗地规划用途为([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 岀让年限 CRNX_12 = reFunction( f'宗地土地出让年期([()\w\.:: —\(\),〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)。', item) # 容积率 RJL_13 = reFunction( f'容积率([()\w\.:: \(\)%〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 绿地率 LDL_15 = reFunction( f'绿地率([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 建筑空间 JZKJ_16 = reFunction( f'建筑空间([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 起始价(万元) QSJ_17 = reFunction( f'本宗地起始价([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 保证金(万元) BZJ_18 = reFunction( f'竞买保证金([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'增价幅度([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '挂牌出让地块基本情况' in items and '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_6 = reFunction( f'宗地编号为([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 土地位置 TDWZ_7 = reFunction( f'该地块([()\w\.:: —\(\)〔〕%㎡≤≥《》,\-\/\%;、\.﹪]*)。出让面积', item) # 出让面积(m2) CRMJ_8 = reFunction( f'出让面积:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 绿化用地 LHYD_9 = reFunction( f'绿化用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 道路用地 DLYD_10 = reFunction( f'道路用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 土地用途 TDYT_11 = reFunction( f'用途:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 岀让年限 CRNX_12 = reFunction( f'出让年限:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 容积率 RJL_13 = reFunction( f'容积率:*([()\w\.:: ,—\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 绿地率 LDL_15 = reFunction( f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item ) if reFunction( f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) else reFunction( f'绿地率(%)([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 起始价(万元) QSJ_17 = reFunction( f'起始价为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item) # 保证金(万元) BZJ_18 = reFunction( f'竞买保证金为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'竞价幅度为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)。', item) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: if '宗地编号' in items and '地块基本情况' not in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_6 = reFunction( f'宗地编号:*\s*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 土地位置 TDWZ_7 = reFunction( f'宗地坐落:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 出让面积(m2) CRMJ_8 = reFunction( f'宗地\s*总*面积:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction( f'土地用途[明细]*:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 岀让年限 CRNX_12 = reFunction( f'出让年限:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 容积率 RJL_13 = reFunction( f'容积率:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 绿地率 LDL_15 = reFunction( f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item ) if reFunction( f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item ) else reFunction( f'绿地率(%)\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 建筑空间 JZKJ_16 = reFunction( f'建筑限高\(米\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 起始价(万元) QSJ_17 = reFunction( f'起始价:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 保证金(万元) BZJ_18 = reFunction( f'保证金:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'加价幅度:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 挂牌开始时间 GPRQ_21 = reFunction( f'挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 挂牌截止时间 GPJZSJ_22 = reFunction( f'挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 联系地址 LXDZ_24 = reFunction( f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items).split('联')[0] if reFunction( f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else '' # 联系人 LXR_25 = reFunction( f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items ).split('联')[0] if reFunction( f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else '' # 联系电话 LXDH_26 = reFunction( f'联系电话:\s*([()\d\.:: \(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块基本情况' in items: # todo soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_6 = tdData.get('编号')[_] if tdData.get( '编号') else '' # 土地位置 TDWZ_7 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 出让面积(m2) CRMJ_8 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 土地用途 TDYT_11 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class nanjingLandDetailSpider(CrawlSpider): name = 'nanjingLandDetail' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(nanjingLandDetailSpider, cls).__new__(cls) pathPage = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),'Logs/nanjingLandDetailPage.txt') cls.pathDetail = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), 'data/南京市国有建设用地使用权公开出让网上交易系统_地块详情_南京.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: fp.write("""标题,公告编号,地块编号,地块名称,容积率,用地性质,规划面积,实际岀让面积,公告发布时间,保证金金额,挂牌起始价,竟争保障房建设资金起始价,最高限价,加价幅度,报名开始时时间,报名截至时间,报价截至时间,保证金截至时间,限时竟价开始时间,最新报价,最近报价时间,竟得者,竟得价,报价轮次,报价人,金额报价,单位地价,报价时间,爬取地址url,唯一标识,\n""") return cls.instance def __init__(self): super(nanjingLandDetailSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) self.redisClient = RedisClient('nanjing', 'LandDetail') self.duplicateUrl = 0 self.targetUrl = 'https://jy.landnj.cn/default.aspx?page={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 5): priority = 5 - int(page) yield Request(self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={'page': page, 'priority': priority}, # headers={'Content-Type': 'application/json'}, # dont_filter=True ) except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//*[@id="ctl00_ContentPlaceHolder1_GridView1"]/tr[position()>1 and position()<12]') for dataItem in dataItems: # title = dataItem.xpath('a/text()').extract_first() url = 'https://jy.landnj.cn' + dataItem.xpath('td[1]/a/@href').extract_first() yield Request(url, method='GET', callback=self.parse_detail, meta={ 'page': page, # 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') # TODO 共有字段 # 标题 BT_1 = ''.join(data.xpath('//*[@id="ctl00_ContentPlaceHolder1_UpdatePanel2"]/div/span/text()').extract()) # 公告编号 GGBH_2 = reFunction(f'公告编号:\s*([{self.reStr}]*)\s',items) # 地块编号 DKBH_3 = reFunction(f'地块编号:\s*([{self.reStr}]*)\s',items) # 地块名称 DKMC_4 = reFunction(f'地块名称:\s*([{self.reStr}]*)\s',items) # 容积率 RJL_5 = reFunction(f'容积率:\s*([{self.reStr}]*)\s',items) # 用地性质 YDXZ_6 = reFunction(f'用地性质:\s*([{self.reStr}]*)\s',items) # 规划面积 GHMJ_7 = reFunction(f'规划面积:\s*([{self.reStr}]*)\s',items) # 实际岀让面积 SJCRMJ_8 = reFunction(f'实际出让面积:\s*([{self.reStr}]*)\s',items) # 公告发布时间 GGFBSJ_9 = reFunction(f'公告发布时间:\s*([{self.reStr}]*)\s',items) # 保证金金额 BZJJE_10 = reFunction(f'保证金金额:\s*([{self.reStr}]*)\s',items) # 挂牌起始价 GPQSJ_11 = reFunction(f'挂牌起始价:\s*([{self.reStr}]*)\s',items) # 竟争保障房建设资金起始价 JZBZ_12 = reFunction(f'竞争保障房建设资金起始价:\s*([{self.reStr}]*)\s',items) # 最高限价 ZGXJ_13 = reFunction(f'最高限价:\s*([{self.reStr}]*)\s',items) # 加价幅度 JJFD_14 = reFunction(f'加价幅度:\s*([{self.reStr}]*)\s',items) # 报名开始时时间 BMKS_15 = reFunction(f'报名开始时间:\s*([{self.reStr}]*)\s',items) # 报名截至时间 BMJZ_16 = reFunction(f'报名截至时间:\s*([{self.reStr}]*)\s',items) # 报价截至时间 BJJZ_17 = reFunction(f'报价截至时间:\s*([{self.reStr}]*)\s',items) # 保证金截至时间 BZJJZ_18 = reFunction(f'保证金截至时间:\s*([{self.reStr}]*)\s',items) # 限时竟价开始时间 ZSJJKS_19 = reFunction(f'限时竞价开始时间:\s*([{self.reStr}]*)\s',items) # 最新报价 ZXBJ_20 = reFunction(f'最新报价:\s*([{self.reStr}]*)\s',items) # 最近报价时间 ZXBJ_21 = reFunction(f'最新报价时间:\s*([{self.reStr}]*)\s',items) # 竟得者 JDZ_22 = reFunction(f'竞得者:\s*([{self.reStr}]*)\s',items) # 竟得价 ZDJ_23 = reFunction(f'竞得价:\s*([{self.reStr}]*)\s',items) # 报价轮次 BJLC_24 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[1]/text()').extract_first() # 报价人 BJR_25 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[2]/span/text()').extract_first() # 金额报价 JEBJ_26 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[3]/span/text()').extract_first() # 单位地价 DWDJ_27 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[4]/span/text()').extract_first() # 报价时间 BJSJ_28 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[5]/text()').extract_first() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_1 + GGBH_2) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_1, GGBH_2, DKBH_3, DKMC_4, RJL_5, YDXZ_6, GHMJ_7, SJCRMJ_8, GGFBSJ_9, BZJJE_10, GPQSJ_11, JZBZ_12, ZGXJ_13, JJFD_14, BMKS_15, BMJZ_16, BJJZ_17, BZJJZ_18, ZSJJKS_19, ZXBJ_20, ZXBJ_21, JDZ_22, ZDJ_23, BJLC_24, BJR_25, JEBJ_26, DWDJ_27, BJSJ_28, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class longyanTransformNoticeSpider(CrawlSpider): # TODO name = 'longyanTransformNotice' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(longyanTransformNoticeSpider, cls).__new__(cls) # TODO pathPage = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),'Logs/longyanTransformNoticePage.txt') # TODO cls.pathDetail = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), 'data/龙岩市公共资源交易中心_出让公告_龙岩.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write("""文件标题,信息时间,正文标题,公告编号,出让时间,公告类型,宗地编号,地块编号,宗地位置,宗地坐落,土地用途,规划土地用途,出让年限,使用年限,批准机关及文号,规划用地面积〔m2),规划面积(m2),出让面积(m2),出让用地面积(m2),宗地出让面积,建筑密度,容积率,绿地率,绿地率(%),建筑控制高度(m),建筑控制高度(米),建筑系数(%),投资强度(万元/公顷),土地估价备案号,是否省重点,现状土地条件,竞买保证金,竟买保证金(万元),起叫价,出让起始价(万元),加价幅度,是否设置保留价,挂牌开始时间,挂牌截止时间,获取出让文件时间,提交竞买申请时间,保证金截止时间,确认竞买资格时间,联系地址,联系电话,联系人,保证金账户开户单位/户名,保证金账户账号,保证金账户开户行,出让金账户开户单位/户名,出让金账户开户行,出让金账户账号,爬取地址url,唯一标识,\n""") return cls.instance def __init__(self): super(longyanTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('longyan', 'longyanTransformNotice') self.duplicateUrl = 0 # TODO self.targetUrl = 'https://www.lyggzy.com.cn/lyztb/tdky/084002/?pageing={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 10): yield Request(self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True ) except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="r-bd"]/ul[1]/li') for dataItem in dataItems: title = dataItem.xpath('a/text()').extract_first() url = 'https://www.lyggzy.com.cn/' + dataItem.xpath('a/@href').extract_first()[1:] msgTime = dataItem.xpath('span/text()').extract_first() yield Request(url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, 'msgTime': msgTime, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') htmlTable = htmlTableTransformer() WJBT_1 = '' XXSJ_2 = '' WBT_3 = '' GGBH_4 = '' CRSJ_5 = '' GGNX_6 = '' ZDBH_7 = '' DKWZ_8 = '' ZDWZ_9 = '' ZDZL_10 = '' TDYT_11 = '' GHTDYT_12 = '' CRNX_13 = '' SYNX_14 = '' PZJGJWH_15 = '' GHYDMJ_16 = '' GHMJ_17 = '' CRMJ_18 = '' CRYDMJ_19 = '' ZDCRMJ_20 = '' JZMD_21 = '' RJL_22 = '' LDL_23 = '' LDL_24 = '' JZKZGD_25 = '' JZKZZGD_26 = '' JZXS_27 = '' TZQD_28 = '' TDGJBAH_29 = '' SFSZD_30 = '' TDXZTJ_31 = '' JMBZJ_32 = '' JMBZJ_72 = '' QJJ_33 = '' CRQSJ_34 = '' JJFD_35 = '' SFSZBLJ_36 = '' GPKSSJ_37 = '' GPJZSJ_38 = '' HQCRWJSJ_39 = '' TJJMSQSJ_40 = '' BZJJZSJ_41 = '' QRJMZGSJ_42 = '' LXDZ_43 = '' LXDH_44 = '' LXR_45 = '' BZJZH_86 = '' BZJZH_87 = '' BZJZH_88 = '' CRJZH_97 = '' CRJZH_98 = '' CRJZH_99 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_1 = response.meta.get('title').strip() # 信息时间 XXSJ_2 = reFunction('[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first()) # 正文标题 WBT_3 = WJBT_1 # 公告编号 GGBH_4 = ''.join(data.xpath('//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()').extract()) # 出让时间 CRSJ_5 = reFunction('定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items) # 公告类型 GGNX_6 = '出让公告' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_1 + XXSJ_2) GPSJ_0 = reFunction('挂牌交易期限:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items) GPSJ_1 = reFunction('申请人可于:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)到', items) GPSJ = GPSJ_0 if GPSJ_0 else GPSJ_1 # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if GPSJ: try: GPKSSJ_37 = GPSJ.split('至')[0] GPJZSJ_38 = GPSJ.split('至')[1] except: pass # 获取出让文件时间、 HQCRWJSJ_39 = GPSJ_1 # 提交竞买申请时间、 TJJMSQSJ_40 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 保证金截止时间、 BZJJZSJ_41 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 确认竞买资格时间 QRJMZGSJ_42 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 联系地址、 LXDZ_43 = reFunction('联系地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话、 LXDH_44 = reFunction('联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人、 LXR_45 = reFunction('联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) ZH_0 = reFunction('以下账户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪\s]*)[一二三四五六七八九123456789]*', items) ZH_1 = reFunction('保证金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items) try: if ZH_0: if ZH_0[:2] == '户名': result = re.split('[①②③④]*', ZH_0) # 保证金账户开户单位 / 户名 BZJZH_86 = result[0].replace('户名:','') if result[0] else '' # 保证金账户账号 BZJZH_87 = '|'.join([re.split(',|,', _)[0] for _ in result[1:]]) # 保证金账户开户行 BZJZH_88 = '|'.join([re.split(',|,', _)[-1] for _ in result[1:]]) else: result = re.split('[①②③④]*', ZH_0) # 保证金账户开户单位 / 户名 BZJZH_86 = '|'.join([re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) # 保证金账户账号 BZJZH_87 = '|'.join([re.findall('户\s*名:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) # 保证金账户开户行 BZJZH_88 = '|'.join([re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) elif ZH_1: # 保证金账户开户单位 / 户名 BZJZH_86 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') # 保证金账户账号 BZJZH_87 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') # 保证金账户开户行 BZJZH_88 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') except: pass CR = reFunction('出让金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items) try: # 出让金账户开户单位 / 户名 CRJZH_97 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') # 出让金账户开户行 CRJZH_98 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') # 出让金账户账号 CRJZH_99 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') except: pass if '拍卖出让地块的基本情况和规划指标要求' not in items and '备注' not in items and '挂牌出让地块的基本情况和规划指标要求' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) tables = soup.find_all('table') if '规划用途及主要指标' in items: # 处理费标准的表格 soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.tbody.find_all('tr')[0].find('td', colspan='4') number = table.tbody.find_all('tr')[0].index(tdReplace) tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert(number + _, tdList[_ - 1]) tdReplace.extract() [_.extract() for _ in table.tbody.find_all('tr')[1].find_all('td')] table.tbody.find_all('tr')[1].extract() tdData = htmlTable.tableTrTdChangeToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_7 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else '' # 出让面积(m2) CRMJ_18 = tdData.get('土地面积(㎡)')[_] if tdData.get('土地面积(㎡)') else '' # 容积率 RJL_22 = tdData.get('容积率')[_] if tdData.get('容积率') else '' # 绿地率( %) LDL_24 = tdData.get('绿地率(%)')[_] if tdData.get('绿地率(%)') else '' # 建筑系数( %) JZXS_27 = tdData.get('建筑系数(%)')[_] if tdData.get('建筑系数(%)') else '' # 竟买保证金(万元) JMBZJ_72 = tdData.get('竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else '' # 出让起始价(万元) CRQSJ_34 = tdData.get('挂牌出让起始价(万元)')[_] if tdData.get('挂牌出让起始价(万元)') else '' # 加价幅度、 JJFD_35 = tdData.get('加价幅度')[_] if tdData.get('加价幅度') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) elif len(tables) <= 3: tdsList = {} for table in tables: td = htmlTable.tableTrTdRegulationToList(table) tdsList.update(td) for _ in range(len(list(tdsList.values())[0])): # 宗地编号 ZDBH_7 = tdsList.get('宗地编号')[_] if tdsList.get('宗地编号') else '' # 地块编号 地块名称 DKWZ_8 = tdsList.get('地块编号')[_] if tdsList.get('地块编号') else tdsList.get('地块编号')[_] if tdsList.get('地块编号') else '' # 宗地位置 ZDWZ_9 = tdsList.get('宗地位置')[_] if tdsList.get('宗地位置') else '' # 宗地坐落 ZDZL_10 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else '' # 土地用途 TDYT_11 = tdsList.get('土地用途')[_] if tdsList.get('土地用途') else '' # 规划土地用途 GHTDYT_12 = tdsList.get('规划土地用途')[_] if tdsList.get('规划土地用途') else '' # 出让年限 CRNX_13 = tdsList.get('出让年限')[_] if tdsList.get('出让年限') else '' # 使用年限 SYNX_14 = tdsList.get('使用年限')[_] if tdsList.get('使用年限') else '' # 批准机关及文号 PZJGJWH_15 = tdsList.get('批准机关及文号')[_] if tdsList.get('批准机关及文号') else tdsList.get('批准文号')[_] if tdsList.get('批准文号') else '' # 规划用地面积〔m2) GHYDMJ_16 = tdsList.get('规划用地面积(m2)')[_] if tdsList.get('规划用地面积(m2)') else tdsList.get('用地面积(㎡)')[_] if tdsList.get('用地面积(㎡)') else tdsList.get('规划用地面积(㎡)')[_] if tdsList.get('规划用地面积(㎡)') else '' # 出让面积(m2) CRMJ_18 = tdsList.get('出让面积(㎡)')[_] if tdsList.get('出让面积(㎡)') else '' # 规划面积(m2) GHMJ_17 = tdsList.get('规划面积(㎡)')[_] if tdsList.get('规划面积(㎡)') else '' # 出让用地面积(m2) CRYDMJ_19 = tdsList.get('出让用地面积(m2)')[_] if tdsList.get('出让用地面积(m2)') else '' # 宗地出让面积 ZDCRMJ_20 = tdsList.get('宗地出让面积')[_] if tdsList.get('宗地出让面积') else '' # 建筑密度 JZMD_21 = tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else '' # 容积率 RJL_22 = tdsList.get('容积率')[_] if tdsList.get('容积率') else '' # 绿地率 LDL_23 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else '' # 绿地率( %) LDL_24 = tdsList.get('绿地率')[_] if tdsList.get('绿地率') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else '' # 建筑控制高度(m) JZKZGD_25 = tdsList.get('建筑控制高度(m)')[_] if tdsList.get('建筑控制高度(m)') else '' # 建筑控制高度(米) JZKZZGD_26 = tdsList.get('建筑控制高度(米)')[_] if tdsList.get('建筑控制高度(米)') else '' # 投资强度(万元 / 公顷) TZQD_28 = tdsList.get('投资强度(万元/公顷)')[_] if tdsList.get('投资强度(万元/公顷)') else '' # 竞买保证金 JMBZJ_32 = tdsList.get('竞买保证金')[_] if tdsList.get('竞买保证金') else '' # 出让起始价(万元) CRQSJ_34 = tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else '' # 竟买保证金(万元) JMBZJ_72 = tdsList.get('竞买保证金(万元)')[_] if tdsList.get('竞买保证金(万元)') else '' # 起叫价 QJJ_33 = tdsList.get('起始价')[_] if tdsList.get('起始价') else tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else '' # 加价幅度 JJFD_35 = tdsList.get('加价幅度')[_] if tdsList.get('加价幅度') else '' if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) elif len(tables) == 6: # TODO pass except: for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]: # 宗地编号 ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '') # 宗地坐落 ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 宗地出让面积 ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑密度 JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 容积率 RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 绿地率( %) LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑控制高度(米) JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 投资强度(万元 / 公顷) TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 土地估价备案号 TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 现状土地条件 TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 竞买保证金 JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 起叫价 QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 加价幅度 JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) else: for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]: # 宗地编号 ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '') # 宗地坐落 ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 宗地出让面积 ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑密度 JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 容积率 RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 绿地率( %) LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑控制高度(米) JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 投资强度(万元 / 公顷) TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 土地估价备案号 TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 现状土地条件 TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 竞买保证金 JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 起叫价 QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 加价幅度 JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class longyanTransformResultSpider(CrawlSpider): # TODO name = 'longyanTransformResult' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(longyanTransformResultSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/longyanTransformResultPage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/龙岩市公共资源交易中心_出让结果_龙岩.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """文件标题,信息时间,正文标题,公告编号,出让时间,公告类型,地块编号,地块位置,土地用途,土地面积(公顷),出让年限,成交价(万元),受让单位,土地现状,土地使用条件,备注,公示期,联系方式,单位地址,邮政编码,联系电话,联系人,联系单位,电子邮件,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(longyanTransformResultSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('longyan', 'longyanTransformResult') self.duplicateUrl = 0 # TODO self.targetUrl = 'https://www.lyggzy.com.cn/lyztb/tdky/084001/?pageing={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 7): yield Request( self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="r-bd"]/ul[1]/li') for dataItem in dataItems: title = dataItem.xpath('a/text()').extract_first() url = 'https://www.lyggzy.com.cn/' + dataItem.xpath( 'a/@href').extract_first()[1:] msgTime = dataItem.xpath('span/text()').extract_first() yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, 'msgTime': msgTime, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') htmlTable = htmlTableTransformer() WJBT_48 = '' XXSJ_49 = '' ZWBT_50 = '' GGBH_51 = '' CRSJ_52 = '' GGNX_53 = '' DKBH_54 = '' DKWZ_55 = '' TDYT_56 = '' TDMJ_57 = '' CRNX_58 = '' CJJ_59 = '' SRDW_60 = '' TDXZTJ_61 = '' TDSYTJ_62 = '' BZ_63 = '' GSQ_64 = '' LXFS_65 = '' DWDZ_66 = '' YZBM_67 = '' LXDH_68 = '' LXR_69 = '' LXDW_77 = '' DZYJ_70 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_48 = response.meta.get('title').strip() # 信息时间 XXSJ_49 = reFunction( '[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first()) # 正文标题 ZWBT_50 = WJBT_48 # 公告编号 GGBH_51 = ''.join( data.xpath( '//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()' ).extract()) # 出让时间 CRSJ_52 = reFunction( '定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items) # 公告类型 GGNX_53 = '出让结果' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_48 + XXSJ_49) # 公示期 GSQ_64 = reFunction( '公示期:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items) # 联系方式 # LXFS_65 # 联系单位 LXDW_77 = reFunction( '联系单位:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_66 = reFunction( '单位地址:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_67 = reFunction( '邮政编码:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_68 = reFunction( '联系电话:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人 LXR_69 = reFunction( '联\s*系\s*人:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 电子邮件 DZYJ_70 = reFunction( '电子邮件:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》@\-\/\%,;,、\.﹪]*)\s', items) if '宗地编号' in items or '土地位置' in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 地块编号 DKBH_54 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 地块位置 DKWZ_55 = tdData.get('宗地位置')[_] if tdData.get( '宗地位置') else tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 土地用途 TDYT_56 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else tdData.get('规划土地用途')[_] if tdData.get( '规划土地用途') else '' # 土地面积(公顷) TDMJ_57 = tdData.get('土地面积(m2)')[_] if tdData.get( '土地面积(m2)') else tdData.get( '出让土地面积(㎡)')[_] if tdData.get('出让土地面积(㎡)') else '' # 出让年限 CRNX_58 = tdData.get('使用年限')[_] if tdData.get( '使用年限') else tdData.get('出让年限')[_] if tdData.get( '出让年限') else '' # 成交价(万元) CJJ_59 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(人民币)')[_] if tdData.get('成交价(人民币)') else '' # 受让单位 SRDW_60 = tdData.get('受让单位')[_] if tdData.get( '受让单位') else tdData.get('竞买人(单位)')[_] if tdData.get( '竞买人(单位)') else '' # 土地使用条件 TDSYTJ_62 = tdData.get('土地使用条件')[_] if tdData.get( '土地使用条件') else '' # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块编号' in items: for item in [ '地块编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('地块编号')[1:] ]: # 地块编号 DKBH_54 = reFunction('地块编号:*\s*([\w\-]*)\s', item) # 地块位置 DKWZ_55 = reFunction( '地块位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_56 = reFunction( '土地用途:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_57 = reFunction( '土地面积\(公顷\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_58 = reFunction( '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_59 = reFunction( '成交价\(万元\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 受让单位 SRDW_60 = reFunction( '受让单位:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地现状 TDXZTJ_61 = reFunction( '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地使用条件 TDSYTJ_62 = reFunction( '土地使用条件:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_63 = reFunction( '备注:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: # 地块位置 DKWZ_55 = reFunction( '地理位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让年限 CRNX_58 = reFunction( '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) CJJ_59 = reFunction( '成交价格(人民币):*\s*([()\w\.::—\¥ (\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 受让单位 SRDW_60 = reFunction( '竞得人名称:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地现状 TDXZTJ_61 = reFunction( '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class shanxiResultNoticeSpider(CrawlSpider): # TODO name = 'shanxiResultNotice' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(shanxiResultNoticeSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/shanxiResultNoticePage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/山西省自然资源厅_结果公示_山西.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """文件标题,时间,来源,正文标题,地块编号,宗地编号,拍卖结果,公开转让方式,挂牌时间,转让人,转让方,受让人,受让方,受让单位,位置,地块位置,出让面积(平方米),用途,成交价(万元),不动产权登记证号,出让合同编号,出让合同,变更协议编号,土地用途,使用年限,面积,土地面积(公顷),转让价格(单价总价),出让年限,土地使用条件,备注,公示期,联系单位,单位地址,邮政编码,联系电话,联系人,电子邮件,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(shanxiResultNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('shanxi', 'shanxiResultNotice') self.duplicateUrl = 0 self.targetUrl = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/jggs/index_{}.shtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' ''' try: for page in range(1, 14): yield Request( self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') else: yield Request( 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/jggs/index.shtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 'index', 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//ul[@class="zwgk_right_content"]/li') for dataItem in dataItems: title = dataItem.xpath('a/text()').extract_first() url = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/jggs' + dataItem.xpath( 'a/@href').extract_first()[1:] yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_45 = '' SJ_46 = '' LY_47 = '' ZWBT_48 = '' DKBH_49 = '' ZDBH_50 = '' PMJG_51 = '' GGZRFS_52 = '' GPSJ_53 = '' ZRR_54 = '' ZRF_55 = '' SRR_56 = '' SRF_57 = '' SRDW_58 = '' WZ_59 = '' DKWZ_60 = '' CRMJ_61 = '' YT_62 = '' CJJ_63 = '' BDCQDJH_64 = '' CRHTBH_65 = '' CRHT_66 = '' BGXYBH_67 = '' TDYT_68 = '' SYNX_69 = '' MJ_70 = '' TDMJ_71 = '' ZRJG_72 = '' CRNX_73 = '' TDSYNX_74 = '' BZ_75 = '' GSQ_76 = '' LXDW_77 = '' DWDZ_78 = '' YZBM_79 = '' LXDH_80 = '' LXR_81 = '' DZYJ_82 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_45 = response.meta.get('title') # 时间 SJ_46 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_47 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 正文标题 ZWBT_48 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 公示期 GSQ_76 = reFunction( f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[。\s]', items) # 联系单位 LXDW_77 = reFunction( '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_78 = reFunction( '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_79 = reFunction( '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_80 = reFunction( '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人 LXR_81 = reFunction( '联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 电子邮件 DZYJ_82 = reFunction( '电子邮件:([()\w\.:: —\(\)@〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_45 + SJ_46) soup = BeautifulSoup( response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if '国有划拨土地使用权结果公示' in items: table.find_all('tr')[1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 地块编号 DKBH_49 = tdData.get('地块编号')[_] if tdData.get( '地块编号') else '' # 公开转让方式 GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get( '公开转让方式') else '' # 挂牌时间 GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else '' # 受让人 SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else '' # 位置 WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else '' # 出让面积(平方米) CRMJ_61 = tdData.get('出让面积')[_] if tdData.get( '出让面积') else '' # 用途 YT_62 = tdData.get('用途')[_] if tdData.get('用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '不动产权登记证号' in items: # 转让方 ZRF_55 = reFunction( '转让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 受让方 SRF_57 = reFunction( '受让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 位置 WZ_59 = reFunction( '宗地位置:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 不动产权登记证号 BDCQDJH_64 = reFunction( '不动产权登记证号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让合同编号 CRHTBH_65 = reFunction( '出让合同编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 变更协议编号 BGXYBH_67 = reFunction( '出让合同变更协议编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 使用年限 SYNX_69 = reFunction( '使用年限:\s*([()【】\w\.::—\(\)〔〕\s㎡≤≥《》\-\/\%,;,、\.﹪]*)面\s*积', items) # 面积 MJ_70 = reFunction( '面\s*积:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 转让价格(单价总价) ZRJG_72 = reFunction( '转让价格:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、。\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '挂牌出让地块的基本情况和规划指标要求' in items: # 宗地编号 ZDBH_50 = reFunction( '宗地编号:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌时间 GPSJ_53 = reFunction( '挂牌时间为:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s', items).replace('。', '') # 转让人 ZRR_54 = reFunction( '转让人为:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*),', items) # 位置 WZ_59 = reFunction( '宗地坐落:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 面积 MJ_70 = reFunction( '宗地面积:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让年限 CRNX_73 = reFunction( '出让年限:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 备注 BZ_75 = reFunction( '备注:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s*二', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块基本情况' in items: try: if '备注' not in items: tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 受让单位 SRDW_58 = tdData.get('受让单位')[_] if tdData.get( '受让单位') else '' # 受让人 SRR_56 = tdData.get('竞得人')[_] if tdData.get( '竞得人') else '' # 地块位置 DKWZ_60 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 土地用途 TDYT_68 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else '' # 土地面积(公顷) TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 出让年限 CRNX_73 = tdData.get('出让年限')[_] if tdData.get( '出让年限') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
class zhengzhouLandAgreementTransformResultSpider(CrawlSpider): # TODO name = 'zhengzhouLandAgreementTransformResult' # 配置更换cookies时间 COOKIES_SWITCH_TIME = datetime.datetime.now() def __new__(cls): if not hasattr(cls, "instance"): cls.instance = super(zhengzhouLandAgreementTransformResultSpider, cls).__new__(cls) # TODO pathPage = os.path.join( os.path.abspath( os.path.dirname(os.path.dirname( os.path.dirname(__file__)))), 'Logs/zhengzhouLandAgreementTransformResultPage.txt') # TODO cls.pathDetail = os.path.join( os.path.abspath( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))), 'data/郑州市自然资源和规划局_土地协议出让结果_郑州.csv') cls.filePage = open(pathPage, 'w+') if os.path.exists(cls.pathDetail): cls.fileDetail = open(cls.pathDetail, 'a+') else: cls.fileDetail = open(cls.pathDetail, 'a+') with open(cls.pathDetail, 'a+') as fp: # TODO fp.write( """标题,来源,时间,行政区,电子监管号,项目名称,项目位置,面积,土地来源,土地用途,供地方式,土地使用年限,行业分类,土地级别,成交价格,分期支付约定—支付期号,分期支付约定—约定支付日期,分期支付约定—约定支付金额,分期支付约定—备注,土地使用权人,约定容积率——下限,约定容积率——上限,约定交地时间,约定开工时间,约定竣工时间,实际开工时间,实际竣工时间,批准单位,合同签订日期,爬取地址url,唯一标识,\n""" ) return cls.instance def __init__(self): super(zhengzhouLandAgreementTransformResultSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient( 'zhengzhou', 'zhengzhouLandAgreementTransformResult') self.duplicateUrl = 0 self.targetUrl = 'http://zzland.zhengzhou.gov.cn/xycrjg/index_{}.jhtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,,、\.﹪㎡' def CloseSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.INFO) def CloseExceptionSpider(self): ''' 关闭spider :return: ''' self.filePage.close() self.fileDetail.close() self.log('响应超时爬虫正常关闭,At{}'.format(datetime.datetime.now()), level=logging.ERROR) def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 2): priority = 4 - int(page) yield Request( self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # headers={'Content-Type': 'application/json'}, dont_filter=True) yield Request( 'http://zzland.zhengzhou.gov.cn/xycrjg/index.jhtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 1, 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') def parse_index(self, response): ''' 拿到总页数, :param response: :return: ''' try: page = response.meta.get('page') datas = Selector(text=response.body.decode('utf-8')) dataItems = datas.xpath('//div[@class="box-content"]/ul/li') for dataItem in dataItems: title = dataItem.xpath('a/h1/text()').extract_first() url = dataItem.xpath('a/@href').extract_first() yield Request( url, method='GET', callback=self.parse_detail, meta={ 'page': page, 'title': title, }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) except Exception as e: self.log(f'列表页解析失败{page}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') BT_18 = '' LY_19 = '' SJ_20 = '' XZQ_21 = '' DZJGH_22 = '' XMMC_23 = '' XMWZ_24 = '' MJ_25 = '' TDLY_26 = '' TSYT_27 = '' GDFS_28 = '' TDSYNX_29 = '' HYFL_30 = '' TDJB_31 = '' CJJG_32 = '' ZFQH_33 = '' YDZFRQ_34 = '' YDZFJE_35 = '' BZ_36 = '' TDSTQR_37 = '' SX_38 = '' XX_39 = '' YDJDSJ_40 = '' YDKGSJ_41 = '' YDJGSJ_42 = '' SJKGSJ_43 = '' SJJGSJ_44 = '' PZDW_45 = '' HTQDRQ_46 = '' # TODO 共有字段 # 标题 BT_18 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_19 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 SJ_20 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 解析 table 若出错 使用正则 htmlTable = htmlTableTransformer() if '宗地编号' not in items and '行政区' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find_all('table')[0] if not table.tbody.find_all('tr')[0].find_all( text=re.compile("用地单位|受让人")): table.tbody.find_all('tr')[0].extract() tdsData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdsData.values())[0])): # 项目位置 XMWZ_24 = tdsData.get('土地座落')[_] if tdsData.get( '土地座落') else tdsData.get('宗地位置')[_] if tdsData.get( '宗地位置') else '' # 面积 MJ_25_0 = tdsData.get('出让面积(公顷)') MJ_25_1 = tdsData.get('出让面积') MJ_25_2 = tdsData.get('出让/划拨面积') MJ_25_ = list(filter(None, [MJ_25_0, MJ_25_1, MJ_25_2])) MJ_25 = MJ_25_[0][_] if MJ_25_ else '' # 土地用途 TSYT_27 = tdsData.get('土地用途')[_] if tdsData.get( '土地用途') else tdsData.get('用途明细')[_] if tdsData.get( '用途明细') else '' # 供地方式 GDFS_28 = tdsData.get('供应方式')[_] if tdsData.get( '供应方式') else '' # 土地级别 TDJB_31 = tdsData.get('土地级别')[_] if tdsData.get( '土地级别') else '' # 成交价格 CJJG_32_0 = tdsData.get('出让价款') CJJG_32_1 = tdsData.get('出让价款(万元)') CJJG_32_2 = tdsData.get('出让/划拨价歀') CJJG_32_ = list( filter(None, [CJJG_32_0, CJJG_32_1, CJJG_32_2])) CJJG_32 = CJJG_32_[0][_] if CJJG_32_ else '' # 土地使用权人 TDSTQR_37 = tdsData.get('用地单位')[_] if tdsData.get( '用地单位') else tdsData.get('受让人')[_] if tdsData.get( '受让人') else '' # 合同签订日期 HTQDRQ_46 = tdsData.get('签订日期')[_] if tdsData.get( '签订日期') else '' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + LY_19 + SJ_20) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_18, LY_19, SJ_20, XZQ_21, DZJGH_22, XMMC_23, XMWZ_24, MJ_25, TDLY_26, TSYT_27, GDFS_28, TDSYNX_29, HYFL_30, TDJB_31, CJJG_32, ZFQH_33, YDZFRQ_34, YDZFJE_35, BZ_36, TDSTQR_37, SX_38, XX_39, YDJDSJ_40, YDKGSJ_41, YDJGSJ_42, SJKGSJ_43, SJJGSJ_44, PZDW_45, HTQDRQ_46, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: pass else: # 进行正则匹配 # 行政区 XZQ_21 = reFunction(f'行政区:([{self.reStr}]*)电子监管号', items) # 电子监管号 DZJGH_22 = reFunction(f'电子监管号:([{self.reStr}]*)项目名称', items) # 项目名称 XMMC_23_ = reFunction(f'项目名称:([{self.reStr}]*)项目位置', items) XMMC_23 = XMMC_23_ if XMMC_23_ else reFunction( f'宗地编号([{self.reStr}]*)地块位置', items) # 项目位置 XMWZ_24_ = reFunction(f'项目位置:([{self.reStr}]*)面积(公顷): ', items) XMWZ_24 = XMWZ_24_ if XMWZ_24_ else reFunction( f'地块位置([{self.reStr}]*)土地用途', items) # 面积 MJ_25_ = reFunction(f'面积\(公顷\):([{self.reStr}]*)土地来源', items) MJ_25 = MJ_25_ if MJ_25_ else reFunction( f'土地面积\(公顷\)([{self.reStr}]*)出让年限', items) # 土地来源 TDLY_26 = reFunction(f'土地来源:([{self.reStr}]*)土地用途', items) # 土地用途 TSYT_27_ = reFunction(f'土地用途:([{self.reStr}]*)供地方式', items) TSYT_27 = TSYT_27_ if TSYT_27_ else data.xpath( 'string(//table/tbody/tr[5]/td[1])').extract_first() # 供地方式 GDFS_28 = reFunction(f'供地方式:([{self.reStr}]*)土地使用年限', items) # 土地使用年限 TDSYNX_29_ = reFunction(f'土地使用年限:([{self.reStr}]*)行业分类', items) TDSYNX_29 = TDSYNX_29_ if TDSYNX_29_ else reFunction( f'出让年限([{self.reStr}]*)成交价\(万元\)', items) # 行业分类 HYFL_30 = reFunction(f'行业分类:([{self.reStr}]*)土地级别', items) # 土地级别 TDJB_31 = reFunction(f'土地级别:([{self.reStr}]*)成交价格\(万元\)', items) # 成交价格 CJJG_32_ = reFunction(f'成交价格\(万元\):([{self.reStr}]*)分期支付约定', items) CJJG_32 = CJJG_32_ if CJJG_32_ else reFunction( f'成交价格\(万元\)([{self.reStr}]*)明细用途', items) # 分期支付约定—支付期号 ZFQH_33 = data.xpath( '//table/tbody/tr[10]/td[1]/text()').extract_first() # 分期支付约定—约定支付日期 YDZFRQ_34 = data.xpath( '//table/tbody/tr[10]/td[2]/text()').extract_first() # 分期支付约定—约定支付金额 YDZFJE_35 = data.xpath( '//table/tbody/tr[10]/td[3]/text()').extract_first() # 分期支付约定—备注 BZ_36 = data.xpath( 'string(//table/tbody/tr[10]/td[4])').extract_first() # 土地使用权人 TDSTQR_37_ = reFunction(f'土地使用权人:([{self.reStr}]*)约定容积率', items) TDSTQR_37 = TDSTQR_37_ if TDSTQR_37_ else reFunction( f'受让单位([{self.reStr}]*)备注', items) # 约定容积率——下限 SX_38 = reFunction(f'下限:([{self.reStr}]*)上限', items) # 约定容积率——上限 XX_39 = reFunction(f'上限:([{self.reStr}]*)约定交地时间', items) # 约定交地时间 YDJDSJ_40 = reFunction(f'约定交地时间:([{self.reStr}]*)约定开工时间', items) # 约定开工时间 YDKGSJ_41 = reFunction(f'约定开工时间:([{self.reStr}]*)约定竣工时间', items) # 约定竣工时间 YDJGSJ_42 = reFunction(f'约定竣工时间:([{self.reStr}]*)实际开工时间', items) # 实际开工时间 SJKGSJ_43 = reFunction(f'实际开工时间:([{self.reStr}]*)实际竣工时间', items) # 实际竣工时间 SJJGSJ_44 = reFunction(f'实际竣工时间:([{self.reStr}]*)批准单位', items) # 批准单位 PZDW_45 = reFunction(f'批准单位:([{self.reStr}]*)合同签订日期', items) # 合同签订日期 HTQDRQ_46 = reFunction(f'合同签订日期:([{self.reStr}]*)\s', items) crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + LY_19 + SJ_20) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_18, LY_19, SJ_20, XZQ_21, DZJGH_22, XMMC_23, XMWZ_24, MJ_25, TDLY_26, TSYT_27, GDFS_28, TDSYNX_29, HYFL_30, TDJB_31, CJJG_32, ZFQH_33, YDZFRQ_34, YDZFJE_35, BZ_36, TDSTQR_37, SX_38, XX_39, YDJDSJ_40, YDKGSJ_41, YDJGSJ_42, SJKGSJ_43, SJJGSJ_44, PZDW_45, HTQDRQ_46, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)