def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: sesionID = getSesion(self.targetUrl) now_time = int(time.time() * 1000) res = requests.get(self.origin_url.format(now_time, sesionID), headers=self.header, allow_redirects=False, timeout=300) pages = 100 pages = sorted(re.findall('reportTotalPage=(\d*);', str(res.content.decode('gbk'))), key=lambda x: len(x), reverse=True)[0] sumPage = 0 for page in range(1, int(pages)+1): # 每十个请求换一个 sessionID if sumPage < 10: self.origin_url = f'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={now_time}&__boxModel__=true&op=page_content&sessionID={sesionID}&pn={page}' else: sesionID = getSesion(self.targetUrl) now_time = int(time.time() * 1000) self.origin_url = f'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={now_time}&__boxModel__=true&op=page_content&sessionID={sesionID}&pn={page}' sumPage = 0 self.log('当前爬取页数{}'.format(page), level=logging.INFO) priority = int(pages) + 1 - int(page) self.filePage.write(str(page)) yield Request(self.origin_url, method='GET', priority=priority, callback=self.parse_index, meta={'page': page, 'priority': priority}, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) sumPage += 1 except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 13): priority = 13 - int(page) yield Request( self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # body=requests_data, # headers={'Content-Type': 'application/json'}, # dont_filter=True ) yield Request( 'http://zrzy.guizhou.gov.cn/zfxxgk/zfxxgkml/zdlyxxgkml/tdcrzrgg/index.html', method='GET', headers=self.header, callback=self.parse_index, meta={'page': 0}, ) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' ''' try: for page in range(1, 14): yield Request( self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启') else: yield Request( 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index.shtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 'index', 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True)
def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 3): priority = 4 - int(page) yield Request( self.targetUrl.format(page), method='GET', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # headers={'Content-Type': 'application/json'}, dont_filter=True) yield Request( 'http://zzland.zhengzhou.gov.cn/hbgd/index.jhtml', method='GET', headers=self.header, callback=self.parse_index, meta={ 'page': 1, 'priority': 1 }, # headers={'Content-Type': 'application/json'}, dont_filter=True) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: try: pageStart = int(self.filePage.read()) if self.filePage.read() else 0 except: pageStart = 0 self.log(f'获取历史页错误: {traceback.format_exc()}', level=logging.ERROR) if pageStart != 105: for page in range(pageStart, 105): self.data['pn'] = page * 18 requests_data = json.dumps(self.data) priority = 89 - int(page) with open(self.pathPage, 'w+') as fp: fp.write(str(page)) yield Request(self.targetUrl, method='POST', headers=self.header, priority=priority, callback=self.parse_index, meta={'page': page, 'priority': priority}, body=requests_data, # headers={'Content-Type': 'application/json'}, # dont_filter=True ) except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' ''' try: for page in range(1, 8): yield Request(self.targetUrl.format(page), method='GET', headers=self.header, callback=self.parse_index, meta={'page': page}, # headers={'Content-Type': 'application/json'}, dont_filter=True ) except Exception as e: self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' 按照优先级爬取 ''' try: pages = 110 sumPage = 0 for page in range(1, int(pages) + 1): self.log('当前爬取页数{}'.format(page), level=logging.INFO) priority = int(pages) + 1 - int(page) self.filePage.write(str(page)) data = { 'total_page': '110', 'tatol': '1312', 'currentPage': f'{page}', 'pageSize': '12', 'code': '0015-0001', 'type': '0,1,4,5,6,7,9,11,99', 'name': '', 'area': '', 'status': '', 'currentSelectTime': '', 'stopstatus': '', 'suspendstatus': '', } yield FormRequest( self.targetUrl, method='POST', formdata=data, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, # body=requests_data, headers={'Content-Type': 'application/json'} dont_filter=True, ) sumPage += 1 except Exception as e: self.log(f'当前爬取失败页数{page}, {datetime.datetime.now()}, 错误: {e}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 3): requests_data = data = { '__VIEWSTATE': '''/wEPDwULLTE0NDY1NDA3MTQPZBYCAgMPZBYKAgUPPCsACQEADxYEHghEYXRhS2V5cxYAHgtfIUl0ZW1Db3VudAIJZBYSZg9kFgwCAQ8PFgIeBFRleHQFBkNLNTMwMWRkAgMPDxYCHwIFDW5ld3NsaXN0LmFzcHhkZAIFDw8WAh4LTmF2aWdhdGVVcmwFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzAxZBYCZg8VARLlnJ/lnLDliKnnlKjop4TliJJkAgcPD2QWAh4Fc3R5bGUFDWRpc3BsYXk6bm9uZTtkAggPFQIMZGlzcGxheTpub25lBGRpdjFkAgkPPCsACQEADxYEHwAWAB8BAv////8PZGQCAQ9kFgoCAQ8PFgIfAgUGQ0s1MzAzZGQCAw8PFgIfAgUMdHdvcGFnZS5hc3B4ZGQCBQ8PZBYEHgdvbmNsaWNrBQ1zaG93bmxpc3QoMik7HwQFC2N1cnNvcjpoYW5kFgJmDxUBFeWcn+WcsOS+m+W6lOWSjOWHuuiuqWQCCA8VAg1kaXNwbGF5OmJsb2NrBGRpdjJkAgkPPCsACQEADxYEHwAWAB8BAgRkFghmD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDMwMQRtYWluEuWcn+WcsOS+m+W6lOiuoeWIkmQCAQ9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzAzMDIEbWFpbiTlnJ/lnLDmi5vmoIfmi43ljZbmjILniYzlh7rorqnlhazlkYpkAgIPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwMzAzBG1haW4S5Zyf5Zyw5Ye66K6p57uT5p6cZAIDD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDMwNARtYWluGOW7uuiuvueUqOWcsOaJueWHhuaWh+S7tmQCAg9kFgwCAQ8PFgIfAgUGQ0s1MzA5ZGQCAw8PFgIfAgUNbmV3c2xpc3QuYXNweGRkAgUPDxYCHwMFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzA5ZBYCZg8VASflvoHlnLDnrqHnkIbmlL/nrZblkozkv6Hmga/lhazlvIDlubPlj7BkAgcPD2QWAh8EBQ1kaXNwbGF5Om5vbmU7ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXYzZAIJDzwrAAkBAA8WBB8AFgAfAQL/////D2RkAgMPZBYMAgEPDxYCHwIFBkNLNTMxMmRkAgMPDxYCHwIFDW5ld3NsaXN0LmFzcHhkZAIFDw8WAh8DBRduZXdzbGlzdC5hc3B4P2lkPUNLNTMxMmQWAmYPFQEe5b6B5Zyw5ZGK55+l5Lmm5ZKM5om55ZCO5YWs5ZGKZAIHDw9kFgIfBAUNZGlzcGxheTpub25lO2QCCA8VAgxkaXNwbGF5Om5vbmUEZGl2NGQCCQ88KwAJAQAPFgQfABYAHwEC/////w9kZAIED2QWDAIBDw8WAh8CBQZDSzUzMDVkZAIDDw8WAh8CBQ1uZXdzbGlzdC5hc3B4ZGQCBQ8PFgIfAwUXbmV3c2xpc3QuYXNweD9pZD1DSzUzMDVkFgJmDxUBJ+WcsOS7t+WKqOaAgeebkea1i+aVsOaNruWSjOWfuuWHhuWcsOS7t2QCBw8PZBYCHwQFDWRpc3BsYXk6bm9uZTtkAggPFQIMZGlzcGxheTpub25lBGRpdjVkAgkPPCsACQEADxYEHwAWAB8BAv////8PZGQCBQ9kFgwCAQ8PFgIfAgUGQ0s1MzA2ZGQCAw8PFgIfAgUNbmV3c2xpc3QuYXNweGRkAgUPDxYCHwMFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzA2ZBYCZg8VARLpl7Lnva7lnJ/lnLDlpITnva5kAgcPD2QWAh8EBQ1kaXNwbGF5Om5vbmU7ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXY2ZAIJDzwrAAkBAA8WBB8AFgAfAQL/////D2RkAgYPZBYKAgEPDxYCHwIFBkNLNTMwNGRkAgMPDxYCHwIFDHR3b3BhZ2UuYXNweGRkAgUPD2QWBB8FBQ1zaG93bmxpc3QoNyk7HwQFC2N1cnNvcjpoYW5kFgJmDxUBEuW+geWcsOWJjeacn+WHhuWkh2QCCA8VAgxkaXNwbGF5Om5vbmUEZGl2N2QCCQ88KwAJAQAPFgQfABYAHwECAmQWBGYPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwNDAxBG1haW4e5ouf5b6B5pS25Zyf5Zyw5ZGK55+l5ZKM5ZCs6K+BZAIBD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDQwMgRtYWluG+aLn+W+geaUtuWcn+WcsOeOsOeKtuiwg+afpWQCBw9kFgoCAQ8PFgIfAgUGQ0s1MzA3ZGQCAw8PFgIfAgUMdHdvcGFnZS5hc3B4ZGQCBQ8PZBYEHwUFDXNob3dubGlzdCg4KTsfBAULY3Vyc29yOmhhbmQWAmYPFQES5b6B5Zyw5a6h5p+l5oql5om5ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXY4ZAIJDzwrAAkBAA8WBB8AFgAfAQICZBYEZg9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzA3MDEEbWFpbiflhpznlKjlnLDovaznlKjlnJ/lnLDlvoHmlLbmibnlh4bmlofku7ZkAgEPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwNzAyBG1haW4S5b6B5pS25oql5om55p2Q5paZZAIID2QWCgIBDw8WAh8CBQZDSzUzMDJkZAIDDw8WAh8CBQx0d29wYWdlLmFzcHhkZAIFDw9kFgQfBQUNc2hvd25saXN0KDkpOx8EBQtjdXJzb3I6aGFuZBYCZg8VARLlvoHlnLDnu4Tnu4flrp7mlr1kAggPFQIMZGlzcGxheTpub25lBGRpdjlkAgkPPCsACQEADxYEHwAWAB8BAgJkFgRmD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDIwMQRtYWluEuW+geaUtuWcn+WcsOWFrOWRimQCAQ9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzAyMDIEbWFpbh7lvoHlnLDooaXlgb/lronnva7mlrnmoYjlhazlkYpkAgYPZBYCAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUFY25hbWUeDkRhdGFWYWx1ZUZpZWxkBQRwa2lkHgtfIURhdGFCb3VuZGdkEBUYDOWKnuS6i+Wkp+WOhQzop4TliJLorrLloIIR5Yy65Y6/5bGAKOWIhuWxgCkM572R56uZ5L+h5oGvDOWfuuWxguWNleS9jR7op4TliJLpmaLnrKzlm5vmrKHkuqTpgJrosIPmn6UM5LqM57qn5py65p6ECeWbvueJh+WxlQznvZHkuIrkupLliqgS5bu66K6u5o+Q5qGI5Yqe55CGFeWkqea0peW4guagh+WHhuWcsOWbvgzmlL/liqHlhazlvIAM5bel5L2c5Yqo5oCBDOWcn+WcsOeuoeeQhgznn7/kuqfnrqHnkIYM5rW35rSL566h55CGDeael+S4mueuoeeQhiAM5pS/562W5rOV6KeEDOWfjuS5oeinhOWIkhjmiavpu5HpmaTmgbbkuJPpobnmlpfkuokt4oCc5LiN5b+Y5Yid5b+D44CB54mi6K6w5L2/5ZG94oCd5Li76aKY5pWZ6IKyD+S4jeWKqOS6p+eZu+iusA/np5HmioDkuI7mlofljJYG5YWo6YOoFRgEQ0swNgRDSzExBENLMDcEQ0sxNgRDSzEwBENLMTcEQ0syOARDSzI5BENLMDUEQ0swOQRDSzEyBENLNTAEQ0s1MQRDSzUzBENLNTQEQ0s1NQRDSzU2BENLNTgEQ0s1MgRDSzk5BENLOTgEQ0s1OQRDSzEzAkNLFCsDGGdnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2RkAgcPPCsACQEADxYEHwAWAB8BAgNkFgZmD2QWAmYPFQMMdHdvcGFnZS5hc3B4BENLNTMM5Zyf5Zyw566h55CGZAIBD2QWAmYPFQMMdHdvcGFnZS5hc3B4BkNLNTMwMxXlnJ/lnLDkvpvlupTlkozlh7rorqlkAgIPZBYCZg8VAw1uZXdzbGlzdC5hc3B4CENLNTMwMzAxEuWcn+WcsOS+m+W6lOiuoeWIkmQCCA88KwAJAQAPFgQfABYAHwECI2QWRmYPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAyMzUyNwAu5aSp5rSl5biCMjAyMOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkgoyMDIwLTA0LTA3ZAIBD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMjE4MTcAMeays+S4nOWMujIwMjDlubTluqblm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJIKMjAyMC0wMS0yMmQCAg9kFgJmDxUEFG5ld3MuYXNweD9pZD0xMDIxODE0ADTmtKXljZfljLoyMDIw5bm05Zu95pyJ5bu66K6+55So5Zyw5L6b5bqU6K6h5YiS5YWs56S6CjIwMjAtMDEtMjJkAgMPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAxNzI1MQA8MjAxOeW5tDHigJQ55pyI5Lu95YWo5Zu95oi/5Zyw5Lqn5byA5Y+R5oqV6LWE5ZKM6ZSA5ZSu5oOF5Ya1CjIwMTktMTAtMThkAgQPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAxMDAwOQAu5aSp5rSl5biCMjAxOeW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkgoyMDE5LTA1LTE1ZAIFD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMDc1ODEAPuWFs+S6juino+mZpOa0peiTn++8iOaMgu+8iTIwMTQtMDIy5Y+35Zyf5Zyw5Ye66K6p5ZCI5ZCM5YWs5ZGKCjIwMTktMDMtMTlkAgYPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAwNTY3NgAi5Lic5Li95Yy6MjAxOeW5tOWcn+WcsOS+m+W6lOiuoeWIkgoyMDE5LTAzLTAxZAIHD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMDAwMzYAJOWFqOWbveWFtuS7luWfjuW4guWcn+WcsOS+m+W6lOiuoeWIkgoyMDE5LTAyLTI0ZAIID2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMyMzc5NwBn5aSp5rSl5biCMjAxOOW5tOW6puS9j+WuheeUqOWcsOS+m+W6lOiuoeWIkuWPiuS4ieW5tOa7muWKqOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE4LTA1LTA4ZAIJD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMyMzUxMgBV5aSp5rSl5biCMjAxOOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE4LTAzLTIyZAIKD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ3MABY5aSp5rSl5biCMjAxN+W5tOW6puWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTExLTEzZAILD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2OQBe5aSp5rSl5biCMjAxN+W5tOWJjeS4ieWto+W6puaIv+WcsOS6p+eUqOWcsOS+m+W6lOaDheWGteOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTEwLTEyZAIMD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2OABb5aSp5rSl5biCMjAxN+W5tOS4iuWNiuW5tOaIv+WcsOS6p+eUqOWcsOS+m+W6lOaDheWGteOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTA3LTEyZAIND2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2NwBZ5aSp5rSl5biCMjAxN+W5tDHlraPluqbmiL/lnLDkuqfnlKjlnLDkvpvlupTmg4XlhrXjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNy0wNS0xN2QCDg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjYAVeWkqea0peW4gjIwMTflubTlm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNy0wMy0yN2QCDw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjUAWOWkqea0peW4gjIwMTblubTluqblm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNi0xMS0yOWQCEA9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjQAVzIwMTctMjAxOeW5tOS9j+WuheeUqOWcsOS+m+W6lOS4ieW5tOa7muWKqOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTEwLTI4ZAIRD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MwBV5aSp5rSl5biCMjAxNuW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTAzLTMwZAISD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MgBe5aSp5rSl5biCMjAxNeW5tOW6puWFqOW4guWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTAxLTA2ZAITD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MQBV5aSp5rSl5biCMjAxNeW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE1LTAzLTMxZAIUD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MABM5aSp5rSl5biCMjAxNOW5tOW6pueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTEyLTIyZAIVD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1OQBV5aSp5rSl5biCMjAxNOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTA0LTAzZAIWD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1MwBM5aSp5rSl5biCMjAxMuW5tOW6puWcn+WcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTAxLTE3ZAIXD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1NQBM5aSp5rSl5biCMjAxM+W5tOW6puWcn+WcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTAxLTE3ZAIYD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1OABn5Z2a5Yaz6LSv5b275oi/5Zyw5Lqn6LCD5o6n5pS/562WIOWIh+WunuiQveWunuS9j+aIv+eUqOWcsOS+m+W6lOOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDEzLTEwLTI1ZAIZD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1NwB357un57ut5rex5YWl6LSv5b275Zu95Yqe5Y+RMTflj7fmlofku7bnsr7npZ7liIflrp7okL3lrp7ku4rlubTkvY/miL/nlKjlnLDkvpvlupTjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMy0wNy0wNGQCGg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTYATDIwMTPlubTlhajlm73kvY/miL/nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMy0wNC0xNmQCGw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTQAmQHotK/lvbvmnY7lhYvlvLrlia/mgLvnkIblnKjlhajlm73kv53pmpzmgKflronlsYXlt6XnqIvlt6XkvZzkvJrorq7kuIrorrLor53nmoTnsr7npZ7vvIzokL3lrp7ku4rlubTkvY/miL/nlKjlnLDkvpvlupTjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZEuLi4KMjAxMi0wNy0wNmQCHA9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTIATOWkqea0peW4gjIwMTHlubTluqblnJ/lnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMi0wMS0zMWQCHQ9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTAAYeWkqea0peW4gjIwMDnlubTnu4/okKXmgKfmiL/lnLDkuqflvIDlj5HnlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMC0xMi0zMGQCHg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTEAYeWkqea0peW4gjIwMTDlubTnu4/okKXmgKfmiL/lnLDkuqflvIDlj5HnlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMC0xMi0zMGQCHw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NDgAZTIwMDnlubTop4TliJLnjq/lpJbnjq/lhoXorqHliJLkvpvlupTnu4/okKXmgKflnLDlnZfmmI7nu4booagg44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiAPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ5AFsyMDA55bm057uP6JCl5oCn5oi/5Zyw5Lqn5byA5Y+R55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiEPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ3AEkyMDA55bm05bel5Lia55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiIPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ2AFIyMDA55bm05L+d6Zqc5oCn5L2P5oi/55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAgkPDxYEHhBDdXJyZW50UGFnZUluZGV4AgEeC1JlY29yZGNvdW50AidkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAgUMTGVmdDEkc2VhcmNoBQtMZWZ0MSRyZXNldNXI3qGhLoxzvr8eO1j2HeTugfIJ''', '__VIEWSTATEGENERATOR': '14DD91A0', '__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': str(page), '__EVENTVALIDATION': '/wEWIwLhw+20CAKdlKkkAuWJhPELAvCUxuwDAvK/saAHAoijlK8BAu2VlowLAoijqMQJAu2V+rQPAu2VglcC7ZWO6QcC0ozARQLSjNT6CwKIo4CKDgKIo5DABwLtlaqhAgKBsYu8CAKBsZ9RAoGxx6oCAoGx288KAoGx7+QNAoGxg5kEAoGx64UCAoGxs/YLAsXD4bsPAsXDzYYEAoGx/7oFAu2VvsYKAqLig7gGAuyjuaoGAujImYgNArXSuJUHAuOzj+oDApHMqaIMAvGOgsgIUNOB7crAAeirbo/qpKOPxUWSV5M=', 'pkid': 'CK530301', 'pkid2': '9', 'newskindid': 'CK530301', 'HiddenFieldPageFinished': '1', 'Left1$ddl_cname': 'CK', 'Left1$tb_search': '', 'Left1$rbl_site': 'title', 'AspNetPager1_input': str(page), } yield FormRequest( self.targetUrl, method='POST', headers=self.header, # priority=priority, callback=self.parse_index, meta={ 'page': page, # 'priority': priority }, formdata=requests_data, # headers={'Content-Type': 'application/json'}, # dont_filter=True ) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def start_requests(self): ''' 先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID ''' try: for page in range(1, 17): requests_data = { 'categoryId': '732', 'typeId': '0', 'pageNum': str(page), 'pageSize': '10', 'search': 'false', 'Title': '', 'StartTime': '', 'EndTime': '', 'area': '%E8%AF%B7%E9%80%89%E6%8B%A9', } priority = 17 - int(page) yield FormRequest( self.targetUrl, method='POST', headers=self.header, priority=priority, callback=self.parse_index, meta={ 'page': page, 'priority': priority }, formdata=requests_data, # body=requests_data, # headers={'Content-Type': 'application/json'}, # dont_filter=True ) except Exception as e: self.log( f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) raise IntegrationException('爬取阻塞,请重启')
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) noticeDetail = 'https://www.sz68.com' + data.xpath( '//iframe[@id="externalframe1"]/@src').extract_first( ) if data.xpath( '//iframe[@id="externalframe1"]/@src').extract_first( ) else 'https://www.sz68.com' + data.xpath( '//iframe[@id="externalframe0"]/@src').extract_first() ZWBT = '' GGQ = '' GPKSSJ = '' GPJSSJ = '' ZDDM_DKZDBH = '' ZDH = '' DKWZ = '' DKYT = '' ZRHYLB = '' TDMJ = '' JZMJ = '' TDSYNX = '' TDFZXZ = '' RJL = '' GPQSJ = '' JMBZJ = '' TDSYNX = '' ZBJJZSJ = '' BMSJ = '' BMDD = '' DZ = '' DH = '' JYSJ = response.meta.get('JYSJ') JYZT = response.meta.get('JYZT') ZDH = response.meta.get('ZDH') TDWZ = response.meta.get('TDWZ') QSJ = response.meta.get('QSJ') TDYT = response.meta.get('TDYT') TDMJ = response.meta.get('TDMJ') JYFS = response.meta.get('JYFS') id = response.meta.get('id') # 公告详情 detailData = requests.get(noticeDetail, headers=self.header, allow_redirects=False, timeout=60, verify=False) if detailData.status_code == 200: detail = Selector(text=detailData.content.decode('utf-8')) items = str(detail.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '').replace('\n', '').replace(' ', '') # 正文标题 ZWBT = ''.join( detail.xpath( '/html/body/div/p[2]/span//text() | /html/body/p[2]/span//text()|/html/body/p[1]/span//text()' ).extract()) # 公告期 GGQ = reFunction('公告期自([\w \-\s]*)[止]?,', items) # 挂牌开始时间 GPKSSJ = reFunction( '挂牌期自(\d{4}年\d{1,2}月\d{1,2}日)[起]?至(?:\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止', items) # 挂牌结束时间 GPJSSJ = reFunction( '挂牌期自(?:\d{4}年\d{1,2}月\d{1,2}日)[起]?至(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止', items) # TODO 解析页面表格 soup = BeautifulSoup(detailData.text) table = soup.find('body').find('div').find( 'table') if soup.find('body').find('div').find( 'table') else soup.find('table') htmlTable = htmlTableTransformer() tdData = htmlTable.table_tr_td(table) # 宗地代码 / 地块宗地编号 ZDDM_DKZDBH = tdData.get('宗地编号') if tdData.get( '宗地编号') else tdData.get('地块宗地编号') # 宗地号 ZDH_A = tdData.get('宗地号') # 土地位置 DKWZ = tdData.get('土地位置') # 土地用途 DKYT = tdData.get('土地用途') # 准入行业类别 ZRHYLB = tdData.get('准入行业类别') # 土地面积 / 土地面积(平方米) TDMJ_A = tdData.get('土地面积(平方米)') if tdData.get( '土地面积(平方米)') else tdData.get('土地面积') # 建筑面积(平方米) / 总建筑面积 JZMJ = tdData.get('建筑面积(平方米)') if tdData.get( '建筑面积(平方米)') else tdData.get('总建筑面积') # 挂牌起始价(人民币万元) GPQSJ = tdData.get('挂牌起始价(人民币、万元)') # 竞买(投标)保证金(人民币万元) JMBZJ = tdData.get('竞买(投标)保证金(人民币、万元)') # 土地使用年限(年) TDSYNX = tdData.get('土地使用年期') if not detail.xpath('//table').extract(): # 宗地代码 / 地块宗地编号 ZDDM_DKZDBH = reFunction('宗地编号([\w \-\s]*),', items) # 土地使用年期 / 土地使用年限 情况2 中的 土地使用年期 TDSYNX = reFunction('土地使用年[\s期限]*[为]?(\d*年)', items) # 土地发展建设现状 TDFZXZ = reFunction('土地的发展建设现状:([\S\s]*。)', items) # 容积率 容积率不大于1.518。 RJL = reFunction('容积率[\D]*([\.\d]*)。', items) # 土地位置 宗地位于龙岗 中心城14号地, DKWZ = reFunction('宗地位于([\w \s]*),', items) # 土地用途 DKYT = reFunction('土地用途为([\w \s]*),', items) # TODO 是否需要在解析一种页面 http://localhost:63342/IntegrationSpider/Logs/dwsw.html?_ijt=rfnsd28r0fb132e6i5qkd3db6f # 保证金截止时间 ZBJJZSJ = reFunction( '保证金的到账截止时间为(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时\d{1,2}分)', items) # 地址 //匹配这些中文标点符号 。 ? ! , 、 ; : DZ = '|'.join( re.findall('地址:([\w \.\-\s\/\%,\(\)。 \? \! 、:]*);咨询电话', items)) # 电话 DH = '|'.join( re.findall('咨询电话:([\w \.\-\s\/\%,\(\)。 \? \! 、]*)[;。]', items)) else: raise IntegrationException(f'获取公告详情失败, url: {noticeDetail}') # TODO 基本信息 完成 itemsData = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 交易方式 JYFS_A = data.xpath( '//div[@class="content_case1"]/div[1]/ul/li[2]/span/text()' ).extract_first() # 交易类型 JYLX = data.xpath( '//div[@class="content_case1"]/div[1]/ul/li[1]/span/text()' ).extract_first() # 宗地 ZD = data.xpath('//div[@class="content_case1"]/div[1]/div/text()' ).extract_first() # 发布时间 FBSJ = data.xpath( '//div[@class="content_case1"]/div[2]/span[2]/text()' ).extract_first() # 交易状态 JYZT_A = data.xpath( '//div[@class="content_case1"]/div[2]/span[3]/text()' ).extract_first() # 中标人 / 竞得人 ZBR_24 = data.xpath( '//div[@class="right_first"]/div[1]/div[2]/text()' ).extract_first() # 成交价(元) CJJ_25 = data.xpath( '//div[@class="right_first"]/div[2]/div[2]/text()' ).extract_first() # 保证金(元) BZJ_26 = data.xpath( '//div[@class="right_first twin"][1]/div[1]/div[2]/text()' ).extract_first() # 起始价(元) QSJ_A = data.xpath( '//div[@class="right_first twin"][1]/div[2]/div[2]/text()' ).extract_first() # 竞价阶梯(元) JJJT_28 = data.xpath( '//div[@class="right_first twin"][2]/div[1]/div[2]/text()' ).extract_first() # 封顶价(元) FDJ_29 = data.xpath( '//div[@class="right_first twin"][2]/div[2]/div[2]/text()' ).extract_first() # 竞买申请截止时间 JMSQJZSJ_30 = data.xpath( '//div[@class="right_first twin"][3]/div[1]/div[2]/text()' ).extract_first() # 竞买人数 JMRS_31 = data.xpath( '//div[@class="right_first twin"][3]/div[2]/div[2]/text()' ).extract_first() # TODO 标的详情 完成 BDdetail = data.xpath( '//li[@class="weather_info_ul_item"]/div[2]/span') # 宗地号 ZDH_B = BDdetail[0].xpath('text()').extract_first() # 土地面积 TDMJ_B = BDdetail[1].xpath('text()').extract_first() # 建筑面积 JZMJ_A = BDdetail[2].xpath('text()').extract_first() # 容积率 RJL_A = BDdetail[3].xpath('text()').extract_first() # 建筑覆盖率 JZFGL = BDdetail[4].xpath('text()').extract_first() # 建筑高度 JZGD = BDdetail[5].xpath('text()').extract_first() # 用途 YT = BDdetail[6].xpath('text()').extract_first() # 使用年限 SYNX = BDdetail[7].xpath('text()').extract_first() # 区域 QY = BDdetail[8].xpath('text()').extract_first() # 位置 WZ = BDdetail[9].xpath('text()').extract_first() # 绿地率 LDL = BDdetail[10].xpath('text()').extract_first() # 建筑楼层 JZLC = BDdetail[11].xpath('text()').extract_first() # TODO 竞价记录 完成 # 竞买人 JMR = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[2]/text()' ).extract_first() # 竞买出价(元) JMSJ = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[3]/text()' ).extract_first() # 竞价时间 CJSJ = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[4]/text()' ).extract_first() # 状态 ZT = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[5]/text()' ).extract_first() # TODO 结果公示 完成 results = requests.post( 'https://www.sz68.com/tiaim/web/resultdetailbytargetId', headers=self.header, data={'targetId': id}, allow_redirects=False, timeout=60, verify=False) if results.status_code == 200: resultsData = results.json() # 正文标题 ZWBT_A = resultsData.get('notice').get('NAME') # 发布日期 FBRQ = resultsData.get('notice').get('PUBLISH_TIME') # 宗地号 ZDH_C = resultsData.get('notice').get('DTL_REF_NO') # 竞得人 JDR = reFunction('竞得人:([\w \.\-\s\/\%,]*)<', resultsData.get('fileExtName')) # 中标人 ZBR_A = reFunction('中标人:([\w \.\-\s\/\%,]*)<', resultsData.get('fileExtName')) # 位置 WZ = reFunction('位置:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 土地用途 TDYT_A = reFunction('土地用途:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 土地面积 TDMJ_C = reFunction('土地面积:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 建筑面积 JZMJ_B = reFunction('建筑面积:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 起始价 QSJ_D = reFunction('起始价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 成交价 CJJ_A = reFunction('成交价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 溢价率 YJL = reFunction('溢价率:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 综合楼面单价 ZHLMDJ = reFunction('综合楼面单价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # TODO 附件 解析出让合同 完成 accessory = '土地模块|' links = data.xpath('//div[@class="accessory_link"]/a') for link in links: fileName = link.xpath( 'text()[position()=((position() mod 2)=0)]' ).extract_first().strip() if link.xpath( 'text()[position()=((position() mod 2)=0)]').extract_first( ) else '未知名称' try: href = link.xpath('@href').extract_first() linkPath = self.dirName + f'土地模块_{ZDH}' + fileName response = requests.get(href, headers=self.header, timeout=200) with open(linkPath, 'wb') as fp: fp.write(response.content) except: pass else: accessory += fileName + '|' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url md5Mark = encrypt_md5(ZDH + WZ + ZWBT + url) csvFile = [ JYSJ, JYZT, ZDH, TDWZ, QSJ, TDYT, TDMJ, JYFS_A, JYLX, ZD, FBSJ, JYZT_A, ZBR_24, CJJ_25, BZJ_26, QSJ_A, JJJT_28, FDJ_29, JMSQJZSJ_30, JMRS_31, ZWBT, GGQ, GPKSSJ, GPJSSJ, ZDDM_DKZDBH, ZDH_A, DKWZ, DKYT, ZRHYLB, TDMJ_A, JZMJ, TDSYNX, TDFZXZ, RJL, GPQSJ, JMBZJ, TDSYNX, ZBJJZSJ, DZ, DH, ZDH_B, TDMJ_B, JZMJ_B, RJL_A, JZFGL, JZGD, YT, SYNX, QY, WZ, LDL, JZLC, JMR, JMSJ, CJSJ, ZT, ZWBT_A, FBRQ, ZDH_C, JDR, ZBR_A, WZ, TDYT_A, TDMJ_C, JZMJ_B, QSJ_D, CJJ_A, YJL, ZHLMDJ, crawlingTime, url, md5Mark, accessory, ] fileData = [] for _ in csvFile: try: fileData.append( _.replace(',', ' ').replace('\n', '').replace('\r', '')) except: fileData.append(str(_)) self.fileDetail.write(','.join(fileData)) self.fileDetail.write('\n') except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)