コード例 #1
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         sesionID = getSesion(self.targetUrl)
         now_time = int(time.time() * 1000)
         res = requests.get(self.origin_url.format(now_time, sesionID), headers=self.header, allow_redirects=False, timeout=300)
         pages = 100
         pages = sorted(re.findall('reportTotalPage=(\d*);', str(res.content.decode('gbk'))), key=lambda x: len(x), reverse=True)[0]
         sumPage = 0
         for page in range(1, int(pages)+1):
             # 每十个请求换一个 sessionID
             if sumPage < 10:
                 self.origin_url = f'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={now_time}&__boxModel__=true&op=page_content&sessionID={sesionID}&pn={page}'
             else:
                 sesionID = getSesion(self.targetUrl)
                 now_time = int(time.time() * 1000)
                 self.origin_url = f'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={now_time}&__boxModel__=true&op=page_content&sessionID={sesionID}&pn={page}'
                 sumPage = 0
             self.log('当前爬取页数{}'.format(page), level=logging.INFO)
             priority = int(pages) + 1 - int(page)
             self.filePage.write(str(page))
             yield Request(self.origin_url, method='GET', priority=priority, callback=self.parse_index,
                           meta={'page': page, 'priority': priority},
                           # body=requests_data, headers={'Content-Type': 'application/json'}
                           dont_filter=True,
                           )
             sumPage += 1
     except Exception as e:
         self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}', level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #2
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         for page in range(1, 13):
             priority = 13 - int(page)
             yield Request(
                 self.targetUrl.format(page),
                 method='GET',
                 headers=self.header,
                 priority=priority,
                 callback=self.parse_index,
                 meta={
                     'page': page,
                     'priority': priority
                 },
                 # body=requests_data,
                 # headers={'Content-Type': 'application/json'},
                 # dont_filter=True
             )
         yield Request(
             'http://zrzy.guizhou.gov.cn/zfxxgk/zfxxgkml/zdlyxxgkml/tdcrzrgg/index.html',
             method='GET',
             headers=self.header,
             callback=self.parse_index,
             meta={'page': 0},
         )
     except Exception as e:
         self.log(
             f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}',
             level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #3
0
 def start_requests(self):
     '''
     '''
     try:
         for page in range(1, 14):
             yield Request(
                 self.targetUrl.format(page),
                 method='GET',
                 headers=self.header,
                 callback=self.parse_index,
                 meta={'page': page},
                 # headers={'Content-Type': 'application/json'},
                 dont_filter=True)
     except Exception as e:
         self.log(
             f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}',
             level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
     else:
         yield Request(
             'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index.shtml',
             method='GET',
             headers=self.header,
             callback=self.parse_index,
             meta={
                 'page': 'index',
                 'priority': 1
             },
             # headers={'Content-Type': 'application/json'},
             dont_filter=True)
コード例 #4
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         for page in range(1, 3):
             priority = 4 - int(page)
             yield Request(
                 self.targetUrl.format(page),
                 method='GET',
                 headers=self.header,
                 priority=priority,
                 callback=self.parse_index,
                 meta={
                     'page': page,
                     'priority': priority
                 },
                 # headers={'Content-Type': 'application/json'},
                 dont_filter=True)
         yield Request(
             'http://zzland.zhengzhou.gov.cn/hbgd/index.jhtml',
             method='GET',
             headers=self.header,
             callback=self.parse_index,
             meta={
                 'page': 1,
                 'priority': 1
             },
             # headers={'Content-Type': 'application/json'},
             dont_filter=True)
     except Exception as e:
         self.log(
             f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}',
             level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #5
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         try:
             pageStart = int(self.filePage.read()) if self.filePage.read() else 0
         except:
             pageStart = 0
             self.log(f'获取历史页错误: {traceback.format_exc()}', level=logging.ERROR)
         if pageStart != 105:
             for page in range(pageStart, 105):
                 self.data['pn'] = page * 18
                 requests_data = json.dumps(self.data)
                 priority = 89 - int(page)
                 with open(self.pathPage, 'w+') as fp:
                     fp.write(str(page))
                 yield Request(self.targetUrl, method='POST', headers=self.header, priority=priority, callback=self.parse_index,
                               meta={'page': page, 'priority': priority},
                               body=requests_data,
                               # headers={'Content-Type': 'application/json'},
                               # dont_filter=True
                               )
     except Exception as e:
         self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #6
0
 def start_requests(self):
     '''
     '''
     try:
         for page in range(1, 8):
             yield Request(self.targetUrl.format(page), method='GET', headers=self.header,
                           callback=self.parse_index,
                           meta={'page': page},
                           # headers={'Content-Type': 'application/json'},
                           dont_filter=True
                           )
     except Exception as e:
         self.log(f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #7
0
 def start_requests(self):
     '''
      按照优先级爬取
     '''
     try:
         pages = 110
         sumPage = 0
         for page in range(1, int(pages) + 1):
             self.log('当前爬取页数{}'.format(page), level=logging.INFO)
             priority = int(pages) + 1 - int(page)
             self.filePage.write(str(page))
             data = {
                 'total_page': '110',
                 'tatol': '1312',
                 'currentPage': f'{page}',
                 'pageSize': '12',
                 'code': '0015-0001',
                 'type': '0,1,4,5,6,7,9,11,99',
                 'name': '',
                 'area': '',
                 'status': '',
                 'currentSelectTime': '',
                 'stopstatus': '',
                 'suspendstatus': '',
             }
             yield FormRequest(
                 self.targetUrl,
                 method='POST',
                 formdata=data,
                 priority=priority,
                 callback=self.parse_index,
                 meta={
                     'page': page,
                     'priority': priority
                 },
                 # body=requests_data, headers={'Content-Type': 'application/json'}
                 dont_filter=True,
             )
             sumPage += 1
     except Exception as e:
         self.log(f'当前爬取失败页数{page}, {datetime.datetime.now()}, 错误: {e}',
                  level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #8
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         for page in range(1, 3):
             requests_data = data = {
                 '__VIEWSTATE':
                 '''/wEPDwULLTE0NDY1NDA3MTQPZBYCAgMPZBYKAgUPPCsACQEADxYEHghEYXRhS2V5cxYAHgtfIUl0ZW1Db3VudAIJZBYSZg9kFgwCAQ8PFgIeBFRleHQFBkNLNTMwMWRkAgMPDxYCHwIFDW5ld3NsaXN0LmFzcHhkZAIFDw8WAh4LTmF2aWdhdGVVcmwFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzAxZBYCZg8VARLlnJ/lnLDliKnnlKjop4TliJJkAgcPD2QWAh4Fc3R5bGUFDWRpc3BsYXk6bm9uZTtkAggPFQIMZGlzcGxheTpub25lBGRpdjFkAgkPPCsACQEADxYEHwAWAB8BAv////8PZGQCAQ9kFgoCAQ8PFgIfAgUGQ0s1MzAzZGQCAw8PFgIfAgUMdHdvcGFnZS5hc3B4ZGQCBQ8PZBYEHgdvbmNsaWNrBQ1zaG93bmxpc3QoMik7HwQFC2N1cnNvcjpoYW5kFgJmDxUBFeWcn+WcsOS+m+W6lOWSjOWHuuiuqWQCCA8VAg1kaXNwbGF5OmJsb2NrBGRpdjJkAgkPPCsACQEADxYEHwAWAB8BAgRkFghmD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDMwMQRtYWluEuWcn+WcsOS+m+W6lOiuoeWIkmQCAQ9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzAzMDIEbWFpbiTlnJ/lnLDmi5vmoIfmi43ljZbmjILniYzlh7rorqnlhazlkYpkAgIPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwMzAzBG1haW4S5Zyf5Zyw5Ye66K6p57uT5p6cZAIDD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDMwNARtYWluGOW7uuiuvueUqOWcsOaJueWHhuaWh+S7tmQCAg9kFgwCAQ8PFgIfAgUGQ0s1MzA5ZGQCAw8PFgIfAgUNbmV3c2xpc3QuYXNweGRkAgUPDxYCHwMFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzA5ZBYCZg8VASflvoHlnLDnrqHnkIbmlL/nrZblkozkv6Hmga/lhazlvIDlubPlj7BkAgcPD2QWAh8EBQ1kaXNwbGF5Om5vbmU7ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXYzZAIJDzwrAAkBAA8WBB8AFgAfAQL/////D2RkAgMPZBYMAgEPDxYCHwIFBkNLNTMxMmRkAgMPDxYCHwIFDW5ld3NsaXN0LmFzcHhkZAIFDw8WAh8DBRduZXdzbGlzdC5hc3B4P2lkPUNLNTMxMmQWAmYPFQEe5b6B5Zyw5ZGK55+l5Lmm5ZKM5om55ZCO5YWs5ZGKZAIHDw9kFgIfBAUNZGlzcGxheTpub25lO2QCCA8VAgxkaXNwbGF5Om5vbmUEZGl2NGQCCQ88KwAJAQAPFgQfABYAHwEC/////w9kZAIED2QWDAIBDw8WAh8CBQZDSzUzMDVkZAIDDw8WAh8CBQ1uZXdzbGlzdC5hc3B4ZGQCBQ8PFgIfAwUXbmV3c2xpc3QuYXNweD9pZD1DSzUzMDVkFgJmDxUBJ+WcsOS7t+WKqOaAgeebkea1i+aVsOaNruWSjOWfuuWHhuWcsOS7t2QCBw8PZBYCHwQFDWRpc3BsYXk6bm9uZTtkAggPFQIMZGlzcGxheTpub25lBGRpdjVkAgkPPCsACQEADxYEHwAWAB8BAv////8PZGQCBQ9kFgwCAQ8PFgIfAgUGQ0s1MzA2ZGQCAw8PFgIfAgUNbmV3c2xpc3QuYXNweGRkAgUPDxYCHwMFF25ld3NsaXN0LmFzcHg/aWQ9Q0s1MzA2ZBYCZg8VARLpl7Lnva7lnJ/lnLDlpITnva5kAgcPD2QWAh8EBQ1kaXNwbGF5Om5vbmU7ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXY2ZAIJDzwrAAkBAA8WBB8AFgAfAQL/////D2RkAgYPZBYKAgEPDxYCHwIFBkNLNTMwNGRkAgMPDxYCHwIFDHR3b3BhZ2UuYXNweGRkAgUPD2QWBB8FBQ1zaG93bmxpc3QoNyk7HwQFC2N1cnNvcjpoYW5kFgJmDxUBEuW+geWcsOWJjeacn+WHhuWkh2QCCA8VAgxkaXNwbGF5Om5vbmUEZGl2N2QCCQ88KwAJAQAPFgQfABYAHwECAmQWBGYPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwNDAxBG1haW4e5ouf5b6B5pS25Zyf5Zyw5ZGK55+l5ZKM5ZCs6K+BZAIBD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDQwMgRtYWluG+aLn+W+geaUtuWcn+WcsOeOsOeKtuiwg+afpWQCBw9kFgoCAQ8PFgIfAgUGQ0s1MzA3ZGQCAw8PFgIfAgUMdHdvcGFnZS5hc3B4ZGQCBQ8PZBYEHwUFDXNob3dubGlzdCg4KTsfBAULY3Vyc29yOmhhbmQWAmYPFQES5b6B5Zyw5a6h5p+l5oql5om5ZAIIDxUCDGRpc3BsYXk6bm9uZQRkaXY4ZAIJDzwrAAkBAA8WBB8AFgAfAQICZBYEZg9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzA3MDEEbWFpbiflhpznlKjlnLDovaznlKjlnJ/lnLDlvoHmlLbmibnlh4bmlofku7ZkAgEPZBYCZg8VBA1uZXdzbGlzdC5hc3B4CENLNTMwNzAyBG1haW4S5b6B5pS25oql5om55p2Q5paZZAIID2QWCgIBDw8WAh8CBQZDSzUzMDJkZAIDDw8WAh8CBQx0d29wYWdlLmFzcHhkZAIFDw9kFgQfBQUNc2hvd25saXN0KDkpOx8EBQtjdXJzb3I6aGFuZBYCZg8VARLlvoHlnLDnu4Tnu4flrp7mlr1kAggPFQIMZGlzcGxheTpub25lBGRpdjlkAgkPPCsACQEADxYEHwAWAB8BAgJkFgRmD2QWAmYPFQQNbmV3c2xpc3QuYXNweAhDSzUzMDIwMQRtYWluEuW+geaUtuWcn+WcsOWFrOWRimQCAQ9kFgJmDxUEDW5ld3NsaXN0LmFzcHgIQ0s1MzAyMDIEbWFpbh7lvoHlnLDooaXlgb/lronnva7mlrnmoYjlhazlkYpkAgYPZBYCAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUFY25hbWUeDkRhdGFWYWx1ZUZpZWxkBQRwa2lkHgtfIURhdGFCb3VuZGdkEBUYDOWKnuS6i+Wkp+WOhQzop4TliJLorrLloIIR5Yy65Y6/5bGAKOWIhuWxgCkM572R56uZ5L+h5oGvDOWfuuWxguWNleS9jR7op4TliJLpmaLnrKzlm5vmrKHkuqTpgJrosIPmn6UM5LqM57qn5py65p6ECeWbvueJh+WxlQznvZHkuIrkupLliqgS5bu66K6u5o+Q5qGI5Yqe55CGFeWkqea0peW4guagh+WHhuWcsOWbvgzmlL/liqHlhazlvIAM5bel5L2c5Yqo5oCBDOWcn+WcsOeuoeeQhgznn7/kuqfnrqHnkIYM5rW35rSL566h55CGDeael+S4mueuoeeQhiAM5pS/562W5rOV6KeEDOWfjuS5oeinhOWIkhjmiavpu5HpmaTmgbbkuJPpobnmlpfkuokt4oCc5LiN5b+Y5Yid5b+D44CB54mi6K6w5L2/5ZG94oCd5Li76aKY5pWZ6IKyD+S4jeWKqOS6p+eZu+iusA/np5HmioDkuI7mlofljJYG5YWo6YOoFRgEQ0swNgRDSzExBENLMDcEQ0sxNgRDSzEwBENLMTcEQ0syOARDSzI5BENLMDUEQ0swOQRDSzEyBENLNTAEQ0s1MQRDSzUzBENLNTQEQ0s1NQRDSzU2BENLNTgEQ0s1MgRDSzk5BENLOTgEQ0s1OQRDSzEzAkNLFCsDGGdnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2RkAgcPPCsACQEADxYEHwAWAB8BAgNkFgZmD2QWAmYPFQMMdHdvcGFnZS5hc3B4BENLNTMM5Zyf5Zyw566h55CGZAIBD2QWAmYPFQMMdHdvcGFnZS5hc3B4BkNLNTMwMxXlnJ/lnLDkvpvlupTlkozlh7rorqlkAgIPZBYCZg8VAw1uZXdzbGlzdC5hc3B4CENLNTMwMzAxEuWcn+WcsOS+m+W6lOiuoeWIkmQCCA88KwAJAQAPFgQfABYAHwECI2QWRmYPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAyMzUyNwAu5aSp5rSl5biCMjAyMOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkgoyMDIwLTA0LTA3ZAIBD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMjE4MTcAMeays+S4nOWMujIwMjDlubTluqblm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJIKMjAyMC0wMS0yMmQCAg9kFgJmDxUEFG5ld3MuYXNweD9pZD0xMDIxODE0ADTmtKXljZfljLoyMDIw5bm05Zu95pyJ5bu66K6+55So5Zyw5L6b5bqU6K6h5YiS5YWs56S6CjIwMjAtMDEtMjJkAgMPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAxNzI1MQA8MjAxOeW5tDHigJQ55pyI5Lu95YWo5Zu95oi/5Zyw5Lqn5byA5Y+R5oqV6LWE5ZKM6ZSA5ZSu5oOF5Ya1CjIwMTktMTAtMThkAgQPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAxMDAwOQAu5aSp5rSl5biCMjAxOeW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkgoyMDE5LTA1LTE1ZAIFD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMDc1ODEAPuWFs+S6juino+mZpOa0peiTn++8iOaMgu+8iTIwMTQtMDIy5Y+35Zyf5Zyw5Ye66K6p5ZCI5ZCM5YWs5ZGKCjIwMTktMDMtMTlkAgYPZBYCZg8VBBRuZXdzLmFzcHg/aWQ9MTAwNTY3NgAi5Lic5Li95Yy6MjAxOeW5tOWcn+WcsOS+m+W6lOiuoeWIkgoyMDE5LTAzLTAxZAIHD2QWAmYPFQQUbmV3cy5hc3B4P2lkPTEwMDAwMzYAJOWFqOWbveWFtuS7luWfjuW4guWcn+WcsOS+m+W6lOiuoeWIkgoyMDE5LTAyLTI0ZAIID2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMyMzc5NwBn5aSp5rSl5biCMjAxOOW5tOW6puS9j+WuheeUqOWcsOS+m+W6lOiuoeWIkuWPiuS4ieW5tOa7muWKqOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE4LTA1LTA4ZAIJD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMyMzUxMgBV5aSp5rSl5biCMjAxOOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE4LTAzLTIyZAIKD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ3MABY5aSp5rSl5biCMjAxN+W5tOW6puWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTExLTEzZAILD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2OQBe5aSp5rSl5biCMjAxN+W5tOWJjeS4ieWto+W6puaIv+WcsOS6p+eUqOWcsOS+m+W6lOaDheWGteOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTEwLTEyZAIMD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2OABb5aSp5rSl5biCMjAxN+W5tOS4iuWNiuW5tOaIv+WcsOS6p+eUqOWcsOS+m+W6lOaDheWGteOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE3LTA3LTEyZAIND2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2NwBZ5aSp5rSl5biCMjAxN+W5tDHlraPluqbmiL/lnLDkuqfnlKjlnLDkvpvlupTmg4XlhrXjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNy0wNS0xN2QCDg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjYAVeWkqea0peW4gjIwMTflubTlm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNy0wMy0yN2QCDw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjUAWOWkqea0peW4gjIwMTblubTluqblm73mnInlu7rorr7nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxNi0xMS0yOWQCEA9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NjQAVzIwMTctMjAxOeW5tOS9j+WuheeUqOWcsOS+m+W6lOS4ieW5tOa7muWKqOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTEwLTI4ZAIRD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MwBV5aSp5rSl5biCMjAxNuW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTAzLTMwZAISD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MgBe5aSp5rSl5biCMjAxNeW5tOW6puWFqOW4guWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE2LTAxLTA2ZAITD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MQBV5aSp5rSl5biCMjAxNeW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE1LTAzLTMxZAIUD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ2MABM5aSp5rSl5biCMjAxNOW5tOW6pueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTEyLTIyZAIVD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1OQBV5aSp5rSl5biCMjAxNOW5tOWbveacieW7uuiuvueUqOWcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTA0LTAzZAIWD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1MwBM5aSp5rSl5biCMjAxMuW5tOW6puWcn+WcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTAxLTE3ZAIXD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1NQBM5aSp5rSl5biCMjAxM+W5tOW6puWcn+WcsOS+m+W6lOiuoeWIkuOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDE0LTAxLTE3ZAIYD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1OABn5Z2a5Yaz6LSv5b275oi/5Zyw5Lqn6LCD5o6n5pS/562WIOWIh+WunuiQveWunuS9j+aIv+eUqOWcsOS+m+W6lOOAkOi9rOiHquWOn+W4guWbveWcn+aIv+euoeWxgOe9keermeOAkQoyMDEzLTEwLTI1ZAIZD2QWAmYPFQQTbmV3cy5hc3B4P2lkPTMwMDQ1NwB357un57ut5rex5YWl6LSv5b275Zu95Yqe5Y+RMTflj7fmlofku7bnsr7npZ7liIflrp7okL3lrp7ku4rlubTkvY/miL/nlKjlnLDkvpvlupTjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMy0wNy0wNGQCGg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTYATDIwMTPlubTlhajlm73kvY/miL/nlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMy0wNC0xNmQCGw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTQAmQHotK/lvbvmnY7lhYvlvLrlia/mgLvnkIblnKjlhajlm73kv53pmpzmgKflronlsYXlt6XnqIvlt6XkvZzkvJrorq7kuIrorrLor53nmoTnsr7npZ7vvIzokL3lrp7ku4rlubTkvY/miL/nlKjlnLDkvpvlupTjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZEuLi4KMjAxMi0wNy0wNmQCHA9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTIATOWkqea0peW4gjIwMTHlubTluqblnJ/lnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMi0wMS0zMWQCHQ9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTAAYeWkqea0peW4gjIwMDnlubTnu4/okKXmgKfmiL/lnLDkuqflvIDlj5HnlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMC0xMi0zMGQCHg9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NTEAYeWkqea0peW4gjIwMTDlubTnu4/okKXmgKfmiL/lnLDkuqflvIDlj5HnlKjlnLDkvpvlupTorqHliJLjgJDovazoh6rljp/luILlm73lnJ/miL/nrqHlsYDnvZHnq5njgJEKMjAxMC0xMi0zMGQCHw9kFgJmDxUEE25ld3MuYXNweD9pZD0zMDA0NDgAZTIwMDnlubTop4TliJLnjq/lpJbnjq/lhoXorqHliJLkvpvlupTnu4/okKXmgKflnLDlnZfmmI7nu4booagg44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiAPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ5AFsyMDA55bm057uP6JCl5oCn5oi/5Zyw5Lqn5byA5Y+R55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiEPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ3AEkyMDA55bm05bel5Lia55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAiIPZBYCZg8VBBNuZXdzLmFzcHg/aWQ9MzAwNDQ2AFIyMDA55bm05L+d6Zqc5oCn5L2P5oi/55So5Zyw6K6h5YiS5a6J5o6S6KGo44CQ6L2s6Ieq5Y6f5biC5Zu95Zyf5oi/566h5bGA572R56uZ44CRCjIwMDktMTItMjhkAgkPDxYEHhBDdXJyZW50UGFnZUluZGV4AgEeC1JlY29yZGNvdW50AidkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAgUMTGVmdDEkc2VhcmNoBQtMZWZ0MSRyZXNldNXI3qGhLoxzvr8eO1j2HeTugfIJ''',
                 '__VIEWSTATEGENERATOR': '14DD91A0',
                 '__EVENTTARGET': 'AspNetPager1',
                 '__EVENTARGUMENT': str(page),
                 '__EVENTVALIDATION':
                 '/wEWIwLhw+20CAKdlKkkAuWJhPELAvCUxuwDAvK/saAHAoijlK8BAu2VlowLAoijqMQJAu2V+rQPAu2VglcC7ZWO6QcC0ozARQLSjNT6CwKIo4CKDgKIo5DABwLtlaqhAgKBsYu8CAKBsZ9RAoGxx6oCAoGx288KAoGx7+QNAoGxg5kEAoGx64UCAoGxs/YLAsXD4bsPAsXDzYYEAoGx/7oFAu2VvsYKAqLig7gGAuyjuaoGAujImYgNArXSuJUHAuOzj+oDApHMqaIMAvGOgsgIUNOB7crAAeirbo/qpKOPxUWSV5M=',
                 'pkid': 'CK530301',
                 'pkid2': '9',
                 'newskindid': 'CK530301',
                 'HiddenFieldPageFinished': '1',
                 'Left1$ddl_cname': 'CK',
                 'Left1$tb_search': '',
                 'Left1$rbl_site': 'title',
                 'AspNetPager1_input': str(page),
             }
             yield FormRequest(
                 self.targetUrl,
                 method='POST',
                 headers=self.header,
                 # priority=priority,
                 callback=self.parse_index,
                 meta={
                     'page': page,
                     # 'priority': priority
                 },
                 formdata=requests_data,
                 # headers={'Content-Type': 'application/json'},
                 # dont_filter=True
             )
     except Exception as e:
         self.log(
             f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}',
             level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #9
0
 def start_requests(self):
     '''
     先拿到总页数, 按照优先级爬取, 并每十个请求换一个sessionID
     '''
     try:
         for page in range(1, 17):
             requests_data = {
                 'categoryId': '732',
                 'typeId': '0',
                 'pageNum': str(page),
                 'pageSize': '10',
                 'search': 'false',
                 'Title': '',
                 'StartTime': '',
                 'EndTime': '',
                 'area': '%E8%AF%B7%E9%80%89%E6%8B%A9',
             }
             priority = 17 - int(page)
             yield FormRequest(
                 self.targetUrl,
                 method='POST',
                 headers=self.header,
                 priority=priority,
                 callback=self.parse_index,
                 meta={
                     'page': page,
                     'priority': priority
                 },
                 formdata=requests_data,
                 # body=requests_data,
                 # headers={'Content-Type': 'application/json'},
                 # dont_filter=True
             )
     except Exception as e:
         self.log(
             f'当前爬取页数失败, {datetime.datetime.now()}, 错误: {e}\n{traceback.format_exc()}',
             level=logging.ERROR)
         raise IntegrationException('爬取阻塞,请重启')
コード例 #10
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            noticeDetail = 'https://www.sz68.com' + data.xpath(
                '//iframe[@id="externalframe1"]/@src').extract_first(
                ) if data.xpath(
                    '//iframe[@id="externalframe1"]/@src').extract_first(
                    ) else 'https://www.sz68.com' + data.xpath(
                        '//iframe[@id="externalframe0"]/@src').extract_first()

            ZWBT = ''
            GGQ = ''
            GPKSSJ = ''
            GPJSSJ = ''
            ZDDM_DKZDBH = ''
            ZDH = ''
            DKWZ = ''
            DKYT = ''
            ZRHYLB = ''
            TDMJ = ''
            JZMJ = ''
            TDSYNX = ''
            TDFZXZ = ''
            RJL = ''
            GPQSJ = ''
            JMBZJ = ''
            TDSYNX = ''
            ZBJJZSJ = ''
            BMSJ = ''
            BMDD = ''
            DZ = ''
            DH = ''
            JYSJ = response.meta.get('JYSJ')
            JYZT = response.meta.get('JYZT')
            ZDH = response.meta.get('ZDH')
            TDWZ = response.meta.get('TDWZ')
            QSJ = response.meta.get('QSJ')
            TDYT = response.meta.get('TDYT')
            TDMJ = response.meta.get('TDMJ')
            JYFS = response.meta.get('JYFS')
            id = response.meta.get('id')
            # 公告详情
            detailData = requests.get(noticeDetail,
                                      headers=self.header,
                                      allow_redirects=False,
                                      timeout=60,
                                      verify=False)

            if detailData.status_code == 200:
                detail = Selector(text=detailData.content.decode('utf-8'))
                items = str(detail.xpath('string(.)').extract()[0]).replace(
                    '\xa0', '').replace('\u3000',
                                        '').replace('\n', '').replace(' ', '')
                # 正文标题
                ZWBT = ''.join(
                    detail.xpath(
                        '/html/body/div/p[2]/span//text() | /html/body/p[2]/span//text()|/html/body/p[1]/span//text()'
                    ).extract())
                # 公告期
                GGQ = reFunction('公告期自([\w \-\s]*)[止]?,', items)
                # 挂牌开始时间
                GPKSSJ = reFunction(
                    '挂牌期自(\d{4}年\d{1,2}月\d{1,2}日)[起]?至(?:\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # 挂牌结束时间
                GPJSSJ = reFunction(
                    '挂牌期自(?:\d{4}年\d{1,2}月\d{1,2}日)[起]?至(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # TODO 解析页面表格
                soup = BeautifulSoup(detailData.text)
                table = soup.find('body').find('div').find(
                    'table') if soup.find('body').find('div').find(
                        'table') else soup.find('table')

                htmlTable = htmlTableTransformer()
                tdData = htmlTable.table_tr_td(table)
                # 宗地代码 / 地块宗地编号
                ZDDM_DKZDBH = tdData.get('宗地编号') if tdData.get(
                    '宗地编号') else tdData.get('地块宗地编号')
                # 宗地号
                ZDH_A = tdData.get('宗地号')
                # 土地位置
                DKWZ = tdData.get('土地位置')
                # 土地用途
                DKYT = tdData.get('土地用途')
                # 准入行业类别
                ZRHYLB = tdData.get('准入行业类别')
                # 土地面积 / 土地面积(平方米)
                TDMJ_A = tdData.get('土地面积(平方米)') if tdData.get(
                    '土地面积(平方米)') else tdData.get('土地面积')
                # 建筑面积(平方米) / 总建筑面积
                JZMJ = tdData.get('建筑面积(平方米)') if tdData.get(
                    '建筑面积(平方米)') else tdData.get('总建筑面积')
                # 挂牌起始价(人民币万元)
                GPQSJ = tdData.get('挂牌起始价(人民币、万元)')
                # 竞买(投标)保证金(人民币万元)
                JMBZJ = tdData.get('竞买(投标)保证金(人民币、万元)')
                # 土地使用年限(年)
                TDSYNX = tdData.get('土地使用年期')

                if not detail.xpath('//table').extract():
                    # 宗地代码 / 地块宗地编号
                    ZDDM_DKZDBH = reFunction('宗地编号([\w \-\s]*),', items)
                    # 土地使用年期 / 土地使用年限  情况2 中的 土地使用年期
                    TDSYNX = reFunction('土地使用年[\s期限]*[为]?(\d*年)', items)
                    # 土地发展建设现状
                    TDFZXZ = reFunction('土地的发展建设现状:([\S\s]*。)', items)
                    # 容积率  容积率不大于1.518。
                    RJL = reFunction('容积率[\D]*([\.\d]*)。', items)
                    # 土地位置  宗地位于龙岗 中心城14号地,
                    DKWZ = reFunction('宗地位于([\w \s]*),', items)
                    # 土地用途
                    DKYT = reFunction('土地用途为([\w \s]*),', items)
                    # TODO 是否需要在解析一种页面  http://localhost:63342/IntegrationSpider/Logs/dwsw.html?_ijt=rfnsd28r0fb132e6i5qkd3db6f
                # 保证金截止时间
                ZBJJZSJ = reFunction(
                    '保证金的到账截止时间为(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时\d{1,2}分)',
                    items)
                # 地址  //匹配这些中文标点符号 。 ? ! , 、 ; :

                DZ = '|'.join(
                    re.findall('地址:([\w \.\-\s\/\%,\(\)。 \? \!  、:]*);咨询电话',
                               items))
                # 电话
                DH = '|'.join(
                    re.findall('咨询电话:([\w \.\-\s\/\%,\(\)。 \? \!  、]*)[;。]',
                               items))
            else:
                raise IntegrationException(f'获取公告详情失败, url: {noticeDetail}')

            # TODO 基本信息  完成
            itemsData = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 交易方式
            JYFS_A = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[2]/span/text()'
            ).extract_first()
            # 交易类型
            JYLX = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[1]/span/text()'
            ).extract_first()
            # 宗地
            ZD = data.xpath('//div[@class="content_case1"]/div[1]/div/text()'
                            ).extract_first()
            # 发布时间
            FBSJ = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[2]/text()'
            ).extract_first()
            # 交易状态
            JYZT_A = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[3]/text()'
            ).extract_first()
            # 中标人 / 竞得人
            ZBR_24 = data.xpath(
                '//div[@class="right_first"]/div[1]/div[2]/text()'
            ).extract_first()
            # 成交价(元)
            CJJ_25 = data.xpath(
                '//div[@class="right_first"]/div[2]/div[2]/text()'
            ).extract_first()
            # 保证金(元)
            BZJ_26 = data.xpath(
                '//div[@class="right_first twin"][1]/div[1]/div[2]/text()'
            ).extract_first()
            # 起始价(元)
            QSJ_A = data.xpath(
                '//div[@class="right_first twin"][1]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞价阶梯(元)
            JJJT_28 = data.xpath(
                '//div[@class="right_first twin"][2]/div[1]/div[2]/text()'
            ).extract_first()
            # 封顶价(元)
            FDJ_29 = data.xpath(
                '//div[@class="right_first twin"][2]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞买申请截止时间
            JMSQJZSJ_30 = data.xpath(
                '//div[@class="right_first twin"][3]/div[1]/div[2]/text()'
            ).extract_first()
            # 竞买人数
            JMRS_31 = data.xpath(
                '//div[@class="right_first twin"][3]/div[2]/div[2]/text()'
            ).extract_first()

            # TODO 标的详情  完成
            BDdetail = data.xpath(
                '//li[@class="weather_info_ul_item"]/div[2]/span')
            # 宗地号
            ZDH_B = BDdetail[0].xpath('text()').extract_first()
            # 土地面积
            TDMJ_B = BDdetail[1].xpath('text()').extract_first()
            # 建筑面积
            JZMJ_A = BDdetail[2].xpath('text()').extract_first()
            # 容积率
            RJL_A = BDdetail[3].xpath('text()').extract_first()
            # 建筑覆盖率
            JZFGL = BDdetail[4].xpath('text()').extract_first()
            # 建筑高度
            JZGD = BDdetail[5].xpath('text()').extract_first()
            # 用途
            YT = BDdetail[6].xpath('text()').extract_first()
            # 使用年限
            SYNX = BDdetail[7].xpath('text()').extract_first()
            # 区域
            QY = BDdetail[8].xpath('text()').extract_first()
            # 位置
            WZ = BDdetail[9].xpath('text()').extract_first()
            # 绿地率
            LDL = BDdetail[10].xpath('text()').extract_first()
            # 建筑楼层
            JZLC = BDdetail[11].xpath('text()').extract_first()

            # TODO 竞价记录 完成
            # 竞买人
            JMR = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[2]/text()'
            ).extract_first()
            # 竞买出价(元)
            JMSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[3]/text()'
            ).extract_first()
            # 竞价时间
            CJSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[4]/text()'
            ).extract_first()
            # 状态
            ZT = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[5]/text()'
            ).extract_first()

            # TODO 结果公示 完成
            results = requests.post(
                'https://www.sz68.com/tiaim/web/resultdetailbytargetId',
                headers=self.header,
                data={'targetId': id},
                allow_redirects=False,
                timeout=60,
                verify=False)
            if results.status_code == 200:
                resultsData = results.json()
                # 正文标题
                ZWBT_A = resultsData.get('notice').get('NAME')
                # 发布日期
                FBRQ = resultsData.get('notice').get('PUBLISH_TIME')
                # 宗地号
                ZDH_C = resultsData.get('notice').get('DTL_REF_NO')
                # 竞得人
                JDR = reFunction('竞得人:([\w \.\-\s\/\%,]*)<',
                                 resultsData.get('fileExtName'))
                # 中标人
                ZBR_A = reFunction('中标人:([\w \.\-\s\/\%,]*)<',
                                   resultsData.get('fileExtName'))
                # 位置
                WZ = reFunction('位置:([\w \.\-\s\/\%,、]*)<',
                                resultsData.get('fileExtName'))
                # 土地用途
                TDYT_A = reFunction('土地用途:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 土地面积
                TDMJ_C = reFunction('土地面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 建筑面积
                JZMJ_B = reFunction('建筑面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 起始价
                QSJ_D = reFunction('起始价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 成交价
                CJJ_A = reFunction('成交价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 溢价率
                YJL = reFunction('溢价率:([\w \.\-\s\/\%,、]*)<',
                                 resultsData.get('fileExtName'))
                # 综合楼面单价
                ZHLMDJ = reFunction('综合楼面单价:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))

            # TODO  附件  解析出让合同  完成
            accessory = '土地模块|'
            links = data.xpath('//div[@class="accessory_link"]/a')
            for link in links:
                fileName = link.xpath(
                    'text()[position()=((position() mod 2)=0)]'
                ).extract_first().strip() if link.xpath(
                    'text()[position()=((position() mod 2)=0)]').extract_first(
                    ) else '未知名称'
                try:
                    href = link.xpath('@href').extract_first()
                    linkPath = self.dirName + f'土地模块_{ZDH}' + fileName
                    response = requests.get(href,
                                            headers=self.header,
                                            timeout=200)

                    with open(linkPath, 'wb') as fp:
                        fp.write(response.content)
                except:
                    pass
                else:
                    accessory += fileName + '|'
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            md5Mark = encrypt_md5(ZDH + WZ + ZWBT + url)
            csvFile = [
                JYSJ,
                JYZT,
                ZDH,
                TDWZ,
                QSJ,
                TDYT,
                TDMJ,
                JYFS_A,
                JYLX,
                ZD,
                FBSJ,
                JYZT_A,
                ZBR_24,
                CJJ_25,
                BZJ_26,
                QSJ_A,
                JJJT_28,
                FDJ_29,
                JMSQJZSJ_30,
                JMRS_31,
                ZWBT,
                GGQ,
                GPKSSJ,
                GPJSSJ,
                ZDDM_DKZDBH,
                ZDH_A,
                DKWZ,
                DKYT,
                ZRHYLB,
                TDMJ_A,
                JZMJ,
                TDSYNX,
                TDFZXZ,
                RJL,
                GPQSJ,
                JMBZJ,
                TDSYNX,
                ZBJJZSJ,
                DZ,
                DH,
                ZDH_B,
                TDMJ_B,
                JZMJ_B,
                RJL_A,
                JZFGL,
                JZGD,
                YT,
                SYNX,
                QY,
                WZ,
                LDL,
                JZLC,
                JMR,
                JMSJ,
                CJSJ,
                ZT,
                ZWBT_A,
                FBRQ,
                ZDH_C,
                JDR,
                ZBR_A,
                WZ,
                TDYT_A,
                TDMJ_C,
                JZMJ_B,
                QSJ_D,
                CJJ_A,
                YJL,
                ZHLMDJ,
                crawlingTime,
                url,
                md5Mark,
                accessory,
            ]
            fileData = []
            for _ in csvFile:
                try:
                    fileData.append(
                        _.replace(',', ' ').replace('\n',
                                                    '').replace('\r', ''))
                except:
                    fileData.append(str(_))
            self.fileDetail.write(','.join(fileData))
            self.fileDetail.write('\n')
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)