def extract_bidding_infos_from_page(self, response): key = response.meta['key'] page = response.meta['page'] self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,开始抽取列表页内容".format(key, page)) css_text = response.xpath('//*[@id="newsList"]/ul') datas = [] for trs in css_text: tds = trs.xpath('./li') for index, td in enumerate(tds): data = {} data['title']= td.xpath('./a//text()').extract_first().strip() data['href']= td.xpath('./a/@href').extract_first() data['create_time']=td.xpath('text()').extract()[1].strip() datas.append(data) for data in datas: #判断是否为空字典 # 去除时间标签中不含时间元素的标签 url_time = data['create_time'] # 标题 url_title = data['title'] #获取url url=data['href'] detail_url=url = 'http://www.ccgp-jiangsu.gov.cn/' + url.replace('./', 'ggxx/gkzbgg/').replace('../', '') self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url)) # 判断时间 url_time = match_data(url_time) if self.limit_days != 0: if compare(url_time) <= self.limit_days: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, } ) else: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, } )
def extract_bidding_infos_from_page(self, response): key = response.meta['key'] page = response.meta['page'] datas=json.loads(response.text).get('data') self.tml_logger.info("【bidding_qgggzy】关键词{}在第{}页,开始抽取列表页内容".format(key, page)) for data in datas: #判断是否为空字典 # 去除时间标签中不含时间元素的标签 url_time = data['timeShow'] # 标题 url_title = data['title'] #获取url url=data['url'] detail_url='http://www.ggzy.gov.cn/information/html/b/' index=0 for u in url.split('/')[6:]: index+=1 if index!=5: detail_url+=u+'/' else: detail_url += u self.tml_logger.info("【bidding_qgggzy】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url)) # 判断时间 url_time = match_data(url_time) if self.limit_days != 0: if compare(url_time) <= self.limit_days: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, } ) else: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, } )
def extract_bidding_infos_from_page(self,response): key = response.meta['key'] page = response.meta['page'] result = json.loads(response.body) datas = result['data']['data'] self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,开始抽取列表页内容".format(key, page)) for data in datas: # 去除时间标签中不含时间元素的标签 url_time = data['create_time'] # 标题 url_title = data['bulletinName'] id=data['id'] if key=='招标公告': detail_url=self.bidding_detail_url.format(id) url=self.bidding_true_url.format(id) elif key=='资格预审公告': detail_url = self.check_detail_url.format(id) url = self.check_true_url.format(id) self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url)) # 判断时间 url_time = match_data(url_time) if self.limit_days!=0: if compare(url_time)<=self.limit_days: yield scrapy.Request(url= detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time':url_time, 'url':url, 'title':url_title, } ) else: yield scrapy.Request(url= detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time':url_time, 'url':url, 'title':url_title, } )
def extract_bidding_infos_from_page(self, response): key = response.meta['key'] page = response.meta['page'] result = json.loads(response.body) returns = result['return'] result = json.loads(returns) datas = result.get('Table') # self.tml_logger.info("【bidding_hzdzzb】关键词{}在第{}页,开始抽取列表页内容".format(key, page)) for data in datas: # 时间 create_time = data['infodate'] # # 标题 bulletinName = data['title'] url = data['href'] url = 'http://hndzzbtb.hndrc.gov.cn/' + url self.tml_logger.info( "【bidding_hzdzzb】关键词{}在第{}页,请求网页详情url:{}".format( key, page, url)) if self.limit_days != 0: yield scrapy.Request(url=url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': create_time, 'url': url, 'title': bulletinName, }) else: if compare(create_time) <= self.limit_days: yield scrapy.Request(url=url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': create_time, 'url': url, 'title': bulletinName, })
def extract_bidding_infos_from_page(self,response): key = response.meta['key'] page = response.meta['page'] result = json.loads(response.body) custom=result['custom'] result = json.loads(custom) datas=result.get('Table') self.tml_logger.info("【bidding_ccgp】关键词{}在第{}页,开始抽取列表页内容".format(key, page)) for data in datas: #时间 create_time = data['date'] #标题 bulletinName=data['title'] url=data['infourl'] if 'http' not in url: url='http://www.lnwlzb.com'+url self.tml_logger.info("【bidding_ccgp】关键词{}在第{}页,请求网页详情url:{}".format(key, page, url)) if self.limit_days!=0: yield scrapy.Request(url= url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time':create_time, 'url':url, 'title':bulletinName, } ) else: if compare(create_time)<self.limit_days: yield scrapy.Request(url=url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': create_time, 'url': url, 'title': bulletinName, } )
def extract_bidding_infos_from_page(self, response): key = response.meta['key'] page = response.meta['page'] result = json.loads(response.body) datas = result.get('data') for data in datas: # 时间 create_time = data['PubDate'] # # 标题 bulletinName = data['Title'] id = data['Id'] url = 'http://ztb.guizhou.gov.cn/trade/bulletin/?id={}'.format(id) self.tml_logger.info( "【bidding_gzzb】关键词{}在第{}页,请求网页详情url:{}".format(key, page, url)) detail_url = 'http://ztb.guizhou.gov.cn/api/trade/{}'.format(id) if self.limit_days != 0: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': create_time, 'url': url, 'title': bulletinName, }) else: if compare(create_time) <= self.limit_days: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': create_time, 'url': url, 'title': bulletinName, })
def extract_bidding_infos_from_page(self, response): key = response.meta['key'] page = response.meta['page'] css_text = response.xpath('//tr') datas = [] for trs in css_text: tds = trs.xpath('./td') data = {} for index, td in enumerate(tds): if index == 0: data['href'] = td.xpath( './a/@href').extract_first().replace( "javascript:urlOpen('", '').replace("')", '') data['title'] = td.xpath( './a/text()').extract_first().strip() if index == 4: data['create_time'] = td.xpath( './text()').extract_first().strip() datas.append(data) self.tml_logger.info("【bidding_gxzbtb】关键词{}在第{}页,开始抽取列表页内容".format( key, page)) for data in datas: #判断是否为空字典 if not bool(data): # 去除时间标签中不含时间元素的标签 url_time = data['create_time'] # 标题 url_title = data['bulletinName'] #获取url detail_url = data['href'] self.tml_logger.info( "【bidding_gxzbtb】关键词{}在第{}页,请求网页详情url:{}".format( key, page, detail_url)) # 判断时间 url_time = match_data(url_time) if self.limit_days != 0: if compare(url_time) <= self.limit_days: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, }) else: yield scrapy.Request(url=detail_url, callback=self.get_content, headers=self.headers, dont_filter=True, meta={ 'key': key, 'page': page, 'url_time': url_time, 'url': detail_url, 'title': url_title, })