Beispiel #1
0
    def extract_bidding_infos_from_page(self, response):
        key = response.meta['key']
        page = response.meta['page']
        self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,开始抽取列表页内容".format(key, page))
        css_text = response.xpath('//*[@id="newsList"]/ul')
        datas = []
        for trs in css_text:
            tds = trs.xpath('./li')

            for index, td in enumerate(tds):
                data = {}
                data['title']= td.xpath('./a//text()').extract_first().strip()
                data['href']= td.xpath('./a/@href').extract_first()
                data['create_time']=td.xpath('text()').extract()[1].strip()
                datas.append(data)

        for data in datas:
            #判断是否为空字典

            # 去除时间标签中不含时间元素的标签
            url_time = data['create_time']
            # 标题
            url_title = data['title']
            #获取url
            url=data['href']
            detail_url=url = 'http://www.ccgp-jiangsu.gov.cn/' + url.replace('./', 'ggxx/gkzbgg/').replace('../', '')




            self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url))
            # 判断时间
            url_time = match_data(url_time)
            if self.limit_days != 0:
                if compare(url_time) <= self.limit_days:
                    yield scrapy.Request(url=detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': url_time,
                                             'url':  detail_url,
                                             'title': url_title,
                                         }
                                         )
            else:
                yield scrapy.Request(url=detail_url,
                                     callback=self.get_content,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={
                                         'key': key,
                                         'page': page,
                                         'url_time': url_time,
                                         'url': detail_url,
                                         'title': url_title,
                                     }
                                     )
Beispiel #2
0
    def extract_bidding_infos_from_page(self, response):
        key = response.meta['key']
        page = response.meta['page']
        datas=json.loads(response.text).get('data')
        self.tml_logger.info("【bidding_qgggzy】关键词{}在第{}页,开始抽取列表页内容".format(key, page))
        for data in datas:
            #判断是否为空字典

            # 去除时间标签中不含时间元素的标签
            url_time = data['timeShow']
            # 标题
            url_title = data['title']
            #获取url
            url=data['url']
            detail_url='http://www.ggzy.gov.cn/information/html/b/'
            index=0
            for u in url.split('/')[6:]:
                index+=1

                if index!=5:
                    detail_url+=u+'/'
                else:
                    detail_url += u
            self.tml_logger.info("【bidding_qgggzy】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url))
            # 判断时间
            url_time = match_data(url_time)
            if self.limit_days != 0:
                if compare(url_time) <= self.limit_days:
                    yield scrapy.Request(url=detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': url_time,
                                             'url':  detail_url,
                                             'title': url_title,
                                         }
                                         )
            else:
                yield scrapy.Request(url=detail_url,
                                     callback=self.get_content,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={
                                         'key': key,
                                         'page': page,
                                         'url_time': url_time,
                                         'url': detail_url,
                                         'title': url_title,
                                     }
                                     )
Beispiel #3
0
    def extract_bidding_infos_from_page(self,response):

        key = response.meta['key']
        page = response.meta['page']
        result = json.loads(response.body)
        datas = result['data']['data']
        self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,开始抽取列表页内容".format(key, page))
        for data in datas:
            # 去除时间标签中不含时间元素的标签
            url_time = data['create_time']
            # 标题
            url_title = data['bulletinName']
            id=data['id']
            if key=='招标公告':
                detail_url=self.bidding_detail_url.format(id)
                url=self.bidding_true_url.format(id)
            elif key=='资格预审公告':
                detail_url = self.check_detail_url.format(id)
                url = self.check_true_url.format(id)
            self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,请求网页详情url:{}".format(key, page, detail_url))
            # 判断时间
            url_time = match_data(url_time)
            if self.limit_days!=0:
                if compare(url_time)<=self.limit_days:
                    yield scrapy.Request(url= detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                            'url_time':url_time,
                                             'url':url,
                                             'title':url_title,
                                         }
                                         )
            else:
                yield scrapy.Request(url= detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                            'url_time':url_time,
                                             'url':url,
                                             'title':url_title,
                                         }
                                         )
Beispiel #4
0
    def extract_bidding_infos_from_page(self, response):

        key = response.meta['key']
        page = response.meta['page']
        result = json.loads(response.body)
        returns = result['return']
        result = json.loads(returns)
        datas = result.get('Table')
        # self.tml_logger.info("【bidding_hzdzzb】关键词{}在第{}页,开始抽取列表页内容".format(key, page))
        for data in datas:

            # 时间
            create_time = data['infodate']
            # # 标题
            bulletinName = data['title']
            url = data['href']
            url = 'http://hndzzbtb.hndrc.gov.cn/' + url
            self.tml_logger.info(
                "【bidding_hzdzzb】关键词{}在第{}页,请求网页详情url:{}".format(
                    key, page, url))
            if self.limit_days != 0:
                yield scrapy.Request(url=url,
                                     callback=self.get_content,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={
                                         'key': key,
                                         'page': page,
                                         'url_time': create_time,
                                         'url': url,
                                         'title': bulletinName,
                                     })
            else:
                if compare(create_time) <= self.limit_days:
                    yield scrapy.Request(url=url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': create_time,
                                             'url': url,
                                             'title': bulletinName,
                                         })
Beispiel #5
0
    def extract_bidding_infos_from_page(self,response):

        key = response.meta['key']
        page = response.meta['page']
        result = json.loads(response.body)
        custom=result['custom']
        result = json.loads(custom)
        datas=result.get('Table')
        self.tml_logger.info("【bidding_ccgp】关键词{}在第{}页,开始抽取列表页内容".format(key, page))
        for data in datas:
            #时间
            create_time = data['date']
            #标题
            bulletinName=data['title']
            url=data['infourl']
            if 'http' not in url:
                url='http://www.lnwlzb.com'+url
            self.tml_logger.info("【bidding_ccgp】关键词{}在第{}页,请求网页详情url:{}".format(key, page, url))
            if self.limit_days!=0:
                yield scrapy.Request(url= url,
                                     callback=self.get_content,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={
                                         'key': key,
                                         'page': page,
                                         'url_time':create_time,
                                         'url':url,
                                         'title':bulletinName,
                                     }
                                     )
            else:
                if compare(create_time)<self.limit_days:
                    yield scrapy.Request(url=url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': create_time,
                                             'url': url,
                                             'title': bulletinName,
                                         }
                                         )
Beispiel #6
0
    def extract_bidding_infos_from_page(self, response):

        key = response.meta['key']
        page = response.meta['page']
        result = json.loads(response.body)
        datas = result.get('data')
        for data in datas:
            # 时间
            create_time = data['PubDate']
            # # 标题
            bulletinName = data['Title']
            id = data['Id']
            url = 'http://ztb.guizhou.gov.cn/trade/bulletin/?id={}'.format(id)
            self.tml_logger.info(
                "【bidding_gzzb】关键词{}在第{}页,请求网页详情url:{}".format(key, page, url))
            detail_url = 'http://ztb.guizhou.gov.cn/api/trade/{}'.format(id)
            if self.limit_days != 0:
                yield scrapy.Request(url=detail_url,
                                     callback=self.get_content,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={
                                         'key': key,
                                         'page': page,
                                         'url_time': create_time,
                                         'url': url,
                                         'title': bulletinName,
                                     })
            else:
                if compare(create_time) <= self.limit_days:
                    yield scrapy.Request(url=detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': create_time,
                                             'url': url,
                                             'title': bulletinName,
                                         })
Beispiel #7
0
    def extract_bidding_infos_from_page(self, response):
        key = response.meta['key']
        page = response.meta['page']
        css_text = response.xpath('//tr')
        datas = []
        for trs in css_text:
            tds = trs.xpath('./td')
            data = {}
            for index, td in enumerate(tds):
                if index == 0:
                    data['href'] = td.xpath(
                        './a/@href').extract_first().replace(
                            "javascript:urlOpen('", '').replace("')", '')
                    data['title'] = td.xpath(
                        './a/text()').extract_first().strip()

                if index == 4:
                    data['create_time'] = td.xpath(
                        './text()').extract_first().strip()
            datas.append(data)

        self.tml_logger.info("【bidding_gxzbtb】关键词{}在第{}页,开始抽取列表页内容".format(
            key, page))
        for data in datas:
            #判断是否为空字典
            if not bool(data):
                # 去除时间标签中不含时间元素的标签
                url_time = data['create_time']
                # 标题
                url_title = data['bulletinName']
                #获取url
                detail_url = data['href']
                self.tml_logger.info(
                    "【bidding_gxzbtb】关键词{}在第{}页,请求网页详情url:{}".format(
                        key, page, detail_url))
                # 判断时间
                url_time = match_data(url_time)
                if self.limit_days != 0:
                    if compare(url_time) <= self.limit_days:
                        yield scrapy.Request(url=detail_url,
                                             callback=self.get_content,
                                             headers=self.headers,
                                             dont_filter=True,
                                             meta={
                                                 'key': key,
                                                 'page': page,
                                                 'url_time': url_time,
                                                 'url': detail_url,
                                                 'title': url_title,
                                             })
                else:
                    yield scrapy.Request(url=detail_url,
                                         callback=self.get_content,
                                         headers=self.headers,
                                         dont_filter=True,
                                         meta={
                                             'key': key,
                                             'page': page,
                                             'url_time': url_time,
                                             'url': detail_url,
                                             'title': url_title,
                                         })