Beispiel #1
0
    def parse(self, response):
        items = response.meta['items']
        # 由于最后一个tr标签为页数栏、所以排除掉
        all_trs = response.xpath('//div[@class="ewb-comp-bd"]//table//tr')[:-1]

        for each_tr in all_trs:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_tr.xpath('./td[1]/a/@title').extract_first()
            except:
                pass

            try:
                items['url'] = self.govPurchase_baseUrl + each_tr.xpath('./td[1]/a/@href').extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_tr.xpath('./td[2]/text()').extract_first().strip()
            except:
                pass

            yield scrapy.Request(url = items['url'], callback = self.parse_article, meta = {'items' : deepcopy(items)}, headers = self.headers)
    def parse(self):
        for each_li in self.infos:
            self.items['title'] = ''
            self.items['url'] = ''
            self.items['web_time'] = ''
            self.items['intro'] = ''
            self.items['addr_id'] = ''

            try:
                self.items['title'] = each_li.xpath(
                    self.title_rule).extract_first().strip()
            except:
                pass

            try:
                self.items['url'] = self.base_url + each_li.xpath(
                    self.url_rule).extract_first()
            except:
                msg = self.spider_name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.spider_name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                self.items['web_time'] = each_li.xpath(
                    self.web_time_rule).extract_first().strip()
            except:
                pass

            yield self.items
    def parse(self, response):
        items = response.meta['items']

        infos = response.xpath('//div[@class="List2"]/ul/li')

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(
                    self.xpath_rule['title_rule']).extract_first().strip()
            except:
                pass

            try:
                items['url'] = self.baseUrl + each_li.xpath(
                    self.xpath_rule['url_rule']).extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(
                    self.xpath_rule['web_time_rule']).extract_first().strip()
            except:
                pass

            # 分别需要使用年份月份id来构造真实文章的url链接
            year, month = items['web_time'].split(
                '-')[0], items['web_time'].split('-')[1]
            article_id = re.search(r'infoId=(.*?)&', items['url'],
                                   re.S).group(1)

            if items['type_id'] == '38255':
                article_url = self.bidNotice_url.format(
                    year, month, article_id)
            elif items['type_id'] == '38257':
                article_url = self.bidResult_url.format(
                    year, month, article_id)
            else:
                article_url = self.modifyResult_url.format(
                    year, month, article_id)

            yield scrapy.Request(article_url,
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
Beispiel #4
0
    def parse(self, response):
        VIEWSTATE = re.search(r'value="(.*?)"', response.text, re.S).group(1)
        # print(VIEWSTATE)

        infos = response.xpath('//table[@class="wb-data-item"]//tr')

        for each_li in infos:
            items = {}
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(self.xpath_rule['title_rule']).extract_first().strip()
            except:
                pass

            try:
                items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip()
            except:
                pass
            # print(items['title'])

            yield scrapy.Request(url = items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})

        # 总共1443页
        if self.count < 3:
            self.count += 1

            form_data = {
                '__VIEWSTATE' : VIEWSTATE,
                '__VIEWSTATEGENERATOR' : 'D38D4441',
                '__EVENTTARGET' : 'JyxxSearch1$Pager',
                '__EVENTARGUMENT' : str(self.count),
                'JyxxSearch1$Pager_input' : '1',
                '__VIEWSTATEENCRYPTED' : ''
            }
            yield scrapy.FormRequest(url = self.gov_bidNotice_url, callback = self.parse, dont_filter = True, formdata = form_data, headers = self.headers)
Beispiel #5
0
    def parse(self, response):
        items = response.meta['items']

        infos = response.xpath(self.xpath_rule['list_page'])

        for each_li in infos[1:]:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            # 在这里之所以分这么细、是因为建设工程的tr标签每个都不一样、但是政府采购则全部一样
            try:
                if 'jsgcZbgg' in response.url or 'zfcg' in response.url:
                    items['title'] = ''.join(each_li.xpath(self.xpath_rule['title_rule']).extract()).strip()
                elif 'jsgcBgtz' in response.url:
                    items['title'] = ''.join(each_li.xpath('./td[4]/a/@title').extract()).strip()
                else:
                    items['title'] = ''.join(each_li.xpath('./td[3]/@title').extract()).strip()
            except:
                pass

            try:
                if 'jsgcZbgg' in response.url or 'jsgcZbjggs' in response.url or 'zfcg' in response.url:
                    items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first()
                else:
                    items['url'] = self.baseUrl + ''.join(each_li.xpath('./td[4]/a/@href').extract()).strip()
                if items['url'] == None:
                    raise Exception
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                if 'jsgcZbgg' in response.url or 'jsgcZbjggs' in response.url or 'zfcg' in response.url:
                    items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip()
                else:
                    items['web_time'] = ''.join(each_li.xpath('./td[5]/text()').extract()).strip()
            except:
                pass
            yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']
        # 获取所有招标信息的li标签
        all_lis = response.xpath('//form[@id="moderate"]/li')

        for each_li in all_lis:

            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(
                    './a/span/text()').extract_first().strip()
            except:
                pass

            try:
                items['url'] = self.article_url + each_li.xpath(
                    './a/@href').extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(
                    './/em/text()').extract_first().split(' ')[0].strip()
            except:
                pass

            for city in self.city_dict:
                if city in items['title']:
                    items['addr_id'] = self.city_dict[city]
                    break

            yield scrapy.Request(url=items['url'],
                                 callback=self.article_parse,
                                 meta={'items': deepcopy(items)})
Beispiel #7
0
    def parse(self, response):
        items = response.meta['items']

        # 获取所有招标信息的li标签
        all_lis = response.xpath(self.xpath_rule['list_page'])

        for each_li in all_lis:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(
                    self.xpath_rule['title_rule']).extract_first()
            except:
                pass

            try:
                if items['type_id'] == '38255':
                    items['url'] = self.bidNotice_baseUrl + each_li.xpath(
                        self.xpath_rule['url_rule']).extract_first()[1:]
                else:
                    items['url'] = self.resultNotice_baseUrl + each_li.xpath(
                        self.xpath_rule['url_rule']).extract_first()[1:]
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(
                    self.xpath_rule['web_time_rule']).extract_first()
            except:
                pass

            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
Beispiel #8
0
    def parse(self, response):
        items = response.meta['items']
        # 由于最后一个tr标签为页数栏、所以排除掉
        all_lis = response.xpath('//div[@class="infor-con2 on"]//li')

        for each_li in all_lis:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath('./a/@title').extract_first()
            except:
                pass

            try:
                # 因为有时候所获取的url没有带协议 有些就有有  所有加了个判断语句
                items['url'] = each_li.xpath('./a/@href').extract_first()
                if 'http' not in items['url']:
                    items['url'] = self.govPurchase_baseUrl + items['url']
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(
                    './span/text()').extract_first()
            except:
                pass

            if '.doc' not in items['url'] and '.rar' not in items[
                    'url'] and '.jpg' not in items[
                        'url'] and '.docx' not in items['url']:
                yield scrapy.Request(items['url'],
                                     callback=self.parse_article,
                                     meta={'items': deepcopy(items)},
                                     headers=self.headers)
    def parse(self, response):
        items = response.meta['items']

        infos = response.xpath(self.xpath_rule['list_page'])

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = ''.join(
                    each_li.xpath(
                        self.xpath_rule['title_rule']).extract()).strip()
            except:
                pass

            try:
                items['url'] = self.baseUrl + each_li.xpath(
                    self.xpath_rule['url_rule']).extract_first()
                if items['url'] == None:
                    raise Exception
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                page_date = each_li.xpath(
                    self.xpath_rule['web_time_rule']).extract_first().strip()
                items['web_time'] = self.switch_date(page_date)
            except:
                pass
            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']

        infos = response.xpath(self.xpath_rule['list_page'])

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = ''.join(each_li.xpath(self.xpath_rule['title_rule']).extract()).strip()
            except:
                pass

            try:
                get_url = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first()
                get_prefix = re.search(r'(http://ggzy.xzsp.tj.gov.cn:80/jyxx.*?/)', get_url, re.S).group(1)
                dirty_url = aes.url_encrypt(get_url)
                suffix_url = re.search(r'http://ggzy.xzsp.tj.gov.cn:80/jyxx.*?/(.*)', dirty_url, re.S).group(1)
                if '/' in suffix_url:
                    items['url'] = get_prefix + suffix_url.replace('/', '%5E')
                else:
                    items['url'] = dirty_url

                if items['url'] == None:
                    raise Exception
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip()
            except:
                pass
            yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']

        if '071008' not in response.url:
            infos = response.xpath('//table[@class="ewb-trade-tb"]//tr')[1:]
        else:
            infos = response.xpath('//ul[@class="wb-data-item"]/li')

        if '071008' not in  response.url:
            self.xpath_rule = self.xpath_rule
        else:
            self.xpath_rule = self.health_xpath

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(self.xpath_rule['title_rule']).extract_first().strip()
            except:
                pass

            try:
                items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip()
            except:
                pass
            yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
    def parse(self, response):

        items = response.meta['items']
        # 获取所有招标信息的li标签
        all_lis = response.xpath('//div[@class="serviceMsg"]//ul/li')

        for each_li in all_lis:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath('./a/text()').extract_first()
            except:
                pass

            try:
                items['url'] = self.base_url + each_li.xpath(
                    './a/@href').extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                # (2017-05-31)日期格式
                dirty_time = each_li.xpath('.//span/text()').extract_first()
                items['web_time'] = re.sub('\(|\)', '', dirty_time).strip()
            except:
                pass

            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 meta={'items': deepcopy(items)},
                                 headers=self.headers)
    def parse(self, response):
        items = response.meta['items']

        infos = response.xpath('//div[@id="jt"]/ul/li')

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_li.xpath(
                    self.xpath_rule['title_rule']).extract_first().strip()
            except:
                pass

            try:
                items['url'] = self.baseUrl + each_li.xpath(
                    self.xpath_rule['url_rule']).extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_li.xpath(
                    self.xpath_rule['web_time_rule']).extract_first().strip()
            except:
                pass

            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']
        infos = re.findall('<a href=(.*?)</ul>', response.text, re.S)

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = re.search(r'>(.*?)</a>', each_li,
                                           re.S).group(1)
            except:
                pass

            try:
                items['url'] = self.baseUrl + re.search(
                    r'"(.*?)"', each_li, re.S).group(1)
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = re.search(r'\[(.*?)\]', each_li,
                                              re.S).group(1)
            except:
                pass
            # print(items)
            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']
        # 由于第一个并非是有效信息
        infos = response.xpath('//div[@class="news"]//tr')[1:]

        for each_li in infos:
            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                if 'jsgcBgtz' not in response.url and 'jsgcZbjggs' not in response.url:
                    items['title'] = each_li.xpath(
                        self.xpath_rule['title_rule']).extract_first().strip()
                elif 'jsgcZbjggs' not in response.url:
                    items['title'] = each_li.xpath(
                        self.xpath_rule['modify_title_rule']).extract_first(
                        ).strip()
                else:
                    items['title'] = each_li.xpath(
                        self.xpath_rule['result_title_rule']).extract_first(
                        ).strip()
            except:
                pass

            try:
                if 'jsgcBgtz' not in response.url:
                    items['url'] = self.baseUrl + each_li.xpath(
                        self.xpath_rule['url_rule']).extract_first()
                else:
                    items['url'] = self.baseUrl + each_li.xpath(
                        self.xpath_rule['modify_url_rule']).extract_first()
            except:
                print(items)
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                if 'jsgcBgtz' not in response.url:
                    items['web_time'] = each_li.xpath(
                        self.xpath_rule['web_time_rule']).extract_first(
                        ).strip()
                else:
                    items['web_time'] = each_li.xpath(
                        self.xpath_rule['modify_web_time_rule']).extract_first(
                        ).strip()
            except:
                pass
            # print(items)
            yield scrapy.Request(items['url'],
                                 callback=self.parse_article,
                                 headers=self.headers,
                                 meta={'items': deepcopy(items)})
    def parse(self, response):
        items = response.meta['items']

        # 获取所有信息的所在tr标签
        all_trs = response.xpath('//table[@class="listInfoTable"]//tr')[1:]

        for each_tr in all_trs:

            items['title'] = ''
            items['url'] = ''
            items['web_time'] = ''
            items['intro'] = ''
            items['addr_id'] = ''

            try:
                items['title'] = each_tr.xpath(
                    './td/@title').extract_first().replace('\r', '')
            except:
                pass

            try:
                items['url'] = self.article_url + each_tr.xpath(
                    './td/a/@href').extract_first()
            except:
                msg = self.name + ', 该爬虫详情页获取url失败'
                send_mail_when_error(msg)
                self.error_count += 1
                if self.error_count > 3:
                    quit()
                    msg = self.name + ', 该爬虫因详情页获取失败被暂停'
                    send_mail_when_error(msg)
                pass

            try:
                items['web_time'] = each_tr.xpath(
                    './td/following-sibling::td[4]/@title').extract_first(
                    ).strip()
            except:
                pass

            for city in self.city_dict:
                if city in items['title']:
                    items['addr_id'] = self.city_dict[city]
                    break

            # 因为根据标题获取地名有时候会获取不到、所以再进行多一层判断从正文中获取地市名
            if items['addr_id'] == '':
                for each_pattern in self.pattern_list:
                    try:
                        search_text = re.search(each_pattern, dirty_article,
                                                re.S).group(1)
                    except:
                        continue
                    else:
                        for city_name in self.city_dict:
                            if city_name in search_text:
                                items['addr_id'] = self.city_dict[city_name]
                                break
                    break

            yield scrapy.Request(url=items['url'],
                                 callback=self.parse_article,
                                 meta={'items': deepcopy(items)},
                                 headers=self.headers)