def parse(self, response): items = response.meta['items'] # 由于最后一个tr标签为页数栏、所以排除掉 all_trs = response.xpath('//div[@class="ewb-comp-bd"]//table//tr')[:-1] for each_tr in all_trs: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_tr.xpath('./td[1]/a/@title').extract_first() except: pass try: items['url'] = self.govPurchase_baseUrl + each_tr.xpath('./td[1]/a/@href').extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_tr.xpath('./td[2]/text()').extract_first().strip() except: pass yield scrapy.Request(url = items['url'], callback = self.parse_article, meta = {'items' : deepcopy(items)}, headers = self.headers)
def parse(self): for each_li in self.infos: self.items['title'] = '' self.items['url'] = '' self.items['web_time'] = '' self.items['intro'] = '' self.items['addr_id'] = '' try: self.items['title'] = each_li.xpath( self.title_rule).extract_first().strip() except: pass try: self.items['url'] = self.base_url + each_li.xpath( self.url_rule).extract_first() except: msg = self.spider_name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.spider_name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: self.items['web_time'] = each_li.xpath( self.web_time_rule).extract_first().strip() except: pass yield self.items
def parse(self, response): items = response.meta['items'] infos = response.xpath('//div[@class="List2"]/ul/li') for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath( self.xpath_rule['title_rule']).extract_first().strip() except: pass try: items['url'] = self.baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath( self.xpath_rule['web_time_rule']).extract_first().strip() except: pass # 分别需要使用年份月份id来构造真实文章的url链接 year, month = items['web_time'].split( '-')[0], items['web_time'].split('-')[1] article_id = re.search(r'infoId=(.*?)&', items['url'], re.S).group(1) if items['type_id'] == '38255': article_url = self.bidNotice_url.format( year, month, article_id) elif items['type_id'] == '38257': article_url = self.bidResult_url.format( year, month, article_id) else: article_url = self.modifyResult_url.format( year, month, article_id) yield scrapy.Request(article_url, callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): VIEWSTATE = re.search(r'value="(.*?)"', response.text, re.S).group(1) # print(VIEWSTATE) infos = response.xpath('//table[@class="wb-data-item"]//tr') for each_li in infos: items = {} items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath(self.xpath_rule['title_rule']).extract_first().strip() except: pass try: items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip() except: pass # print(items['title']) yield scrapy.Request(url = items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)}) # 总共1443页 if self.count < 3: self.count += 1 form_data = { '__VIEWSTATE' : VIEWSTATE, '__VIEWSTATEGENERATOR' : 'D38D4441', '__EVENTTARGET' : 'JyxxSearch1$Pager', '__EVENTARGUMENT' : str(self.count), 'JyxxSearch1$Pager_input' : '1', '__VIEWSTATEENCRYPTED' : '' } yield scrapy.FormRequest(url = self.gov_bidNotice_url, callback = self.parse, dont_filter = True, formdata = form_data, headers = self.headers)
def parse(self, response): items = response.meta['items'] infos = response.xpath(self.xpath_rule['list_page']) for each_li in infos[1:]: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' # 在这里之所以分这么细、是因为建设工程的tr标签每个都不一样、但是政府采购则全部一样 try: if 'jsgcZbgg' in response.url or 'zfcg' in response.url: items['title'] = ''.join(each_li.xpath(self.xpath_rule['title_rule']).extract()).strip() elif 'jsgcBgtz' in response.url: items['title'] = ''.join(each_li.xpath('./td[4]/a/@title').extract()).strip() else: items['title'] = ''.join(each_li.xpath('./td[3]/@title').extract()).strip() except: pass try: if 'jsgcZbgg' in response.url or 'jsgcZbjggs' in response.url or 'zfcg' in response.url: items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first() else: items['url'] = self.baseUrl + ''.join(each_li.xpath('./td[4]/a/@href').extract()).strip() if items['url'] == None: raise Exception except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: if 'jsgcZbgg' in response.url or 'jsgcZbjggs' in response.url or 'zfcg' in response.url: items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip() else: items['web_time'] = ''.join(each_li.xpath('./td[5]/text()').extract()).strip() except: pass yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 获取所有招标信息的li标签 all_lis = response.xpath('//form[@id="moderate"]/li') for each_li in all_lis: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath( './a/span/text()').extract_first().strip() except: pass try: items['url'] = self.article_url + each_li.xpath( './a/@href').extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath( './/em/text()').extract_first().split(' ')[0].strip() except: pass for city in self.city_dict: if city in items['title']: items['addr_id'] = self.city_dict[city] break yield scrapy.Request(url=items['url'], callback=self.article_parse, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 获取所有招标信息的li标签 all_lis = response.xpath(self.xpath_rule['list_page']) for each_li in all_lis: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath( self.xpath_rule['title_rule']).extract_first() except: pass try: if items['type_id'] == '38255': items['url'] = self.bidNotice_baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first()[1:] else: items['url'] = self.resultNotice_baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first()[1:] except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath( self.xpath_rule['web_time_rule']).extract_first() except: pass yield scrapy.Request(items['url'], callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 由于最后一个tr标签为页数栏、所以排除掉 all_lis = response.xpath('//div[@class="infor-con2 on"]//li') for each_li in all_lis: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath('./a/@title').extract_first() except: pass try: # 因为有时候所获取的url没有带协议 有些就有有 所有加了个判断语句 items['url'] = each_li.xpath('./a/@href').extract_first() if 'http' not in items['url']: items['url'] = self.govPurchase_baseUrl + items['url'] except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath( './span/text()').extract_first() except: pass if '.doc' not in items['url'] and '.rar' not in items[ 'url'] and '.jpg' not in items[ 'url'] and '.docx' not in items['url']: yield scrapy.Request(items['url'], callback=self.parse_article, meta={'items': deepcopy(items)}, headers=self.headers)
def parse(self, response): items = response.meta['items'] infos = response.xpath(self.xpath_rule['list_page']) for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = ''.join( each_li.xpath( self.xpath_rule['title_rule']).extract()).strip() except: pass try: items['url'] = self.baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first() if items['url'] == None: raise Exception except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: page_date = each_li.xpath( self.xpath_rule['web_time_rule']).extract_first().strip() items['web_time'] = self.switch_date(page_date) except: pass yield scrapy.Request(items['url'], callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] infos = response.xpath(self.xpath_rule['list_page']) for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = ''.join(each_li.xpath(self.xpath_rule['title_rule']).extract()).strip() except: pass try: get_url = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first() get_prefix = re.search(r'(http://ggzy.xzsp.tj.gov.cn:80/jyxx.*?/)', get_url, re.S).group(1) dirty_url = aes.url_encrypt(get_url) suffix_url = re.search(r'http://ggzy.xzsp.tj.gov.cn:80/jyxx.*?/(.*)', dirty_url, re.S).group(1) if '/' in suffix_url: items['url'] = get_prefix + suffix_url.replace('/', '%5E') else: items['url'] = dirty_url if items['url'] == None: raise Exception except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip() except: pass yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
def parse(self, response): items = response.meta['items'] if '071008' not in response.url: infos = response.xpath('//table[@class="ewb-trade-tb"]//tr')[1:] else: infos = response.xpath('//ul[@class="wb-data-item"]/li') if '071008' not in response.url: self.xpath_rule = self.xpath_rule else: self.xpath_rule = self.health_xpath for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath(self.xpath_rule['title_rule']).extract_first().strip() except: pass try: items['url'] = self.baseUrl + each_li.xpath(self.xpath_rule['url_rule']).extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath(self.xpath_rule['web_time_rule']).extract_first().strip() except: pass yield scrapy.Request(items['url'], callback = self.parse_article, headers = self.headers, meta = {'items' : deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 获取所有招标信息的li标签 all_lis = response.xpath('//div[@class="serviceMsg"]//ul/li') for each_li in all_lis: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath('./a/text()').extract_first() except: pass try: items['url'] = self.base_url + each_li.xpath( './a/@href').extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: # (2017-05-31)日期格式 dirty_time = each_li.xpath('.//span/text()').extract_first() items['web_time'] = re.sub('\(|\)', '', dirty_time).strip() except: pass yield scrapy.Request(items['url'], callback=self.parse_article, meta={'items': deepcopy(items)}, headers=self.headers)
def parse(self, response): items = response.meta['items'] infos = response.xpath('//div[@id="jt"]/ul/li') for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_li.xpath( self.xpath_rule['title_rule']).extract_first().strip() except: pass try: items['url'] = self.baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_li.xpath( self.xpath_rule['web_time_rule']).extract_first().strip() except: pass yield scrapy.Request(items['url'], callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] infos = re.findall('<a href=(.*?)</ul>', response.text, re.S) for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = re.search(r'>(.*?)</a>', each_li, re.S).group(1) except: pass try: items['url'] = self.baseUrl + re.search( r'"(.*?)"', each_li, re.S).group(1) except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = re.search(r'\[(.*?)\]', each_li, re.S).group(1) except: pass # print(items) yield scrapy.Request(items['url'], callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 由于第一个并非是有效信息 infos = response.xpath('//div[@class="news"]//tr')[1:] for each_li in infos: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: if 'jsgcBgtz' not in response.url and 'jsgcZbjggs' not in response.url: items['title'] = each_li.xpath( self.xpath_rule['title_rule']).extract_first().strip() elif 'jsgcZbjggs' not in response.url: items['title'] = each_li.xpath( self.xpath_rule['modify_title_rule']).extract_first( ).strip() else: items['title'] = each_li.xpath( self.xpath_rule['result_title_rule']).extract_first( ).strip() except: pass try: if 'jsgcBgtz' not in response.url: items['url'] = self.baseUrl + each_li.xpath( self.xpath_rule['url_rule']).extract_first() else: items['url'] = self.baseUrl + each_li.xpath( self.xpath_rule['modify_url_rule']).extract_first() except: print(items) msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: if 'jsgcBgtz' not in response.url: items['web_time'] = each_li.xpath( self.xpath_rule['web_time_rule']).extract_first( ).strip() else: items['web_time'] = each_li.xpath( self.xpath_rule['modify_web_time_rule']).extract_first( ).strip() except: pass # print(items) yield scrapy.Request(items['url'], callback=self.parse_article, headers=self.headers, meta={'items': deepcopy(items)})
def parse(self, response): items = response.meta['items'] # 获取所有信息的所在tr标签 all_trs = response.xpath('//table[@class="listInfoTable"]//tr')[1:] for each_tr in all_trs: items['title'] = '' items['url'] = '' items['web_time'] = '' items['intro'] = '' items['addr_id'] = '' try: items['title'] = each_tr.xpath( './td/@title').extract_first().replace('\r', '') except: pass try: items['url'] = self.article_url + each_tr.xpath( './td/a/@href').extract_first() except: msg = self.name + ', 该爬虫详情页获取url失败' send_mail_when_error(msg) self.error_count += 1 if self.error_count > 3: quit() msg = self.name + ', 该爬虫因详情页获取失败被暂停' send_mail_when_error(msg) pass try: items['web_time'] = each_tr.xpath( './td/following-sibling::td[4]/@title').extract_first( ).strip() except: pass for city in self.city_dict: if city in items['title']: items['addr_id'] = self.city_dict[city] break # 因为根据标题获取地名有时候会获取不到、所以再进行多一层判断从正文中获取地市名 if items['addr_id'] == '': for each_pattern in self.pattern_list: try: search_text = re.search(each_pattern, dirty_article, re.S).group(1) except: continue else: for city_name in self.city_dict: if city_name in search_text: items['addr_id'] = self.city_dict[city_name] break break yield scrapy.Request(url=items['url'], callback=self.parse_article, meta={'items': deepcopy(items)}, headers=self.headers)