class BrookingsExpertsSpider(scrapy.Spider): urls_data = start_item.get_url('brookings_experts') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): """ 主页解析 :param response:返回专家导航链接 """ experts_navi = response.xpath( '//*[@id="menu-item-20631"]/a/@href').extract_first() experts__navi_url = response.urljoin(experts_navi) yield scrapy.Request(experts__navi_url, callback=self.parse_expert) def parse_expert(self, response): """ 专家页面解析 :param response: 专家详情链接 """ experts_urls = response.xpath( '//div[@class="list-content"]/article/div[@class="expert-image"]/a/@href' ).extract() if experts_urls: for experts_url in experts_urls: yield scrapy.Request(experts_url, callback=self.parse_expert_detail) page = response.meta.get('page') if response.meta.get( 'page') else 1 base_url = 'https://www.brookings.edu/experts/page/{}/' next_page = base_url.format(page) yield scrapy.Request(next_page, callback=self.parse_expert, meta={'page': page + 1}) def parse_expert_detail(self, response): """ 解析专家详情 """ content_by_xpath = parse_item.parse_response(self.urls_data['site'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class HeritageSpider(scrapy.Spider): urls_data = start_item.get_url('heritage') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): """ 获取首页文章链接,解析所有页面链接 :return:页面链接 """ # 翻页基础链接 base_url = 'https://www.heritage.org/search?contains=&type=All&date_offset=&range_start=&range_end=&page={}' # 网站基础链接 site_base_url = 'https://www.heritage.org' # 页面链接 page_urls = response.xpath('//div[@class="views-row"]/section/div/a/@href').extract() for page_url in page_urls: yield scrapy.Request(site_base_url + page_url, callback=self.parse_page_detail) total_page = response.xpath('//li[contains(@class,"pager__item--last")]/a/@href').extract_first().split('=')[-1] for page in range(1, int(total_page) + 1): yield scrapy.Request(base_url.format(page), callback=self.parse_all_urls, meta={'site_base_url': site_base_url}) def parse_all_urls(self, response): """ 获取所有页面下链接 :return: 返回当前页面下链接 """ site_base_url = response.meta.get('site_base_url') page_urls = response.xpath('//div[@class="views-row"]/section/div/a/@href').extract() for page_url in page_urls: yield scrapy.Request(site_base_url + page_url, callback=self.parse_page_detail) def parse_page_detail(self, response): """ 解析页面详情 """ content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class BrookingsAboutSpider(scrapy.Spider): urls_data = start_item.get_url('brookings_about') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): """ 解析页面信息 """ content_by_xpath = parse_item.parse_response(self.urls_data['site'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class CarnegieendowmentSpider(scrapy.Spider): urls_data = start_item.get_url('carnegieendowment') name = urls_data['tag'] # allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): base_url = 'https://carnegieendowment.org/search/?qry=¢er=' yield scrapy.Request(base_url, callback=self.parse_all_urls) def parse_all_urls(self, response): page_content_urls = response.xpath( '//div[contains(@class,"foreground")]//ul//li[@class="clearfix"]/h4/a/@href' ).extract() for page_content_url in page_content_urls: yield scrapy.Request(url=response.urljoin(page_content_url), callback=self.parse_page_detail) next_page = response.xpath( '//a[contains(@class,"page-links__next")]/@href').extract_first() if next_page: yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_all_urls) def parse_page_detail(self, response): content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response) comment = content_by_xpath['comment_author'] if comment: pass parse_item.processing_data(content_by_xpath) # 对非解析获取的字段赋值 data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class BruegelSpdier(scrapy.Spider): urls_data = start_item.get_url('bruegel') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): page_content_urls = response.xpath( '//div[@class="mdl-submenu"]/a[1]/@href').extract() if page_content_urls: for page_content_url in page_content_urls: yield scrapy.Request(page_content_url, callback=self.parse_page_detail) base_url = 'http://bruegel.org/?basefilter=all&s=&paged={}' if response.meta.get('page'): page = response.meta.get('page') else: page = 1 yield scrapy.Request(url=base_url.format(page), callback=self.parse, meta={'page': page + 1}) def parse_page_detail(self, response): content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class ChaThamHouseSpider(scrapy.Spider): urls_data = start_item.get_url('chathamhouse') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] item = ThinkTankItem() def parse(self, response): get_second_navi = response.url + '?page=0' yield scrapy.Request(get_second_navi, callback=self.parse_second_navi) def parse_second_navi(self, response): """ 二级导航类型 :param response:分类导航详情 """ # print(response.url) classify_urls = response.xpath( '//div[@class="view-content"]//a/@href').extract() for classify_url in classify_urls: classify_detail_url = response.urljoin(classify_url) yield scrapy.Request(classify_detail_url, callback=self.parse_latest__detail) total_page = response.xpath( '//li[contains(@class,"pager-next")]/a/@href').extract_first() if total_page: next_page = response.urljoin(total_page) yield scrapy.Request(next_page, callback=self.parse_second_navi) def parse_latest__detail(self, response): # 最新事件 fragment0 = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing-default")]' ).extract() if fragment0: fragment0_url = response.url + '#fragment-0' yield scrapy.Request(fragment0_url, callback=self.parse_fragment0, dont_filter=True) # 以往事件 fragment3 = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing-block_2")]' ).extract() if fragment3: fragment3_url = response.url + '#fragment-3' yield scrapy.Request(fragment3_url, callback=self.parse_fragment3, dont_filter=True) # 影音 fragment4 = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing_audio_and_video-default")]' ).extract() if fragment4: fragment4_url = response.url + '#fragment-4' yield scrapy.Request(fragment4_url, callback=self.parse_fragment4, dont_filter=True) # def parse_fragment0(self, response): # """ # 解析三级导航第一列最新事件 # :param response: 页面链接 # :return: # """ base_classify_urls = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing-default")]' ) classify_latest_urls = base_classify_urls.xpath( './div/a//@href').extract() if classify_latest_urls: for classify_latest_url in classify_latest_urls: classify_latest = response.urljoin(classify_latest_url) yield scrapy.Request(classify_latest, callback=self.parse_page_detail, dont_filter=True) next_pager = base_classify_urls.xpath( './/li[contains(@class,"pager-next")]/a/@href').extract_first() if next_pager: next_page = response.urljoin(next_pager) yield scrapy.Request(next_page, callback=self.parse_fragment0) def parse_fragment3(self, response): base_fragment3_url = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing-block_3")]' ) classify_past_urls = base_fragment3_url.xpath( './div/a//@href').extract() if classify_past_urls: for classify_past_url in classify_past_urls: classify_past = response.urljoin(classify_past_url) yield scrapy.Request(classify_past, callback=self.parse_page_detail, dont_filter=True) next_pager = base_fragment3_url.xpath( './/li[contains(@class,"pager-next")]/a/@href').extract_first() if next_pager: next_page = response.urljoin(next_pager) yield scrapy.Request(next_page, callback=self.parse_fragment3) def parse_fragment4(self, response): base_fragment4_url = response.xpath( '//div[contains(@class,"view-section_index_auto_content_listing_audio_and_video-default")]' ) classify_past_urls = base_fragment4_url.xpath( './div/a/@href').extract() if classify_past_urls: for classify_past_url in classify_past_urls: classify_past = response.urljoin(classify_past_url) yield scrapy.Request(classify_past, callback=self.parse_page_detail, dont_filter=True) next_pager = response.xpath( '//div[@id="fragment-4"]//li[contains(@class,"pager-next")]/a/@href' ).extract_first() if next_pager: next_page = response.urljoin(next_pager) yield scrapy.Request(next_page, callback=self.parse_fragment4) def parse_page_detail(self, response): content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class RandOrgSpider(scrapy.Spider): urls_data = start_item.get_url('rand') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] def parse(self, response): """ 获取所有导航链接 :param response: 链接 """ base_link = 'https://www.rand.org' result = response.xpath( '//ul[@class="topic-list"]/li/ul/li/a/@href').extract() for url in result: classify_url = base_link + url yield scrapy.Request(url=classify_url, callback=self.parse_calssify) def parse_calssify(self, response): base_url = response.url page_url = base_url + '?page={}'.format(1) yield scrapy.Request(url=page_url, callback=self.parse_all_url, meta={ 'page': 1, 'url': base_url }) def parse_all_url(self, response): """ 获取每页信息 :param respones: 返回页面链接 """ res = response.xpath( '//ul[@class="teasers list organic"]/li/div[2]/h3/a/@href' ).extract() if res: for detail_url in res: yield scrapy.Request(url=detail_url, callback=self.parse_page_detail) page = response.meta.get('page') + 1 meta_url = response.meta.get('url') url = meta_url + '?page={}'.format(page) yield scrapy.Request(url=url, callback=self.parse_all_url, meta={ 'page': page, 'url': meta_url }) def parse_page_detail(self, response): """ 解析页面详情 :return: item """ content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) data['expertDV'] = parse_item.parse_expert_DV(response, data['expertDV']) parse_item.parse_check_data(data) item = ThinkTankItem() item['data'] = data item['site'] = self.urls_data['site'] item['tag'] = self.urls_data['tag'] yield item
class BrookingsSpider(scrapy.Spider): urls_data = start_item.get_url('brookings') name = urls_data['tag'] allowed_domains = [urls_data['site']] start_urls = urls_data['url'] item = ThinkTankItem() def parse(self, response): """ 解析主页面 :param response: 二级导航链接 """ second_navi_urls = response.xpath( '//div[@class="post-linear-list term-list topic-list-wrapper"][1]//ul/li/a/@href').extract() for second_navi_url in second_navi_urls: yield scrapy.Request(second_navi_url, callback=self.parse_second_navi, meta={'base_url': second_navi_url}) def parse_second_navi(self, response): """ 解析二级导航 :param response: 返回二级导航链接 """ base_url = response.meta.get('base_url') clssify_urls = base_url + 'page/{}/'.format(2) yield scrapy.Request(clssify_urls, callback=self.parse_topic_page, meta={'page': 2, 'url': base_url}) def parse_topic_page(self, response): """ 解析主题 :param response: 返回分类下每页链接 """ classify_page_urls = response.xpath( '//div[@class="list-content"]/article/a/@href | //div[@class="list-content"]/article/div/h4/a/@href' ).extract() if classify_page_urls: for page_url in classify_page_urls: yield scrapy.Request(page_url, callback=self.parse_page_detail, meta={'get_image': True}) page = response.meta.get('page') + 1 meta_url = response.meta.get('url') page_next = meta_url + 'page/{}/'.format(page) yield scrapy.Request(page_next, callback=self.parse_topic_page, meta={'page': page, 'url': meta_url, }) def parse_page_detail(self, response): """ 解析页面详情 """ # 通过获取数据库对应xpath解析对应字段 content_by_xpath = parse_item.parse_response(self.urls_data['site'], response) content_by_xpath['svg_data'] = [] if content_by_xpath['svg_url']: content_by_xpath['svg_data'].append( parse_item.parse_svg_url(content_by_xpath['svg_url'])) # 对非解析获取的字段赋值 parse_item.processing_data(content_by_xpath) data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site']) self.item['data'] = data self.item['tag'] = self.urls_data['tag'] self.item['site'] = self.urls_data['site'] yield self.item