Beispiel #1
0
class BrookingsExpertsSpider(scrapy.Spider):
    urls_data = start_item.get_url('brookings_experts')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):
        """
        主页解析
        :param response:返回专家导航链接
        """
        experts_navi = response.xpath(
            '//*[@id="menu-item-20631"]/a/@href').extract_first()
        experts__navi_url = response.urljoin(experts_navi)
        yield scrapy.Request(experts__navi_url, callback=self.parse_expert)

    def parse_expert(self, response):
        """
        专家页面解析
        :param response: 专家详情链接
        """

        experts_urls = response.xpath(
            '//div[@class="list-content"]/article/div[@class="expert-image"]/a/@href'
        ).extract()
        if experts_urls:
            for experts_url in experts_urls:
                yield scrapy.Request(experts_url,
                                     callback=self.parse_expert_detail)

            page = response.meta.get('page') if response.meta.get(
                'page') else 1
            base_url = 'https://www.brookings.edu/experts/page/{}/'
            next_page = base_url.format(page)
            yield scrapy.Request(next_page,
                                 callback=self.parse_expert,
                                 meta={'page': page + 1})

    def parse_expert_detail(self, response):
        """
        解析专家详情
        """
        content_by_xpath = parse_item.parse_response(self.urls_data['site'],
                                                     response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #2
0
class HeritageSpider(scrapy.Spider):
    urls_data = start_item.get_url('heritage')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):
        """
        获取首页文章链接,解析所有页面链接
        :return:页面链接
        """
        # 翻页基础链接
        base_url = 'https://www.heritage.org/search?contains=&type=All&date_offset=&range_start=&range_end=&page={}'
        # 网站基础链接
        site_base_url = 'https://www.heritage.org'
        # 页面链接
        page_urls = response.xpath('//div[@class="views-row"]/section/div/a/@href').extract()
        for page_url in page_urls:
            yield scrapy.Request(site_base_url + page_url, callback=self.parse_page_detail)

        total_page = response.xpath('//li[contains(@class,"pager__item--last")]/a/@href').extract_first().split('=')[-1]
        for page in range(1, int(total_page) + 1):
            yield scrapy.Request(base_url.format(page), callback=self.parse_all_urls,
                                 meta={'site_base_url': site_base_url})

    def parse_all_urls(self, response):
        """
        获取所有页面下链接
        :return: 返回当前页面下链接
        """
        site_base_url = response.meta.get('site_base_url')
        page_urls = response.xpath('//div[@class="views-row"]/section/div/a/@href').extract()
        for page_url in page_urls:
            yield scrapy.Request(site_base_url + page_url, callback=self.parse_page_detail)

    def parse_page_detail(self, response):
        """
        解析页面详情
        """
        content_by_xpath = parse_item.parse_response(self.urls_data['tag'], response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #3
0
class BrookingsAboutSpider(scrapy.Spider):
    urls_data = start_item.get_url('brookings_about')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):
        """
        解析页面信息
        """
        content_by_xpath = parse_item.parse_response(self.urls_data['site'],
                                                     response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #4
0
class CarnegieendowmentSpider(scrapy.Spider):
    urls_data = start_item.get_url('carnegieendowment')
    name = urls_data['tag']
    # allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):

        base_url = 'https://carnegieendowment.org/search/?qry=&center='
        yield scrapy.Request(base_url, callback=self.parse_all_urls)

    def parse_all_urls(self, response):
        page_content_urls = response.xpath(
            '//div[contains(@class,"foreground")]//ul//li[@class="clearfix"]/h4/a/@href'
        ).extract()
        for page_content_url in page_content_urls:
            yield scrapy.Request(url=response.urljoin(page_content_url),
                                 callback=self.parse_page_detail)
        next_page = response.xpath(
            '//a[contains(@class,"page-links__next")]/@href').extract_first()
        if next_page:
            yield scrapy.Request(url=response.urljoin(next_page),
                                 callback=self.parse_all_urls)

    def parse_page_detail(self, response):
        content_by_xpath = parse_item.parse_response(self.urls_data['tag'],
                                                     response)
        comment = content_by_xpath['comment_author']
        if comment:
            pass
        parse_item.processing_data(content_by_xpath)
        # 对非解析获取的字段赋值
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #5
0
class BruegelSpdier(scrapy.Spider):
    urls_data = start_item.get_url('bruegel')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):
        page_content_urls = response.xpath(
            '//div[@class="mdl-submenu"]/a[1]/@href').extract()
        if page_content_urls:
            for page_content_url in page_content_urls:
                yield scrapy.Request(page_content_url,
                                     callback=self.parse_page_detail)

            base_url = 'http://bruegel.org/?basefilter=all&s=&paged={}'
            if response.meta.get('page'):
                page = response.meta.get('page')
            else:
                page = 1
            yield scrapy.Request(url=base_url.format(page),
                                 callback=self.parse,
                                 meta={'page': page + 1})

    def parse_page_detail(self, response):

        content_by_xpath = parse_item.parse_response(self.urls_data['tag'],
                                                     response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
class ChaThamHouseSpider(scrapy.Spider):
    urls_data = start_item.get_url('chathamhouse')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']
    item = ThinkTankItem()

    def parse(self, response):
        get_second_navi = response.url + '?page=0'
        yield scrapy.Request(get_second_navi, callback=self.parse_second_navi)

    def parse_second_navi(self, response):
        """
        二级导航类型
        :param response:分类导航详情
        """
        # print(response.url)
        classify_urls = response.xpath(
            '//div[@class="view-content"]//a/@href').extract()
        for classify_url in classify_urls:
            classify_detail_url = response.urljoin(classify_url)

            yield scrapy.Request(classify_detail_url,
                                 callback=self.parse_latest__detail)
        total_page = response.xpath(
            '//li[contains(@class,"pager-next")]/a/@href').extract_first()
        if total_page:
            next_page = response.urljoin(total_page)
            yield scrapy.Request(next_page, callback=self.parse_second_navi)

    def parse_latest__detail(self, response):

        # 最新事件
        fragment0 = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing-default")]'
        ).extract()
        if fragment0:
            fragment0_url = response.url + '#fragment-0'
            yield scrapy.Request(fragment0_url,
                                 callback=self.parse_fragment0,
                                 dont_filter=True)
        # 以往事件
        fragment3 = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing-block_2")]'
        ).extract()
        if fragment3:
            fragment3_url = response.url + '#fragment-3'
            yield scrapy.Request(fragment3_url,
                                 callback=self.parse_fragment3,
                                 dont_filter=True)
        # 影音
        fragment4 = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing_audio_and_video-default")]'
        ).extract()
        if fragment4:
            fragment4_url = response.url + '#fragment-4'
            yield scrapy.Request(fragment4_url,
                                 callback=self.parse_fragment4,
                                 dont_filter=True)
        #

    def parse_fragment0(self, response):
        #     """
        #     解析三级导航第一列最新事件
        #     :param response: 页面链接
        #     :return:
        #     """
        base_classify_urls = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing-default")]'
        )
        classify_latest_urls = base_classify_urls.xpath(
            './div/a//@href').extract()
        if classify_latest_urls:
            for classify_latest_url in classify_latest_urls:
                classify_latest = response.urljoin(classify_latest_url)
                yield scrapy.Request(classify_latest,
                                     callback=self.parse_page_detail,
                                     dont_filter=True)

        next_pager = base_classify_urls.xpath(
            './/li[contains(@class,"pager-next")]/a/@href').extract_first()
        if next_pager:
            next_page = response.urljoin(next_pager)
            yield scrapy.Request(next_page, callback=self.parse_fragment0)

    def parse_fragment3(self, response):
        base_fragment3_url = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing-block_3")]'
        )
        classify_past_urls = base_fragment3_url.xpath(
            './div/a//@href').extract()
        if classify_past_urls:
            for classify_past_url in classify_past_urls:
                classify_past = response.urljoin(classify_past_url)
                yield scrapy.Request(classify_past,
                                     callback=self.parse_page_detail,
                                     dont_filter=True)

        next_pager = base_fragment3_url.xpath(
            './/li[contains(@class,"pager-next")]/a/@href').extract_first()
        if next_pager:
            next_page = response.urljoin(next_pager)
            yield scrapy.Request(next_page, callback=self.parse_fragment3)

    def parse_fragment4(self, response):
        base_fragment4_url = response.xpath(
            '//div[contains(@class,"view-section_index_auto_content_listing_audio_and_video-default")]'
        )
        classify_past_urls = base_fragment4_url.xpath(
            './div/a/@href').extract()
        if classify_past_urls:
            for classify_past_url in classify_past_urls:
                classify_past = response.urljoin(classify_past_url)
                yield scrapy.Request(classify_past,
                                     callback=self.parse_page_detail,
                                     dont_filter=True)

        next_pager = response.xpath(
            '//div[@id="fragment-4"]//li[contains(@class,"pager-next")]/a/@href'
        ).extract_first()
        if next_pager:
            next_page = response.urljoin(next_pager)
            yield scrapy.Request(next_page, callback=self.parse_fragment4)

    def parse_page_detail(self, response):

        content_by_xpath = parse_item.parse_response(self.urls_data['tag'],
                                                     response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #7
0
class RandOrgSpider(scrapy.Spider):
    urls_data = start_item.get_url('rand')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']

    def parse(self, response):
        """
        获取所有导航链接
        :param response: 链接
        """
        base_link = 'https://www.rand.org'
        result = response.xpath(
            '//ul[@class="topic-list"]/li/ul/li/a/@href').extract()
        for url in result:
            classify_url = base_link + url
            yield scrapy.Request(url=classify_url,
                                 callback=self.parse_calssify)

    def parse_calssify(self, response):
        base_url = response.url
        page_url = base_url + '?page={}'.format(1)
        yield scrapy.Request(url=page_url,
                             callback=self.parse_all_url,
                             meta={
                                 'page': 1,
                                 'url': base_url
                             })

    def parse_all_url(self, response):
        """
        获取每页信息
        :param respones: 返回页面链接
        """

        res = response.xpath(
            '//ul[@class="teasers list organic"]/li/div[2]/h3/a/@href'
        ).extract()
        if res:
            for detail_url in res:
                yield scrapy.Request(url=detail_url,
                                     callback=self.parse_page_detail)

            page = response.meta.get('page') + 1
            meta_url = response.meta.get('url')
            url = meta_url + '?page={}'.format(page)
            yield scrapy.Request(url=url,
                                 callback=self.parse_all_url,
                                 meta={
                                     'page': page,
                                     'url': meta_url
                                 })

    def parse_page_detail(self, response):
        """
        解析页面详情
        :return: item
        """
        content_by_xpath = parse_item.parse_response(self.urls_data['tag'],
                                                     response)
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath,
                                             self.urls_data['site'])
        data['expertDV'] = parse_item.parse_expert_DV(response,
                                                      data['expertDV'])
        parse_item.parse_check_data(data)
        item = ThinkTankItem()
        item['data'] = data
        item['site'] = self.urls_data['site']
        item['tag'] = self.urls_data['tag']
        yield item
Beispiel #8
0
class BrookingsSpider(scrapy.Spider):
    urls_data = start_item.get_url('brookings')
    name = urls_data['tag']
    allowed_domains = [urls_data['site']]
    start_urls = urls_data['url']
    item = ThinkTankItem()

    def parse(self, response):
        """
        解析主页面
        :param response: 二级导航链接
        """
        second_navi_urls = response.xpath(
            '//div[@class="post-linear-list term-list topic-list-wrapper"][1]//ul/li/a/@href').extract()
        for second_navi_url in second_navi_urls:
            yield scrapy.Request(second_navi_url, callback=self.parse_second_navi, meta={'base_url': second_navi_url})

    def parse_second_navi(self, response):
        """
        解析二级导航
        :param response: 返回二级导航链接
        """
        base_url = response.meta.get('base_url')
        clssify_urls = base_url + 'page/{}/'.format(2)
        yield scrapy.Request(clssify_urls, callback=self.parse_topic_page, meta={'page': 2, 'url': base_url})

    def parse_topic_page(self, response):
        """
        解析主题
        :param response: 返回分类下每页链接
        """
        classify_page_urls = response.xpath(
            '//div[@class="list-content"]/article/a/@href | //div[@class="list-content"]/article/div/h4/a/@href'
        ).extract()
        if classify_page_urls:
            for page_url in classify_page_urls:
                yield scrapy.Request(page_url, callback=self.parse_page_detail, meta={'get_image': True})
        page = response.meta.get('page') + 1
        meta_url = response.meta.get('url')
        page_next = meta_url + 'page/{}/'.format(page)
        yield scrapy.Request(page_next, callback=self.parse_topic_page,
                             meta={'page': page, 'url': meta_url, })

    def parse_page_detail(self, response):
        """
        解析页面详情
        """
        # 通过获取数据库对应xpath解析对应字段

        content_by_xpath = parse_item.parse_response(self.urls_data['site'], response)
        content_by_xpath['svg_data'] = []
        if content_by_xpath['svg_url']:
            content_by_xpath['svg_data'].append(
                parse_item.parse_svg_url(content_by_xpath['svg_url']))
        # 对非解析获取的字段赋值
        parse_item.processing_data(content_by_xpath)
        data = parse_item.parse_common_field(response, content_by_xpath, self.urls_data['site'])
        self.item['data'] = data
        self.item['tag'] = self.urls_data['tag']
        self.item['site'] = self.urls_data['site']
        yield self.item