Exemple #1
0
    def parse_content(self, response):
        item = GovCnItem()
        item['datePublished'] = response.meta.get('time').strip()
        item['headline'] = response.meta.get('title')
        item['Bigtype'] = response.meta.get('Bigtype')
        item['articleBody'] = response.css('td.b12c').xpath(
            'string(.)').extract_first()
        item['annex'] = [
            urlparse.urljoin(response.url, each_pic)
            for each_pic in response.css('td.b12c img::attr(src)').extract()
        ]
        if not item['articleBody']:
            item['articleBody'] = response.css('div#UCAP-CONTENT').xpath(
                'string(.)').extract_first()
            item['annex'] = [
                urlparse.urljoin(response.url, each_pic) for each_pic in
                response.css('div#UCAP-CONTENT img::attr(src)').extract()
            ]
        try:
            item['copyrightHolder'] = response.css(
                'span.font::text').extract_first().split(u':')[1].strip()
        except:
            item['copyrightHolder'] = ''
        item['url'] = response.url

        yield item
Exemple #2
0
    def parse_content(self, response):
        item = GovCnItem()
        item['datePublished'] = ''
        item['headline'] = response.meta.get('title')
        item['Bigtype'] = response.meta.get('Bigtype')
        item['articleBody'] = response.css('div#UCAP-CONTENT').xpath(
            'string(.)').extract_first()
        item['annex'] = [
            urlparse.urljoin(response.url, each_pic) for each_pic in
            response.css('div#UCAP-CONTENT img::attr(src)').extract()
        ]
        item['copyrightHolder'] = ''
        item['url'] = response.url

        yield item