def parse_content(self, response): item = GovCnItem() item['datePublished'] = response.meta.get('time').strip() item['headline'] = response.meta.get('title') item['Bigtype'] = response.meta.get('Bigtype') item['articleBody'] = response.css('td.b12c').xpath( 'string(.)').extract_first() item['annex'] = [ urlparse.urljoin(response.url, each_pic) for each_pic in response.css('td.b12c img::attr(src)').extract() ] if not item['articleBody']: item['articleBody'] = response.css('div#UCAP-CONTENT').xpath( 'string(.)').extract_first() item['annex'] = [ urlparse.urljoin(response.url, each_pic) for each_pic in response.css('div#UCAP-CONTENT img::attr(src)').extract() ] try: item['copyrightHolder'] = response.css( 'span.font::text').extract_first().split(u':')[1].strip() except: item['copyrightHolder'] = '' item['url'] = response.url yield item
def parse_content(self, response): item = GovCnItem() item['datePublished'] = '' item['headline'] = response.meta.get('title') item['Bigtype'] = response.meta.get('Bigtype') item['articleBody'] = response.css('div#UCAP-CONTENT').xpath( 'string(.)').extract_first() item['annex'] = [ urlparse.urljoin(response.url, each_pic) for each_pic in response.css('div#UCAP-CONTENT img::attr(src)').extract() ] item['copyrightHolder'] = '' item['url'] = response.url yield item