def parse_article(self, response): text = ' '.join([x.strip() for x in response.css('div.article-body *::text').extract() if x]) author = ' '.join([x.strip() for x in response.css('p.byline *::text').extract() if x]) yield Article( title = response.meta.get('title', ''), url = response.meta.get('url', ''), published = response.meta.get('date', datetime.now()), content = text, author = author, source = 'San Francisco Chronicle' )
def parse_article(self, response): text = ' '.join([ x.strip() for x in response.css( 'div.pb-f-article-body div[data-type=text] *::text').extract() if x ]) author = ' '.join([ x.strip() for x in response.css('span.byline *::text').extract() if x ]) yield Article(title=response.meta.get('title', ''), url=response.meta.get('url', ''), published=response.meta.get('date', datetime.now()), content=text, author=author, source='LA Times')
def parse_article(self, response): selector = '.pb-f-article-body' text = ' '.join([ x.strip() for x in response.css( 'div[itemprop=articleBody] *::text').extract() if x ]) author = ' '.join([ x.strip() for x in response.css('div.trb_ar_by *::text').extract() if x ]) yield Article(title=response.meta.get('title', ''), url=response.meta.get('url', ''), published=response.meta.get('date', datetime.now()), content=text, author=author, source='San Diego Union Tribune')
def parse_article(self, response): selector = '.pb-f-article-body' text = ' '.join([ x.strip() for x in response.css('div#content-body *::text').extract() if x ]) author = ' '.join([ x.strip() for x in response.css('div.byline *::text').extract() if x ]) yield Article(title=response.meta.get('title', ''), url=response.meta.get('url', ''), published=response.meta.get('date', datetime.now()), content=text, author=author, source='Sacramento Bee')