def parse_article(self, response): # get title title = response.css( 'h1.gp-coluna.col-8.c-titulo ::text').extract_first() # get sub_title sub_title = response.css('h2.c-sumario ::text').extract_first() # get author author = response.css('li.c-autor span::text').extract_first() # get date date = self.format_date( response.css('li.data-publicacao time::text').extract_first()) # get section section = response.css('a.c-nome-editoria span::text').extract_first() # get text text = "" for paragraph in response.css( 'div.gp-coluna.col-6.texto-materia.paywall-google p::text'): text = (text + paragraph.extract()) # get comments self.token = response.css( 'div.sociabilizacao-load-area ::attr(data-token)').extract_first() link_comments = 'https://live.gazetadopovo.com.br/webservice/comentario/abaComentarios?comentario=&token=' + self.token yield response.follow(link_comments, self.parse_comments) article = CrawlerNewsItem(title=title, sub_title=sub_title, author=author, date=date, text=text, section=section, _id=response.request.url) yield article
def parse_article(self, response): # get title title = response.css('h1.article__title::text').extract_first() # get sub_title sub_title = response.css('h2.article__subtitle::text').extract_first() # get article's date date = self.format_date(str(response.css('div.article__date::text').extract_first())) # get author author = response.css('div.article__author::text').extract_first() # get text text = "" for paragraph in response.css('div.article__content-container.protected-content p::text'): text = (text + paragraph.extract()) # get section section = response.css('div.site-header__section-name a::text').extract_first() # get id_article id_article = response.request.url.split('-') news = CrawlerNewsItem( _id=id_article[len(id_article)-1] ,title=title, sub_title=sub_title, date=date, author=author, text=text, section=section, url=response.request.url) yield news # get comments by json yield response.follow('https://oglobo.globo.com/ajax/comentario/buscar/' + id_article[len(id_article)-1] + '/1.json', self.parse_comments)
def parse_article(self, response): # get title title = response.css('h1.c-content-head__title::text').extract_first() # get sub_title sub_title = response.css( 'h2.c-content-head__subtitle::text').extract_first() # get article's date transform date from isodate to timestamp date = dateutil.parser.parse( response.css('time.c-more-options__published-date::attr(datetime)' ).extract_first()).strftime('%s') # get author author = response.css( 'strong.c-signature__author::text').extract_first() # get text text = "" for paragraph in response.xpath( "//div[@class='c-news__body']/p//text()").extract(): text = text + paragraph # get section section = response.css( 'li.c-site-nav__item.c-site-nav__item--section a::text' ).extract_first() article = CrawlerNewsItem(_id=response.request.url, title=title, sub_title=sub_title, date=date, author=author, text=text, section=section, url=response.request.url) yield article
def parse_article(self, response): # get title title = response.css('h1.eltdf-title-text ::text').extract_first() # get sub_title sub_title = response.css('div.wpb_wrapper h3::text').extract_first() # get article's date date = self.format_date( response.css('div.eltdf-post-info-date.entry-date.updated a::text' ).extract_first()) # get author author = response.css( 'a.eltdf-post-info-author-link ::text').extract_first() # get text text = "" for paragraph in response.css('div.eltdf-post-text p::text'): text = (text + paragraph.extract()) # get section section = response.css( 'div.eltdf-post-info-category a::text').extract_first() news = CrawlerNewsItem(title=title, sub_title=sub_title, date=date, author=author, text=text, section=section, _id=response.request.url) yield news
def parse_article(self, response): # get title title = response.css('h1.n--noticia__title::text').extract_first() # get sub_title sub_title = response.css( 'h2.n--noticia__subtitle::text').extract_first() # get article's date dt_article = response.css( 'div.n--noticia__state-desc p::text').extract_first() # transform article's date from isodate to timestamp dt_article = self.format_date(dt_article) # get article's section section = response.css( 'div.header-current-page.cor-e a::text').extract_first() # get author author = response.css( 'div.n--noticia__state-title::text').extract_first() # get text text_article = "" paragraph = "" for paragraph in response.css( 'div.n--noticia__content.content p::text').extract(): text_article = text_article + paragraph article = CrawlerNewsItem(_id=response.request.url, title=title, sub_title=sub_title, date=dt_article, text=text_article, section=section, url=response.request.url) yield article
def parse_article(self, response): # get title title = response.css('h1.articulo-titulo ::text').extract_first() # get sub_title sub_title = response.css( 'h2.articulo-subtitulo ::text').extract_first() # get article's date date = dateutil.parser.parse( response.css('time.articulo-actualizado ::attr(datetime)'). extract_first()).strftime( '%s') # transform date from isodate to timestamp # get author author = response.css('span.autor-nombre a::text').extract_first() # get text text = "" for paragraph in response.css('div.articulo-cuerpo p::text'): text = (text + paragraph.extract()) # get section section = response.css('a.enlace span::text').extract_first() news = CrawlerNewsItem(title=title, sub_title=sub_title, date=date, author=author, text=text, section=section, _id=response.request.url) yield news
def parse_news(self, response): item = CrawlerNewsItem() item['url'] = response.url item['article_from'] = self.name item['article_type'] = 'news' item['title'] = self._parse_title(response) item['publish_date'] = self._parse_publish_date(response) item['authors'] = self._parse_authors(response) item['tags'] = self._parse_tags(response) item['text'] = self._parse_text(response) item['text_html'] = self._parse_text_html(response) item['images'] = self._parse_images(response) item['video'] = self._parse_video(response) item['links'] = self._parse_links(response) return item
def parse_article(self, response): # get title title = response.css('h1.article-title::text').extract_first() # get sub_title sub_title = response.css('h2.article-subtitle::text').extract_first() # get article's date date = self.format_date( response.css('div.article-date span::text').extract_first()) # get author author = response.css('div.article-author span::text').extract_first() # get text text = "" for paragraph in response.xpath( "//section[@class='article-content']/p//text()").extract(): text = text + paragraph # get section section = response.css('div.article-category a::text').extract_first() news = CrawlerNewsItem(_id=response.request.url, title=title, sub_title=sub_title, date=date, author=author, text=text, section=section, url=response.request.url) yield news # get comments for (text_comment, dt_comment, author_comment) in zip( response.css('div.comment-text p::text'), response.css('span.comment-meta.comment-metadata a::text'), response.css('div.comment-author.vcard cite::text')): comment = CrawlerNewsCommentItem( date=self.format_date(dt_comment.extract( )), # transform comments' date from isodate to timestamp author=author_comment.extract(), text=text_comment.extract(), id_article=response.request.url) yield comment
def parse_article(self, response): # get title title = response.css('h1::text').extract_first() # get article's date dt_article = response.css( 'time.entry-date.published::attr(datetime)').extract_first() # transform article's date from isodate to timestamp dt_article = dateutil.parser.parse(dt_article).strftime('%s') # get article's section section = response.css('span.categoria a::text').extract_first() # get text text_article = "" for paragraph in response.xpath( "//div[@class='entry-content']/p//text()").extract(): text_article = text_article + paragraph article = CrawlerNewsItem(_id=response.request.url, title=title, date=dt_article, text=text_article, section=section) yield article # get comments for (text_comment, dt_comment, author_comment) in zip( response.css('div.comment-content p::text'), response.css('div.comment-metadata time::attr(datetime)'), response.css('div.comment-author.vcard b::text')): comment = CrawlerNewsCommentItem( date=dateutil.parser.parse(dt_comment.extract()).strftime( '%s' ), # transform comments' date from isodate to timestamp author=author_comment.extract(), text=text_comment.extract(), id_article=response.request.url) yield comment