def get_details(self, response): """ Dada uma response de uma página de nóticias retorna um item contendo todos os dados solicitados no lab """ item = RiLab01Item() title = response.css('h1::text').get(default='').strip() paragraphs = response.css('p').xpath('string()').getall() sub_title = paragraphs[2] try: author = response.css('strong').xpath('string()').get().replace("-", "").strip() except AttributeError: author = paragraphs[5].split("-")[0].strip() date = paragraphs[0] hour = response.css('p.meta::text').get().strip().split()[-1] section = response.css('body::attr(id)').get().split("-")[-1] text = paragraphs[5:] text.pop() text = self.text_formater(text) item['title'] = title item['sub_title'] = sub_title item['author'] = author item['date'] = self.date_formater(date, hour) item['section'] = section item['text'] = text item['url'] = response.url #self.writer_data(item) #save = [item['title'], item['sub_title'], item['author'], item['date'], item['section'], item['text'], item['url']] #print(','.join(save)) yield item
def parse(self, response): links = [] newLinks = [] if response.url in self.start_urls: # NAV links = response.css('h3.td-module-title a::attr(href)').getall() for link in links: yield response.follow(link, callback=self.parse) else: # PARSE item = RiLab01Item() item['_id'] = self.id item['author'] = response.css('div.td-post-author-name a::text').get() item['date'] = response.css('span.td-post-date time::attr(datetime)').get() item['title'] = response.css('title::text').get() item['sub_title'] = 'N/A' item['section'] = self.getSection(response) item['text'] = self.getText(response) item['url'] = response.url self.id = self.id + 1 yield item if self.id < 200: # NAV link = response.css('div.td-post-next-prev-content a::attr(href)').get() yield response.follow(link, callback=self.parse)
def create_news_item(self, response): date = response.css('span.td-post-date time::attr(datetime)').get() news_date = self.get_datetime(date) threshold_date = datetime.datetime(2018, 1, 1) # checking if the news is up to date. if news_date > threshold_date: url = response.url news_date = self.format_date(news_date) title = response.css('header h1::text').get() author = response.css('div.td-post-author-name a::text').get() category = response.meta.get('category') text = response.css('div.td-post-content span.s1::text').getall() # trying to get the news text from a different tag. if not text: text = response.css('div.td-post-content p::text').getall() donation_paragraph = response.css( 'p.donation_paragraph::text').get() if text[-1] == donation_paragraph: # remove donation paragraph. del text[-1] # NOTE: we don't have the sub_title information in the news. return RiLab01Item(title=title, author=author, date=news_date, text=text, url=url, section=category)
def parse_per_page(self, response): subtitle = response.css("div.c-overhead ::text").extract_first() title = response.css("h1.c-title ::text").extract_first().encode( "utf-8") date = response.css( "div.c-credits li:nth-child(3) ::text").extract_first() author = response.css( "div.c-credits li:nth-child(1) ::text").extract_first() section = response.css("li.c-title-content a ::text").extract_first() url = response.url if date is None or date[0] != "[": date = response.css( "div.c-credits li:nth-child(2) ::text").extract_first() date = datetime.strptime(str(date[1:11]), '%d/%m/%Y') all_p = response.css("div.paywall-google p ::text") text = "" for p in all_p: text += p.extract().encode("utf-8") text = text.replace(',', '') itemLab = RiLab01Item(title=title, author=author, url=url, sub_title=subtitle.capitalize(), date=date, section=section, text=text) yield itemLab
def parse_news_page(self, response): """Parse the news page and extract text, title, subtitle, author, section(the section is part of url), date and url data using loader object, check more in documentation(https://docs.scrapy.org/en/latest/topics/loaders.html) :param response: the html response from scrapy download :return scrapy.Item: a iterable list of items extracted """ loader = RiLab01Loader(item=RiLab01Item(), response=response) url = response.url loader.add_value('_id', response.meta.get('page_count')) loader.add_css('title', '.c-titulo::text') loader.add_css('title', '.c-title::text') loader.add_css('sub_title', '.c-sobretitulo span::text') loader.add_css('sub_title', 'c-overhead span::text') loader.add_css('author', '[class*="autor"] span::text') loader.add_css('author', '.item-agency::text') loader.add_css('author', '.item-name span::text') loader.add_css('date', '.data-publicacao time::text') loader.add_value('section', url.split('/')[3]) loader.add_css('text', '.paywall-google > p::text') loader.add_value('url', url) write_in_frontier(loader) return loader.load_item()
def parse_detalhe_materia(self, response): item = RiLab01Item() item['author'] = self.formata_autor(response.css('section p strong::text, strong a::text').get()) item['title'] = response.css('h1::text').get() item['sub_title'] = response.xpath('//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()').get() item['date'] = self.formata_data( response.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()').get()) item['section'] = response.url.split('/')[5] item['text'] = self.formata_texto( response.css('.entry p::text, p span::text, p a::text, entry span::text, strong::text').getall(), response.css('section p strong::text, strong a::text').get()) item['url'] = response.url yield item
def extract_data(self, response): output = RiLab01Item() self.id += 1 self.log('\n\n\n\nID: %s\n' % self.id) output['_id'] = str(self.id) output['title'] = self.get_tile(response) output['sub_title'] = self.get_desc(response) output['author'] = self.get_author(response) output['date'] = self.get_date(response) output['section'] = self.get_section(response) output['text'] = self.get_text(response) output['url'] = response.url yield output
def _br_247_callback(self, response): output = RiLab01Item() output['_id'] = self.current_id output['title'] = response.css('h1::text').get().replace('\n', '') output['sub_title'] = response.xpath( '//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()' ).get().replace('\n', '') output['author'] = self._get_author( response.css('section p strong::text, strong a::text').get()) output['date'] = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()' ).get() output['section'] = response.url.split('/')[5] output['text'] = response.css( '.entry p::text, p span::text, p a::text, entry span::text, strong::text' ).getall().replace('\n', '') output['url'] = response.url self.current_id += 1 yield output
def parseNoticia(self, response): if (self.permitsCrawl(response.url)): links = response.xpath('//a/@href').getall( ) # Links que ja estao 'dentro' da pagina de noticias: titulo_noticia = response.xpath('//h1/text()').get() with open('frontier/diariodocentrodomundo.json', 'r') as frontier: frontier_data = json.load( frontier ) # Pega para gerar o id da noticia -> Ordem em que foi add no frontier noticia_loader = ItemLoader(item=RiLab01Item(), response=response) noticia_loader.add_value('_id', len(frontier_data) + 1) noticia_loader.add_xpath('title', '//h1/text()') noticia_loader.add_value( 'sub_title', 'Noticias não tem subtitle no Diario do Centro do Mundo') noticia_loader.add_xpath( 'author', '//div[@class="td-post-author-name"]/a/text()') date = response.xpath('//time/@datetime').get() noticia_loader.add_value('date', date) noticia_loader.add_value('section', 'Not specified on page!') noticia_loader.add_xpath( 'text', '//div[@class="td-post-content td-pb-padding-side"]/p/text()') noticia_loader.add_value('url', response.url) item = noticia_loader.load_item() yield item with open('frontier/diariodocentrodomundo.json', 'r') as frontier: frontier_data = json.load(frontier) frontier_data[ titulo_noticia] = response.url ##Adiciona link da noticia no frontier with open('frontier/diariodocentrodomundo.json', 'w') as frontier: json.dump(frontier_data, frontier) for link in links: if (self.permitsCrawl(link)): yield scrapy.Request( link, self.parseNoticia ) ##Chama a funcao de forma recursiva para fazer o
def parse_article_detail(self, response): """ Crawls article and get informations from it :param response: HTML code of article page :return: Item to include in CSV """ item = RiLab01Item() item['title'] = response.css('h1::text').get() item['sub_title'] = response.xpath( '//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()' ).get() formatted_author = self.format_author( response.css('section p strong::text, strong a::text').get()) item['author'] = formatted_author formatted_date = self.format_date( response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()' ).get()) item['date'] = formatted_date item['section'] = response.url.split('/')[5] formatted_text = self.format_text( response.css( '.entry p::text, p span::text, p a::text, entry span::text, strong::text' ).getall()) item['text'] = formatted_text item['url'] = response.url yield item