def parse_article_grouped(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './h2/text()') loader.add_xpath('author', './span[@class="author-name"]/text()') loader.add_xpath('url', './ancestor::div/a/@href') return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath( 'title', "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()" ) loader.add_xpath( 'author', "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()" ) loader.add_xpath( 'url', "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/@href" ) loader.add_xpath( 'site', "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()" ) #loader.add_xpath('last_seen', "./div[contains(@class,'listing-author-name')]/text()") #loader.add_xpath('author', response.css('.listing-author-name').xpath('text()')) #logging.info(response.css('.listing-author-name').xpath('text()').extract()) return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('author', './/p//small[2]//a//text()') loader.add_xpath('title', './/h4//text()[normalize-space()]') loader.add_xpath('url', './/h4//@href') return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './article/a/h2/text()') loader.add_xpath('author', './article/p/b/following-sibling::text()[1]') loader.add_xpath('url', './article/a/@href') return loader.load_item()
def parse_sunday(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './/p[@class="lead"]/a/text()') loader.add_xpath('author', './/p[@class="by-line"]/a/text()') loader.add_xpath('url', './/p[@class="lead"]/a/@href') return loader.load_item()
def parse_body(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './/a[@class="title"]/text()') loader.add_xpath('author', './/span[@class="article-author"]/a/text()') loader.add_xpath('url', './/a[@class="title"]/@href') return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) # loader.add_xpath('author', './/h3//text()') loader.add_xpath('title', './/h2//a//text()') loader.add_xpath('url', './/h2//@href') return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) # author = selector.xpath('.//p//text()').extract()[0][4:] loader.add_value('author', author) loader.add_xpath('title', './/h4//text()') loader.add_xpath('url', './/h4//@href') return loader.load_item()
def parse_article(self, response): selector = response.xpath('//*[@id="article-content"]') loader = ItemLoader(DiariosItem(), selector=selector) #capitalizar el titulo y quitar los 4 primeros caracteres que es el "Por " loader.add_value('author', response.xpath('//strong//text()').extract_first().title()[4:]) loader.add_value('title', response.xpath('//*[@class="headline huge normal-style "]/a/text()').extract_first()) loader.add_value('url', response.request.url) return loader.load_item()
def parse_article(self, article): loader = ItemLoader(item=DiariosItem()) loader.add_value('title', article['title']) loader.add_value('author', article['author'][0]) loader.add_value('url', article['url']) return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './a[not(@class)]/h3/text()') loader.add_xpath('author', './a[@class="author-name"]/text()') loader.add_xpath('url', './a[not(@class)]/@href') return loader.load_item()
def parse_article(self, selector, response): import re loader = ItemLoader(DiariosItem(), selector=selector) autor = selector.xpath('.//span//a//text()').extract_first().title() autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor) loader.add_value('author', autor) loader.add_xpath('title', './/h3//a//text()') loader.add_xpath('url', './/h3//@href') return loader.load_item()
def parse_item(self, response): sel = Selector(response) links = sel.xpath('//*[@id="conteudo-interna"]/*/a') for link in links: url = link.xpath('@href').extract() if url[0].endswith(".pdf"): yield DiariosItem(title=link.xpath('text()').extract(), url=url) else: yield Request(url[0], callback=self.parse_item)
def parse_article(self, selector, response): print('ghola') loader = ItemLoader(DiariosItem(), selector=selector) # author = selector.xpath( './/div[@class="contenedor-nombre-perfil"]//text()').extract( )[0][4:] print(author) loader.add_value('author', author) loader.add_xpath('title', './/h3//text()') loader.add_xpath('url', './/@href[1]') return loader.load_item()
def parse_article_body(self, selector, response): # check if article has no author if selector.xpath('./ancestor::div[@class="data-txt"]/p') == []: return loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './ancestor::article/div[@class="mt"]/a/h2/text()') loader.add_xpath('url', './ancestor::article/div[@class="mt"]/a/@href') loader.add_xpath('author', './ancestor::div[@class="data-txt"]/p/text()') return loader.load_item()
def parse_article(self, item): loader = ItemLoader(DiariosItem()) for cred in item['credits']['by']: if cred['type'] == 'author': autor = cred['name'].title().strip() # Busco la coma poscoma = autor.find(',') # Si hay coma me quedo con lo de la izquierda if poscoma > -1: autor = autor[:poscoma] # Saco símbolos extraños autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor).strip() loader.add_value('author', autor) loader.add_value('title', item['headlines']['basic']) loader.add_value('url', 'http://www.abc.com.py' + item['website_url']) return loader.load_item()
def parse_article(self, selector, response): import re loader = ItemLoader(DiariosItem(), selector=selector) #Extraigo autor y convierto en mayus y borro espacios autor = selector.xpath('.//div[@class="person-name"]/a/text()').extract_first().title().strip() # Saco símbolos raros autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor) # Trae "Por" al principio así que lo saco if autor[:4] == "Por ": autor = autor[4:] # Guardo autor loader.add_value('author', autor) # Guardo título loader.add_xpath('title', './/h3//text()'.strip()) # Guardo URL loader.add_xpath('url', './/h3//@href') return loader.load_item()
def parse_article(self, selector, response): import re loader = ItemLoader(item=DiariosItem(), selector=selector) titulo = selector.xpath( './article//figure/a/h4/text()').extract_first() titulo = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', titulo) loader.add_value('title', titulo) autor = selector.xpath( './article//figure//div[@class="byline"]/span[@class="author"]//text()' ).extract_first() autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor) loader.add_value('author', autor) loader.add_xpath('url', './article//figure/a/@href') return loader.load_item()
def parse_article(self, response): import re selector = response.xpath('//*[@class="article-content"]') loader = ItemLoader(DiariosItem(), selector=selector) #Extraigo autor y convierto en mayus y borro espacios autor = response.xpath( '//*[@class="name"]//text()').extract_first().title().strip() # Saco símbolos raros autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor) # Trae "Por" al principio así que lo saco if autor[:4] == "Por ": autor = autor[4:] # Guardo autor loader.add_value('author', autor) # Guardo título loader.add_value( 'title', response.xpath( '//*[@class="headline"]//text()').extract_first().strip()) # Guardo URL loader.add_value('url', response.request.url) return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './/div/h2/a/text()') loader.add_xpath('author', 'string(.//div/a/@title)') loader.add_xpath('url', 'string(.//div/h2/a/@href)') return loader.load_item()
def parse_article(self, selector, response): loader = ItemLoader(DiariosItem(), selector=selector) loader.add_xpath('title', './/h2[@class="flow-title"]//text()') loader.add_xpath('author', './/span[@class="flow-author"]//text()') loader.add_xpath('url', './/h2//@href') return loader.load_item()