Beispiel #1
0
    def parse_article_grouped(self, selector, response):
        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('title', './h2/text()')
        loader.add_xpath('author', './span[@class="author-name"]/text()')
        loader.add_xpath('url', './ancestor::div/a/@href')
        return loader.load_item()
Beispiel #2
0
    def parse_article(self, selector, response):

        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath(
            'title',
            "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()"
        )
        loader.add_xpath(
            'author',
            "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()"
        )

        loader.add_xpath(
            'url',
            "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/@href"
        )
        loader.add_xpath(
            'site',
            "./div[contains(@class,'ld-card__body')]/h3[contains(@class,'ld-card__title')]/a/text()"
        )
        #loader.add_xpath('last_seen', "./div[contains(@class,'listing-author-name')]/text()")

        #loader.add_xpath('author', response.css('.listing-author-name').xpath('text()'))

        #logging.info(response.css('.listing-author-name').xpath('text()').extract())

        return loader.load_item()
Beispiel #3
0
    def parse_article(self, selector, response):
        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('author', './/p//small[2]//a//text()')
        loader.add_xpath('title', './/h4//text()[normalize-space()]')
        loader.add_xpath('url', './/h4//@href')
        return loader.load_item()
Beispiel #4
0
 def parse_article(self, selector, response):
     loader = ItemLoader(DiariosItem(), selector=selector)
     loader.add_xpath('title', './article/a/h2/text()')
     loader.add_xpath('author',
                      './article/p/b/following-sibling::text()[1]')
     loader.add_xpath('url', './article/a/@href')
     return loader.load_item()
Beispiel #5
0
    def parse_sunday(self, selector, response):
        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('title', './/p[@class="lead"]/a/text()')
        loader.add_xpath('author', './/p[@class="by-line"]/a/text()')
        loader.add_xpath('url', './/p[@class="lead"]/a/@href')
        return loader.load_item()
Beispiel #6
0
    def parse_body(self, selector, response):
        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('title', './/a[@class="title"]/text()')
        loader.add_xpath('author', './/span[@class="article-author"]/a/text()')
        loader.add_xpath('url', './/a[@class="title"]/@href')
        return loader.load_item()
Beispiel #7
0
 def parse_article(self, selector, response):
     loader = ItemLoader(DiariosItem(), selector=selector)
     #
     loader.add_xpath('author', './/h3//text()')
     loader.add_xpath('title', './/h2//a//text()')
     loader.add_xpath('url', './/h2//@href')
     return loader.load_item()
Beispiel #8
0
 def parse_article(self, selector, response):
     loader = ItemLoader(DiariosItem(), selector=selector)
     #
     author = selector.xpath('.//p//text()').extract()[0][4:]
     loader.add_value('author', author)
     loader.add_xpath('title', './/h4//text()')
     loader.add_xpath('url', './/h4//@href')
     return loader.load_item()
Beispiel #9
0
 def parse_article(self, response):
     selector = response.xpath('//*[@id="article-content"]')
     loader = ItemLoader(DiariosItem(), selector=selector)
     #capitalizar el titulo y quitar los 4 primeros caracteres que es el "Por "
     loader.add_value('author', response.xpath('//strong//text()').extract_first().title()[4:])
     loader.add_value('title', response.xpath('//*[@class="headline huge normal-style "]/a/text()').extract_first())
     loader.add_value('url', response.request.url)
     return loader.load_item()
Beispiel #10
0
    def parse_article(self, article):

        loader = ItemLoader(item=DiariosItem())
        loader.add_value('title', article['title'])
        loader.add_value('author', article['author'][0])
        loader.add_value('url', article['url'])

        return loader.load_item()
Beispiel #11
0
    def parse_article(self, selector, response):
        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('title', './a[not(@class)]/h3/text()')
        loader.add_xpath('author', './a[@class="author-name"]/text()')
        loader.add_xpath('url', './a[not(@class)]/@href')

        return loader.load_item()
Beispiel #12
0
 def parse_article(self, selector, response):
     import re
     loader = ItemLoader(DiariosItem(), selector=selector)
     autor = selector.xpath('.//span//a//text()').extract_first().title()
     autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor)
     loader.add_value('author', autor)
     loader.add_xpath('title', './/h3//a//text()')
     loader.add_xpath('url', './/h3//@href')
     return loader.load_item()
Beispiel #13
0
    def parse_item(self, response):
        sel = Selector(response)
        links = sel.xpath('//*[@id="conteudo-interna"]/*/a')

        for link in links:
            url = link.xpath('@href').extract()
            if url[0].endswith(".pdf"):
                yield DiariosItem(title=link.xpath('text()').extract(),
                                  url=url)
            else:
                yield Request(url[0], callback=self.parse_item)
Beispiel #14
0
 def parse_article(self, selector, response):
     print('ghola')
     loader = ItemLoader(DiariosItem(), selector=selector)
     #
     author = selector.xpath(
         './/div[@class="contenedor-nombre-perfil"]//text()').extract(
         )[0][4:]
     print(author)
     loader.add_value('author', author)
     loader.add_xpath('title', './/h3//text()')
     loader.add_xpath('url', './/@href[1]')
     return loader.load_item()
Beispiel #15
0
    def parse_article_body(self, selector, response):
        # check if article has no author
        if selector.xpath('./ancestor::div[@class="data-txt"]/p') == []:
            return

        loader = ItemLoader(DiariosItem(), selector=selector)

        loader.add_xpath('title',
                         './ancestor::article/div[@class="mt"]/a/h2/text()')
        loader.add_xpath('url', './ancestor::article/div[@class="mt"]/a/@href')
        loader.add_xpath('author',
                         './ancestor::div[@class="data-txt"]/p/text()')
        return loader.load_item()
Beispiel #16
0
 def parse_article(self, item):
     loader = ItemLoader(DiariosItem())
     for cred in item['credits']['by']:
         if cred['type'] == 'author':
             autor = cred['name'].title().strip()
             # Busco la coma
             poscoma = autor.find(',')
             # Si hay coma me quedo con lo de la izquierda
             if  poscoma > -1:
                 autor = autor[:poscoma]
             # Saco símbolos extraños
             autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor).strip()
     loader.add_value('author', autor)
     loader.add_value('title', item['headlines']['basic'])
     loader.add_value('url', 'http://www.abc.com.py' + item['website_url'])
     return loader.load_item()
Beispiel #17
0
 def parse_article(self, selector, response):
     import re
     loader = ItemLoader(DiariosItem(), selector=selector)
     #Extraigo autor y convierto en mayus y borro espacios
     autor = selector.xpath('.//div[@class="person-name"]/a/text()').extract_first().title().strip()
     # Saco símbolos raros
     autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor)
     # Trae "Por" al principio así que lo saco
     if autor[:4] == "Por ":
         autor = autor[4:]
     # Guardo autor
     loader.add_value('author', autor)
     # Guardo título
     loader.add_xpath('title', './/h3//text()'.strip())
     # Guardo URL
     loader.add_xpath('url', './/h3//@href')
     return loader.load_item()
Beispiel #18
0
    def parse_article(self, selector, response):
        import re

        loader = ItemLoader(item=DiariosItem(), selector=selector)

        titulo = selector.xpath(
            './article//figure/a/h4/text()').extract_first()
        titulo = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', titulo)
        loader.add_value('title', titulo)

        autor = selector.xpath(
            './article//figure//div[@class="byline"]/span[@class="author"]//text()'
        ).extract_first()
        autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor)
        loader.add_value('author', autor)

        loader.add_xpath('url', './article//figure/a/@href')

        return loader.load_item()
Beispiel #19
0
 def parse_article(self, response):
     import re
     selector = response.xpath('//*[@class="article-content"]')
     loader = ItemLoader(DiariosItem(), selector=selector)
     #Extraigo autor y convierto en mayus y borro espacios
     autor = response.xpath(
         '//*[@class="name"]//text()').extract_first().title().strip()
     # Saco símbolos raros
     autor = re.sub('[^a-zA-ZñÑáéíóúÁÉÍÓÚ ]', '', autor)
     # Trae "Por" al principio así que lo saco
     if autor[:4] == "Por ":
         autor = autor[4:]
     # Guardo autor
     loader.add_value('author', autor)
     # Guardo título
     loader.add_value(
         'title',
         response.xpath(
             '//*[@class="headline"]//text()').extract_first().strip())
     # Guardo URL
     loader.add_value('url', response.request.url)
     return loader.load_item()
Beispiel #20
0
 def parse_article(self, selector, response):
     loader = ItemLoader(DiariosItem(), selector=selector)
     loader.add_xpath('title', './/div/h2/a/text()')
     loader.add_xpath('author', 'string(.//div/a/@title)')
     loader.add_xpath('url', 'string(.//div/h2/a/@href)')
     return loader.load_item()
Beispiel #21
0
 def parse_article(self, selector, response):
     loader = ItemLoader(DiariosItem(), selector=selector)
     loader.add_xpath('title', './/h2[@class="flow-title"]//text()')
     loader.add_xpath('author', './/span[@class="flow-author"]//text()')
     loader.add_xpath('url', './/h2//@href')
     return loader.load_item()