Esempio n. 1
0
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1.art-titre::text'))
    article.add_value('description', extract_with_css('h2.art-chapeau::text'))
    article.add_value('full_article', extract_with_css('div.art-text'))
    article.add_value('source', response.url)
    return article.load_item()
Esempio n. 2
0
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('div.row.doc-title h1::text'))
    article.add_value('description', '')
    article.add_value('full_article', extract_with_css('div.row.doc-content'))
    article.add_value('source', response.url)
    return article.load_item()
Esempio n. 3
0
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1.article__title::text'))
    article.add_value('description', extract_with_css('p.article__desc::text'))
    article.add_value(
        'full_article',
        extract_with_css(
            'article.article__content.old__article-content-single'))
    article.add_value('source', response.url)
    return article.load_item()
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    def extract_all_with_css(query):
        return response.css(query).getall()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1.entry-title'))
    article.add_value('description', '')
    full_article = extract_all_with_css("section.cb-entry-content p")
    for paragraph in full_article:
        article.add_value('full_article', paragraph)
    article.add_value('source', response.url)
    return article.load_item()
Esempio n. 5
0
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    def extract_all_with_css(query):
        return response.css(query).getall()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1::text'))
    article.add_value('description', extract_with_css('p.chapo::text'))
    full_article = extract_all_with_css("div#col-middle p")
    for paragraph in full_article:
        article.add_value('full_article', paragraph)
    article.add_value('source', response.url)
    return article.load_item()
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    def extract_all_with_css(query):
        return response.css(query).getall()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1.alpha.color-white.text-shadow::text'))
    article.add_value('description', extract_with_css('div.chapo p::text'))
    full_article = extract_all_with_css("p.py0p5")
    for paragraph in full_article:
        article.add_value('full_article', paragraph)
    article.add_value('source', response.url)
    return article.load_item()
Esempio n. 7
0
def parse_article(response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    def extract_all_with_css(query):
        return response.css(query).getall()

    article = ItemLoader(item=Article())
    article.add_value('title', extract_with_css('h1.post-title.entry-title'))
    description = extract_all_with_css('div#Chapo p')
    for p in description:
        article.add_value('description', p)
    full_article = extract_all_with_css("div#bsf_rt_marker p")
    for paragraph in full_article:
        article.add_value('full_article', paragraph)
    article.add_value('source', response.url)
    return article.load_item()
Esempio n. 8
0
    def parse(self, response):
        url = response.request.url
        id = articleurl_to_id(url)
        if id in self.parsed_urls:
            return

        article = Article()
        article['item_id'] = id
        article['category'] = articleurl_to_category(url)

        article_selectors = response.xpath(
            "(//article[contains(@class, 'article') and contains(@role, 'main')])[1]"
        )
        bulletin_selectors = response.xpath(
            "(//article[contains(@class, 'bulletin')])[1]")

        self.counter += 1

        if len(article_selectors) > 0:
            items, hasText = self.parse_article(article_selectors, article)
        elif len(bulletin_selectors) > 0:
            items, hasText = self.parse_bulletin(bulletin_selectors, article)
        else:
            self.log("Type of url not found: " + url)
            return

        if self.counter > self.write_limit:
            self.save_ids()
            self.counter = 0

        if hasText:
            self.parsed_ids.append(str(id))
            return items
        else:
            with open('failed_ids.csv', 'a+') as f:
                f.write(str(id))
                f.write('\n')