def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1.art-titre::text')) article.add_value('description', extract_with_css('h2.art-chapeau::text')) article.add_value('full_article', extract_with_css('div.art-text')) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('div.row.doc-title h1::text')) article.add_value('description', '') article.add_value('full_article', extract_with_css('div.row.doc-content')) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1.article__title::text')) article.add_value('description', extract_with_css('p.article__desc::text')) article.add_value( 'full_article', extract_with_css( 'article.article__content.old__article-content-single')) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() def extract_all_with_css(query): return response.css(query).getall() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1.entry-title')) article.add_value('description', '') full_article = extract_all_with_css("section.cb-entry-content p") for paragraph in full_article: article.add_value('full_article', paragraph) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() def extract_all_with_css(query): return response.css(query).getall() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1::text')) article.add_value('description', extract_with_css('p.chapo::text')) full_article = extract_all_with_css("div#col-middle p") for paragraph in full_article: article.add_value('full_article', paragraph) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() def extract_all_with_css(query): return response.css(query).getall() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1.alpha.color-white.text-shadow::text')) article.add_value('description', extract_with_css('div.chapo p::text')) full_article = extract_all_with_css("p.py0p5") for paragraph in full_article: article.add_value('full_article', paragraph) article.add_value('source', response.url) return article.load_item()
def parse_article(response): def extract_with_css(query): return response.css(query).get(default='').strip() def extract_all_with_css(query): return response.css(query).getall() article = ItemLoader(item=Article()) article.add_value('title', extract_with_css('h1.post-title.entry-title')) description = extract_all_with_css('div#Chapo p') for p in description: article.add_value('description', p) full_article = extract_all_with_css("div#bsf_rt_marker p") for paragraph in full_article: article.add_value('full_article', paragraph) article.add_value('source', response.url) return article.load_item()
def parse(self, response): url = response.request.url id = articleurl_to_id(url) if id in self.parsed_urls: return article = Article() article['item_id'] = id article['category'] = articleurl_to_category(url) article_selectors = response.xpath( "(//article[contains(@class, 'article') and contains(@role, 'main')])[1]" ) bulletin_selectors = response.xpath( "(//article[contains(@class, 'bulletin')])[1]") self.counter += 1 if len(article_selectors) > 0: items, hasText = self.parse_article(article_selectors, article) elif len(bulletin_selectors) > 0: items, hasText = self.parse_bulletin(bulletin_selectors, article) else: self.log("Type of url not found: " + url) return if self.counter > self.write_limit: self.save_ids() self.counter = 0 if hasText: self.parsed_ids.append(str(id)) return items else: with open('failed_ids.csv', 'a+') as f: f.write(str(id)) f.write('\n')