def parse_document(self, response): url = response.url base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] l = ItemLoader(item=Document(), response=response) l.add_value("url", url) l.add_value("edition", "-" if edition == base_edition else edition) l.add_xpath("title", self.config.title_path) l.add_xpath("date", self.config.date_path) l.add_xpath("text", self.config.text_path) l.add_xpath("topics", self.config.topics_path) l.add_xpath("authors", self.config.authors_path) l.add_xpath("reposts_fb", self.config.reposts_fb_path) l.add_xpath("reposts_vk", self.config.reposts_vk_path) l.add_xpath("reposts_ok", self.config.reposts_ok_path) l.add_xpath("reposts_twi", self.config.reposts_twi_path) l.add_xpath("reposts_lj", self.config.reposts_lj_path) l.add_xpath("reposts_tg", self.config.reposts_tg_path) l.add_xpath("likes", self.config.likes_path) l.add_xpath("views", self.config.views_path) l.add_xpath("comm_count", self.config.comm_count_path) yield l.load_item()
def parse_document(self, response): url = response.url base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] l = ItemLoader(item=Document(), response=response) l.add_value('url', url) l.add_value('edition', '-' if edition == base_edition else edition) l.add_xpath('title', self.config.title_path) l.add_xpath('date', self.config.date_path) l.add_xpath('text', self.config.text_path) l.add_xpath('topics', self.config.topics_path) l.add_xpath('authors', self.config.authors_path) l.add_xpath('reposts_fb', self.config.reposts_fb_path) l.add_xpath('reposts_vk', self.config.reposts_vk_path) l.add_xpath('reposts_ok', self.config.reposts_ok_path) l.add_xpath('reposts_twi', self.config.reposts_twi_path) l.add_xpath('reposts_lj', self.config.reposts_lj_path) l.add_xpath('reposts_tg', self.config.reposts_tg_path) l.add_xpath('likes', self.config.likes_path) l.add_xpath('views', self.config.views_path) l.add_xpath('comm_count', self.config.comm_count_path) yield l.load_item()
def parse_document(self, response): news_item = json.loads(response.body_as_unicode())['root'] url = 'https://meduza.io/{}'.format(news_item['url']) # Taking all blocks from response with information blocks = self._get_text_blocks(news_item) # Extract text paragraphs from every block of the article text_paragraphs = self._extract_text_from_blocks(blocks) base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] # Replace every \xa0 with space text_paragraphs = [text.replace('\xa0', ' ') for text in text_paragraphs] title = news_item['title'].replace('\xa0', ' ') # Constructing the resulting item l = ItemLoader(item=Document(), response=response) l.add_value('url', url) l.add_value('edition', '-' if edition == base_edition else edition) l.add_value('title', title) l.add_value('topics', '') l.add_value('date', datetime.utcfromtimestamp(news_item['datetime']).strftime(self.config.date_format)) l.add_value('text', text_paragraphs if text_paragraphs else ['']) l.add_value('authors', news_item['source']['name'] if 'source' in news_item else ['']) yield l.load_item()
def parse_document(self, response): url = response.url base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] l = ItemLoader(item=Document(), response=response) l.add_value('url', url) l.add_value('edition', '-' if edition == base_edition else edition) l.add_xpath('title', self.config.title_path) l.add_xpath('date', self.config.date_path) l.add_xpath('text', self.config.text_path) l.add_xpath('topics', self.config.topics_path) yield l.load_item()
def parse_document(self, response): news_item = response.meta["news_item"] url = response.url base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] l = ItemLoader(item=Document(), response=response) l.add_value("url", url) l.add_value("edition", "-" if edition == base_edition else edition) l.add_value("title", news_item["title"]) l.add_value("topics", "") l.add_value("date", datetime.fromtimestamp(news_item["date"]).strftime(self.config.date_format)) l.add_css("text", self.config.text_path) yield l.load_item()