Exemple #1
0
    def parse_document(self, response):
        url = response.url
        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        l = ItemLoader(item=Document(), response=response)
        l.add_value("url", url)
        l.add_value("edition", "-" if edition == base_edition else edition)
        l.add_xpath("title", self.config.title_path)
        l.add_xpath("date", self.config.date_path)
        l.add_xpath("text", self.config.text_path)
        l.add_xpath("topics", self.config.topics_path)
        l.add_xpath("authors", self.config.authors_path)

        l.add_xpath("reposts_fb", self.config.reposts_fb_path)
        l.add_xpath("reposts_vk", self.config.reposts_vk_path)
        l.add_xpath("reposts_ok", self.config.reposts_ok_path)
        l.add_xpath("reposts_twi", self.config.reposts_twi_path)
        l.add_xpath("reposts_lj", self.config.reposts_lj_path)
        l.add_xpath("reposts_tg", self.config.reposts_tg_path)
        l.add_xpath("likes", self.config.likes_path)
        l.add_xpath("views", self.config.views_path)
        l.add_xpath("comm_count", self.config.comm_count_path)

        yield l.load_item()
Exemple #2
0
    def parse_document(self, response):
        url = response.url
        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        l = ItemLoader(item=Document(), response=response)
        l.add_value('url', url)
        l.add_value('edition', '-' if edition == base_edition else edition)
        l.add_xpath('title', self.config.title_path)
        l.add_xpath('date', self.config.date_path)
        l.add_xpath('text', self.config.text_path)
        l.add_xpath('topics', self.config.topics_path)
        l.add_xpath('authors', self.config.authors_path)

        l.add_xpath('reposts_fb', self.config.reposts_fb_path)
        l.add_xpath('reposts_vk', self.config.reposts_vk_path)
        l.add_xpath('reposts_ok', self.config.reposts_ok_path)
        l.add_xpath('reposts_twi', self.config.reposts_twi_path)
        l.add_xpath('reposts_lj', self.config.reposts_lj_path)
        l.add_xpath('reposts_tg', self.config.reposts_tg_path)
        l.add_xpath('likes', self.config.likes_path)
        l.add_xpath('views', self.config.views_path)
        l.add_xpath('comm_count', self.config.comm_count_path)

        yield l.load_item()
Exemple #3
0
    def parse_document(self, response):
        news_item = json.loads(response.body_as_unicode())['root']
        url = 'https://meduza.io/{}'.format(news_item['url'])

        # Taking all blocks from response with information
        blocks = self._get_text_blocks(news_item)

        # Extract text paragraphs from every block of the article
        text_paragraphs = self._extract_text_from_blocks(blocks)

        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        # Replace every \xa0 with space
        text_paragraphs = [text.replace('\xa0', ' ') for text in text_paragraphs]
        title = news_item['title'].replace('\xa0', ' ')

        # Constructing the resulting item
        l = ItemLoader(item=Document(), response=response)
        l.add_value('url', url)
        l.add_value('edition', '-' if edition == base_edition else edition)
        l.add_value('title', title)
        l.add_value('topics', '')
        l.add_value('date', datetime.utcfromtimestamp(news_item['datetime']).strftime(self.config.date_format))
        l.add_value('text', text_paragraphs if text_paragraphs else [''])
        l.add_value('authors', news_item['source']['name'] if 'source' in news_item else [''])

        yield l.load_item()
Exemple #4
0
    def parse_document(self, response):
        url = response.url
        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        l = ItemLoader(item=Document(), response=response)
        l.add_value('url', url)
        l.add_value('edition', '-' if edition == base_edition else edition)
        l.add_xpath('title', self.config.title_path)
        l.add_xpath('date', self.config.date_path)
        l.add_xpath('text', self.config.text_path)
        l.add_xpath('topics', self.config.topics_path)
        yield l.load_item()
Exemple #5
0
    def parse_document(self, response):
        news_item = response.meta["news_item"]
        url = response.url
        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        l = ItemLoader(item=Document(), response=response)
        l.add_value("url", url)
        l.add_value("edition", "-" if edition == base_edition else edition)
        l.add_value("title", news_item["title"])
        l.add_value("topics", "")
        l.add_value("date", datetime.fromtimestamp(news_item["date"]).strftime(self.config.date_format))
        l.add_css("text", self.config.text_path)
        yield l.load_item()