Exemple #1
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title = response.css('div.detail_area > h1.jdl::text').extract()[0]
        loader.add_value('title', title)
        author_name = response.css('div.author > strong::text').extract()[0]
        loader.add_value('author_name', author_name)
        raw_content = response.css('article > div.text_detail').extract()[0]
        raw_content = ' '.join(raw_content)
        loader.add_value('raw_content', raw_content)

        # Parse date information
        try:
            # Example: Kamis 15 Sep 2016, 18:33 WIB
            date_str = response.css(
                'div.detail_area > div.date::text').extract()[0]
            # Example: '15 Sep 2016, 18:33'
            date_str = ' '.join(date_str.split(' ')[1:5])
            self.logger.info('parse_date: parse_news: date_str: %s', date_str)
            published_at = datetime.strptime(date_str, '%d %b %Y, %H:%M')
            loader.add_value('published_at', published_at)
        except Exception as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #2
0
    def parse_news(self, response):
        title = response.css('h1[itemprop="headline"]::text').extract()[0]
        author_name = response.css('a[rel="author"] > span::text').extract()[0]
        raw_content = response.css('.content').extract()[0]

        if not (title and author_name and raw_content):
            return

        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('title', title)
        loader.add_value('author_name', author_name)
        loader.add_value('raw_content', raw_content)

        # Parse date information
        try:
            # Example: Selasa,  6 Oktober 2015 - 05:23 WIB
            date_time_str = response.css('article > div.time::text').extract()[0]
            date_time_str = date_time_str.split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
            self.logger.info('parse_date: parse_news: date_str: %s', date_time_str)
            published_at = wib_to_utc(datetime.strptime(date_time_str, '%d %B %Y - %H:%M'))
            loader.add_value('published_at', published_at)
        except Exception as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #3
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        parsed_news = json.loads(str(response.body))[0]

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', parsed_news['url'])

        if not parsed_news['title']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('title', parsed_news['title'])

        # Convert HTML text to a scrapy response
        html_response = HtmlResponse(url=parsed_news['url'],
                body=parsed_news['content'].encode('utf-8', 'ignore'))
        xpath_query = '''
            //body/node()
                [not(descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::image|
                    descendant-or-self::img|
                    descendant-or-self::iframe
                )]
        '''
        raw_content_selectors = html_response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        loader.add_value('raw_content', raw_content)

        if not parsed_news['published']:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: 12 Oct 2016 - 05:25
        date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                    '%d %b %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        if not parsed_news['author']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', parsed_news['author'])

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #4
0
    def parse_news(self, article, sub_article):
        if not (sub_article['news_url'] and article['news_title'] and
                article['news_reporter'] and sub_article['news_description']
                and article['news_date_publish']):
            return

        self.logger.info('parse_news: %s' % article)

        # Example: https://m.merdeka.com/tag/p/pilgub-dki/politik/nachrowi-pastikan-agus-sylvi-tak-cuma-incar-suara-santri-ulama.html
        url = 'https://www.merdeka.com' + sub_article['news_url']

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News())
        loader.add_value('url', url)
        loader.add_value('title', article['news_title'])
        loader.add_value('author_name', article['news_reporter'])
        loader.add_value('raw_content', sub_article['news_description'])

        # Parse date information
        try:
            # Example: 2016-10-12 15:16:04
            date_time_str = article['news_date_publish']
            self.logger.info('parse_date: parse_news: date_str: %s',
                             date_time_str)
            published_at = wib_to_utc(
                datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S'))
            loader.add_value('published_at', published_at)
        except Exception as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #5
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        loader = ItemLoader(item=News(), response=response)
        json_response = json.loads(response.body)

        try:
            url = json_response['NewsML']['NewsItem']['NewsComponent'][
                'NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
        except KeyError:
            return loader.load_item()
        loader.add_value('url', url)

        try:
            title = json_response['NewsML']['NewsItem']['NewsComponent'][
                'NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
        except KeyError:
            return loader.load_item()
        if not title:
            return loader.load_item()
        loader.add_value('title', title)

        try:
            raw_content = json_response['NewsML']['NewsItem']['NewsComponent'][
                'NewsComponent']['NewsComponent']['ContentItem'][
                    'DataContent']['nitf']['body']['body.content']['p']
        except KeyError:
            return loader.load_item()
        if not raw_content:
            return loader.load_item()
        loader.add_value('raw_content', raw_content)

        try:
            author_name = json_response['NewsML']['NewsItem']['NewsComponent'][
                'NewsComponent']['Author']
        except KeyError:
            return loader.load_item()
        if not author_name:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', author_name)

        try:
            date_time_str = json_response['NewsML']['NewsItem'][
                'NewsManagement']['FirstCreated']
        except KeyError:
            return loader.load_item()
        if not date_time_str:
            return loader.load_item()

        date_time_str = date_time_str.split('T')
        date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
        try:
            published_at_wib = datetime.strptime(' '.join(date_time_str),
                                                 '%Y%m%d %H%M%S')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        return loader.load_item()
Exemple #6
0
    def parse_news(self, response):

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css(
            'div.detail > article > div.detail_area > h1::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        # parse date
        date_selectors = response.css(
            "div.detail > article > div.detail_area > div.date::text")
        if not date_selectors:
            return loader.load_item()
        # Selasa 10 Oktober 2017, 13:40 WIB
        date_str = date_selectors.extract()[0]

        date_str = filter(None, re.split('[\s,]', date_str))[1:5]
        info_time = ' '.join([_(s) for s in date_str if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #TODO check the published_at, if it is smaller than the last time
        #we crawl, just drop the data.

        #parse author name
        author_name_selectors = response.css(
            "div.detail > article > div.detail_area > div.author > strong::text"
        )
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract_first()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css(
            "div.detail > article > div.text_detail.detail_area")
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #7
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('div.detail_area > h1.jdl::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //div[@class="text_detail detail_area"]/node()
                [not(self::comment()|self::script|self::div)]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = ' '.join(raw_content_selectors.extract())
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Parse date information
        # Example: Kamis 15 Sep 2016, 18:33 WIB
        date_selectors = response.css('div.detail_area > div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        date_str = date_selectors.extract()[0]
        # Example: '15 Sep 2016, 18:33'
        date_str = ' '.join(date_str.split(' ')[1:5])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.author > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
    def parse_news(self, response):

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css(
            'div.main-container > div > section.main-content > h1.page-header::text'
        )
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css(
            'div.post-meta > div > div > div.submitted > span::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract_first()
        info_time = re.sub(r'[,-]', '', date_str)
        info_time = re.sub(r'\s+', ' ', info_time)
        time_arr = filter(None, re.split('[\s,|-]', info_time))[:4]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            return loader.load_item()

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            'div.post-meta > div > div > div.items-penulis > span > a::text'
        ).extract_first()
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css(
            'div.region.region-content > section > article > div.field.field-name-body.field-type-text-with-summary > div.field-items > div.field-item.even'
        )
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('div#mdk-news-title::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css("div.mdk-date-reporter > span::text")
        # We need to do this because sometimes selector can contains 3 or 2 elements.
        pos = len(date_selectors) - 2
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract()[pos]

        # eg: 8 September 2017 21:02
        date_str = date_str.split("|")[1].strip()
        time_arr = filter(None, re.split('[\s,|]', date_str))
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)

        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            "div.mdk-date-reporter > span::text")
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract()[1]
            author_name = author_name.split(":")[1].strip()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css("div.mdk-body-paragraph")
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('div.artikel > h1.artikel::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css('div.artikel > div.tanggal::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract_first()

        # eg: Tuesday, 12 September 2017 | 20:21 WIB
        time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)

        if self.media['last_crawl_at'] >= published_at:
            is_no_update = True
            return loader.load_item()

        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            'div.artikel > div > p > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract()[-1].strip()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css('div.artikel > div > p')
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #11
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.detailtitle::text')
        if not title_selectors:
            # If error, drop from the item pipeline
            return loader.load_item()
        title = title_selectors.extract_first().strip()
        loader.add_value('title', title)

        # Parse date information
        date_time = response.css(
            'body > div > div.container > div.page-header > div::text'
        ).extract_first().strip()
        date_time = date_time.split(',')[-1].strip()
        date_time = ' '.join([_(w) for w in date_time.split(' ')
                              ])  # October => Oktober
        try:
            published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
        except ValueError:
            # If error, drop from the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # If multipage
        multipage_selectors = response.css('.newsPagingWrap > a')
        if multipage_selectors:
            return self.parse_indices(multipage_selectors, loader)

        # Else if not multipage

        author_name_selectors = response.css('.newsContent > p > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[-1].strip()
            loader.add_value('author_name', author_name)

        # Extract the news content
        raw_content_selectors = response.css('.newsContent > p')
        if not raw_content_selectors:
            # Drop from the item pipeline
            return loader.load_item()

        raw_content = ' '.join(raw_content_selectors.extract())
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #12
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('h1::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        # parse date
        date_selectors = response.css("div.date > span::text")
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract()[0]

        # eg: Selasa, 12 Sep 2017 20:08
        date_str = date_str.split(",")[1].strip()
        time_arr = filter(None, re.split('[\s,|]', date_str))
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #TODO check the published_at, if it is smaller than the last time
        #we crawl, just drop the data.

        #parse author name
        author_name_selectors = response.css("div.date > span")[1].css(
            "span > span::text")
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract_first()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css("div.contentdetail")
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #13
0
    def parse_news(self, response):

        # self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('div.wrap-head > h2 > a::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        ##parse date
        date_selectors = response.css('div.wrap-head > span::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract()[0]

        # eg: Tuesday, 12 September 2017 | 20:21 WIB
        time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)

        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css('div.red::text')
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract()[0].strip()
            author_name = author_name.replace('Rep: ',
                                              '').replace('Red: ', '').strip()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css('div.content-detail')
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #14
0
    def parse_news_metro(self, response):
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        date_selector = response.css('.artikel > div.block-tanggal::text')
        if not date_selector:
            return self.parse_news_pilkada(loader, response)
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.artikel > h1::text')
        if not title_selector:
            return loader.load_item()
        loader.add_value('title', title_selector.extract()[0])

        # Select all p which don't have iframe inside it
        raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
        if not raw_content_selector:
            return loader.load_item()
        raw_content = ''
        for raw_content_selector_one in raw_content_selector:
            raw_content = raw_content + raw_content_selector_one.extract()

        # Go to next page while there is next page button
        next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
        if next_page_selector:
            return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))

        loader.add_value('raw_content', raw_content)

        # The author usually put inside <strong> tag, however, some news is not using <strong> tag.
        # NOTE: this block of code may need revision in the future
        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        return loader.load_item()
Exemple #15
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.content-detail > h4::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract raw html, not the text
        raw_content_selectors = response.css('div.content-body')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Example: Selasa, 11 Oktober 2016 | 10:48
        date_selectors = response.css('div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        date_str = date_selectors.extract()[0]
        # Example: 11 October 2016 10:48
        date_str = re.split('[\s,|-]', date_str)
        date_str = ' '.join([_(s) for s in date_str[1:] if s])

        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_selectors = response.css('div.content-detail > p::text')
        if not author_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_selectors.extract()[0]
            author_name = author_name.split('/')[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #16
0
    def parse_news(self, response):

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css(
            'div.pa15.bgwhite > h1.f32.fno.crimson::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css(
            'div.pa15.bgwhite > div.mt10.mb10 > time.grey.f13.dip::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract_first()
        # eg: Kompas.com - 10/10/2017, 13:37 WIB
        info_time = date_str.split(',')[1].strip()
        time_arr = filter(None, re.split('[\s,|]', info_time))[:4]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            return loader.load_item()

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        # author_name_selectors = response.css('div.read__author > a::text').extract_first()
        # if not author_name_selectors:
        loader.add_value('author_name', 'N/A')
        # else:
        #     author_name = author_name_selectors
        #      loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css(
            'div.ptb15 > div.txt-article.mb20')
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #17
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('section.main-content > h1::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css("div.submitted > span::text")
        if not date_selectors:
            return loader.load_item()
        # eg: 5 September, 2017 - 18:54
        date_str = date_selectors.extract_first()

        time_arr = filter(None, re.split('[\s,-]', date_str))
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            "div.items-penulis > span > a::text")
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors.extract()[0].strip()
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css("div.field-item.even")
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1[itemprop="headline"]::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        author_name_selectors = response.css('a[rel="author"] > span::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        raw_content_selectors = response.css('.content')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        date_time_str_selectors = response.css('article > div.time::text')
        if not date_time_str_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: Selasa,  6 Oktober 2015 - 05:23 WIB
        date_time_str = date_time_str_selectors.extract()[0]
        date_time_str = date_time_str.split(',')[1].strip()[:-4]
        date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                                                 '%d %B %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #19
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.title-big-detail::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0].strip()
        loader.add_value('title', title)

        # Extract raw html, not the text
        raw_content_selectors = response.css('div.detail-content')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()[0]
        loader.add_value('raw_content', raw_content)

        date_selectors = response.css(
            'span.meta-author > span:nth-child(3)::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        # Example: Sabtu, 1 Oktober 2016, 15:47 WIB
        date_str = date_selectors.extract()[0].strip()
        # Example: 1 October 2016 15:47
        date_str = date_str.replace(',', '').split(' ')[1:-1]
        date_str = ' '.join([_(s) for s in date_str])
        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_selectors = response.css('span.meta-author > span > b::text')
        if not author_selectors:
            author_name = ''
            loader.add_value('author_name', author_name)
        else:
            author_name = author_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
    def parse_news(self, response):

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('h1.read__title::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css('div.read__time::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract_first()
        # eg: Kompas.com - 10/10/2017, 13:37 WIB
        time_arr = filter(None, re.split('[\s,-]', date_str))[1:3]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M')
        except ValueError as e:
            return loader.load_item()

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            'div.read__author > a::text').extract_first()
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css('div.read__content')
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #21
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        #parse title
        title_selectors = response.css('h1.read__title::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css('div.read__date::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract()[0]

        # eg: Tuesday, 12 September 2017 | 20:21 WIB
        time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            'div.contentArticle.box-shadow-new > h6::text').extract_first()
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css(
            'div.contentArticle.box-shadow-new').extract()
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Exemple #22
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('div.detail_area > h1.jdl::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        raw_content_selectors = response.css('article > div.text_detail')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()[0]
        loader.add_value('raw_content', raw_content)

        # Parse date information
        # Example: Kamis 15 Sep 2016, 18:33 WIB
        date_selectors = response.css('div.detail_area > div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        date_str = date_selectors.extract()[0]
        # Example: '15 Sep 2016, 18:33'
        date_str = ' '.join(date_str.split(' ')[1:5])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.author > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #23
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.detail_text > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract raw html, not the text
        raw_content_selectors = response.css('div.detail_text')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()[0]
        loader.add_value('raw_content', raw_content)

        # Example: Senin, 10/10/2016 05:12
        date_selectors = response.css('div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        date_str = date_selectors.extract()[0]
        # Example: 10/10/2016 05:12
        date_str = date_str.split(',')[1].strip()
        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d/%m/%Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.author > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #24
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        parsed_news = json.loads(str(response.body))
        parsed_news = parsed_news[0]

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        if not parsed_news['title']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('title', parsed_news['title'])

        if not parsed_news['content']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        parsed_news['content'] = re.search(r'<body>(.*)</body>', parsed_news['content'], re.S|re.I).group(1)
        parsed_news['content'] = re.sub(r'<img[^>]+\>', '', parsed_news['content'])
        loader.add_value('raw_content', parsed_news['content'])

        if not parsed_news['published']:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: 12 Oct 2016 - 05:25
        date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                    '%d %b %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        if not parsed_news['author']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', parsed_news['author'])

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #25
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))
        is_video = response.css(
            'ul.breadcrumb > li > a::text').extract()[0] == 'VIDEO'

        # Skip if video page, since no author here
        if is_video:
            return

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title = response.css('div.part.lead.pr > h1::text').extract()[0]
        info = response.css('div.part.lead.pr > span::text').extract()[0]
        author_name = info.split('-')[0].strip()
        # Example: 10 Oktober 2016 21:10 wib
        date_str = info.split('-')[1].strip()

        # Extract raw html, not the text
        raw_content = response.css('div.part.article').extract()
        raw_content = ' '.join(raw_content)
        # Parse date information
        try:
            # Example: 10 October 2016 21:10
            date_str = ' '.join([_(w) for w in date_str[:-4].split(' ')])
            self.logger.info(
                'parse_date: parse_news: date_str: {}'.format(date_str))
            published_at = wib_to_utc(
                datetime.strptime(date_str, '%d %B %Y %H:%M'))
            loader.add_value('published_at', published_at)
        except Exception as e:
            raise CloseSpider('cannot_parse_date: {}'.format(e))

        loader.add_value('title', title)
        loader.add_value('author_name', author_name)
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #26
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css("div.kcm-read-top > h2::text")
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        raw_content_selectors = response.css("div.kcm-read-text > p")
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()[0]
        loader.add_value('raw_content', raw_content)

        date_selectors = response.css("div.kcm-date::text")
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        date = date_selectors.extract()[0]
        try:
            published_at = self.convert_date(date)
        except Exception:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css("span.pb_10::text")
        if not author_name_selectors:
            author_name = ''
            loader.add_value('author_name', author_name)
        else:
            author_name = ', '.join(author_name_selectors.extract())
            loader.add_value('author_name', author_name)

        return loader.load_item()
Exemple #27
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title = response.css('h1.title-big-detail::text').extract()[0].strip()
        info = response.css('span.meta-author span::text').extract()
        author_name = info[0].strip()
        # Example: Sabtu, 1 Oktober 2016, 15:47 WIB
        date_str = info[-1].strip()
        # Extract raw html, not the text
        raw_content = response.css('div.detail-content').extract()[0]

        # Parse date information
        try:
            # Example: 1 October 2016 15:47
            date_str = date_str.replace(',', '').split(' ')[1:-1]
            date_str = ' '.join([_(s) for s in date_str])
            self.logger.info(
                'parse_date: parse_news: date_str: {}'.format(date_str))

            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
            published_at = wib_to_utc(published_at_wib)
            loader.add_value('published_at', published_at)
        except Exception as e:
            raise CloseSpider('cannot_parse_date: {}'.format(e))

        loader.add_value('title', title)
        loader.add_value('author_name', author_name)
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #28
0
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('h1.article-header__title::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract the content using XPath instead of CSS selector
        xpath_query = """
            //div[@class="article-raw-content"]/node()
                [not(
                    self::comment()|
                    self::script|
                    self::div)]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Parse date information
        # Example: ' pada 18 Okt 2016, 08:33 WIB'
        date_selectors = response.css('span.article-header__datetime::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        date_str = date_selectors.extract()[0].strip()
        # Example: '18 Oct 2016, 08:33'
        date_str = ' '.join([_(w) for w in date_str.split(' ')[1:-1]])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css(
            'a.article-header__author-link::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.NewsTitle > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title.strip())

        # Parse date information
        # Example: 27 Oct 2016, 18:33:36 WIB
        date_selectors = response.css('div.NewsDate::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        try:
            date_str = date_selectors.extract()[0]
            published_at_wib = datetime.strptime(date_str,
                                                 '%d %b %Y %H:%M:%S WIB')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # no author
        loader.add_value('author_name', '')

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //div[@class="pad10"]/p/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()
Exemple #30
0
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.title-big-detail::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0].strip()
        loader.add_value('title', title)

        # Extract raw html, not the text
        # We filter-out the noise: HTML comments, scripts, css styles etc
        xpath_query = '''
            //div[@class="detail-content"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe|
                    descendant-or-self::a[@class="share-btn-right shared"]
                )]
        '''
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        loader.add_value('raw_content', raw_content)

        date_selectors = response.css(
            'span.meta-author > span:nth-child(3)::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        # Example: Sabtu, 1 Oktober 2016, 15:47 WIB
        date_str = date_selectors.extract()[0].strip()
        # Example: 1 October 2016 15:47
        date_str = date_str.replace(',', '').split(' ')[1:-1]
        date_str = ' '.join([_(s) for s in date_str])
        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_selectors = response.css('span.meta-author > span > b::text')
        if not author_selectors:
            author_name = ''
            loader.add_value('author_name', author_name)
        else:
            author_name = author_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()