def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title = response.css('div.detail_area > h1.jdl::text').extract()[0] loader.add_value('title', title) author_name = response.css('div.author > strong::text').extract()[0] loader.add_value('author_name', author_name) raw_content = response.css('article > div.text_detail').extract()[0] raw_content = ' '.join(raw_content) loader.add_value('raw_content', raw_content) # Parse date information try: # Example: Kamis 15 Sep 2016, 18:33 WIB date_str = response.css( 'div.detail_area > div.date::text').extract()[0] # Example: '15 Sep 2016, 18:33' date_str = ' '.join(date_str.split(' ')[1:5]) self.logger.info('parse_date: parse_news: date_str: %s', date_str) published_at = datetime.strptime(date_str, '%d %b %Y, %H:%M') loader.add_value('published_at', published_at) except Exception as e: raise CloseSpider('cannot_parse_date: %s' % e) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): title = response.css('h1[itemprop="headline"]::text').extract()[0] author_name = response.css('a[rel="author"] > span::text').extract()[0] raw_content = response.css('.content').extract()[0] if not (title and author_name and raw_content): return self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('title', title) loader.add_value('author_name', author_name) loader.add_value('raw_content', raw_content) # Parse date information try: # Example: Selasa, 6 Oktober 2015 - 05:23 WIB date_time_str = response.css('article > div.time::text').extract()[0] date_time_str = date_time_str.split(',')[1].strip()[:-4] date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')]) self.logger.info('parse_date: parse_news: date_str: %s', date_time_str) published_at = wib_to_utc(datetime.strptime(date_time_str, '%d %B %Y - %H:%M')) loader.add_value('published_at', published_at) except Exception as e: raise CloseSpider('cannot_parse_date: %s' % e) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) parsed_news = json.loads(str(response.body))[0] # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', parsed_news['url']) if not parsed_news['title']: # Will be dropped on the item pipeline return loader.load_item() loader.add_value('title', parsed_news['title']) # Convert HTML text to a scrapy response html_response = HtmlResponse(url=parsed_news['url'], body=parsed_news['content'].encode('utf-8', 'ignore')) xpath_query = ''' //body/node() [not(descendant-or-self::comment()| descendant-or-self::style| descendant-or-self::script| descendant-or-self::div| descendant-or-self::span| descendant-or-self::image| descendant-or-self::img| descendant-or-self::iframe )] ''' raw_content_selectors = html_response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) loader.add_value('raw_content', raw_content) if not parsed_news['published']: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: 12 Oct 2016 - 05:25 date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %b %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) if not parsed_news['author']: loader.add_value('author_name', '') else: loader.add_value('author_name', parsed_news['author']) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, article, sub_article): if not (sub_article['news_url'] and article['news_title'] and article['news_reporter'] and sub_article['news_description'] and article['news_date_publish']): return self.logger.info('parse_news: %s' % article) # Example: https://m.merdeka.com/tag/p/pilgub-dki/politik/nachrowi-pastikan-agus-sylvi-tak-cuma-incar-suara-santri-ulama.html url = 'https://www.merdeka.com' + sub_article['news_url'] # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News()) loader.add_value('url', url) loader.add_value('title', article['news_title']) loader.add_value('author_name', article['news_reporter']) loader.add_value('raw_content', sub_article['news_description']) # Parse date information try: # Example: 2016-10-12 15:16:04 date_time_str = article['news_date_publish'] self.logger.info('parse_date: parse_news: date_str: %s', date_time_str) published_at = wib_to_utc( datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')) loader.add_value('published_at', published_at) except Exception as e: raise CloseSpider('cannot_parse_date: %s' % e) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) json_response = json.loads(response.body) try: url = json_response['NewsML']['NewsItem']['NewsComponent'][ 'NewsComponent']['NewsComponent']['NewsLines']['MoreLink'] except KeyError: return loader.load_item() loader.add_value('url', url) try: title = json_response['NewsML']['NewsItem']['NewsComponent'][ 'NewsComponent']['NewsComponent']['NewsLines']['HeadLine'] except KeyError: return loader.load_item() if not title: return loader.load_item() loader.add_value('title', title) try: raw_content = json_response['NewsML']['NewsItem']['NewsComponent'][ 'NewsComponent']['NewsComponent']['ContentItem'][ 'DataContent']['nitf']['body']['body.content']['p'] except KeyError: return loader.load_item() if not raw_content: return loader.load_item() loader.add_value('raw_content', raw_content) try: author_name = json_response['NewsML']['NewsItem']['NewsComponent'][ 'NewsComponent']['Author'] except KeyError: return loader.load_item() if not author_name: loader.add_value('author_name', '') else: loader.add_value('author_name', author_name) try: date_time_str = json_response['NewsML']['NewsItem'][ 'NewsManagement']['FirstCreated'] except KeyError: return loader.load_item() if not date_time_str: return loader.load_item() date_time_str = date_time_str.split('T') date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1] try: published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S') except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) return loader.load_item()
def parse_news(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css( 'div.detail > article > div.detail_area > h1::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) # parse date date_selectors = response.css( "div.detail > article > div.detail_area > div.date::text") if not date_selectors: return loader.load_item() # Selasa 10 Oktober 2017, 13:40 WIB date_str = date_selectors.extract()[0] date_str = filter(None, re.split('[\s,]', date_str))[1:5] info_time = ' '.join([_(s) for s in date_str if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #TODO check the published_at, if it is smaller than the last time #we crawl, just drop the data. #parse author name author_name_selectors = response.css( "div.detail > article > div.detail_area > div.author > strong::text" ) if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract_first() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css( "div.detail > article > div.text_detail.detail_area") if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('div.detail_area > h1.jdl::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) # Extract the content using XPath instead of CSS selector # We get the XPath from chrome developer tools (copy XPath) # or equivalent tools from other browser xpath_query = """ //div[@class="text_detail detail_area"]/node() [not(self::comment()|self::script|self::div)] """ raw_content_selectors = response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = ' '.join(raw_content_selectors.extract()) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Parse date information # Example: Kamis 15 Sep 2016, 18:33 WIB date_selectors = response.css('div.detail_area > div.date::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date_str = date_selectors.extract()[0] # Example: '15 Sep 2016, 18:33' date_str = ' '.join(date_str.split(' ')[1:5]) try: published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_name_selectors = response.css('div.author > strong::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css( 'div.main-container > div > section.main-content > h1.page-header::text' ) if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css( 'div.post-meta > div > div > div.submitted > span::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract_first() info_time = re.sub(r'[,-]', '', date_str) info_time = re.sub(r'\s+', ' ', info_time) time_arr = filter(None, re.split('[\s,|-]', info_time))[:4] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: return loader.load_item() #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( 'div.post-meta > div > div > div.items-penulis > span > a::text' ).extract_first() if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css( 'div.region.region-content > section > article > div.field.field-name-body.field-type-text-with-summary > div.field-items > div.field-item.even' ) if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('div#mdk-news-title::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css("div.mdk-date-reporter > span::text") # We need to do this because sometimes selector can contains 3 or 2 elements. pos = len(date_selectors) - 2 if not date_selectors: return loader.load_item() date_str = date_selectors.extract()[pos] # eg: 8 September 2017 21:02 date_str = date_str.split("|")[1].strip() time_arr = filter(None, re.split('[\s,|]', date_str)) info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( "div.mdk-date-reporter > span::text") if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract()[1] author_name = author_name.split(":")[1].strip() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css("div.mdk-body-paragraph") if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('div.artikel > h1.artikel::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css('div.artikel > div.tanggal::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract_first() # eg: Tuesday, 12 September 2017 | 20:21 WIB time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) if self.media['last_crawl_at'] >= published_at: is_no_update = True return loader.load_item() loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( 'div.artikel > div > p > strong::text') if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract()[-1].strip() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css('div.artikel > div > p') if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1.detailtitle::text') if not title_selectors: # If error, drop from the item pipeline return loader.load_item() title = title_selectors.extract_first().strip() loader.add_value('title', title) # Parse date information date_time = response.css( 'body > div > div.container > div.page-header > div::text' ).extract_first().strip() date_time = date_time.split(',')[-1].strip() date_time = ' '.join([_(w) for w in date_time.split(' ') ]) # October => Oktober try: published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M') except ValueError: # If error, drop from the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) # If multipage multipage_selectors = response.css('.newsPagingWrap > a') if multipage_selectors: return self.parse_indices(multipage_selectors, loader) # Else if not multipage author_name_selectors = response.css('.newsContent > p > strong::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[-1].strip() loader.add_value('author_name', author_name) # Extract the news content raw_content_selectors = response.css('.newsContent > p') if not raw_content_selectors: # Drop from the item pipeline return loader.load_item() raw_content = ' '.join(raw_content_selectors.extract()) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('h1::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) # parse date date_selectors = response.css("div.date > span::text") if not date_selectors: return loader.load_item() date_str = date_selectors.extract()[0] # eg: Selasa, 12 Sep 2017 20:08 date_str = date_str.split(",")[1].strip() time_arr = filter(None, re.split('[\s,|]', date_str)) info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #TODO check the published_at, if it is smaller than the last time #we crawl, just drop the data. #parse author name author_name_selectors = response.css("div.date > span")[1].css( "span > span::text") if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract_first() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css("div.contentdetail") if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): # self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('div.wrap-head > h2 > a::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) ##parse date date_selectors = response.css('div.wrap-head > span::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract()[0] # eg: Tuesday, 12 September 2017 | 20:21 WIB time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css('div.red::text') if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract()[0].strip() author_name = author_name.replace('Rep: ', '').replace('Red: ', '').strip() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css('div.content-detail') if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news_metro(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) date_selector = response.css('.artikel > div.block-tanggal::text') if not date_selector: return self.parse_news_pilkada(loader, response) try: date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4] date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')]) published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M') except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) if (self.media['last_scraped_at'] >= published_at): is_no_update = True self.logger.info('Media have no update') raise CloseSpider('finished') loader.add_value('published_at', published_at) title_selector = response.css('.artikel > h1::text') if not title_selector: return loader.load_item() loader.add_value('title', title_selector.extract()[0]) # Select all p which don't have iframe inside it raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]') if not raw_content_selector: return loader.load_item() raw_content = '' for raw_content_selector_one in raw_content_selector: raw_content = raw_content + raw_content_selector_one.extract() # Go to next page while there is next page button next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href') if next_page_selector: return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content)) loader.add_value('raw_content', raw_content) # The author usually put inside <strong> tag, however, some news is not using <strong> tag. # NOTE: this block of code may need revision in the future author_name = '' for author_name_selector in reversed(raw_content_selector): author_name_selector = author_name_selector.css('strong::text') for tmp in reversed(author_name_selector.extract()): tmp = tmp.strip() if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp): author_name = tmp break if author_name: break author_name = ','.join(author_name.split(' | ')) loader.add_value('author_name', author_name) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('div.content-detail > h4::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) # Extract raw html, not the text raw_content_selectors = response.css('div.content-body') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Example: Selasa, 11 Oktober 2016 | 10:48 date_selectors = response.css('div.date::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date_str = date_selectors.extract()[0] # Example: 11 October 2016 10:48 date_str = re.split('[\s,|-]', date_str) date_str = ' '.join([_(s) for s in date_str[1:] if s]) # Parse date information try: published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_selectors = response.css('div.content-detail > p::text') if not author_selectors: loader.add_value('author_name', '') else: author_name = author_selectors.extract()[0] author_name = author_name.split('/')[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css( 'div.pa15.bgwhite > h1.f32.fno.crimson::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css( 'div.pa15.bgwhite > div.mt10.mb10 > time.grey.f13.dip::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract_first() # eg: Kompas.com - 10/10/2017, 13:37 WIB info_time = date_str.split(',')[1].strip() time_arr = filter(None, re.split('[\s,|]', info_time))[:4] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: return loader.load_item() #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name # author_name_selectors = response.css('div.read__author > a::text').extract_first() # if not author_name_selectors: loader.add_value('author_name', 'N/A') # else: # author_name = author_name_selectors # loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css( 'div.ptb15 > div.txt-article.mb20') if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('section.main-content > h1::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css("div.submitted > span::text") if not date_selectors: return loader.load_item() # eg: 5 September, 2017 - 18:54 date_str = date_selectors.extract_first() time_arr = filter(None, re.split('[\s,-]', date_str)) info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( "div.items-penulis > span > a::text") if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors.extract()[0].strip() loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css("div.field-item.even") if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1[itemprop="headline"]::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) author_name_selectors = response.css('a[rel="author"] > span::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) raw_content_selectors = response.css('.content') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) date_time_str_selectors = response.css('article > div.time::text') if not date_time_str_selectors: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: Selasa, 6 Oktober 2015 - 05:23 WIB date_time_str = date_time_str_selectors.extract()[0] date_time_str = date_time_str.split(',')[1].strip()[:-4] date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1.title-big-detail::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0].strip() loader.add_value('title', title) # Extract raw html, not the text raw_content_selectors = response.css('div.detail-content') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract()[0] loader.add_value('raw_content', raw_content) date_selectors = response.css( 'span.meta-author > span:nth-child(3)::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() # Example: Sabtu, 1 Oktober 2016, 15:47 WIB date_str = date_selectors.extract()[0].strip() # Example: 1 October 2016 15:47 date_str = date_str.replace(',', '').split(' ')[1:-1] date_str = ' '.join([_(s) for s in date_str]) # Parse date information try: published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_selectors = response.css('span.meta-author > span > b::text') if not author_selectors: author_name = '' loader.add_value('author_name', author_name) else: author_name = author_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('h1.read__title::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css('div.read__time::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract_first() # eg: Kompas.com - 10/10/2017, 13:37 WIB time_arr = filter(None, re.split('[\s,-]', date_str))[1:3] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M') except ValueError as e: return loader.load_item() #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( 'div.read__author > a::text').extract_first() if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css('div.read__content') if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) #parse title title_selectors = response.css('h1.read__title::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css('div.read__date::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract()[0] # eg: Tuesday, 12 September 2017 | 20:21 WIB time_arr = filter(None, re.split('[\s,|]', date_str))[1:-1] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( 'div.contentArticle.box-shadow-new > h6::text').extract_first() if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css( 'div.contentArticle.box-shadow-new').extract() if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('div.detail_area > h1.jdl::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) raw_content_selectors = response.css('article > div.text_detail') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract()[0] loader.add_value('raw_content', raw_content) # Parse date information # Example: Kamis 15 Sep 2016, 18:33 WIB date_selectors = response.css('div.detail_area > div.date::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date_str = date_selectors.extract()[0] # Example: '15 Sep 2016, 18:33' date_str = ' '.join(date_str.split(' ')[1:5]) try: published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_name_selectors = response.css('div.author > strong::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('div.detail_text > h1::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) # Extract raw html, not the text raw_content_selectors = response.css('div.detail_text') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract()[0] loader.add_value('raw_content', raw_content) # Example: Senin, 10/10/2016 05:12 date_selectors = response.css('div.date::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date_str = date_selectors.extract()[0] # Example: 10/10/2016 05:12 date_str = date_str.split(',')[1].strip() # Parse date information try: published_at_wib = datetime.strptime(date_str, '%d/%m/%Y %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_name_selectors = response.css('div.author > strong::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) parsed_news = json.loads(str(response.body)) parsed_news = parsed_news[0] # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) if not parsed_news['title']: # Will be dropped on the item pipeline return loader.load_item() loader.add_value('title', parsed_news['title']) if not parsed_news['content']: # Will be dropped on the item pipeline return loader.load_item() parsed_news['content'] = re.search(r'<body>(.*)</body>', parsed_news['content'], re.S|re.I).group(1) parsed_news['content'] = re.sub(r'<img[^>]+\>', '', parsed_news['content']) loader.add_value('raw_content', parsed_news['content']) if not parsed_news['published']: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: 12 Oct 2016 - 05:25 date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %b %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) if not parsed_news['author']: loader.add_value('author_name', '') else: loader.add_value('author_name', parsed_news['author']) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) is_video = response.css( 'ul.breadcrumb > li > a::text').extract()[0] == 'VIDEO' # Skip if video page, since no author here if is_video: return # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title = response.css('div.part.lead.pr > h1::text').extract()[0] info = response.css('div.part.lead.pr > span::text').extract()[0] author_name = info.split('-')[0].strip() # Example: 10 Oktober 2016 21:10 wib date_str = info.split('-')[1].strip() # Extract raw html, not the text raw_content = response.css('div.part.article').extract() raw_content = ' '.join(raw_content) # Parse date information try: # Example: 10 October 2016 21:10 date_str = ' '.join([_(w) for w in date_str[:-4].split(' ')]) self.logger.info( 'parse_date: parse_news: date_str: {}'.format(date_str)) published_at = wib_to_utc( datetime.strptime(date_str, '%d %B %Y %H:%M')) loader.add_value('published_at', published_at) except Exception as e: raise CloseSpider('cannot_parse_date: {}'.format(e)) loader.add_value('title', title) loader.add_value('author_name', author_name) loader.add_value('raw_content', raw_content) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css("div.kcm-read-top > h2::text") if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) raw_content_selectors = response.css("div.kcm-read-text > p") if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract()[0] loader.add_value('raw_content', raw_content) date_selectors = response.css("div.kcm-date::text") if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date = date_selectors.extract()[0] try: published_at = self.convert_date(date) except Exception: # Will be dropped on the item pipeline return loader.load_item() loader.add_value('published_at', published_at) author_name_selectors = response.css("span.pb_10::text") if not author_name_selectors: author_name = '' loader.add_value('author_name', author_name) else: author_name = ', '.join(author_name_selectors.extract()) loader.add_value('author_name', author_name) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title = response.css('h1.title-big-detail::text').extract()[0].strip() info = response.css('span.meta-author span::text').extract() author_name = info[0].strip() # Example: Sabtu, 1 Oktober 2016, 15:47 WIB date_str = info[-1].strip() # Extract raw html, not the text raw_content = response.css('div.detail-content').extract()[0] # Parse date information try: # Example: 1 October 2016 15:47 date_str = date_str.replace(',', '').split(' ')[1:-1] date_str = ' '.join([_(s) for s in date_str]) self.logger.info( 'parse_date: parse_news: date_str: {}'.format(date_str)) published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M') published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) except Exception as e: raise CloseSpider('cannot_parse_date: {}'.format(e)) loader.add_value('title', title) loader.add_value('author_name', author_name) loader.add_value('raw_content', raw_content) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1.article-header__title::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) # Extract the content using XPath instead of CSS selector xpath_query = """ //div[@class="article-raw-content"]/node() [not( self::comment()| self::script| self::div)] """ raw_content_selectors = response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Parse date information # Example: ' pada 18 Okt 2016, 08:33 WIB' date_selectors = response.css('span.article-header__datetime::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() date_str = date_selectors.extract()[0].strip() # Example: '18 Oct 2016, 08:33' date_str = ' '.join([_(w) for w in date_str.split(' ')[1:-1]]) try: published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_name_selectors = response.css( 'a.article-header__author-link::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('div.NewsTitle > h1::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title.strip()) # Parse date information # Example: 27 Oct 2016, 18:33:36 WIB date_selectors = response.css('div.NewsDate::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() try: date_str = date_selectors.extract()[0] published_at_wib = datetime.strptime(date_str, '%d %b %Y %H:%M:%S WIB') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) # no author loader.add_value('author_name', '') # Extract the content using XPath instead of CSS selector # We get the XPath from chrome developer tools (copy XPath) # or equivalent tools from other browser xpath_query = """ //div[@class="pad10"]/p/node() [not( descendant-or-self::comment()| descendant-or-self::style| descendant-or-self::script| descendant-or-self::div| descendant-or-self::span| descendant-or-self::img| descendant-or-self::table| descendant-or-self::iframe )] """ raw_content_selectors = response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: {}'.format(response)) # Init item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1.title-big-detail::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0].strip() loader.add_value('title', title) # Extract raw html, not the text # We filter-out the noise: HTML comments, scripts, css styles etc xpath_query = ''' //div[@class="detail-content"]/node() [not( descendant-or-self::comment()| descendant-or-self::style| descendant-or-self::script| descendant-or-self::div| descendant-or-self::span| descendant-or-self::img| descendant-or-self::table| descendant-or-self::iframe| descendant-or-self::a[@class="share-btn-right shared"] )] ''' raw_content_selectors = response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) loader.add_value('raw_content', raw_content) date_selectors = response.css( 'span.meta-author > span:nth-child(3)::text') if not date_selectors: # Will be dropped on the item pipeline return loader.load_item() # Example: Sabtu, 1 Oktober 2016, 15:47 WIB date_str = date_selectors.extract()[0].strip() # Example: 1 October 2016 15:47 date_str = date_str.replace(',', '').split(' ')[1:-1] date_str = ' '.join([_(s) for s in date_str]) # Parse date information try: published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) author_selectors = response.css('span.meta-author > span > b::text') if not author_selectors: author_name = '' loader.add_value('author_name', author_name) else: author_name = author_selectors.extract()[0] loader.add_value('author_name', author_name) # Move scraped news to pipeline return loader.load_item()