def parse_mihaaru_links(self, response): """ Does not use an RSS Feed. Scrapes site for links to articles. """ links = response.xpath('//div[@class="main_news_size_2"]/a') for link in links: item = NewsbyteItem() item['source'] = response.url url = link.xpath('@href').extract() item['link'] = url[0] title = link.xpath('text()').extract() item['title'] = title[0] item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['region'] = self.region pubdate = time.localtime( ) # if there is no pubdate the time it is scraped is used item['pubdate'] = time.mktime(pubdate) request = Request(item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item yield request
def parse_vienlinks(self, response): """ There's an issue regarding date published... currently using the date when it's scraped. Does not use an RSS Feed. """ for link in response.xpath('//div[@class="list-view"]//a/@href'): full_link = response.urljoin(link.extract()) item = NewsbyteItem() item['source'] = response.url item['link'] = full_link item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['region'] = self.region pubdate = time.localtime() item['pubdate'] = time.mktime(pubdate) request = Request(full_link, callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item yield request
def parse_fortunelinks(self, response): """ Does not use an RSS Feed. """ for link in response.xpath('//div[@class="span6"]/h3/a'): new_link = ''.join(link.xpath('./@href').extract()) title = ''.join(link.xpath('./text()').extract()) item = NewsbyteItem() item['source'] = response.url item['link'] = new_link item['title'] = title item['country'] = '#' if response.meta['country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta['language'] is None else response.meta['language'] item['region'] = item['region'] = self.region request = Request( new_link, callback=getattr(self, response.meta['method']), dont_filter=response.meta.get('dont_filter', False) ) request.meta['item'] = item yield request
def parse_dailystarlinks(self, response): """ Does not use an RSS Feed. """ links = response.xpath( '//div[@class="three-33"]/ul[@class="list-border besides"]/li/a/@href' ).extract() frontpage_links = [link for link in links if 'frontpage' in link] for link in frontpage_links: item = NewsbyteItem() item['source'] = response.url item['link'] = response.urljoin(link) item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['region'] = self.region request = Request(item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item request.meta['xpath'] = response.meta['xpath'] yield request
def parse_raiamwema_links(self, response): """ Does not use an RSS Feed. Scrapes site for links to articles. """ # Gets latest posts links = response.xpath('//div[@class="item-details"]') for link in links: try: item = NewsbyteItem() item['source'] = response.url url = link.xpath('h3/a/@href').extract() item['link'] = url[0] title = link.xpath('h3/a/text()').extract() item['title'] = title[0] item['country'] = '#' if response.meta['country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta['language'] is None else response.meta['language'] item['region'] = self.region pubdate = time.localtime() # if there is no pubdate the time it is scraped is used item['pubdate'] = time.mktime(pubdate) description = link.xpath('div[@class="td-excerpt"]/text()').extract() item['description'] = description[0] request = Request( item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get('dont_filter', False) ) request.meta['item'] = item yield request except Exception as e: print e
def parse_dominican_today_links(self, response): """ Does not use an RSS Feed. Scrapes site for links to articles. Only gets news categorized under "Local" """ links = response.xpath( '//div[contains(@class, "noticias2-category")]/h2/a') for link in links: try: item = NewsbyteItem() item['source'] = response.url url = link.xpath('@href').extract() item['link'] = url[0] title = link.xpath('text()').extract() item['title'] = title[0] item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['region'] = self.region request = Request(item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item yield request except Exception as e: print e
def parse_al_ayyam(self, response): """ This function uses feedparser to parse through rss feeds extracting the XML nodes important to present in news articles (i.e title of article, link, publication date, etc.) and populates the scrapy.Fields in the NewsbyteItem dictionary with said info. """ feed = feedparser.parse(response.body) for entry in feed.entries: try: item = NewsbyteItem() item['source'] = response.url item['title'] = lxml.html.fromstring(entry.title).text pubdate = entry.published if not isinstance(pubdate, unicode): # if pubdate is not unicode # fix wrong dates if pubdate.tm_year < 2000: pubdate = time.localtime() elif time.localtime().tm_yday - pubdate.tm_yday > 7: continue else: item['pubdate'] = time.mktime(pubdate) else: pubdate = parse(pubdate, fuzzy=True, dayfirst=True) pubdate = time.mktime(pubdate.timetuple()) item['pubdate'] = pubdate item['link'] = 'http://www.al-ayyam.ps/' + entry.link item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['description'] = entry.description item['item_id'] = str(uuid4()) item['region'] = self.region request = Request(item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item request.meta['entry'] = entry request.meta['xpath'] = response.meta['xpath'] if 'thumb_xpath' in response.meta: request.meta['thumb_xpath'] = response.meta['thumb_xpath'] else: item['thumbnail'] = '' yield request except Exception as e: print '%s: %s' % (type(e), e) print entry
def parse_abante_links(self, response): """ Does not use an RSS Feed. Scrapes site for links to articles. """ # Gets links links = response.xpath('//div[contains(@class,"td-animation")]') for link in links: try: item = NewsbyteItem() item['source'] = response.url url = link.xpath( 'div[@class="item-details"]/h3/a/@href').extract() item['link'] = url[0] title = link.xpath( 'div[@class="item-details"]/h3/a/@title').extract() item['title'] = title[0] item['country'] = '#' if response.meta[ 'country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta[ 'language'] is None else response.meta['language'] item['region'] = self.region pubdate = link.xpath( 'div[@class="item-details"]/div/span/time/@datetime' ).extract() pubdate = parse(pubdate[0], fuzzy=True, dayfirst=False) item['pubdate'] = time.mktime(pubdate.timetuple()) description = link.xpath( 'div[@class="item-details"]/div[@class="td-excerpt"]' ).extract() item['description'] = description[0] thumbnail = link.xpath( 'div[@class="td-module-thumb"]/a/img/@src').extract() item['thumbnail'] = thumbnail[0] request = Request(item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get( 'dont_filter', False)) request.meta['item'] = item yield request except Exception as e: print e
def parse_megi_links(self, response): """ Does not use an RSS Feed. Scrapes site for links to articles. """ # Gets latest posts latest = response.xpath('//div[@class="latest_widget_blurb_wrapper"]') # Get header of lifestyle news lifestyle = response.xpath('//div[@class="homepage_article_widget_section_title" and text()="Lifestyle"]') # Gets front page posts before lifestyle news news = lifestyle.xpath('preceding::div[@class="special_smallstory_white_wrappers"]') # Combines latest news nad other news links = latest + news for link in links: try: item = NewsbyteItem() item['source'] = response.url url = link.xpath('div/a/@href').extract() item['link'] = response.urljoin(url[0]) title = link.xpath('div/a/text()').extract() item['title'] = title[0] item['country'] = '#' if response.meta['country'] is None else response.meta['country'] item['item_id'] = str(uuid4()) item['language'] = '#' if response.meta['language'] is None else response.meta['language'] item['region'] = self.region pubdate = time.localtime() # if there is no pubdate the time it is scraped is used item['pubdate'] = time.mktime(pubdate) description = link.xpath('div/text()').extract() item['description'] = description[1] request = Request( item['link'], callback=getattr(self, response.meta['method']), dont_filter=response.meta.get('dont_filter', False) ) request.meta['item'] = item yield request except Exception as e: print e
def parse_common(self, response): """ This function uses feedparser to parse through rss feeds extracting the XML nodes important to present in news articles (i.e title of article, link, publication date, etc.) and populates the scrapy.Fields in the NewsbyteItem dictionary with said info. """ feed = feedparser.parse(response.body) for entry in feed.entries: try: item = NewsbyteItem() # Get published date of article attributes = entry.keys() if 'published_parsed' in attributes: pubdate = entry.published_parsed else: pubdate = None if pubdate is None and 'updated_parsed' in attributes: pubdate = entry.updated_parsed elif pubdate is None and 'published' in attributes: pubdate = entry.published if pubdate is None: pubdate = time.localtime() # if there is no pubdate the time it is scraped is used if not isinstance(pubdate, unicode): # if pubdate is not unicode # Fix Sri Lanka native dates if response.url == 'http://www.lankadeepa.lk/rss/latest_news/1': date = entry.published split_date = date.split() # Create new pubdate from published_parsed and published dates pubdate = time.strptime(str(pubdate.tm_year)+" "+str(pubdate.tm_mon)+" "+split_date[3]+" "+split_date[5], "%Y %m %d %H:%M") # Fix wrong dates if pubdate.tm_year < 2000: # I don't know what this is for pubdate = time.localtime() elif time.localtime().tm_yday - pubdate.tm_yday > 7: # Checks if article is older than 7 days print "outdated" continue else: item['pubdate'] = time.mktime(pubdate) else: # Certain feeds use mm/dd/yyyy format instead of dd/mm/yyyy if response.url == 'http://mubasher.aljazeera.net/rss.xml' or \ response.url == 'http://www.almadapaper.net/rss/': pubdate = parse(pubdate, fuzzy=True, dayfirst=False) else: pubdate = parse(pubdate, fuzzy=True, dayfirst=True) # Checks if article is older than 7 days if time.localtime().tm_yday - pubdate.timetuple().tm_yday > 7: continue else: pubdate = time.mktime(pubdate.timetuple()) item['pubdate'] = pubdate item['item_id'] = str(uuid4()) item['source'] = response.url item['link'] = entry.link item['country'] = '#' if response.meta['country'] is None else response.meta['country'] item['language'] = '#' if response.meta['language'] is None else response.meta['language'] item['title'] = lxml.html.fromstring(entry.title).text item['description'] = entry.description item['region'] = self.region request = Request( entry.link, callback=getattr(self, response.meta['method']), dont_filter=response.meta.get('dont_filter', False) ) request.meta['item'] = item request.meta['entry'] = entry request.meta['xpath'] = response.meta['xpath'] if 'thumb_xpath' in response.meta: request.meta['thumb_xpath'] = response.meta['thumb_xpath'] else: item['thumbnail'] = '' yield request except Exception as e: print '%s: %s' % (type(e), e) print entry