def test_scrape_pub_date(self): print("Testing that NYT Scraper only returns articles from the past") start_time = datetime.datetime.now() scraper = NYTScraper("http://rss.nytimes.com/services/xml/rss/nyt/Baseball.xml") scraper.scrape_all() for item in scraper.articles: self.assertTrue(item["pub_date"] < pytz.utc.localize(datetime.datetime.now()))
def get_latest_articles(self): self.log.debug("Getting latest articles for feed {}".format(self.feed_url)) #gtl we need to somehow intuit the right scraper... #... I'm so sorry if "nytimes.com" in self.feed_url: scraper = NYTScraper(self.feed_url) elif "tsn.ca" in self.feed_url: scraper = TSNScraper(self.feed_url) scraper.scrape_all() for raw_article in scraper.articles: # print("Raw: {}".format(raw_article)) a = Article() a.article_url = raw_article["article_url"] a.article_title = raw_article["article_title"] if not raw_article["pub_date"]: a.pub_date = timezone.now() else: a.pub_date = raw_article["pub_date"] a.parent_feed = self if len(Article.objects.all().filter(article_title=a.article_title)) == 0: print("[SCRAPING] '{}'".format(a.article_title)) a.article_text = scraper.get_article_text(a.article_url) a.save() else: print("[CACHED] '{}'".format(a.article_title))