Example #1
0
 def test_scrape_pub_date(self):
     print("Testing that NYT Scraper only returns articles from the past")
     start_time = datetime.datetime.now()
     scraper = NYTScraper("http://rss.nytimes.com/services/xml/rss/nyt/Baseball.xml")
     scraper.scrape_all()
     for item in scraper.articles:
         self.assertTrue(item["pub_date"] < pytz.utc.localize(datetime.datetime.now()))
Example #2
0
    def get_latest_articles(self):
        self.log.debug("Getting latest articles for feed {}".format(self.feed_url))
        
        #gtl we need to somehow intuit the right scraper...
        #... I'm so sorry
        if "nytimes.com" in self.feed_url:
            scraper = NYTScraper(self.feed_url)
        elif "tsn.ca" in self.feed_url:
            scraper = TSNScraper(self.feed_url)

        scraper.scrape_all()
        for raw_article in scraper.articles:
            # print("Raw: {}".format(raw_article))
            a = Article()
            a.article_url = raw_article["article_url"]
            a.article_title = raw_article["article_title"]
            if not raw_article["pub_date"]:
                a.pub_date = timezone.now()
            else:
                a.pub_date = raw_article["pub_date"]
            a.parent_feed = self
            if len(Article.objects.all().filter(article_title=a.article_title)) == 0:
                print("[SCRAPING] '{}'".format(a.article_title))
                a.article_text = scraper.get_article_text(a.article_url)
                a.save()
            else:
                print("[CACHED] '{}'".format(a.article_title))