コード例 #1
0
ファイル: unit_tests.py プロジェクト: zhoubug/newspaper
    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source('http://yahoo.com')
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
コード例 #2
0
ファイル: unit_tests.py プロジェクト: WheresWardy/newspaper
    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source("http://yahoo.com")
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
コード例 #3
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
コード例 #4
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())
コード例 #5
0
ファイル: unit_tests.py プロジェクト: erezbil/newspaper
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
コード例 #6
0
ファイル: unit_tests.py プロジェクト: Newspad/newspaper
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())
コード例 #7
0
 def test_feed_extraction(self):
     """Test that feeds are matched properly
     """
     url = 'http://theatlantic.com'
     html = mock_resource_with('theatlantic.com1', 'html')
     s = Source(url, memoize_articles=False)
     s.html = html
     s.parse()
     # mock in categories containing only homepage
     #s.set_categories()
     category = Category(url=url)
     category.html = html
     category.doc = s.doc
     s.categories = [
         category,
     ]
     #s.parse_categories()
     s.set_feeds()
     self.assertEqual(len(s.feeds), 3)
コード例 #8
0
def main():
    source="The Huffington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False)
    hpost.download()
    hpost.parse()

    hpost.set_categories()
    
    hpost.categories = [hpost.categories[0]]
    hpost.categories[0].url = "http://huffingtonpost.com/theworldpost"
    hpost.download_categories()
    hpost.parse_categories()

    hpost.set_feeds()
    hpost.download_feeds()

    hpost.generate_articles()
    
    #for c in hpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(hpost.size())

    for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        findtime = re.search(r'Posted.*<time datetime="(.*?)">', html)
        if findtime is None:
            date=None
            time=None
        else:
            date,time = findtime.group(1).split("T")
            date = date.split("-")
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
            
            time = ":".join(time.split("-")[0].split(":")[0:2])
        date_time = str(date) + " " + str(time)
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('Huffington Post', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
コード例 #9
0
def main():
    source="The Washington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()
    
    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()
    
    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not  None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
コード例 #10
0
def main():
    source = "The Washington Post"
    delivery_time = "6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()

    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()

    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [
            x for x in wpost.articles
            if re.match(".*com/world/.*", x.url) is not None
            and re.match(".*gallery.html", x.url) is None
    ]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>', html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,
                                                  '%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex