Exemple #1
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = "cnn"

        config = Configuration()
        config.verbose = False
        s = Source("http://cnn.com", config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print "\t\tWe have %d articles currently!" % s.size()
        print
        print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
Exemple #2
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        config = Configuration()
        config.verbose = False
        s = Source('http://cnn.com', config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print '\t\tWe have %d articles currently!' % s.size()
        print
        print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
Exemple #3
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol', u'http://partners.cnn.com',
            u'http://www.cnn.com', u'http://cnn.com/US',
            u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL',
            u'http://cnn.com/cnni', u'http://cnn.com/SPORT',
            u'http://cnn.com/mostpopular', u'http://arabic.cnn.com',
            u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA',
            u'http://us.cnn.com', u'http://travel.cnn.com',
            u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ',
            u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com',
            u'http://money.cnn.com', u'http://cnn.com/tools/index.html',
            u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI',
            u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA',
            u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS'
        ]
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
Exemple #4
0
    def test_source_build(self):
        """builds a source object, validates it has no errors, prints out
        all valid categories and feed urls"""

        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        configs = Configuration()
        configs.verbose = False
        s = Source('http://cnn.com', configs=configs)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        print '\t\tWe have %d articles currently!' % s.size()
Exemple #5
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol',
            u'http://partners.cnn.com', u'http://www.cnn.com',
            u'http://cnn.com/US', u'http://cnn.com/EUROPE',
            u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni',
            u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular',
            u'http://arabic.cnn.com', u'http://cnn.com/WORLD',
            u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com',
            u'http://travel.cnn.com', u'http://mexico.cnn.com',
            u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com',
            u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com',
            u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com',
            u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com',
            u'http://cnn.com/AFRICA', u'http://cnn.com/TECH',
            u'http://cnn.com/BUSINESS']
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
class ExtractArticles():
    def __init__(self):
        self.sources = []
        self.papers = []
        self.pool = []
        self.categories = []
        self.category = None
        self.paper = None
        self.articles = []
        self.article = None
        self.newspaper = newspaper
        self.news_pool = news_pool

    def build_sources(self, domains):
        """Build sources using newspaper API to scrape from selected domains."""
        try:
            for domain in domains:
                source = 'http://%s' % domain
                self.sources.append(source)
            for source in self.sources:
                self.paper = Source(source)
                self.paper = self.newspaper.build(source,
                                                  memoize_articles=True,
                                                  keep_article_html=True,
                                                  verbose=True)
                print('Source: {} - Size: {}'.format(source,
                                                     self.paper.size()))
                self.papers.append(self.paper)
            self.news_pool.set(self.papers, threads_per_source=2)
            self.news_pool.join()
            return self.papers
        except:
            raise Exception

    def parse_article(self, paper, order=0):
        self.paper = paper
        try:
            self.article = paper.articles[order]
            article = self.article
            article.download()
            article.parse()
            brand = paper.brand
            url = article.url
            text = article.text
            html = article.article_html
            title = article.title
            images = article.images
            video = article.movies
            date = article.publish_date
            result = {
                'paper': brand,
                'article_url': url,
                'title': title,
                'text': text,
                'content': html,
                'video': video,
                'images': images,
                'publish_time': date
            }
            return result
        except:
            raise Exception

    def parse_articles(self, pool):
        index = 0
        try:
            for paper in pool:
                size = paper.size()
                brand = paper.brand
                while index < size:
                    article = self.parse_article(paper, index)
                    self.articles.append(article)
                    index += 1
                if size == 0:
                    pass
                print('Paper [{}] has new [{}] articles'.format(brand, size))
            return self.articles
        except:
            raise Exception

    def remove_invalid_articles(self, pool):
        """Remove scraped articles with duplicated or None titles."""
        try:
            title_list = []
            article_list = []
            print('Original articles: {}'.format(len(pool)))
            for article in pool:
                title = article['title']
                if title is None or title == "":
                    pool.remove(article)
                if title not in title_list:
                    title_list.append(title)
                    article_list.append(article)
            print('Unique articles: {}'.format(len(article_list)))
            return article_list
        except:
            raise Exception
import newspaper
from newspaper import Source

url = 'http://www.prothomalo.com/'
bangla_paper = Source(url, memoize_articles=False, number_threads=20)
bangla_paper.build()
print(bangla_paper.size())

for article in bangla_paper.articles:

    try:
        article.download()
        article.parse()
        print(article.url)
        print('Title :\n' + str(article.title) + '\n')
        print('Content :\n' + str(article.text) + '\n')

        if (len(article.tags) > 0):
            print('Tags :\n' + str(article.tags) + '\n')
        else:
            print('Tags :\n{}\n')

    except Exception:
        print(Exception)
'''
#print (newspaper.languages())
url = 'http://www.kalerkantho.com/online/Islamic-lifestylie/2017/12/29/583269';
#url = 'https://bdnews24.com/neighbours/2017/12/29/indian-state-of-assam-tense-ahead-of-citizens-list-targeting-illegal-bangladeshis'
article = Article(url, language='bn')
'''