def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = "cnn" config = Configuration() config.verbose = False s = Source("http://cnn.com", config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print "\t\tWe have %d articles currently!" % s.size() print print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' config = Configuration() config.verbose = False s = Source('http://cnn.com', config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print '\t\tWe have %d articles currently!' % s.size() print print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com', u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST', u'http://cnn.com', u'http://ireport.cnn.com', u'http://cnn.com/video', u'http://transcripts.cnn.com', u'http://cnn.com/espanol', u'http://partners.cnn.com', u'http://www.cnn.com', u'http://cnn.com/US', u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni', u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular', u'http://arabic.cnn.com', u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com', u'http://travel.cnn.com', u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com', u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA', u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS' ] FEEDS = [u'http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) url_re = re.compile(".*cnn\.com") mock_response_with(url_re, 'cnn_main_site') s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC assert s.size() == 241 assert s.category_urls() == CATEGORY_URLS # TODO: A lot of the feed extraction is NOT being tested because feeds # are primarly extracted from the HTML of category URLs. We lose this # effect by just mocking CNN's main page HTML. Warning: tedious fix. assert s.feed_urls() == FEEDS
def test_source_build(self): """builds a source object, validates it has no errors, prints out all valid categories and feed urls""" DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' configs = Configuration() configs.verbose = False s = Source('http://cnn.com', configs=configs) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC print '\t\tWe have %d articles currently!' % s.size()
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com', u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST', u'http://cnn.com', u'http://ireport.cnn.com', u'http://cnn.com/video', u'http://transcripts.cnn.com', u'http://cnn.com/espanol', u'http://partners.cnn.com', u'http://www.cnn.com', u'http://cnn.com/US', u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni', u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular', u'http://arabic.cnn.com', u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com', u'http://travel.cnn.com', u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com', u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA', u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS'] FEEDS = [u'http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) url_re = re.compile(".*cnn\.com") mock_response_with(url_re, 'cnn_main_site') s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC assert s.size() == 241 assert s.category_urls() == CATEGORY_URLS # TODO: A lot of the feed extraction is NOT being tested because feeds # are primarly extracted from the HTML of category URLs. We lose this # effect by just mocking CNN's main page HTML. Warning: tedious fix. assert s.feed_urls() == FEEDS
class ExtractArticles(): def __init__(self): self.sources = [] self.papers = [] self.pool = [] self.categories = [] self.category = None self.paper = None self.articles = [] self.article = None self.newspaper = newspaper self.news_pool = news_pool def build_sources(self, domains): """Build sources using newspaper API to scrape from selected domains.""" try: for domain in domains: source = 'http://%s' % domain self.sources.append(source) for source in self.sources: self.paper = Source(source) self.paper = self.newspaper.build(source, memoize_articles=True, keep_article_html=True, verbose=True) print('Source: {} - Size: {}'.format(source, self.paper.size())) self.papers.append(self.paper) self.news_pool.set(self.papers, threads_per_source=2) self.news_pool.join() return self.papers except: raise Exception def parse_article(self, paper, order=0): self.paper = paper try: self.article = paper.articles[order] article = self.article article.download() article.parse() brand = paper.brand url = article.url text = article.text html = article.article_html title = article.title images = article.images video = article.movies date = article.publish_date result = { 'paper': brand, 'article_url': url, 'title': title, 'text': text, 'content': html, 'video': video, 'images': images, 'publish_time': date } return result except: raise Exception def parse_articles(self, pool): index = 0 try: for paper in pool: size = paper.size() brand = paper.brand while index < size: article = self.parse_article(paper, index) self.articles.append(article) index += 1 if size == 0: pass print('Paper [{}] has new [{}] articles'.format(brand, size)) return self.articles except: raise Exception def remove_invalid_articles(self, pool): """Remove scraped articles with duplicated or None titles.""" try: title_list = [] article_list = [] print('Original articles: {}'.format(len(pool))) for article in pool: title = article['title'] if title is None or title == "": pool.remove(article) if title not in title_list: title_list.append(title) article_list.append(article) print('Unique articles: {}'.format(len(article_list))) return article_list except: raise Exception
import newspaper from newspaper import Source url = 'http://www.prothomalo.com/' bangla_paper = Source(url, memoize_articles=False, number_threads=20) bangla_paper.build() print(bangla_paper.size()) for article in bangla_paper.articles: try: article.download() article.parse() print(article.url) print('Title :\n' + str(article.title) + '\n') print('Content :\n' + str(article.text) + '\n') if (len(article.tags) > 0): print('Tags :\n' + str(article.tags) + '\n') else: print('Tags :\n{}\n') except Exception: print(Exception) ''' #print (newspaper.languages()) url = 'http://www.kalerkantho.com/online/Islamic-lifestylie/2017/12/29/583269'; #url = 'https://bdnews24.com/neighbours/2017/12/29/indian-state-of-assam-tense-ahead-of-citizens-list-targeting-illegal-bangladeshis' article = Article(url, language='bn') '''