def test_config_build(self): """Test if our **kwargs to config building setup actually works. NOTE: No need to mock responses as we are just initializing the objects, not actually calling download(..) """ a = Article(url='http://www.cnn.com/2013/11/27/' 'travel/weather-thanksgiving/index.html') assert a.config.language == 'en' assert a.config.memoize_articles is True assert a.config.use_meta_language is True a = Article(url='http://www.cnn.com/2013/11/27/travel/' 'weather-thanksgiving/index.html', language='zh', memoize_articles=False) assert a.config.language == 'zh' assert a.config.memoize_articles is False assert a.config.use_meta_language is False s = Source(url='http://cnn.com') assert s.config.language == 'en' assert s.config.MAX_FILE_MEMO == 20000 assert s.config.memoize_articles is True assert s.config.use_meta_language is True s = Source(url="http://cnn.com", memoize_articles=False, MAX_FILE_MEMO=10000, language='en') assert s.config.memoize_articles is False assert s.config.MAX_FILE_MEMO == 10000 assert s.config.language == 'en' assert s.config.use_meta_language is False
def main(): source="The Guardian" #config = Config() #config.memoize_articles = False guardian = Source("http://www.theguardian.com/world", memoize_articles=False) guardian.build() #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(guardian.size()) for article in [x for x in guardian.articles if re.match(".*/world/.*", x.url) is not None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text date = str(a.publish_date).split()[0].split("-") date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) delta = re.search(r'<span class="content__dateline-time">(.*)</span>' , html).group(1).replace(".",":").split()[0] time = datetime.now() + timedelta(hours=delta ) date_time = date + " " + time #print(title) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article('The Guardian', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
class News: articles = [] def __init__(self, url): self.newspaper = Source(url) self.newspaper.clean_memo_cache() self.newspaper.build() self.articles = self.newspaper.articles def get_news(self, num_of_articles): return self.newspaper.articles[:num_of_articles]
def test_config_build(self): """ Test if our **kwargs to config building setup actually works. """ a = Article( url= 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html' ) assert a.config.language == 'en' assert a.config.memoize_articles == True assert a.config.use_meta_language == True a = Article( url= 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', language='zh', memoize_articles=False) assert a.config.language == 'zh' assert a.config.memoize_articles == False assert a.config.use_meta_language == False s = Source(url='http://cnn.com') assert s.config.language == 'en' assert s.config.MAX_FILE_MEMO == 20000 assert s.config.memoize_articles == True assert s.config.use_meta_language == True s = Source(url="http://cnn.com", memoize_articles=False, MAX_FILE_MEMO=10000, language='en') assert s.config.memoize_articles == False assert s.config.MAX_FILE_MEMO == 10000 assert s.config.language == 'en' assert s.config.use_meta_language == False s = newspaper.build('http://cnn.com', dry=True) assert s.config.language == 'en' assert s.config.MAX_FILE_MEMO == 20000 assert s.config.memoize_articles == True assert s.config.use_meta_language == True s = newspaper.build('http://cnn.com', dry=True, memoize_articles=False, MAX_FILE_MEMO=10000, language='zh') assert s.config.language == 'zh' assert s.config.MAX_FILE_MEMO == 10000 assert s.config.memoize_articles == False assert s.config.use_meta_language == False
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' config = Configuration() config.verbose = False s = Source('http://cnn.com', config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print '\t\tWe have %d articles currently!' % s.size() print print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
def test_source_custom_params(self): s = Source(url="http://cnn.com", memoize_articles=False, MAX_FILE_MEMO=10000, language='en') self.assertFalse(s.config.memoize_articles) self.assertEqual(10000, s.config.MAX_FILE_MEMO) self.assertEqual('en', s.config.language) self.assertFalse(s.config.use_meta_language)
def crawl_sites(self, parse=True, download=True, nlp=True): self.do_parse = parse self.do_nlp = nlp assert not (self.do_parse ^ self.do_nlp ), """if nlp is set to true, parse must be set to true""" article_futures = [] newspaper_config = self.crawler_config.crawl_option sources = { s.name: Source(s.url, newspaper_config) for s in self.crawler_config.sites } for s_name, source in sources.items(): source.build() logger.info("Number of articles in newspaper %s is %d" % (s_name, source.size())) logger.info("Built the sources for the newspapers") if not download: return sources logger.info("downloading the article data from the newspapers") for s_name, source in sources.items(): article_futures.extend([ self.news_pools[s_name].submit(self.download_article, article) for article in source.articles ]) #download the actual content and parse for future_obj in as_completed(article_futures): self.article_callback(future_obj) return sources
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = "cnn" config = Configuration() config.verbose = False s = Source("http://cnn.com", config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print "\t\tWe have %d articles currently!" % s.size() print print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
def build_sources(self, domains): """Build sources using newspaper API to scrape from selected domains.""" try: for domain in domains: source = 'http://%s' % domain self.sources.append(source) for source in self.sources: self.paper = Source(source) self.paper = self.newspaper.build(source, memoize_articles=True, keep_article_html=True, verbose=True) print('Source: {} - Size: {}'.format(source, self.paper.size())) self.papers.append(self.paper) self.news_pool.set(self.papers, threads_per_source=2) self.news_pool.join() return self.papers except: raise Exception
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com', u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST', u'http://cnn.com', u'http://ireport.cnn.com', u'http://cnn.com/video', u'http://transcripts.cnn.com', u'http://cnn.com/espanol', u'http://partners.cnn.com', u'http://www.cnn.com', u'http://cnn.com/US', u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni', u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular', u'http://arabic.cnn.com', u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com', u'http://travel.cnn.com', u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com', u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA', u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS' ] FEEDS = [u'http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) url_re = re.compile(".*cnn\.com") mock_response_with(url_re, 'cnn_main_site') s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC assert s.size() == 241 assert s.category_urls() == CATEGORY_URLS # TODO: A lot of the feed extraction is NOT being tested because feeds # are primarly extracted from the HTML of category URLs. We lose this # effect by just mocking CNN's main page HTML. Warning: tedious fix. assert s.feed_urls() == FEEDS
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com', u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST', u'http://cnn.com', u'http://ireport.cnn.com', u'http://cnn.com/video', u'http://transcripts.cnn.com', u'http://cnn.com/espanol', u'http://partners.cnn.com', u'http://www.cnn.com', u'http://cnn.com/US', u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni', u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular', u'http://arabic.cnn.com', u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com', u'http://travel.cnn.com', u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com', u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA', u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS'] FEEDS = [u'http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) url_re = re.compile(".*cnn\.com") mock_response_with(url_re, 'cnn_main_site') s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC assert s.size() == 241 assert s.category_urls() == CATEGORY_URLS # TODO: A lot of the feed extraction is NOT being tested because feeds # are primarly extracted from the HTML of category URLs. We lose this # effect by just mocking CNN's main page HTML. Warning: tedious fix. assert s.feed_urls() == FEEDS
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ 'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com', 'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST', 'http://cnn.com', 'http://ireport.cnn.com', 'http://cnn.com/video', 'http://transcripts.cnn.com', 'http://cnn.com/espanol', 'http://partners.cnn.com', 'http://www.cnn.com', 'http://cnn.com/US', 'http://cnn.com/EUROPE', 'http://cnn.com/TRAVEL', 'http://cnn.com/cnni', 'http://cnn.com/SPORT', 'http://cnn.com/mostpopular', 'http://arabic.cnn.com', 'http://cnn.com/WORLD', 'http://cnn.com/LATINAMERICA', 'http://us.cnn.com', 'http://travel.cnn.com', 'http://mexico.cnn.com', 'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com', 'http://amanpour.blogs.cnn.com', 'http://money.cnn.com', 'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com', 'http://cnn.com/CNNI', 'http://business.blogs.cnn.com', 'http://cnn.com/AFRICA', 'http://cnn.com/TECH', 'http://cnn.com/BUSINESS'] FEEDS = ['http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) # resp = mock_response_with('http://cnn.com', 'cnn_main_site') s.clean_memo_cache() s.build()
def extract_from_source(self, source): news = NpSource(source, verbose=True) news.clean_memo_cache() news.build() logging.info('...build done!') for url in news.article_urls(): if self.is_available_url(url): article = self._extract_articles(url) if self.is_available_article(article): self._store_article(article)
def test_feed_extraction(self): """Test that feeds are matched properly """ url = 'http://theatlantic.com' html = mock_resource_with('theatlantic.com1', 'html') s = Source(url, memoize_articles=False) s.html = html s.parse() # mock in categories containing only homepage #s.set_categories() category = Category(url=url) category.html = html category.doc = s.doc s.categories = [ category, ] #s.parse_categories() s.set_feeds() self.assertEqual(len(s.feeds), 3)
def test_source_build(self): """builds a source object, validates it has no errors, prints out all valid categories and feed urls""" DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' configs = Configuration() configs.verbose = False s = Source('http://cnn.com', configs=configs) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC print '\t\tWe have %d articles currently!' % s.size()
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = ('CNN.com International delivers breaking news from across ' 'the globe and information on the latest top stories, ' 'business, sports and entertainment headlines. Follow the ' 'news as it happens through: special reports, videos, ' 'audio, photo galleries plus interactive maps and timelines.') CATEGORY_URLS = [ 'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com', 'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST', 'http://cnn.com', 'http://ireport.cnn.com', 'http://cnn.com/video', 'http://transcripts.cnn.com', 'http://cnn.com/espanol', 'http://partners.cnn.com', 'http://www.cnn.com', 'http://cnn.com/US', 'http://cnn.com/EUROPE', 'http://cnn.com/TRAVEL', 'http://cnn.com/cnni', 'http://cnn.com/SPORT', 'http://cnn.com/mostpopular', 'http://arabic.cnn.com', 'http://cnn.com/WORLD', 'http://cnn.com/LATINAMERICA', 'http://us.cnn.com', 'http://travel.cnn.com', 'http://mexico.cnn.com', 'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com', 'http://amanpour.blogs.cnn.com', 'http://money.cnn.com', 'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com', 'http://cnn.com/CNNI', 'http://business.blogs.cnn.com', 'http://cnn.com/AFRICA', 'http://cnn.com/TECH', 'http://cnn.com/BUSINESS' ] FEEDS = ['http://rss.cnn.com/rss/edition.rss'] BRAND = 'cnn' s = Source('http://cnn.com', verbose=False, memoize_articles=False) # html = mock_resource_with('http://cnn.com', 'cnn_main_site') s.clean_memo_cache() s.build()
def test_source_url_input_none(self): with self.assertRaises(Exception): Source(url=None)
def main(): source = "Al Jazeera" aj = Source( "http://america.aljazeera.com/topics/topic/categories/international.html", memoize_articles=False) fetch_data(aj)
import newspaper from newspaper import Source url = 'http://www.prothomalo.com/' bangla_paper = Source(url, memoize_articles=False, number_threads=20) bangla_paper.build() print(bangla_paper.size()) for article in bangla_paper.articles: try: article.download() article.parse() print(article.url) print('Title :\n' + str(article.title) + '\n') print('Content :\n' + str(article.text) + '\n') if (len(article.tags) > 0): print('Tags :\n' + str(article.tags) + '\n') else: print('Tags :\n{}\n') except Exception: print(Exception) ''' #print (newspaper.languages()) url = 'http://www.kalerkantho.com/online/Islamic-lifestylie/2017/12/29/583269'; #url = 'https://bdnews24.com/neighbours/2017/12/29/indian-state-of-assam-tense-ahead-of-citizens-list-targeting-illegal-bangladeshis' article = Article(url, language='bn') '''
def test_cache_categories(self): """Builds two same source objects in a row examines speeds of both """ url = 'http://uk.yahoo.com' mock_response_with(url, 'yahoo_main_site') s = Source(url) s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
def __init__(self, url): self.newspaper = Source(url) self.newspaper.clean_memo_cache() self.newspaper.build() self.articles = self.newspaper.articles
def test_cache_categories(self): """ builds two same source objects in a row examines speeds of both """ s = Source("http://yahoo.com") s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] # reset and try again with caching s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
class ExtractArticles(): def __init__(self): self.sources = [] self.papers = [] self.pool = [] self.categories = [] self.category = None self.paper = None self.articles = [] self.article = None self.newspaper = newspaper self.news_pool = news_pool def build_sources(self, domains): """Build sources using newspaper API to scrape from selected domains.""" try: for domain in domains: source = 'http://%s' % domain self.sources.append(source) for source in self.sources: self.paper = Source(source) self.paper = self.newspaper.build(source, memoize_articles=True, keep_article_html=True, verbose=True) print('Source: {} - Size: {}'.format(source, self.paper.size())) self.papers.append(self.paper) self.news_pool.set(self.papers, threads_per_source=2) self.news_pool.join() return self.papers except: raise Exception def parse_article(self, paper, order=0): self.paper = paper try: self.article = paper.articles[order] article = self.article article.download() article.parse() brand = paper.brand url = article.url text = article.text html = article.article_html title = article.title images = article.images video = article.movies date = article.publish_date result = { 'paper': brand, 'article_url': url, 'title': title, 'text': text, 'content': html, 'video': video, 'images': images, 'publish_time': date } return result except: raise Exception def parse_articles(self, pool): index = 0 try: for paper in pool: size = paper.size() brand = paper.brand while index < size: article = self.parse_article(paper, index) self.articles.append(article) index += 1 if size == 0: pass print('Paper [{}] has new [{}] articles'.format(brand, size)) return self.articles except: raise Exception def remove_invalid_articles(self, pool): """Remove scraped articles with duplicated or None titles.""" try: title_list = [] article_list = [] print('Original articles: {}'.format(len(pool))) for article in pool: title = article['title'] if title is None or title == "": pool.remove(article) if title not in title_list: title_list.append(title) article_list.append(article) print('Unique articles: {}'.format(len(article_list))) return article_list except: raise Exception
def test_cache_categories(self): """ builds two same source objects in a row examines speeds of both """ s = Source('http://yahoo.com') s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] # reset and try again with caching s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
def failfunc(): Source(url=None)
def test_cache_categories(self): """Builds two same source objects in a row examines speeds of both """ url = 'http://uk.yahoo.com' html = mock_resource_with('yahoo_main_site', 'html') s = Source(url) s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] s.set_categories() self.assertCountEqual(saved_urls, s.category_urls())
fname = "data.csv" df_origin = pd.read_csv(fname, header=None) data = df_origin.as_matrix() print data.shape # create list of Article objects urls = data[1:, 0].tolist() # for each line in csv articles = [] for i in range(len(urls)): # print "iteration:{} {} ".format(i,urls[i]) articles.append(Article(url=urls[i])) # create a source of aricltes news_source = Source("https://www.dummyurl.com") news_source.articles = articles # create a news_pool for threading purposes news_pool.set([news_source], threads_per_source=2) news_pool.join() # iterate through article list to create a column for the csv print "Parsing articles..." article_list = [] labels = ['title', 'authors', 'text', 'keywords', 'summary', 'tags'] for article in articles: print "Parsing article {}".format(article.url) article.parse() article_list.append({ labels[0]: article.title,
def failfunc(): __ = Source(url=None)
def paper(self): ''' get newspaper articles, default source is `herald` newspaper defaults to articles of this month and year import newspaperzw news = newspaperzw.news() ''' if self.summary and self.nlp == False: # raise exception. `nltk` module missing raise Exception(self.error_msg) news_source = Providers().getUrl(self.provider).strip() name = Source(news_source, self.config) name.build() name.download() name.parse() name.download_articles() # do logging logging.debug(f"News Source build and downloaded. url: {news_source}") news_data = {} news_article = [] counter = 0 for article in name.article_urls(): images = "" keywords = "" try: name.articles[counter].download() name.articles[counter].parse() # log logging.debug( f"Article #{counter} downloaded and parsed successfuly") except: counter += 1 # log logging.error( f"Error download and parsing article #{counter}. continue.." ) continue # get in data title = name.articles[counter].title date_pub = name.articles[counter].publish_date top_image = name.articles[counter].top_image link = name.articles[counter].url text = name.articles[counter].text if (self.nlp): # do nlp stuff name.articles[counter].nlp() summary = name.articles[counter].summary for words in name.articles[counter].keywords: keywords += str(words) + ',' # log logging.debug( f"summary flag enabled. NLP summary obtained successfuly") # add to news pool, only add news of this year and month # data_pub format = 10-04-2018 21:28:09 data = {} if (self.nlp): data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "summary": summary, "keywords": keywords.rstrip(','), "url": link }) # log logging.debug("article data with summary saved to news pool!") else: data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "url": link }) # log logging.debug("article data added to news pool") news_article.append(data) data = {} # increment to next articles counter += 1 # build main news storage news_data.update({ 'source': name.brand, 'domain': name.domain, 'news': news_article }) # log logging.debug("News main data pool created on success") return news_data
def test_source_default_params(self): s = Source(url='http://cnn.com') self.assertEqual('en', s.config.language) self.assertEqual(20000, s.config.MAX_FILE_MEMO) self.assertTrue(s.config.memoize_articles) self.assertTrue(s.config.use_meta_language)
def main(): source="The Huffington Post" delivery_time="6:00" #config = Config() #config.memoize_articles = False hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False) hpost.download() hpost.parse() hpost.set_categories() hpost.categories = [hpost.categories[0]] hpost.categories[0].url = "http://huffingtonpost.com/theworldpost" hpost.download_categories() hpost.parse_categories() hpost.set_feeds() hpost.download_feeds() hpost.generate_articles() #for c in hpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(hpost.size()) for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) findtime = re.search(r'Posted.*<time datetime="(.*?)">', html) if findtime is None: date=None time=None else: date,time = findtime.group(1).split("T") date = date.split("-") date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) time = ":".join(time.split("-")[0].split(":")[0:2]) date_time = str(date) + " " + str(time) #print(title) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article('Huffington Post', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source = "The Washington Post" delivery_time = "6:00" #config = Config() #config.memoize_articles = False wpost = Source("http://washingtonpost.com/world", memoize_articles=False) wpost.download() wpost.parse() wpost.set_categories() wpost.categories = [wpost.categories[0]] wpost.categories[0].url = "http://washingtonpost.com/world" wpost.download_categories() wpost.parse_categories() wpost.set_feeds() wpost.download_feeds() wpost.generate_articles() #for c in wpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(wpost.size()) for article in [ x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None ]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) if a.publish_date is not None: date = str(a.publish_date).split()[0].split("-") #print(date) date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) else: date = None time = re.search(r'<span class="pb-timestamp">(.*?)</span>', html) if time is None: print(url) date = None else: time = time.group(1) if ":" not in time: time = delivery_time else: time = time.split(" at ")[1] time = datetime.datetime.strptime(time, '%I:%M %p').strftime('%H:%M') date_time = str(date) + " " + str(time) #print(date_time) date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #print(text) #print(date_time) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article(source, article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source="The Washington Post" delivery_time="6:00" #config = Config() #config.memoize_articles = False wpost = Source("http://washingtonpost.com/world", memoize_articles=False) wpost.download() wpost.parse() wpost.set_categories() wpost.categories = [wpost.categories[0]] wpost.categories[0].url = "http://washingtonpost.com/world" wpost.download_categories() wpost.parse_categories() wpost.set_feeds() wpost.download_feeds() wpost.generate_articles() #for c in wpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(wpost.size()) for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) if a.publish_date is not None: date = str(a.publish_date).split()[0].split("-") #print(date) date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) else: date = None time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html) if time is None: print(url) date = None else: time = time.group(1) if ":" not in time: time = delivery_time else: time = time.split(" at ")[1] time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M') date_time = str(date) + " " + str(time) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #print(text) #print(date_time) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article(source, article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source = "The Guardian" #config = Config() #config.memoize_articles = False guardian = Source("http://www.theguardian.com/world", memoize_articles=False) guardian.build() #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(guardian.size()) for article in [ x for x in guardian.articles if re.match(".*/world/.*", x.url) is not None ]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text date = str(a.publish_date).split()[0].split("-") date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) delta = re.search(r'<span class="content__dateline-time">(.*)</span>', html).group(1).replace(".", ":").split()[0] time = datetime.now() + timedelta(hours=delta) date_time = date + " " + time #print(title) #print(date_time) date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article('The Guardian', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source = "BBC" bbc = Source("http://www.bbc.com/news", memoize_articles=False) fetch_data(bbc)