def url_path(self): if os.path.getsize(self.file_path) == 0: with open(self.file_path, "w+", encoding="utf-8") as uf: popular_urls_list = newspaper.popular_urls() for i in range(len(popular_urls_list)): uf.write(popular_urls_list[i]) uf.write("\n") print("populate url list have worte")
def get_trending(): url = ('https://newsapi.org/v2/top-headlines?' 'country=us&' 'apiKey=bcaaf0d008994d818672b2c1141be98b') response = requests.get(url) popular_articles = { 'articles': json.loads(response.text).get('articles') } popular_url_topics = { 'popular_urls': newspaper.popular_urls(), 'hot_topics': newspaper.hot() } return {**popular_articles, **popular_url_topics}
def trending(): trending_terms = newspaper.hot() trending_urls = newspaper.popular_urls()[:10] return trending_terms, trending_urls
def test_popular_urls(self): """ just make sure this runs """ newspaper.popular_urls()
def get_popular(): global pop pop = newspaper.popular_urls()
def get_popular_urls(self, *args, **kwargs): return newspaper.popular_urls()
def test_popular_urls(self): """Just make sure this method runs """ newspaper.popular_urls()
def newspaper_stories(words, search_type='or', search_level=0, urls=None, display=True, memorize=False, language='en'): config = newspaper.Config() config.memoize_articles = memorize config.language = language config.fetch_images = False config.request_timeout = 20 config.MIN_WORD_COUNT = 300 config.MIN_SENT_COUNT = 10 if urls == None or urls == 'top_news': news_urls = { 'huffington': 'http://huffingtonpost.com', 'reuters': 'http://www.reuters.com', 'cbs-news': 'http://www.cbsnews.com', 'usa-today': 'http://usatoday.com', 'cnn': 'http://cnn.com', 'npr': 'http://www.npr.org', 'abc-news': 'http://abcnews.com', 'us-news': 'http://www.usnews.com', 'msn': 'http://msn.com', 'pbs': 'http://www.pbs.org', 'nbc-news': 'http://www.nbcnews.com', 'msnbc': 'http://www.msnbc.com', 'fox': 'http://www.foxnews.com' } elif urls == 'all_us_news': news_urls = { 'abc-news': 'https://abcnews.go.com', 'al-jazeera-english': 'http://www.aljazeera.com', 'ars-technica': 'http://arstechnica.com', 'associated-press': 'https://apnews.com/', 'axios': 'https://www.axios.com', 'bleacher-report': 'http://www.bleacherreport.com', 'bloomberg': 'http://www.bloomberg.com', 'breitbart-news': 'http://www.breitbart.com', 'business-insider': 'http://www.businessinsider.com', 'buzzfeed': 'https://www.buzzfeed.com', 'cbs-news': 'http://www.cbsnews.com', 'cnbc': 'http://www.cnbc.com', 'cnn': 'http://us.cnn.com', 'crypto-coins-news': 'https://www.ccn.com', 'engadget': 'https://www.engadget.com', 'entertainment-weekly': 'http://www.ew.com', 'espn': 'http://espn.go.com', 'espn-cric-info': 'http://www.espncricinfo.com/', 'fortune': 'http://fortune.com', 'fox-news': 'http://www.foxnews.com', 'fox-sports': 'http://www.foxsports.com', 'google-news': 'https://news.google.com', 'hacker-news': 'https://news.ycombinator.com', 'ign': 'http://www.ign.com', 'mashable': 'http://mashable.com', 'medical-news-today': 'http://www.medicalnewstoday.com', 'msnbc': 'http://www.msnbc.com', 'mtv-news': 'http://www.mtv.com/news', 'national-geographic': 'http://news.nationalgeographic.com', 'national-review': 'https://www.nationalreview.com/', 'nbc-news': 'http://www.nbcnews.com', 'new-scientist': 'https://www.newscientist.com/section/news', 'newsweek': 'http://www.newsweek.com', 'new-york-magazine': 'http://nymag.com', 'next-big-future': 'https://www.nextbigfuture.com', 'nfl-news': 'http://www.nfl.com/news', 'nhl-news': 'https://www.nhl.com/news', 'politico': 'https://www.politico.com', 'polygon': 'http://www.polygon.com', 'recode': 'http://www.recode.net', 'reddit-r-all': 'https://www.reddit.com/r/all', 'reuters': 'http://www.reuters.com', 'techcrunch': 'https://techcrunch.com', 'techradar': 'http://www.techradar.com', 'american-conservative': 'http://www.theamericanconservative.com/', 'hill': 'http://thehill.com', 'huffington-post': 'http://www.huffingtonpost.com', 'next-web': 'http://thenextweb.com', 'verge': 'http://www.theverge.com', 'wall-street-journal': 'http://www.wsj.com', 'washington-post': 'https://www.washingtonpost.com', 'washington-times': 'https://www.washingtontimes.com/', 'time': 'http://time.com', 'usa-today': 'http://www.usatoday.com/news', 'vice-news': 'https://news.vice.com', 'wired': 'https://www.wired.com' } elif urls == "texas_universities": news_urls = { 'A&M': 'http://www.tamu.edu', 'A&M-Commerce': 'http://www.tamuc.edu', 'A&M-Corpus': 'http://www.tamucc.edu', 'A&M-Kingsville': 'http://www.tamuk.edu', 'A&M-Galveston': 'http://www.tamug.edu', 'A&M-PrairieView': 'http://www.pvamu.edu', 'A&M-International': 'http://www.tamiu.edu', 'A&M-WestTexas': 'http://www.wtamu.edu', 'Baylor': 'http://www.baylor.edu', 'Rice': 'http://www.rice.edu', 'SFAustin': 'http://www.sfasu.edu', 'SMU': 'http://www.smu.edu', 'SulRoss': 'http://www.sulross.edu', 'TexasState': 'http://www.txstate.edu', 'Texas_Tech': 'http://www.ttu.edu', 'UDallas': 'http://www.udallas.edu', 'UHouston': 'http://www.uh.edu', 'UTexas': 'http://www.utexas.edu', 'UT_Dallas': 'http://www.utdallas.edu', 'UT_ElPaso': 'http://www.utep.edu', 'UT_Houston': 'http://www.uth.edu', 'UT_NorthTexas': 'http://www.unt.edu', 'UT_SanAntonio': 'http://www.utsa.edu' } elif urls == 'popular': news_urls = {} agency_urls = newspaper.popular_urls() for i in range(len(agency_urls)): val = agency_urls[i] url = agency_urls[i].replace("http://", "") url = url.replace("www.", "") url = url.replace("blog.", "") url = url.replace("blogs.", "") url = url.replace(".com", "") url = url.replace(".net", "") url = url.replace(".au", "") url = url.replace(".org", "") url = url.replace(".co.uk", "") url = url.replace("the", "") url = url.replace(".", "-") url = url.replace('usa', 'usa-') if url == 'berkeley-edu': continue if url == 'beta-na-leagueoflegends': continue if url == 'bottomline-as-ucsb-edu': continue news_urls[url] = val else: news_urls = urls print("\nSearch Level {:<d}:".format(search_level), end="") if search_level == 0: print(" Screening URLs for search words") print(" URLs must contain one or more of:", end="") else: print(" No URL Screening") print(" Deep Search for Articles containing: ", end="") i = 0 for word in words: i += 1 if i < len(words): if search_type == 'or': print(word + " or ", end="") else: print(word + " & ", end="") else: print(word) df_articles = pd.DataFrame(columns=[ 'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text' ]) n_articles = {} today = str(date.today()) for agency, url in news_urls.items(): paper = newspaper.build(url, config=config) if display: print("\n{:>6d} Articles available from {:<s} on {:<10s}:". format(paper.size(), agency.upper(), today)) article_collection = [] for article in paper.articles: url_lower = article.url.lower() # Exclude articles that are in a language other then en # or contains mostly video or pictures # search_level 0 only downloads articles with at least # one of the key words in its URL # search_level 1 download all articles that appear to be # appear to be in English and are not mainly photos or # videos. # With either search level, if an article is downloaded # it is scanned to see if it contains the search words # It is also compared to other articles to verify that # it is not a duplicate of another article. # Special Filters for some Agencies if agency == 'cbs-news': if url_lower.find('.com') >= 0: # secure-fly are duplicates of http if article.url.find('secure-fly') >= 0: continue if agency == 'usa-today': if url_lower.find('tunein.com') >= 0: continue if agency == 'huffington': # Ignore huffington if it's not .com if url_lower.find('.com') < 0: continue # Filter Articles that are primarily video, film or not en if url_lower.find('.video/') >=0 or \ url_lower.find('/video') >=0 or \ url_lower.find('/picture') >=0 or \ url_lower.find('.pictures/')>=0 or \ url_lower.find('/photo') >=0 or \ url_lower.find('.photos/') >=0 or \ url_lower.find('espanol') >=0 or \ url_lower.find('.mx/' ) >=0 or \ url_lower.find('/mx.' ) >=0 or \ url_lower.find('.fr/' ) >=0 or \ url_lower.find('/fr.' ) >=0 or \ url_lower.find('.de/' ) >=0 or \ url_lower.find('/de.' ) >=0 or \ url_lower.find('.it/' ) >=0 or \ url_lower.find('/it.' ) >=0 or \ url_lower.find('.gr/' ) >=0 or \ url_lower.find('/gr.' ) >=0 or \ url_lower.find('.se/' ) >=0 or \ url_lower.find('/se.' ) >=0 or \ url_lower.find('.es/' ) >=0 or \ url_lower.find('/es.' ) >=0 or \ url_lower.find('?button') >=0 or \ url_lower.find('calendar.') >=0 or \ url_lower.find('calendar/') >=0 or \ url_lower.find('/event/') >=0 or \ url_lower.find('engr.utexas') >=0 or \ url_lower.find('sites.smu.') >=0: continue # Filter if search_level == 0, URL quick search if search_level == 0: # Verify url contains at least one of the key words found_it = False for word in words: j = url_lower.find(word) if j >= 0: found_it = True break if found_it: # Article contains words and passes filters # Save this article for full review article_collection.append(article.url) else: # No URL screening, Save for full review article_collection.append(article.url) n_to_review = len(article_collection) if display: print("{:>6d} Selected for download".format(n_to_review)) for article_url in article_collection: article = Article(article_url, config=config) try: article.download() except: if display: print("Cannot download:", article_url[0:79]) continue n = 0 # Limit download failures stop_sec = 1 # Initial max wait time in seconds while n < 2: try: article.parse() n = 99 except: n += 1 # Initiate download again before new parse attempt article.download() # Timeout for 5 seconds waiting for download t0 = time() tlapse = 0 while tlapse < stop_sec: tlapse = time() - t0 # Double wait time if needed for next exception stop_sec = stop_sec + 1 if n != 99: if display: print("Cannot download:", article_url[0:79]) n_to_review -= 1 continue article.nlp() keywords = article.keywords title = article.title summary = article.summary text = article.text text_lower_case = text.lower() if search_type == 'or': found_it = False # Verify the url contains at least one of the key words for word in words: j = text_lower_case.find(word) if j >= 0: found_it = True break else: # search type 'and' found_it = True for word in words: j = text_lower_case.find(word) if j < 0: found_it = False break if found_it: # Article contains words and passes filters # Save this article for later full review length = len(text) df_story = pd.DataFrame([[ agency, article_url, length, keywords, title, summary, text ]], columns=[ 'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text' ]) # Check for an identical already in the file if df_articles.shape[0] == 0: df_articles = df_articles.append(df_story) else: # Verify this story is not already in df_articles same_story = False for i in range(df_articles.shape[0]): if text == df_articles['text'].iloc[i]: same_story = True n_to_review -= 1 continue if not (same_story): df_articles = df_articles.append(df_story) else: n_to_review -= 1 print("=", end='') n_articles[agency] = [n_to_review, len(article_collection)] if display: print("\n\nArticles Selected by Agency:") for agency in news_urls: ratio = str(n_articles[agency][0]) + "/" + \ str(n_articles[agency][1]) ratio = ratio print("{:>10s} Articles from {:<s}".format( ratio, agency.upper())) print("\nArticles Collected on " + today + ":", df_articles.shape[0], 'from', df_articles['agency'].nunique(), "Agencies.") print("\nSize Agency Title") print("*{:->78s}*".format("-")) for i in range(df_articles.shape[0]): k = len(df_articles['title'].iloc[i]) if k > 63: for j in range(25): k = 63 - j if df_articles['title'].iloc[i][k] == " ": break print("{:>5d} {:<10s} {:<63s}".format( df_articles['length'].iloc[i], df_articles['agency'].iloc[i], df_articles['title'].iloc[i][0:k])) if len(df_articles['title'].iloc[i]) > 63: print(" {:<60s}".format( df_articles['title'].iloc[i][k:120])) else: print("{:>5d} {:<10s} {:<s}".format( df_articles['length'].iloc[i], df_articles['agency'].iloc[i], df_articles['title'].iloc[i])) print("") print("*{:->78s}*".format("-")) return df_articles
aws_access_key_id="enter key", aws_secret_access_key="enter secret") # To create a new table, we have to set up the schema # The schema requires a primary hash key and allows a secondary range key my_schema = conn.create_schema(hash_key_name='id', hash_key_proto_value=int) # Create the table by providing a name, schema and the computing power required by AWS table = conn.create_table(name="newspaper", schema=my_schema, read_units=10, write_units=5) # To fill our table, we are going to parse through websites that publish news # The newspaper package happens to contain a list of over 200 popular news websites sources = newspaper.popular_urls() hashkey = 1 # primary key integer to be used for DynamoDB table source_iterator = 0 # iterator to be increased at end of loop to move to next source for source in sources: parser = sources[source_iterator] try: collect = newspaper.build( parser) # Build method returns an object with each sub-site url except: source_iterator += 1 # If the Build method fails, move on to next source # Store articles as dict type in a list to iterate through for upload to DynamoDB collection = [] for article in collect.articles:
def popular_urls(): return newspaper.popular_urls()
import newspaper as nw text =[] papers=[] popular_urls = nw.popular_urls() for url in popular_urls: print 'building: ' , url paper = nw.build(language='en',url=url, memoize_articles=False, ) papers.append(paper) print 'done..' #nw.news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total #nw.news_pool.join() with open('dataset.txt', 'w+') as f: for paper in papers: print len(paper.articles) for article in paper.articles: article.download() article.parse() text.append(article.title) try: if article.title!='Something\'s Gone Terribly Wrong': f.write((article.title.decode('utf-8').encode('cp1250'))+'\n') except: pass
def PopularUrl(self): return newspaper.popular_urls()
# Note you will need to enter your own private keys from AWS conn = boto.dynamodb.connect_to_region( 'us-west-2', aws_access_key_id="enter key", aws_secret_access_key="enter secret") # To create a new table, we have to set up the schema # The schema requires a primary hash key and allows a secondary range key my_schema = conn.create_schema(hash_key_name = 'id', hash_key_proto_value = int) # Create the table by providing a name, schema and the computing power required by AWS table = conn.create_table(name="newspaper", schema=my_schema, read_units=10, write_units=5) # To fill our table, we are going to parse through websites that publish news # The newspaper package happens to contain a list of over 200 popular news websites sources = newspaper.popular_urls() hashkey = 1 # primary key integer to be used for DynamoDB table source_iterator = 0 # iterator to be increased at end of loop to move to next source for source in sources: parser = sources[source_iterator] try: collect = newspaper.build(parser) # Build method returns an object with each sub-site url except: source_iterator += 1 # If the Build method fails, move on to next source # Store articles as dict type in a list to iterate through for upload to DynamoDB collection = [] for article in collect.articles: try:
import newspaper # hot() returns a list of the top trending terms on Google using a public api print(newspaper.hot()) # popular_urls() returns a list of popular news source urls print(newspaper.popular_urls()) newspaper.languages()
import newspaper volkskrant = newspaper.build('https://www.volkskrant.nl/', language='nl') print(volkskrant.size()) for category in volkskrant.category_urls(): print(category) for article in volkskrant.articles: print(article.url) print('\n') print(newspaper.hot(), end='\n\n') print(newspaper.popular_urls(), end='\n\n') print(newspaper.languages()) # url = 'https://www.volkskrant.nl/nieuws-achtergrond/eerste-dode-in-nederland-maar-wat-is-eigenlijk-de-kans-om-aan-corona-te-overlijden~bf716564/' # article = newspaper.Article(url) # article.download() # article.parse() # print(article.authors) # article.publish_date # article.text # article.top_image # article.nlp() # article.keywords
import newspaper from newspaper import Article import pandas as pd data = pd.read_csv('../data/uci-news-aggregator.csv',sep=',') #while(1): #print(newspaper.popular_urls()) urls = newspaper.popular_urls() #urls = ['https://www.wsj.com/news/world/middle-east','http://wsj.com','http://nytimes.com','http://www.bbc.co.uk','http://www.npr.org','http://www.reuters.com','http://www.economist.com','http://www.pbs.org','http://bigstory.ap.org','http://cnn.com','http://www.ted.com','http://www.washingtonpost.com','http://www.newyorker.com','http://www.cbs.com'] #print(data.head(5)) url_test = "https://www.nytimes.com/2020/08/08/business/economy/lost-unemployment-benefits.html?action=click&module=Top%20Stories&pgtype=Homepage" article = Article(url_test) article.download() article.parse() print(article.text) exit() for row in data: try: first_article = Article(url=str(row['URL'], language='en')) print(first_article) #article = newspaper.build(str(row['URL']), memoize_articles=False, language='en') #with open('sample'+str(row['ID']), 'w') as f: except: print("Error") '''print("$$$$ Novi url: ",url) cnn_paper = newspaper.build(str(url), memoize_articles=False, language='en') print("$$$$ Broj artikala: ",len(cnn_paper.articles)) j = 0 for article in cnn_paper.articles: try: j+=1
def popular_paper(): popular_url = newspaper.popular_urls() return popular_url