def nyt(query_subject): links = [] for url in search_news(query_subject + ' site:https://www.nytimes.com', pause = rando(), stop = stop_limit): links.append(url) article_list = [] target_article = [] for link in links: try: target_article.append(requests.get(link)) except requests.exceptions.ConnectionError: print('something messed up') for request in target_article: article = {'Title' : '' , 'Authors' : [], 'Text' : [], 'Date' : '', 'Publication' : 'New York Times'} article['Title'] = get_title_nyt(request) article['Authors'] = get_authors_nyt(request) article['Text'] = wordlist(get_content_nyt(request)) article_list.append(article) return article_list
def nyt(query_subject): links = [] for url in search_news(query_subject + ' site:https://www.nytimes.com', pause=rando(), stop=stop_limit): links.append(url) article_list = [] target_article = [] for link in links: try: target_article.append(requests.get(link)) except requests.exceptions.ConnectionError: print('something messed up') for request in target_article: article = { 'Title': '', 'Authors': [], 'Text': [], 'Date': '', 'Publication': 'New York Times' } article['Title'] = get_title_nyt(request) article['Authors'] = get_authors_nyt(request) article['Text'] = wordlist(get_content_nyt(request)) article_list.append(article) return article_list
def hp(query_subject): links = [] for url in search_news(query_subject + ' site:http://www.huffingtonpost.com', pause = rando(), stop = 20): links.append(url) article_list = [] target_article = [] for link in links: try: target_article.append(requests.get(link)) except requests.exceptions.ConnectionError: print('something messed up') for request in target_article: article = {'Title' : '' , 'Authors' : [], 'Text' : [], 'Date' : '', 'Publication' : 'Huffington Post'} article['Text'] = wordlist(get_content_hp(request)) article['Title'] = get_title_hp(request) article['Date'] = get_date_hp(request) article['Authors'] = get_authors_hp(request) article_list.append(article) return article_list
def main(): article_content = open('webscrapertwo.json', 'w') # Create the file inwhich to store the links = [] article_holder = { 'Title': '', 'Authors': [], 'Text': '', 'Date': '', 'Publication': 'New York Times' } subject = input("what do you want to look up? ") for url in search_news(subject + ' site:https://www.nytimes.com', stop=5): links.append(url) # x = len(links) target_article = [] for link in links: try: target_article.append(requests.get(link)) # print(article_link) except requests.exceptions.ConnectionError: print('something messed up') y = 0 for request in target_article: article = { 'Title': '', 'Authors': [], 'Text': '', 'Date': '', 'Publication': 'New York Times' } article['Title'] = get_title_nyt(request) article['Authors'] = get_authors_nyt(request) article['Text'] = get_content_nyt(request) json.dump(article, article_content, indent=4) print('\n') print('\n') print('\n')
def fetch_news(query): print('Getting result...') result = google.search_news(query, num=4, stop=4) print('->Got result!') video_urls = list() for r in result: #print("-->" + r) #if 'watch?v=' in r: video_urls.append(r) if len(video_urls) > 1: return video_urls[random.randint(0, len(video_urls) - 1)] elif len(video_urls) == 1: return video_urls[0] else: return "Could not find a news article. Try typing a different query."
def main(): article_content = open('webscrapertwo.json', 'w') # Create the file inwhich to store the links = [] article_holder = {'Title' : '' , 'Authors' : [], 'Text' : '', 'Date' : '', 'Publication' : 'New York Times'} subject = input("what do you want to look up? ") for url in search_news(subject + ' site:https://www.nytimes.com', stop = 5): links.append(url) # x = len(links) target_article = [] for link in links: try: target_article.append(requests.get(link)) # print(article_link) except requests.exceptions.ConnectionError: print('something messed up') y = 0 for request in target_article: article = {'Title' : '' , 'Authors' : [], 'Text' : '', 'Date' : '', 'Publication' : 'New York Times'} article['Title'] = get_title_nyt(request) article['Authors'] = get_authors_nyt(request) article['Text'] = get_content_nyt(request) json.dump(article, article_content, indent=4) print('\n') print('\n') print('\n')
from __future__ import unicode_literals from google import search_news # import newspaper import urllib from bs4 import BeautifulSoup # data = "" text_f = open("/home/suryansh/Desktop/debatenight", "w") for url in search_news("debate night", stop=30): # print url # url = "http://www.hindustantimes.com/analysis/us-understands-india-s-anger-but-does-not-explicitly-back-surgical-strikes/story-VOow8dR7nRenaY6OryVjoL.html" if url[0:21] == "http://www.bbc.co.uk/": continue print "** " + url + "\n\n" if (url == "http://www.hamhigh.co.uk/news/hampstead_schoolgirls_hold_yoga_day_for_indian_village_1_4763832" or url == "https://www.rt.com/sport/365009-rio-worker-wages-threaten-suit/" or url == "http://www.bbc.co.uk/news/uk-england-manchester-37631537" or url == "http://www.bbc.co.uk/newsround/37675611"): continue # if( url == "https://www.bloomberg.com/view/articles/2016-11-08/india-pushes-u-k-to-figure-out-its-economic-future"): # continue html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding="utf-8") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out