def get_headlines_and_wordcount(date): ''' This function accepts a year in string format (e.g.'1980') and it will return a tuple of average wordcount and a list headlines for that year. ''' #please dont use my key :) api = articleAPI('fad6d61d6d69a16df4ef1e0f38ec9c00:10:73444277') headlines = [] wordcounts = [] for i in range(0, 100): articles = api.search(fq={'source': ['The New York Times']}, begin_date=date + '0101', end_date=date + '1231', sort='oldest', page=str(i)) headlines += parse_headlines(articles) wordcounts += parse_wordcount(articles) time.sleep(1) #10 requests per second (10 items on a page) #find the average wordcount for this year num_wordcounts = len(wordcounts) avg_wordcount = 0 for i in wordcounts: if i != None: try: avg_wordcount += int(i) except: print i else: num_wordcounts -= 1 avg_wordcount = avg_wordcount / num_wordcounts return (avg_wordcount, headlines)
def get_headlines_and_wordcount(date): ''' This function accepts a year in string format (e.g.'1980') and it will return a tuple of average wordcount and a list headlines for that year. ''' #please dont use my key :) api = articleAPI('fad6d61d6d69a16df4ef1e0f38ec9c00:10:73444277') headlines = [] wordcounts = [] for i in range(0,100): articles = api.search(fq = {'source':['The New York Times']}, begin_date = date + '0101', end_date = date + '1231', sort='oldest', page = str(i)) headlines += parse_headlines(articles) wordcounts += parse_wordcount(articles) time.sleep(1) #10 requests per second (10 items on a page) #find the average wordcount for this year num_wordcounts = len(wordcounts) avg_wordcount = 0 for i in wordcounts: if i != None: try: avg_wordcount += int(i) except: print i else: num_wordcounts -= 1 avg_wordcount = avg_wordcount / num_wordcounts return (avg_wordcount, headlines)
def api_query(topics): """hits the NYT API""" api = articleAPI(API_KEY) for x in topics: articles = api.search(x[1]) time.sleep(2) return articles
def scrape_nyt_politics(): api = articleAPI(nyt_key) bush_dict = api.search(q="Immigration", begin_date=20010120, end_date=20010430) obama_dict = api.search(q='Immigration', begin_date=20080120, end_date=20080429) trump_dict = api.search(q="Immigration", begin_date=20160120, end_date=20160429) #getting into docs bush = bush_dict["response"]["docs"][0] obama = obama_dict["response"]['docs'][0] trump = trump_dict["response"]["docs"][0] print("BUSH", bush, "OBAMA", obama, "TRUMP", trump) # Make bush json dumped_json_bush = json.dumps(bush) bush_json = open(bush, "w") bush_json.write(dumped_json_bush) bush_json.close() #make obama json dumped_json_obama = json.dumps(obama) obama_json = open(obama, "w") obama_json.write(dumped_json_obama) obama_json.close #make trump json dumped_json_trump = json.dumps(trump) trump_json = open(trump, "w") trump_json.write(dumped_json_trump) trump_json.close()
def get_article(query): api = articleAPI("522e4e6f593d44baaf69a87cdff70548") today = date.today() prev_date = date(2014, 1, 1) results = api.search(q=query, begin_date=prev_date.strftime('%Y%m%d'), end_date=today.strftime('%Y%m%d'), sort='newest', fl=['web_url', 'snippet', 'headline', 'pub_date']) articles = results['response']['docs'][0:3] adapter = [] for article in articles: adapDict = {} for key, value in article.items(): adapDict['Source'] = 'Times' adapDict['URL'] = article['web_url'] adapDict['Title'] = article['headline']['main'] adapDict['Summary'] = article['snippet'] adapDict['Published on'] = article['pub_date'] adapter.append(adapDict) return adapter
def newyorktimes(search): #needs to be recoded, doesn't work #newyorktimes NYTapi = articleAPI('New York Times API CODE GOES HERE') client = nyt.Client(NYTapi) res = client.query(search, sentences=4) summary = next(res.results).text print summary
def article_inform(keywords, date_series, file_name='data.txt', api_key=_api): """ @:param: keywords: list of keywords @:param: date_series: list of date @:param: file_name: string @:return: content: list of records of NEWYORKTIMES """ content = [] api = articleAPI(api_key) # Downloads the information you need for keyword in keywords: for i in range(0, len(date_series) - 1): try: articles = api.search(q=keyword, start_date=date_series[i], end_date=date_series[i + 1]) time.sleep(1) # my personal API, please get your own API through the website I provided. # For Usage: http://developer.nytimes.com/article_search_v2.json#/Documentation/GET/articlesearch.json # Articles are json files, which is a dict-like file for text in articles['response']['docs']: content.append(text) except Exception: continue # save file with open(file_name, 'w') as f: for item in content: f.write("{}\n".format(item)) return content
def retrieveArticlesByNewsDesk(query, newsDesk, begin, end, maxRequests=10): t = time() articlesToKeep = [] api = articleAPI('2aeeb020218af0f30993f12ff451d821:10:62878084') page = 0 while True: try: responseObject = api.search( q = query, fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':[newsDesk]}, begin_date = begin, end_date = end, page=page) except: print "search failed" print query, newsDesk, begin, end, maxRequests return (articlesToKeep, page+1) if 'response' not in responseObject: print "No response" print query, newsDesk, begin, end, maxRequests return (articlesToKeep, page+1) articles = responseObject['response']['docs'] for article in articles: if worthKeeping(article): condensed = condenseArticle(article) #print condensed['pub_date'] articlesToKeep.append(condensed) if len(articles) < 10 or page == maxRequests - 1: break page += 1 print "" print "Retrieved " + str(len(articlesToKeep)) + " " + newsDesk + " articles." print "Made " + str(page+1) + " requests." print time() - t return (articlesToKeep, page+1)
def connect(self) -> None: """ Creates a connection to NYT """ if self.keys is None: self.keys = self._get_api_keys(self.path) # be sure to generate from list self.connection = articleAPI(self.keys[0])
def nyDataFunc(): api = articleAPI("wL9jacwKcc7zrn4UrgRtD59ikr8cHe5s") with open('nyTimes_Rugby_Data1.csv', 'a', newline='') as newFile: newFileWriter = csv.writer(newFile) newFileWriter.writerow(['nyTimes_Rugby_Data']) for i in range(10): articles = api.search(q='Rubgy', begin_date=20190101, page=i) #pdb.set_trace() data = articles['response']['docs'] dataLength = len(data) with open('nyTimes_Rugby_Data1.csv', 'a', newline='') as newFile: newFileWriter = csv.writer(newFile) for j in range(dataLength): newFileWriter.writerow([data[j]['web_url']])
def getArticles(stocks): api = articleAPI(private.NYT) articles = {} for key, value in stocks.iteritems(): stockRequest = api.search(fq = {'headline': value}, begin_date = 20150905) try: headline = stockRequest['response']['docs'][0]['headline']['main'] url = stockRequest['response']['docs'][0]['web_url'] if stockRequest['response']['docs'][0]['abstract'] is not None: abstract = stockRequest['response']['docs'][0]['abstract'] else: abstract = "None" except Exception as e: pass articles[key] = {'headline': headline, 'url': url, 'abstract': abstract} return articles
def get_articles(self, date, news_desk, key): api = articleAPI(key) start_months = ["0101", "0201", "0301", "0401", "0501", "0601", "0701", "0801", "0901", "1001", "1101", "1201"] end_months = ["0131", "0228", "0330", "0430", "0530", "0630", "0731", "0831", "0930", "1030", "1130", "1231"] for j in range(len(start_months)): for i in range(0, 200): try: articles = api.search(fq={'source': ['The New York Times']}, begin_date=date + start_months[j], end_date=date + end_months[j], sort='oldest', page=str(i), news_desk=news_desk) self.write_to_file(articles) if self.temp > 20: break except Exception, e: print 'Error while extracting from API: ' + str(e) continue
def nytimes_api(): """Return a handle to the New York Times article API. Load an API key from a local file not included in the git repository, then use it to acquire a handle to the New York Times article API. Returns: api (nytimesarticle.articleAPI): A New York Times article API handle. """ # File containing the New York Times API key key_file = 'api-keys/nytimes.txt' # Read in the key string, remove the final character with open(key_file) as f: key = f.read()[:-1] return nyt.articleAPI(key)
def main(): txt = open("keywords.txt", "w") api = articleAPI('45862958eff543bb9555201274493184') sections = which_sections() article_list = get_articles(api, sections) proper_list = get_proper_nouns(article_list) pref_list = [] while True: pref = raw_input("Which articles sound interesting? ") if pref == '-': break pref_num = int(pref) - 1 for i in proper_list[pref_num]: txt.write(i + '\n')
def generate_url_file(start_page_number=0, to_extract=400): urls_file_path = get_file_path('data/urls.txt') if not os.path.exists(urls_file_path): with open(urls_file_path, 'w') as to_write_file: count = 0 page_number = start_page_number previous_count = count while count < to_extract: previous_count = count # https: // api.nytimes.com / svc / search / v2 / articlesearch.json?api - key = ffd404f8bd874e2b881367f933dd423b & fq = news_desk:( # "Politics") # AND # type_of_material:("News") & page = 0 & fl = web_url ny_api = articleAPI(API_KEY) filters = { 'news_desk': 'Politics', 'type_of_material': 'News', } # news_desk_fltr = urllib.quote('&fq=news_desk:(Politics)') # section_fltr = 'AND section_name:("Business")' # source_fltr = 'AND source:("The New York Times")' # material_type_fltr = urllib.quote('AND type_of_material:(News)') # page_number_fltr = urllib.quote('&page=' + str(page_number)) # projection = urllib.quote('&fl=web_url') # ny_times_url = base_url + news_desk_fltr + material_type_fltr + page_number_fltr + projection # json_content = json.load(urllib2.urlopen(ny_times_url)) json_content = ny_api.search(fq=filters, page=page_number) num_hits = json_content[u'response'][u'meta'][u'hits'] print 'Number of hits is: ' + str( num_hits) + '. Current count is: ' + str(count) if num_hits < to_extract: print 'Number of hits is less than to extract. Resizing. [WARNING]!!' to_extract = num_hits results = json_content[u'response'][u'docs'] for result in results: url = result[u'web_url'] to_write_file.write(url) to_write_file.write('\n') count = count + 1 page_number += 1 time.sleep(1) if previous_count == count: print 'no more result' break else: print urls_file_path + ' already exists. Not Over writing [INFO]'
def __init__(self, key='', cache=None): ''' Constructor, setup the API key here and reference the data store as necessary key - the api key to use the access the ny times article api cache - instance of Cache class to use ''' # need a key if (not key): raise Exception('No key passed in, please provide one') self.__nytimes_api = articleAPI(key) # need the cache if (not cache): raise Exception('Missing cache, cannot connot continue') self.__cache = cache
def GetArchivesNYT(self, company, entities): api = articleAPI(constants.NYT_KEY) articles = api.search( q = entities, fq = {'headline': company, 'source':['Reuters','AP','The New York Times',\ 'RETRO REPORT','Technology',' Amazon - Technology']}, begin_date = 20160901 ) news = [] try: for i in articles['response']['docs']: dic = {} dic['date'] = i['pub_date'] # cutting time of day. dic['url'] = i['web_url'] if dic['date'] is not None: news.append(dic) except: print 'Could not retrieve articles for ', company return news
def main(): if os.getenv('NYTIMES_KEY') is None: print "Usage: \nSet 1 environment variable for NY Times authentication: " print "export NYTIMES_KEY=\"your key\"" sys.exit() # auth to NY Times s = sched.scheduler(time.time, time.sleep) api = articleAPI(os.environ['NYTIMES_KEY']) # parse and print headlines sys.stdout.flush() headlines = parse_headlines(keyword, api) print_headlines(headlines) while True: for i in xrange(10, 0, -1): sys.stdout.flush() sys.stdout.write('Refresh in %d\r' % i) time.sleep(1) sys.stdout.flush() headlines = parse_headlines(keyword, api) print_headlines(headlines)
class NytimesapiSpider(scrapy.Spider): name = "nytimes" allowed_domains = ["nytimes.com"] start_urls = ['http://nytimes.com/'] api = articleAPI("038ee5e586674d6aa8c9102e6177a6ca") articles = api.search(q='Trump', fq={'headline': 'Trump', 'source': ['Reuters', 'AP', 'The New York Times']}, begin_date=20170101, facet_field=['source', 'day_of_week'], facet_filter=True) article_list = [] article_dict = {} for key, value in articles.iteritems(): if key is 'response': article_dict[key] = value article_list.append(article_dict) def parse(self, response): print response pass
def _get_articles(keyword, from_date, end_date, page_number, article_type): api = articleAPI(nytimes_api_key) if article_type == 'blog': query_filter = { 'section_name.contains': ['World', 'U.S.', 'Opinion'], 'type_of_material.contains': ['Blog'] } else: query_filter = { 'section_name.contains': ['World', 'U.S.'], 'type_of_material.contains': ['News', 'Brief'] } articles = api.search(q=keyword, fq=query_filter, begin_date=from_date, end_date=end_date, page=page_number) return articles
def get_NYT(start, end): with open('nyt_keys.json') as key: nyt_creds = json.load(key) api = nytimesarticle.articleAPI(nyt_creds['NYT_ARTICLE']) start_dt = pd.to_datetime(start).date() end_dt = pd.to_datetime(end).date() num_days = (end_dt - start_dt).days articles = [] for i in xrange(num_days + 1): dt = start_dt + datetime.timedelta(i) dt_start = str(dt.year) + str(dt.month).zfill(2) + str(dt.day).zfill(2) dt_end = str(dt.year) + str( dt.month).zfill(2) + str(dt.day + 1).zfill(2) result = api.search(q = 'brexit', fq = {'source':['Reuters','AP', 'The New York Times'],\ 'news_desk':["Foreign","Business","Financial","Market Place","World"], 'document_type':"article"}, begin_date = dt_start, end_date = dt_start, \ sort= "newest", \ fl = "web_url,snippet,lead_paragraph,abstract,source,headline,keywords,pub_date,document_type,news_desk,type_of_material", \ facet_field = "source,section_name,document_type") for r in result: if r == unicode('response'): for k in result[r].keys(): if k == unicode('docs'): for doc in result[r][k]: articles.append(doc) dates = [] corpus = [] for art in articles: r = requests.get(art[unicode('web_url')]) tree = html.fromstring(r.content) contents = tree.xpath( '//p[@class="story-body-text story-content"]/text()') body = [] for t in contents: if t.strip <> '': body.append(re.sub(r'[^\w]', ' ', t)) corpus.append(''.join(body)) dates.append(art[unicode('pub_date')][0:10]) return dates, corpus
def get_articles(date,query): ''' This function accepts a year in string format (e.g.'1980') and a query (e.g.'Amnesty International') and it will return a list of parsed articles (in dictionaries) for that year. ''' all_articles = [] api = articleAPI(api_key) for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway. articles = api.search(q=query, fq={'source': ['Reuters', 'AP', 'The New York Times']}, begin_date=date + '0101', end_date=date + '1231', sort='oldest', page=str(i)) articles = parse_articles(articles) all_articles = all_articles + articles return(all_articles)
def searchTopic(topic): f = open('secret', 'r') for line in f: if line.find('NYTIMES_ARTICLE') != -1: a,key = line.split('=') key = key.replace('\n','') api = articleAPI(key) topic = topic headline = topic sources = ['Reuters', 'AP', 'The New York Times'] beg_date = 20140101 pages = 4 articles_content = [] output = [] for page in range(pages): articles = api.search(q = topic, fq = {'headline':topic, 'source':sources}, fl = ['web_url'], begin_date = beg_date, page = str(page + 1)) for url in articles['response']['docs']: if url['web_url'].find('video') == -1: page = requests.get(url['web_url']) doc = lh.fromstring(page.content) text = doc.xpath('//p[@itemprop="articleBody"]') article = str() for par in text: paragraph = par.text_content() article += paragraph articles_content.append(article) for count, art in enumerate(articles_content): regex = re.compile('[^a-zA-Z]') articles_content[count] = regex.sub(' ', art) # with open('%s data.csv' % topic, 'wb') as output_file: # wr = csv.writer(output_file, quoting=csv.QUOTE_ALL) # wr.writerow(articles_content) print articles_content
def main(): client = MongoClient('localhost', 27017) politician_db = client['politician_db'] api = articleAPI('API KEY') df = pd.read_pickle('pol_df.pkl') already_loaded = pickle.load(open('pols_in_mongo.pkl', 'rb')) #print(already_loaded) pol_list = df.Name for politician in pol_list: article_pages = [] time.sleep(1) df_entry = df[df.Name == politician] #create collection for politician politician_col = politician_db[df_entry['collectionname'][df_entry.index[0]]] if df_entry['collectionname'][df_entry.index[0]] in already_loaded: print(politician, 'is already in the database') continue #Set up Query beg_d = df_entry['Entered Office'][df_entry.index[0]] end_d = df_entry['Exited Office'][df_entry.index[0]] #print(beg_d,end_d) try: beg_d_in = date_query(beg_d) # if politician == 'abraham lincoln': # print(beg_d_in, end_d_in) except: print(politician, end=' ') print('Failed at the beginning date step') if type(end_d) == str: try: done = False page = 1 while not done: time.sleep(1) search = api.search(q=politician, begin_date=beg_d_in, page = page) politician_col.insert_many(search['response']['docs']) print(politician, page) page += 1 if len(search['response']['docs'])%10 != 0 or len(search['response']['docs'])==0: done = True print(f'Successful query for {politician};', end=' ') except: print(politician, end=' ') print('failed their attempt to get into our database :(') else: try: end_d_in = date_query(end_d) done = False page = 1 while not done: time.sleep(1) search = api.search(q=politician, begin_date=beg_d_in, end_date = end_d_in, page=page) politician_col.insert_many(search['response']['docs']) print(politician, page) page += 1 if len(search['response']['docs']) % 10 != 0 or len(search['response']['docs']) == 0: done = True print(f'Successful query for {politician};') except: print(politician, end=' ') print('failed their attempt to get into our database :(')
from nytimesarticle import articleAPI from bs4 import BeautifulSoup import requests api = articleAPI("458e6335355c4f2fb83770f41baa4309") f=open('nytArticles.txt','w') links=[] try: for a in range(0,20): articles = api.search(q="shooting", begin_date=20180406,end_date=20180407,page=a) for i in range(0,len(articles['response']['docs'])): url = articles['response']['docs'][i]['web_url'] data = requests.get(url) soup = BeautifulSoup(data.content, 'html.parser') soup.prettify() for j in range((len(soup.find_all('p')))-3): f.write(soup.find_all('p')[j].get_text()) #print(url) links.append(url) f.close() except: print("We got only",len(links)," articles for selected period")
import csv from nytimesarticle import articleAPI import time api = articleAPI('ec4da91764da4217983fadb8c85f1dca') all_articles = [] for j in range( 0, 101 ): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway. time.sleep( 6) #API calls limited to 5 seconds. make it 6 so it doesn't get angry articles = api.search(q="Trump", fq={'headline': 'Trump'}, begin_date='20160707', end_date='20161107', fl='headline,pub_date,word_count', sort='newest', page=str(j)) print j news = [] for i in articles['response']['docs']: dic = {} dic['headline'] = i['headline']['main'].encode("utf8") dic['date'] = i['pub_date'][0:10] # cutting time of day. dic['word_count'] = i['word_count'] news.append(dic) articles = news
from nytimesarticle import articleAPI import json from secret import APIKey api = articleAPI(APIKey) testfile = open("foo.txt","ab") for i in range(1,101): articles = api.search(q='ebola',page=i) articles =json.dumps(articles) testfile.write(articles)
from nytimesarticle import articleAPI import requests from bs4 import BeautifulSoup import csv import urllib APIKey = "5GDjPzkz28HHyX3LuEoWeAMiBUZGqnHw" api = articleAPI(APIKey) def read(): # Read csv and return list of url url = [] try: with open("url.csv", mode="r") as csv_file: readCSV = csv.reader(csv_file, delimiter=',') header = True for row in readCSV: if header: header = False continue try: url.append(row[0]) except: break except FileNotFoundError: with open("url.csv", mode='w') as csv_file: writer = csv.writer(csv_file, delimiter=',') url = read() return url
""" NYT Download articles """ from nytimesarticle import articleAPI import pandas as pd import time import re api = articleAPI("<PASTE_YOUR_API_KEY>") df = pd.DataFrame() for i in range(1, 10): articles = api.search(q="Artificial Intelligence", begin_date=int("20180" + str(i) + "01"), end_date=int("20180" + str(i) + "28")) df = df.append(pd.DataFrame.from_dict(articles)) time.sleep(60) for i in range(10, 13): articles = api.search(q="Artificial Intelligence", begin_date=int("2018" + str(i) + "01"), end_date=int("2018" + str(i) + "28")) df = df.append(pd.DataFrame.from_dict(articles)) time.sleep(60) for i in range(1, 4): articles = api.search(q="Artificial Intelligence", begin_date=int("20190" + str(i) + "01"), end_date=int("20190" + str(i) + "28"))
def __init__(self): self.api = articleAPI("9ed601a914bd7b99e85df32fcb1b1a8b:9:75048351") self.news = []
# nytimes scraping from nytimesarticle import articleAPI import datetime import json import pandas as pd import time api = articleAPI('47948d7109eda66d76ccb67753997d53:15:28718339') page_start = 24 page_end = 30 pages = range(page_start, page_end) query_set = 'David Brooks' mainDF = pd.DataFrame(columns=['headline','kicker','pub_date','page']) for page in pages: articles = api.search( q = query_set, fq = {'kicker':query_set}, fl =['headline','byline','pub_date'], page=page) article_info = [] for article in articles['response']['docs']: (article_info.append({'kicker':article['headline']['kicker'], 'headline': article['headline']['main'], 'pub_date':article['pub_date'], 'page':page }))
# locations locations = [] for x in range(0,len(i['keywords'])): if 'glocations' in i['keywords'][x]['name']: locations.append(i['keywords'][x]['value']) dic['locations'] = locations # subject subjects = [] for x in range(0,len(i['keywords'])): if 'subject' in i['keywords'][x]['name']: subjects.append(i['keywords'][x]['value']) dic['subjects'] = subjects news.append(dic) return(news) api = articleAPI('fde3a426c74d6a1591bcdf1ffe1847d3:7:75025733') #Function to get the data def get_articles(date, query): ''' This function accepts a year in string format (e.g.'1980') and a query (e.g.'Amnesty International') and it will return a list of parsed articles (in dictionaries) for that year. ''' all_articles = [] for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway. articles = api.search(q = query, fq = {'source':['Reuters','AP', 'The New York Times']},
from sqlalchemy.orm import sessionmaker from database_setup_newspaper import Base, Subject, Article #import classes from databaseSetup file engine = create_engine('sqlite:///newspaper.db') #lets program know which db engine to connect with Base.metadata.bind = engine #makes connections between classes and corresponding tables in db DBSession = sessionmaker(bind = engine) #creates a link between our code and the engine we created session = DBSession() #####################WEB SCRAPING COMPONENT############################ #Info from http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial from nytimesarticle import articleAPI api = articleAPI('dad4ec569b4924e931a381a9d9d1edb5:16:75117993')\ obamaArticles = api.search( q = 'Obama', fq = {'headline':'Obama', 'source':['The New York Times']}, begin_date = 20141231 ) clintonArticles = api.search( q = 'Hillary Clinton', fq = {'headline':'Hillary Clinton', 'source':['The New York Times']}, begin_date = 20141231 ) def parse_articles(articles): ''' This function takes in a response to the NYT api and parses the articles into a list of dictionaries ''' news = []
#! python3 # Scraping NYT articles for a specific vocabulary word, returning the full sentence if found. import json from nytimesarticle import articleAPI api = articleAPI('417fe849ada646a28b95d6185b70777c') articles = api.search(q = 'precocious', begin_date = '20050101') url = 'https://api.nytimes.com/svc/search/v2/' # TODO # Get more articles from earlier dates, make sure to get only articles not movies # Option for user to get another sentence # How to search for certain word (re?)
except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def create_save_dir(file_type, section): root_dirpath = os.environ["data_dir"] + "nyt_corpus/" save_dirpath = "/".join(root_dirpath, file_type, section) mkdir_p(save_dirpath) return save_dirpath if __name__ == "__main__": api = articleAPI(os.environ["nyt_api_key"]) # load export nyt api key as environmental variable alias=nyt_api_key nytd_section = ['Arts', 'Business', 'Obituaries', 'Sports', 'World'] for section in nytd_section: num_pages = 101 # nyt max paginate limit = 101 for i in range(0, num_pages): print "scraping %s section, page {0}/{1}".format(section, i+1, num_pages) articles = api.search(sort='newest', fq={'source': ['The New York Times'], 'document_type': ['article'], 'section_name': [section]}, page=i) news = parse_articles(articles) body_text = [] for j in range(10): r = requests.get(news[j]['url']) data = r.text soup = BeautifulSoup(data) g_text = soup.find_all("p", {"class": ["story-body-text story-content", "story-body-text"]})
# -*- coding: utf-8 -*- """ Created on Sun Apr 3 20:05:40 2016 @author: katherinemckenna """ from nytimesarticle import articleAPI import csv import os key_word="Obamacare" os.chdir('/Users/katherinemckenna/Documents/CSE6242/Project/Articles Pulling') api = articleAPI('0358c9b864c6cad5603a6a32420c60be:7:74784865') def parse_articles(articles): news = [] for i in articles['response']['docs']: dic = {} #dic['id'] = i['_id'] dic['desk'+"_"+key_word] = i['news_desk'] dic['date'+"_"+key_word] = i['pub_date'][0:10] # cutting time of day. dic['section'+"_"+key_word] = i['section_name'] #dic['type'] = i['type_of_material'] dic['url'+"_"+key_word] = i['web_url'] #dic['word_count'] = i['word_count'] news.append(dic) return(news) def get_articles(date,query):
def __init__(self, key): self.api = articleAPI(key) self.secret_keyword = "climate change and " self.num_days_trending = 3
from nytimesarticle import articleAPI api = articleAPI("391f118fad27467d8b193a6686e1b3d1") articles = api.search(q='Trump', fq={ 'headline': 'Trump', 'source': ['The New York Times'] }, begin_date=20180618) print articles #prints the url to the trending article and the details of the whole article
https://github.com/evansherlock/nytimesarticle/blob/master/README.txt # Steps 1) Install nytimesarticle python -m easy_install nytimesarticle pip install nytimesarticle 2) Run python nyt_v1.py ''' # Import package from nytimesarticle import articleAPI # Set API Key api = articleAPI('your_api_key') # Query `bugdet deficits' articles = api.search(q = 'federal budget deficits') # articles = api.search(q = 'budget deficits', begin_date = 20111231, page=3) # articles = api.search(q = 'budget deficits', fq = {'headline':'Obama'}) # Total hits response = articles['response']['meta']['hits'] print response # Get publication date of the articles response = articles['response']['docs'] for item in response: print item['pub_date']
from nytimesarticle import articleAPI import datetime import time api = articleAPI('ae2b43b298972426c8e43a96afcb9aaa:0:70254909') def clean_entry(art): clean = {} if art['abstract']: clean['snippet'] = art['abstract'].encode('ascii', 'ignore') else: clean['snippet'] = art['snippet'].encode('ascii', 'ignore') clean['url'] = art['web_url'] clean['multimedia'] = "http://www.nytimes.com/"+str(art['multimedia'][1]['url']) clean['keywords'] = [] for word in art['keywords']: clean['keywords'].append(word['value'].encode('ascii', 'ignore')) clean['headline'] = art ['headline']['main'].encode('ascii', 'ignore') return clean def get_articles(cb, so_far=[], topic=None, page=0, limit=5): def respond_get_articles(payload): if not payload: cb(None) good_articles = [] for art in payload['response']['docs']: if (art['snippet'] or art['abstract']) and art['multimedia'] and art['type_of_material'] in ["News", "Editorial"]: good_articles.append(clean_entry(art)) good_articles += so_far if len(good_articles) >= limit:
from nytimesarticle import articleAPI import collections import math import datetime import re import pickle # api keys prasmuss = '7b4597b0dc6845688a8f90c00f3e60b6' peter_gray_rasmussen = '67391c8a5c6c2d8926eb3d9c5d136c59:7:72273330' proton = 'f8c34c7cda7848f997a9c273815d28a9' api = articleAPI(proton) def convert(data): ''' this function encodes dictionary of unicode entries into utf8 from http://stackoverflow.com/questions/1254454/fastest-way-to-convert-a-dicts-keys-values-from-unicode-to-str ''' if isinstance(data, basestring): return str(data) elif isinstance(data, collections.Mapping): return dict(map(convert, data.iteritems())) elif isinstance(data, collections.Iterable): return type(data)(map(convert, data)) else: return data def get_nyt_article_stats(articles_and_meta): '''
import requests, json from bs4 import BeautifulSoup as bs from nytimesarticle import articleAPI import random, time #api = articleAPI("1365a8efa2ee4cd9bb685612a03904b1") #api = articleAPI("aeff1d6a957f40c19bbda08b9714c4a4") #api = articleAPI("336886d72ea14a71a9a5c34800f9a761") api = articleAPI("8b80d00bbc10401aa74b05e4be7fd065") #api = articleAPI("4009c64cd2cf4a6ea25592535e9c83e3") #api = articleAPI("a91daaccbde54672921fcd3742a1a85d") #api = articleAPI("8b3df0ca99ed46d6bbf103c934c94ce9") #api = articleAPI("c159e3dfabdc45c993619e1bb6425002") #api = articleAPI("98ebc312f98d4257b1c27445152e6435") #api = articleAPI("9555d8d40c0044659546fdeeda4d290e") #api = articleAPI("3f94c1e436964be6909319b6d0989ec9") #api = articleAPI("7b3f3b398aec414aae09501f8c430155") #api = articleAPI("c1083f7480f941159de8f70d60975715") #api = articleAPI("bf53fb1b8c144020b41bbdf99b676157") #api = articleAPI("ade519c504884db097f6ce8fe9aecc2b") #api = articleAPI("4e3304e9ba354531ba8f35935a21188b") month = 9 outfile = open("NYTstories_" + str(month) + ".dat", "a") ## Algorithm # Randomly sample 1000 articles per month from the monthly data
import json import retinasdk import urllib from nytimesarticle import articleAPI from flask import Flask, request, jsonify import giphypop import requests g = giphypop.Giphy() api = articleAPI('499fdeaf3f57e86e6e07db5f763ed2b1:4:74404554') liteClient = retinasdk.LiteClient("b24cfff0-d29c-11e5-8378-4dad29be0fab") q = [] app = Flask(__name__) def getMultimedia(mm): for m in mm: if 'thumbnail' in m['legacy']: return 'http://www.nytimes.com/' + m['legacy']['thumbnail'] def getEntity(text): url = "http://gateway-a.watsonplatform.net/calls/text/TextGetRankedNamedEntities?apikey=bab3a3fa37b3724f29c56caee74e314b585d993d&outputMode=json&text=" + text u = urllib.urlopen(url) # # u is a file-like object data = u.read() resp = json.loads(data) for entity in resp['entities']: if entity['type'] in ['Company','Facility','GeographicFeature'
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk, tree from nltk.tag.stanford import StanfordNERTagger from nytimesarticle import articleAPI from pprint import pprint from bs4 import BeautifulSoup import pandas as pd import matplotlib.pyplot as plt plt.rcdefaults() import matplotlib as mpl import numpy as np import matplotlib.lines as mlines import sys import operator api_key = "01e07bd2b8034b81bc9bef8d3e35df3a" api_interface = articleAPI(api_key) warc_type_regex = re.compile(r'((WARC-Type:).*)') warc_record_id_regex = re.compile(r'((WARC-Record-ID:) <.*>)') html_regex = re.compile(r'<html\s*(((?!<html|<\/html>).)+)\s*<\/html>', re.DOTALL) def visualize(main_entity_name, article_name, article_entities, output_name, bar_amount): #Initialize Values dictionary = dict() entities = list() values = list() for element in article_entities: if element == []:
#final project #course: computation for public policy #project topic: from a western perspective: China and the Environment #import the NYTarticles API from nytimesarticle import articleAPI api = articleAPI('22bcd777b40f8d77e6ccf6469e7f9f16:11:67754634') #test api searches search = api.search (q= 'environment' #put the more important keyword here. #need to filter again how does the work 'environment' is used ,fq = {'headline':'China' #'subline':'environment' ,'body':['China'] ,'source':['Reuters','AP', 'The New York Times']} ,sort='oldest' ,begin_date = 19900101 ,end_date = 20151231 ) #defining a function to parse the result return from API def parse_articles(articles): ''' This function takes in a response to the NYT api and parses the articles into a list of dictionaries ''' news = [] for i in articles['response']['docs']: dic = {} dic['id'] = i['_id']
# coding: utf-8 # In[1]: from nytimesarticle import articleAPI from bs4 import BeautifulSoup import requests import re import io # In[2]: api = articleAPI("7b9a5b632d244ba281e306111824d31a") # In[26]: search_keyword = 'stormy daniels trump' articles = api.search(q=search_keyword,begin_date="20161001") # In[27]: for doc in articles['response']['docs']: arti_url = doc['web_url'] print(arti_url)
from nytimesarticle import articleAPI import time API_KEY = 'd0ca6e84f73f40a2b9cc4e7b24dc5674' api = articleAPI(API_KEY) articles = api.search(q = 'Trump', fq = {'headline':'Trump', 'source':['Reuters', 'AP', 'The New York Times']}) def parse_articles(articles): news = [] for i in articles['response']['docs']: dic = {} dic['id'] = i['_id'] if i['abstract'] is not None: dic['abstract'] = i['abstract'].encode('utf8') dic['headline'] = i['headline']['main'].encode('utf8') dic['desk'] = i['news_desk'] dic['date'] = i['pub_date'][0:10] # cutting time of day. dic['section'] = i['section_name'] if i['snippet'] is not None: dic['snippet'] = i['snippet'].encode('utf8') dic['source'] = i['source'] dic['type'] = i['type_of_material'] dic['url'] = i['web_url'] dic['word_count'] = i['word_count'] # locations locations = [] for x in range(0,len(i['keywords'])): if 'glocations' in i['keywords'][x]['name']:
# -*- coding: utf-8 -*- import os import csv import time from nytimesarticle import articleAPI api = articleAPI('pE4obr7qszOl3BziHDhS47RQv8ssf7pC') if not os.path.exists('articles_scrapped\sp_articles'): os.makedirs('articles_scrapped\sp_articles') def parse_articles(articles): news = [] if (articles.get("response") is not None) and ( (articles.get("response")).get("docs") is not None): for i in articles['response']['docs']: dic = {} dic['id'] = i['_id'] if i['abstract'] is not None: dic['abstract'] = i['abstract'] dic['headline'] = i['headline']['main'] dic['desk'] = i['news_desk'] dic['date'] = i['pub_date'][0:10] # cutting time of day. dic['section'] = i['section_name'] if i['snippet'] is not None: dic['snippet'] = i['snippet'] if i['lead_paragraph'] is not None: dic['lead_paragraph'] = i['lead_paragraph'] dic['source'] = i['source']
import json from nytimesarticle import articleAPI with open('api_keys.json') as f: api_key = json.load(f)['nyt_api_key'] api = articleAPI(api_key) date_list = [('20150101', '20150131'), ('20150201', '20150228'), ('20150301', '20150331'), ('20150401', '20150430'), ('20150501', '20150531'), ('20150601', '20150630'), ('20150701', '20150731'), ('20150801', '20150831')] articles_out = [] for date_tuple in date_list: for i in xrange(0, 100): articles = api.search(q='election', fq={'source' : ['The New York Times']}, begin_date=date_tuple[0], end_date=date_tuple[1], page=str(i)) articles_out += articles['response']['docs'] with open('nyt_results_election_2015.txt', 'wb') as f: json.dump(articles_out, f)
####total_articles.py written by Hieronimus Loho and Adam Chekroud ''' This script pulls the total number of articles in the NYT per quarter per year, just as a reference point. It is very similar to the benchmarker.py script, just without the rotating keys and without any search terms. ''' from nytimesarticle import articleAPI import csv import time import os #Insert API Key api_file = open( "/Users/hieronimusloho/Box Sync/Research Stuff/NYTLocal/NYT_keys.txt", "r") api_list = api_file.read().split('\n') api = articleAPI(api_list[6]) mental_health = [] #Start and end year begin = 1900 end = 2016 #Returns the hits response of the metadata def parse_articles(articles, year, quarter): shell = [] dic = {} dic['year'] = year dic['hits'] = articles['response']['meta']['hits'] dic['quarter'] = quarter shell.append(dic) return (shell)
import json import urllib, urllib2 from nytimesarticle import articleAPI api = articleAPI('00cc3abde644a36b3e10a27189ae1a45%3A11%3A70234304') def run_query(search_terms): # Specify the base root_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json' source = 'Web' # Specify how many results we wish to be returned per page. # Offset specifies where in the results list to start from. # With results_per_page = 10 and offset = 11, this would start from page 2. results_per_page = 10 offset = 0 # Wrap quotes around our query terms as required by the Bing API. # The query we will then use is stored within variable query. query = "'{0}'".format(search_terms) query = urllib.quote(query) # Construct the latter part of our request's URL. # Sets the format of the response to JSON and sets other properties. search_url = "{0}q={1}&api-key=00cc3abde644a36b3e10a27189ae1a45%3A11%3A70234304".format( root_url, query) # Setup authentication with the Bing servers. # The username MUST be a blank string, and put in your API key! username = '' bing_api_key = 'kdiorLyLAkjJpJOhWxPWI55D9fan9H1bHXn6kxWxlJw'
from nytimesarticle import articleAPI api = articleAPI("b42bf8a015d7be77596925a04577efa5:9:72455510") def parse_articles(articles): ''' This function takes in a response to the NYT api and parses the articles into a list of dictionaries ''' news = [] for i in articles['response']['docs']: dic = {} ## dic['id'] = i['_id'] if i['abstract'] is not None: dic['abstract'] = i['abstract'].encode("utf8") dic['headline'] = i['headline']['main'].encode("utf8") ## dic['desk'] = i['news_desk'] dic['date'] = i['pub_date'][0:10] # cutting time of day. dic['section'] = i['section_name'] if i['snippet'] is not None: dic['snippet'] = i['snippet'].encode("utf8") ## dic['source'] = i['source'] ## dic['type'] = i['type_of_material'] ## dic['url'] = i['web_url'] ## dic['word_count'] = i['word_count'] # locations ## locations = [] ## for x in range(0,len(i['keywords'])): ## if 'glocations' in i['keywords'][x]['name']: ## locations.append(i['keywords'][x]['value']) ## dic['locations'] = locations # subject subjects = []
import sys import json import requests import time import logging import traceback import pprint from nytimesarticle import articleAPI from pygeocoder import Geocoder #System arguments ####################################################### date = sys.argv[1] debugging = False #Variables ############################################################# api = articleAPI("af0ead0b339871714bd8718ac007283b:11:73169680") submitted = 0 duplicates = 0 coordCount = 0 multimediaCount = 0 log = logging.getLogger('nyt_worker') log.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s :: %(message)s') fileHandler = logging.FileHandler('logs/nyt_log.log') fileHandler.setLevel(logging.INFO) fileHandler.setFormatter(formatter) log.addHandler(fileHandler)
#!/usr/bin/env python3 import os import json import urllib.request import urllib.response import urllib.error import random from nytimesarticle import articleAPI api = articleAPI("93fc659744f5238f6f95d464865562b8:16:74068039") def parse_articles(): ''' This function takes in a response to the NYT api and parses the articles into a list of dictionaries ''' news = {"children": []} path = "C:\\Users\\Bin\\Desktop\\capstone\\json_data\\" d = {} date = input("Enter a year: ") top_node = {"section_name":"Articles", "size":500, "children": []} file_write = date + ".json" temp_file = open(os.path.join(path, file_write), 'w') for i in range(0,5): articles = api.search(
import unicodecsv as csv from nytimesarticle import articleAPI # api = articleAPI('f507f70d25c67d529f25cbd715cc5de9:5:67901820') api = articleAPI('6f3db26878b18ab77f5cb78e82dd4496:3:67969482') # articles = api.search( q = 'Obama', # fq = {'headline':'Obama', 'source':['Reuters','AP', 'The New York Times']}, # begin_date = 20111231 ) def parse_articles(articles): ''' This function takes in a response to the NYT api and parses the articles into a list of dictionaries ''' news = [] for i in articles['response']['docs']: dic = {} # dic['id'] = i['_id'] # if i['abstract'] is not None: # dic['abstract'] = i['abstract'].encode("utf8") # dic['headline'] = i['headline']['main'].encode("utf8") if type(i['headline']) is dict: dic['headline'] = i['headline']['main'].encode("utf8") # dic['desk'] = i['news_desk'] dic['date'] = i['pub_date'][0:10] # cutting time of day. #dic['section'] = i['section_name'] # if i['snippet'] is not None: # dic['snippet'] = i['snippet'].encode("utf8") #dic['source'] = i['source'] # dic['type'] = i['type_of_material'] dic['url'] = i['web_url']
from nytimesarticle import articleAPI from bs4 import BeautifulSoup import requests api = articleAPI("MqGpINkbKmy15GOA36p7DYm09JTXSAaO") f = open('nytarticlesday.txt', 'w') links = [] articles = api.search(q='cold', begin_date=20190101, page=100) #f=open('sports.txt','w') for i in range(0, 10): url = articles['response']['docs'][i]['web_url'] data = requests.get(url) soup = BeautifulSoup(data.content, 'html.parser') soup.prettify() for i in range((len(soup.find_all('p'))) - 3): f.write(soup.find_all('p')[i].get_text()) print(url) links.append(url) f.close() import os path = os.path.join(os.path.expanduser('~'), 'Desktop', 'file.txt') print(path)
from nytimesarticle import articleAPI from time import time from datetime import datetime import pickle #################################### # WARNING SPAGHETTI CODE KEEP AWAY # # # # srsface don't touch anything # #################################### api = articleAPI('2aeeb020218af0f30993f12ff451d821:10:62878084') ''' page = 0 responseObject = api.search( q = '*', fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':['travel']}, begin_date = 20140409, page=page) articles = responseObject['response']['docs'] for article in articles: print article['headline']['main'] print article['news_desk'] ''' def retrieveAll(): page = 0 while True: responseObject = api.search( q = 'NHL', fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':['sports']}, begin_date = 20111231, page=page) articles = responseObject['response']['docs'] for article in articles: print article['headline']['main'] print "NEWS DESK: ", article['news_desk'] print "ABSTRACT: ", article['abstract'] print "SNIPPET: ", article['snippet'] if article['snippet'] != article['lead_paragraph']:
search_term = cl.classify(search_tweet) if search_term == "food": search_term = "hunger" elif search_term == "water": search_term = "clean water" elif search_term == "hygiene": search_term = "health" elif search_term == "technology": search_term = "access to technology" ## GET NEWS ARTICLE in_file = open("ny_times_key.txt") key = in_file.read() in_file.close() api = articleAPI(key) developingCountries = [ "Mali", "Egypt", "China", "Cuba", "South Africa", "Algeria", "Angola", "Bolivia", "Brunei", "Burma", "Chile", "Congo", "Ecuador",
import matplotlib.pyplot as plt from nytimesarticle import articleAPI from datetime import datetime, timedelta from dateutil.parser import parse from pprint import pprint from pyrb import arange, format_axes, largefonts, save_pngs, open_figure from time import sleep from ipdb import set_trace from tqdm import tqdm # api key # api = articleAPI(r'5d33d48934ef42f0a7b715cbf7f1007f') # api = articleAPI(r'81e82e91402945198c0d0216c1b5181d') # api = articleAPI(r'0ecfeab1d0ef4c25b78ecddf0cf23bee') api = articleAPI(r'89616f68abff4f1691131533addc64d3') # define article search parameters kws = {'q': 'enron'} time0 = parse('1999 Jan 1') time1 = parse('2010 Feb 1') # initialize df, scan over segments from time0 to time1 df = pd.DataFrame() time = arange(time0, time1, timedelta(days=7)) page_limit = 200 for a, b in tqdm(zip(time[:-1], time[1:]), total=time.size - 1, desc='running api'): # scan pages until no results returned page = 0 while True: