Python articleAPI Examples, nytimesarticle.articleAPI Python Examples

Example #1

0

Show file

File: nytscraper.py Project: andierocca/nytimes-headline-analysis

def get_headlines_and_wordcount(date):
    '''
    This function accepts a year in string format (e.g.'1980')
    and it will return a tuple of average wordcount and a list headlines
    for that year.
    '''
    #please dont use my key :)
    api = articleAPI('fad6d61d6d69a16df4ef1e0f38ec9c00:10:73444277')
    headlines = []
    wordcounts = []
    for i in range(0, 100):
        articles = api.search(fq={'source': ['The New York Times']},
                              begin_date=date + '0101',
                              end_date=date + '1231',
                              sort='oldest',
                              page=str(i))
        headlines += parse_headlines(articles)
        wordcounts += parse_wordcount(articles)
        time.sleep(1)  #10 requests per second (10 items on a page)
    #find the average wordcount for this year
    num_wordcounts = len(wordcounts)
    avg_wordcount = 0
    for i in wordcounts:
        if i != None:
            try:
                avg_wordcount += int(i)
            except:
                print i
        else:
            num_wordcounts -= 1
    avg_wordcount = avg_wordcount / num_wordcounts
    return (avg_wordcount, headlines)

Example #2

0

Show file

File: nytscraper.py Project: andierocca/nytimes-headline-analysis

def get_headlines_and_wordcount(date):
    '''
    This function accepts a year in string format (e.g.'1980')
    and it will return a tuple of average wordcount and a list headlines
    for that year.
    '''
    #please dont use my key :)
    api = articleAPI('fad6d61d6d69a16df4ef1e0f38ec9c00:10:73444277')
    headlines = []
    wordcounts = []
    for i in range(0,100):
        articles = api.search(fq = {'source':['The New York Times']},
               begin_date = date + '0101',
               end_date = date + '1231',
               sort='oldest',
               page = str(i))
        headlines += parse_headlines(articles)
        wordcounts += parse_wordcount(articles)
        time.sleep(1) #10 requests per second (10 items on a page)
    #find the average wordcount for this year
    num_wordcounts = len(wordcounts)
    avg_wordcount = 0
    for i in wordcounts:
        if i != None:
            try:
    	        avg_wordcount += int(i)
            except:
                print i
        else:
            num_wordcounts -= 1
    avg_wordcount = avg_wordcount / num_wordcounts
    return (avg_wordcount, headlines)

Example #3

0

Show file

def api_query(topics):
    """hits the NYT API"""
    api = articleAPI(API_KEY)
    for x in topics:
        articles = api.search(x[1])
        time.sleep(2)
        return articles

Example #4

0

Show file

def scrape_nyt_politics():
    api = articleAPI(nyt_key)
    bush_dict = api.search(q="Immigration",
                           begin_date=20010120,
                           end_date=20010430)
    obama_dict = api.search(q='Immigration',
                            begin_date=20080120,
                            end_date=20080429)
    trump_dict = api.search(q="Immigration",
                            begin_date=20160120,
                            end_date=20160429)
    #getting into docs
    bush = bush_dict["response"]["docs"][0]

    obama = obama_dict["response"]['docs'][0]
    trump = trump_dict["response"]["docs"][0]
    print("BUSH", bush, "OBAMA", obama, "TRUMP", trump)
    # Make bush json
    dumped_json_bush = json.dumps(bush)
    bush_json = open(bush, "w")
    bush_json.write(dumped_json_bush)
    bush_json.close()
    #make obama json
    dumped_json_obama = json.dumps(obama)
    obama_json = open(obama, "w")
    obama_json.write(dumped_json_obama)
    obama_json.close
    #make trump json
    dumped_json_trump = json.dumps(trump)
    trump_json = open(trump, "w")
    trump_json.write(dumped_json_trump)
    trump_json.close()

Example #5

0

Show file

File: NYTimesAPI.py Project: Ninamma/DeployTrial

def get_article(query):
    api = articleAPI("522e4e6f593d44baaf69a87cdff70548")

    today = date.today()
    prev_date = date(2014, 1, 1)

    results = api.search(q=query,
                         begin_date=prev_date.strftime('%Y%m%d'),
                         end_date=today.strftime('%Y%m%d'),
                         sort='newest',
                         fl=['web_url', 'snippet', 'headline', 'pub_date'])
    articles = results['response']['docs'][0:3]

    adapter = []

    for article in articles:
        adapDict = {}
        for key, value in article.items():
            adapDict['Source'] = 'Times'
            adapDict['URL'] = article['web_url']
            adapDict['Title'] = article['headline']['main']
            adapDict['Summary'] = article['snippet']
            adapDict['Published on'] = article['pub_date']
        adapter.append(adapDict)
    return adapter

Example #6

0

Show file

def newyorktimes(search):  #needs to be recoded, doesn't work
    #newyorktimes
    NYTapi = articleAPI('New York Times API CODE GOES HERE')
    client = nyt.Client(NYTapi)
    res = client.query(search, sentences=4)
    summary = next(res.results).text
    print summary

Example #7

0

Show file

File: NYTIMES_Classification.py Project: stan2133/textclassification

def article_inform(keywords, date_series, file_name='data.txt', api_key=_api):
    """
    @:param: keywords: list of keywords
    @:param: date_series: list of date
    @:param: file_name: string
    @:return: content: list of records of NEWYORKTIMES
    """
    content = []
    api = articleAPI(api_key)
    # Downloads the information you need
    for keyword in keywords:
        for i in range(0, len(date_series) - 1):

            try:
                articles = api.search(q=keyword,
                                      start_date=date_series[i],
                                      end_date=date_series[i + 1])
                time.sleep(1)
                # my personal API, please get your own API through the website I provided.
                # For Usage: http://developer.nytimes.com/article_search_v2.json#/Documentation/GET/articlesearch.json
                # Articles are json files, which is a dict-like file
                for text in articles['response']['docs']:
                    content.append(text)
            except Exception:
                continue
    # save file
    with open(file_name, 'w') as f:
        for item in content:
            f.write("{}\n".format(item))
    return content

Example #8

0

Show file

File: nytimes.py Project: jneighbs/newsfeed

def retrieveArticlesByNewsDesk(query, newsDesk, begin, end, maxRequests=10):
	t = time()
	articlesToKeep = []
	api = articleAPI('2aeeb020218af0f30993f12ff451d821:10:62878084')
	page = 0
	while True:
		try:
			responseObject = api.search( q = query, fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':[newsDesk]}, begin_date = begin, end_date = end, page=page)
		except:
			print "search failed"
			print query, newsDesk, begin, end, maxRequests
			return (articlesToKeep, page+1)
		if 'response' not in responseObject:
			print "No response"
			print query, newsDesk, begin, end, maxRequests
			return (articlesToKeep, page+1)
		articles = responseObject['response']['docs']
		for article in articles:
			if worthKeeping(article):
				condensed = condenseArticle(article)
				#print condensed['pub_date']
				articlesToKeep.append(condensed)
		if len(articles) < 10 or page == maxRequests - 1:
			break
		page += 1
	print ""
	print "Retrieved " + str(len(articlesToKeep)) + " " + newsDesk + " articles."
	print "Made " + str(page+1) + " requests."
	print time() - t
	return (articlesToKeep, page+1)

Example #9

0

Show file

File: nyt.py Project: PaulAdams4361/Hadoop-Hive_AWS_Project

 def connect(self) -> None:
     """
     Creates a connection to NYT
     """
     if self.keys is None:
         self.keys = self._get_api_keys(self.path)
     # be sure to generate from list
     self.connection = articleAPI(self.keys[0])

Example #10

0

Show file

File: nyTimes_Articles_Collection.py Project: vishalgawade/Data-Aggregation-Big-Data-Analysis-and-Visualization-of-Twitter-New-York-Times-Common-Crawl

def nyDataFunc():
    api = articleAPI("wL9jacwKcc7zrn4UrgRtD59ikr8cHe5s")
    with open('nyTimes_Rugby_Data1.csv', 'a', newline='') as newFile:
        newFileWriter = csv.writer(newFile)
        newFileWriter.writerow(['nyTimes_Rugby_Data'])
    for i in range(10):
        articles = api.search(q='Rubgy', begin_date=20190101, page=i)
        #pdb.set_trace()
        data = articles['response']['docs']
        dataLength = len(data)
        with open('nyTimes_Rugby_Data1.csv', 'a', newline='') as newFile:
            newFileWriter = csv.writer(newFile)
            for j in range(dataLength):
                newFileWriter.writerow([data[j]['web_url']])

Example #11

0

Show file

File: nyt.py Project: billlee929/Talk_Stock

def getArticles(stocks):
    api = articleAPI(private.NYT)
    articles = {}
    for key, value in stocks.iteritems():
        stockRequest = api.search(fq = {'headline': value}, begin_date = 20150905)
        try:
            headline = stockRequest['response']['docs'][0]['headline']['main']
            url = stockRequest['response']['docs'][0]['web_url']
            if stockRequest['response']['docs'][0]['abstract'] is not None:
                abstract = stockRequest['response']['docs'][0]['abstract']
            else:
                abstract = "None"
        except Exception as e:
            pass
        articles[key] = {'headline': headline, 'url': url, 'abstract': abstract}
    return articles

Example #12

0

Show file

File: api_query_helper.py Project: amanbhal/Search-Engine

 def get_articles(self, date, news_desk, key):
     api = articleAPI(key)
     start_months = ["0101", "0201", "0301", "0401", "0501", "0601", "0701", "0801", "0901", "1001", "1101", "1201"]
     end_months = ["0131", "0228", "0330", "0430", "0530", "0630", "0731", "0831", "0930", "1030", "1130", "1231"]
     for j in range(len(start_months)):
         for i in range(0, 200):
             try:
                 articles = api.search(fq={'source': ['The New York Times']}, begin_date=date + start_months[j],
                                       end_date=date + end_months[j], sort='oldest', page=str(i),
                                       news_desk=news_desk)
                 self.write_to_file(articles)
                 if self.temp > 20:
                     break
             except Exception, e:
                 print 'Error while extracting from API: ' + str(e)
                 continue

Example #13

0

Show file

File: handle_apis.py Project: tjfroll/Panorama

def nytimes_api():
    """Return a handle to the New York Times article API.

    Load an API key from a local file not included in the git repository,
    then use it to acquire a handle to the New York Times article API.

    Returns:
        api (nytimesarticle.articleAPI): A New York Times article API handle.

    """
    # File containing the New York Times API key
    key_file = 'api-keys/nytimes.txt'
    # Read in the key string, remove the final character
    with open(key_file) as f:
        key = f.read()[:-1]
    return nyt.articleAPI(key)

Example #14

0

Show file

File: get_articles.py Project: barriserloth/nyt_grabber

def main():
    txt = open("keywords.txt", "w")

    api = articleAPI('45862958eff543bb9555201274493184')
    sections = which_sections()
    article_list = get_articles(api, sections)

    proper_list = get_proper_nouns(article_list)

    pref_list = []
    while True:
        pref = raw_input("Which articles sound interesting? ")
        if pref == '-': break
        pref_num = int(pref) - 1
        for i in proper_list[pref_num]:
            txt.write(i + '\n')

Example #15

0

Show file

def generate_url_file(start_page_number=0, to_extract=400):
    urls_file_path = get_file_path('data/urls.txt')
    if not os.path.exists(urls_file_path):
        with open(urls_file_path, 'w') as to_write_file:
            count = 0
            page_number = start_page_number
            previous_count = count
            while count < to_extract:
                previous_count = count
                # https: // api.nytimes.com / svc / search / v2 / articlesearch.json?api - key = ffd404f8bd874e2b881367f933dd423b & fq = news_desk:(
                # "Politics")
                # AND
                # type_of_material:("News") & page = 0 & fl = web_url
                ny_api = articleAPI(API_KEY)
                filters = {
                    'news_desk': 'Politics',
                    'type_of_material': 'News',
                }
                # news_desk_fltr = urllib.quote('&fq=news_desk:(Politics)')
                # section_fltr = 'AND section_name:("Business")'
                # source_fltr = 'AND source:("The New York Times")'
                # material_type_fltr = urllib.quote('AND type_of_material:(News)')
                # page_number_fltr = urllib.quote('&page=' + str(page_number))
                # projection = urllib.quote('&fl=web_url')
                # ny_times_url = base_url + news_desk_fltr + material_type_fltr + page_number_fltr + projection
                # json_content = json.load(urllib2.urlopen(ny_times_url))
                json_content = ny_api.search(fq=filters, page=page_number)
                num_hits = json_content[u'response'][u'meta'][u'hits']
                print 'Number of hits is: ' + str(
                    num_hits) + '. Current count is: ' + str(count)
                if num_hits < to_extract:
                    print 'Number of hits is less than to extract. Resizing. [WARNING]!!'
                    to_extract = num_hits
                results = json_content[u'response'][u'docs']
                for result in results:
                    url = result[u'web_url']
                    to_write_file.write(url)
                    to_write_file.write('\n')
                    count = count + 1
                page_number += 1
                time.sleep(1)
                if previous_count == count:
                    print 'no more result'
                    break

    else:
        print urls_file_path + ' already exists. Not Over writing [INFO]'

Example #16

0

Show file

File: articles.py Project: amchang/what-will-be-written

  def __init__(self, key='', cache=None):
    '''
      Constructor, setup the API key here and reference the data store as necessary

      key - the api key to use the access the ny times article api
      cache - instance of Cache class to use
    '''

    # need a key
    if (not key):
      raise Exception('No key passed in, please provide one')
    self.__nytimes_api = articleAPI(key)

    # need the cache
    if (not cache):
      raise Exception('Missing cache, cannot connot continue')
    self.__cache = cache

Example #17

0

Show file

File: CreateDatabase.py Project: Akshay77/Stock-Market-Analysis-using-News

    def GetArchivesNYT(self, company, entities):
        api = articleAPI(constants.NYT_KEY)
        articles = api.search( q = entities,
        fq = {'headline': company, 'source':['Reuters','AP','The New York Times',\
                'RETRO REPORT','Technology',' Amazon - Technology']},
                begin_date = 20160901 )
        news = []
        try:
            for i in articles['response']['docs']:
                dic = {}
                dic['date'] = i['pub_date']  # cutting time of day.
                dic['url'] = i['web_url']
                if dic['date'] is not None:
                    news.append(dic)
        except:
            print 'Could not retrieve articles for ', company

        return news

Example #18

0

Show file

File: NYT.py Project: realbadbytes/sentiment

def main():
    if os.getenv('NYTIMES_KEY') is None:
        print "Usage: \nSet 1 environment variable for NY Times authentication: "
        print "export NYTIMES_KEY=\"your key\""
        sys.exit()
    # auth to NY Times
    s = sched.scheduler(time.time, time.sleep)
    api = articleAPI(os.environ['NYTIMES_KEY'])
    # parse and print headlines
    sys.stdout.flush()
    headlines = parse_headlines(keyword, api)
    print_headlines(headlines)
    while True:
        for i in xrange(10, 0, -1):
            sys.stdout.flush()
            sys.stdout.write('Refresh in %d\r' % i)
            time.sleep(1)
        sys.stdout.flush()
        headlines = parse_headlines(keyword, api)
        print_headlines(headlines)

Example #19

0

Show file

class NytimesapiSpider(scrapy.Spider):
    name = "nytimes"
    allowed_domains = ["nytimes.com"]
    start_urls = ['http://nytimes.com/']

    api = articleAPI("038ee5e586674d6aa8c9102e6177a6ca")

    articles = api.search(q='Trump', fq={'headline': 'Trump', 'source': ['Reuters', 'AP', 'The New York Times']}, begin_date=20170101, facet_field=['source', 'day_of_week'], facet_filter=True)

    article_list = []
    article_dict = {}
    for key, value in articles.iteritems():
        if key is 'response':
            article_dict[key] = value
        article_list.append(article_dict)


    def parse(self, response):
        print response
        pass

Example #20

0

Show file

def _get_articles(keyword, from_date, end_date, page_number, article_type):
    api = articleAPI(nytimes_api_key)

    if article_type == 'blog':
        query_filter = {
            'section_name.contains': ['World', 'U.S.', 'Opinion'],
            'type_of_material.contains': ['Blog']
        }
    else:
        query_filter = {
            'section_name.contains': ['World', 'U.S.'],
            'type_of_material.contains': ['News', 'Brief']
        }

    articles = api.search(q=keyword,
                          fq=query_filter,
                          begin_date=from_date,
                          end_date=end_date,
                          page=page_number)

    return articles

Example #21

0

Show file

def get_NYT(start, end):
    with open('nyt_keys.json') as key:
        nyt_creds = json.load(key)

    api = nytimesarticle.articleAPI(nyt_creds['NYT_ARTICLE'])
    start_dt = pd.to_datetime(start).date()
    end_dt = pd.to_datetime(end).date()
    num_days = (end_dt - start_dt).days
    articles = []
    for i in xrange(num_days + 1):
        dt = start_dt + datetime.timedelta(i)
        dt_start = str(dt.year) + str(dt.month).zfill(2) + str(dt.day).zfill(2)
        dt_end = str(dt.year) + str(
            dt.month).zfill(2) + str(dt.day + 1).zfill(2)
        result = api.search(q = 'brexit', fq = {'source':['Reuters','AP', 'The New York Times'],\
                                                  'news_desk':["Foreign","Business","Financial","Market Place","World"],
                                                  'document_type':"article"}, begin_date = dt_start, end_date = dt_start, \
                              sort= "newest", \
                              fl = "web_url,snippet,lead_paragraph,abstract,source,headline,keywords,pub_date,document_type,news_desk,type_of_material", \
                              facet_field = "source,section_name,document_type")
        for r in result:
            if r == unicode('response'):
                for k in result[r].keys():
                    if k == unicode('docs'):
                        for doc in result[r][k]:
                            articles.append(doc)

    dates = []
    corpus = []
    for art in articles:
        r = requests.get(art[unicode('web_url')])
        tree = html.fromstring(r.content)
        contents = tree.xpath(
            '//p[@class="story-body-text story-content"]/text()')
        body = []
        for t in contents:
            if t.strip <> '': body.append(re.sub(r'[^\w]', ' ', t))
        corpus.append(''.join(body))
        dates.append(art[unicode('pub_date')][0:10])
    return dates, corpus

Example #22

0

Show file

File: api_item1.py Project: ickyatcity/project_three

def get_articles(date,query):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will
    return a list of parsed articles (in dictionaries)
    for that year.
    '''

    all_articles = []
    api = articleAPI(api_key)

    for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.

        articles = api.search(q=query,
                              fq={'source': ['Reuters', 'AP', 'The New York Times']},
                              begin_date=date + '0101',
                              end_date=date + '1231',
                              sort='oldest',
                              page=str(i))

        articles = parse_articles(articles)
        all_articles = all_articles + articles
    return(all_articles)

Example #23

0

Show file

File: api_nytimes.py Project: jeffrey6557/pogo

def searchTopic(topic):
	f = open('secret', 'r')
	for line in f:
		if line.find('NYTIMES_ARTICLE') != -1:
			a,key = line.split('=')

	key = key.replace('\n','')
	api = articleAPI(key)
	topic = topic
	headline = topic
	sources = ['Reuters', 'AP', 'The New York Times']
	beg_date = 20140101
	pages = 4


	articles_content = []
	output = []
	for page in range(pages):
		articles = api.search(q = topic, fq = {'headline':topic, 'source':sources}, 
					fl = ['web_url'], begin_date = beg_date, page = str(page + 1))
		for url in articles['response']['docs']:
			if url['web_url'].find('video') == -1:
				page = requests.get(url['web_url'])
				doc = lh.fromstring(page.content)
				text = doc.xpath('//p[@itemprop="articleBody"]')
				article = str()
				for par in text:
					paragraph = par.text_content()
					article += paragraph
				articles_content.append(article)
		for count, art in enumerate(articles_content):
			regex = re.compile('[^a-zA-Z]')
			articles_content[count] = regex.sub(' ', art)
	# with open('%s data.csv' % topic, 'wb') as output_file:
	# 	wr = csv.writer(output_file, quoting=csv.QUOTE_ALL)
	# 	wr.writerow(articles_content)
	print articles_content

Example #24

0

Show file

File: nyt_to_mongo_clean.py Project: aaronfrederick/Politician-Downfall-Prediction

def main():
    client = MongoClient('localhost', 27017)
    politician_db = client['politician_db']
    api = articleAPI('API KEY')
    df = pd.read_pickle('pol_df.pkl')
    already_loaded = pickle.load(open('pols_in_mongo.pkl', 'rb'))
    #print(already_loaded)


    pol_list = df.Name

    for politician in pol_list:
        article_pages = []
        time.sleep(1)
        df_entry = df[df.Name == politician]
        #create collection for politician
        politician_col = politician_db[df_entry['collectionname'][df_entry.index[0]]]
        if df_entry['collectionname'][df_entry.index[0]] in already_loaded:
            print(politician, 'is already in the database')
            continue
        #Set up Query
        beg_d = df_entry['Entered Office'][df_entry.index[0]]
        end_d = df_entry['Exited Office'][df_entry.index[0]]


        #print(beg_d,end_d)
        try:
            beg_d_in = date_query(beg_d)
            # if politician == 'abraham lincoln':
            #     print(beg_d_in, end_d_in)
        except:
            print(politician, end=' ')
            print('Failed at the beginning date step')

        if type(end_d) == str:
            try:
                done = False
                page = 1
                while not done:
                    time.sleep(1)
                    search = api.search(q=politician,
                                        begin_date=beg_d_in,
                                        page = page)
                    politician_col.insert_many(search['response']['docs'])
                    print(politician, page)
                    page += 1
                    if len(search['response']['docs'])%10 != 0 or len(search['response']['docs'])==0:
                        done = True
                print(f'Successful query for {politician};', end=' ')
            except:
                print(politician, end=' ')
                print('failed their attempt to get into our database :(')
        else:
            try:
                end_d_in = date_query(end_d)
                done = False
                page = 1
                while not done:
                    time.sleep(1)
                    search = api.search(q=politician,
                                        begin_date=beg_d_in,
                                        end_date = end_d_in,
                                        page=page)
                    politician_col.insert_many(search['response']['docs'])
                    print(politician, page)
                    page += 1
                    if len(search['response']['docs']) % 10 != 0 or len(search['response']['docs']) == 0:
                        done = True
                print(f'Successful query for {politician};')
            except:
                print(politician, end=' ')
                print('failed their attempt to get into our database :(')

Example #25

0

Show file

from nytimesarticle import articleAPI
from bs4 import BeautifulSoup
import requests
api = articleAPI("458e6335355c4f2fb83770f41baa4309")
f=open('nytArticles.txt','w')
links=[]
try:
    for a in range(0,20):
        articles = api.search(q="shooting", begin_date=20180406,end_date=20180407,page=a)
        for i in range(0,len(articles['response']['docs'])):
            url = articles['response']['docs'][i]['web_url']
            data = requests.get(url)
            soup = BeautifulSoup(data.content, 'html.parser')
            soup.prettify()
            for j in range((len(soup.find_all('p')))-3):
                f.write(soup.find_all('p')[j].get_text())

            #print(url)
            links.append(url)
    f.close()
except:
    print("We got only",len(links)," articles for selected period")

Example #26

0

Show file

File: nyTimesScrape.py Project: GAUNSD/TrumpTwitterAnalysis

import csv
from nytimesarticle import articleAPI
import time

api = articleAPI('ec4da91764da4217983fadb8c85f1dca')

all_articles = []
for j in range(
        0, 101
):  #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
    time.sleep(
        6)  #API calls limited to 5 seconds. make it 6 so it doesn't get angry
    articles = api.search(q="Trump",
                          fq={'headline': 'Trump'},
                          begin_date='20160707',
                          end_date='20161107',
                          fl='headline,pub_date,word_count',
                          sort='newest',
                          page=str(j))

    print j
    news = []

    for i in articles['response']['docs']:
        dic = {}
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['date'] = i['pub_date'][0:10]  # cutting time of day.
        dic['word_count'] = i['word_count']
        news.append(dic)

    articles = news

Example #27

0

Show file

File: retriever.py Project: karthikbangera87/NYTimes-Analysis

from nytimesarticle import articleAPI
import json
from secret import APIKey
api = articleAPI(APIKey)

testfile = open("foo.txt","ab")

for i in range(1,101):
	articles = api.search(q='ebola',page=i)
	articles =json.dumps(articles)
	testfile.write(articles)

Example #28

0

Show file

from nytimesarticle import articleAPI
import requests
from bs4 import BeautifulSoup
import csv
import urllib

APIKey = "5GDjPzkz28HHyX3LuEoWeAMiBUZGqnHw"
api = articleAPI(APIKey)


def read():
    # Read csv and return list of url
    url = []
    try:
        with open("url.csv", mode="r") as csv_file:
            readCSV = csv.reader(csv_file, delimiter=',')
            header = True
            for row in readCSV:
                if header:
                    header = False
                    continue
                try:
                    url.append(row[0])
                except:
                    break
    except FileNotFoundError:
        with open("url.csv", mode='w') as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            url = read()
    return url

Example #29

0

Show file

"""
NYT Download articles
"""

from nytimesarticle import articleAPI
import pandas as pd
import time
import re

api = articleAPI("<PASTE_YOUR_API_KEY>")

df = pd.DataFrame()

for i in range(1, 10):
    articles = api.search(q="Artificial Intelligence",
                          begin_date=int("20180" + str(i) + "01"),
                          end_date=int("20180" + str(i) + "28"))
    df = df.append(pd.DataFrame.from_dict(articles))
    time.sleep(60)

for i in range(10, 13):
    articles = api.search(q="Artificial Intelligence",
                          begin_date=int("2018" + str(i) + "01"),
                          end_date=int("2018" + str(i) + "28"))
    df = df.append(pd.DataFrame.from_dict(articles))
    time.sleep(60)

for i in range(1, 4):
    articles = api.search(q="Artificial Intelligence",
                          begin_date=int("20190" + str(i) + "01"),
                          end_date=int("20190" + str(i) + "28"))

Example #30

0

Show file

File: nyt_parser.py Project: caweinshenker/Stock-Picker

	def __init__(self):
		self.api = articleAPI("9ed601a914bd7b99e85df32fcb1b1a8b:9:75048351")
		self.news = []

Example #31

0

Show file

File: nytimes_scraping.py Project: frouglas/coding-club

# nytimes scraping

from nytimesarticle import articleAPI

import datetime
import json
import pandas as pd
import time

api = articleAPI('47948d7109eda66d76ccb67753997d53:15:28718339')


page_start = 24
page_end = 30
pages = range(page_start, page_end)
query_set = 'David Brooks'
mainDF = pd.DataFrame(columns=['headline','kicker','pub_date','page'])

for page in pages:
	articles = api.search( q = query_set, 
	     fq = {'kicker':query_set}, 
	     fl =['headline','byline','pub_date'],
	     page=page)

	article_info = []
	for article in articles['response']['docs']:
		(article_info.append({'kicker':article['headline']['kicker'],
							  'headline': article['headline']['main'],
							  'pub_date':article['pub_date'],
							  'page':page
			}))

Example #32

0

Show file

File: abhi_nyt_scrapper.py Project: Abhishek19895/Search_Indexing

        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news) 

api = articleAPI('fde3a426c74d6a1591bcdf1ffe1847d3:7:75025733')



#Function to get the data
def get_articles(date, query):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will 
    return a list of parsed articles (in dictionaries)
    for that year.
    '''
    all_articles = []
    for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        articles = api.search(q = query,
               fq = {'source':['Reuters','AP', 'The New York Times']},

Example #33

0

Show file

File: webscrape.py Project: lkrych/Projects

from sqlalchemy.orm import sessionmaker
from database_setup_newspaper import Base, Subject, Article #import classes from databaseSetup file

engine = create_engine('sqlite:///newspaper.db') #lets program know which db engine to connect with
Base.metadata.bind = engine #makes connections between classes and corresponding tables in db

DBSession = sessionmaker(bind = engine) #creates a link between our code and the engine we created

session = DBSession()

#####################WEB SCRAPING COMPONENT############################
#Info from http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial

from nytimesarticle import articleAPI

api = articleAPI('dad4ec569b4924e931a381a9d9d1edb5:16:75117993')\

obamaArticles = api.search( q = 'Obama', 
     fq = {'headline':'Obama', 'source':['The New York Times']}, 
     begin_date = 20141231 )

clintonArticles = api.search( q = 'Hillary Clinton', 
     fq = {'headline':'Hillary Clinton', 'source':['The New York Times']}, 
     begin_date = 20141231 )

def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []

Example #34

0

Show file

File: article_search.py Project: mchensd/vocab-helper

#! python3
# Scraping NYT articles for a specific vocabulary word, returning the full sentence if found.

import json
from nytimesarticle import articleAPI
api = articleAPI('417fe849ada646a28b95d6185b70777c')
articles = api.search(q = 'precocious', begin_date = '20050101')
url = 'https://api.nytimes.com/svc/search/v2/'

# TODO
# Get more articles from earlier dates, make sure to get only articles not movies
# Option for user to get another sentence
# How to search for certain word (re?)

Example #35

0

Show file

File: nyt_corpus_creator.py Project: CamCairns/NYT_web_scraping

    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def create_save_dir(file_type, section):
    root_dirpath = os.environ["data_dir"] + "nyt_corpus/"
    save_dirpath = "/".join(root_dirpath, file_type, section)
    mkdir_p(save_dirpath)
    return save_dirpath


if __name__ == "__main__":
    api = articleAPI(os.environ["nyt_api_key"])  # load export nyt api key as environmental variable alias=nyt_api_key

    nytd_section = ['Arts', 'Business', 'Obituaries', 'Sports', 'World']
    for section in nytd_section:
        num_pages = 101  # nyt max paginate limit = 101
        for i in range(0, num_pages):
            print "scraping %s section, page {0}/{1}".format(section, i+1, num_pages)
            articles = api.search(sort='newest', fq={'source': ['The New York Times'], 'document_type': ['article'], 'section_name': [section]}, page=i)
            news = parse_articles(articles)

            body_text = []
            for j in range(10):
                r = requests.get(news[j]['url'])
                data = r.text
                soup = BeautifulSoup(data)
                g_text = soup.find_all("p", {"class": ["story-body-text story-content", "story-body-text"]})

Example #36

0

Show file

File: nytimes2.py Project: perkinsbt/cse-6242

# -*- coding: utf-8 -*-
"""
Created on Sun Apr  3 20:05:40 2016

@author: katherinemckenna
"""

from nytimesarticle import articleAPI
import csv
import os
key_word="Obamacare"

os.chdir('/Users/katherinemckenna/Documents/CSE6242/Project/Articles Pulling')
api = articleAPI('0358c9b864c6cad5603a6a32420c60be:7:74784865')


def parse_articles(articles):
    news = []
    for i in articles['response']['docs']:
        dic = {}
        #dic['id'] = i['_id']
        dic['desk'+"_"+key_word] = i['news_desk']
        dic['date'+"_"+key_word] = i['pub_date'][0:10] # cutting time of day.
        dic['section'+"_"+key_word] = i['section_name']
        #dic['type'] = i['type_of_material']
        dic['url'+"_"+key_word] = i['web_url']
        #dic['word_count'] = i['word_count']
        news.append(dic)
    return(news) 

def get_articles(date,query):

Example #37

0

Show file

File: nyt_interface.py Project: innainu/climatechangebot

 def __init__(self, key):
     self.api = articleAPI(key)
     self.secret_keyword = "climate change and "
     self.num_days_trending = 3

Example #38

0

Show file

from nytimesarticle import articleAPI

api = articleAPI("391f118fad27467d8b193a6686e1b3d1")

articles = api.search(q='Trump',
                      fq={
                          'headline': 'Trump',
                          'source': ['The New York Times']
                      },
                      begin_date=20180618)

print articles

#prints the url to the trending article and the details of the whole article

Example #39

0

Show file

File: nyt_v1.py Project: martinaragoneses/python-workshop

	https://github.com/evansherlock/nytimesarticle/blob/master/README.txt
	
# Steps
	1) Install nytimesarticle
		python -m easy_install nytimesarticle
		pip install nytimesarticle
		
	2) Run 
		python nyt_v1.py
'''

# Import package
from nytimesarticle import articleAPI

# Set API Key
api = articleAPI('your_api_key')

# Query `bugdet deficits'
articles = api.search(q = 'federal budget deficits')

# articles = api.search(q = 'budget deficits', begin_date = 20111231, page=3)
# articles = api.search(q = 'budget deficits', fq = {'headline':'Obama'})

# Total hits
response = articles['response']['meta']['hits']
print response

# Get publication date of the articles
response = articles['response']['docs']
for item in response:
	print item['pub_date']

Example #40

0

Show file

File: test_nyt_api.py Project: mahmoudalismail/Reporta

from nytimesarticle import articleAPI
import datetime
import time

api = articleAPI('ae2b43b298972426c8e43a96afcb9aaa:0:70254909')

def clean_entry(art):
	clean = {}
	if art['abstract']:
		clean['snippet'] = art['abstract'].encode('ascii', 'ignore')
	else:
		clean['snippet'] = art['snippet'].encode('ascii', 'ignore')
	clean['url'] = art['web_url']
	clean['multimedia'] = "http://www.nytimes.com/"+str(art['multimedia'][1]['url'])
	clean['keywords'] = []
	for word in art['keywords']:
		clean['keywords'].append(word['value'].encode('ascii', 'ignore'))
	clean['headline'] = art ['headline']['main'].encode('ascii', 'ignore')
	return clean

def get_articles(cb, so_far=[], topic=None, page=0, limit=5):
        def respond_get_articles(payload):
                if not payload:
                    cb(None)

                good_articles = []
                for art in payload['response']['docs']:
                        if (art['snippet'] or art['abstract']) and art['multimedia'] and art['type_of_material'] in ["News", "Editorial"]:
                                good_articles.append(clean_entry(art))
                good_articles += so_far
                if len(good_articles) >= limit:

Example #41

0

Show file

File: utilities.py Project: ChihChengLiang/metis_projects

from nytimesarticle import articleAPI
import collections
import math
import datetime
import re
import pickle

# api keys
prasmuss = '7b4597b0dc6845688a8f90c00f3e60b6'
peter_gray_rasmussen = '67391c8a5c6c2d8926eb3d9c5d136c59:7:72273330'
proton = 'f8c34c7cda7848f997a9c273815d28a9'
api = articleAPI(proton)


def convert(data):
    '''
    this function encodes dictionary of unicode entries into utf8
    from http://stackoverflow.com/questions/1254454/fastest-way-to-convert-a-dicts-keys-values-from-unicode-to-str
    '''
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data


def get_nyt_article_stats(articles_and_meta):
    '''

Example #42

0

Show file

File: NYTscraper.py Project: YASMINE245/Fake-News-Corpus

import requests, json
from bs4 import BeautifulSoup as bs
from nytimesarticle import articleAPI
import random, time

#api = articleAPI("1365a8efa2ee4cd9bb685612a03904b1")
#api = articleAPI("aeff1d6a957f40c19bbda08b9714c4a4")
#api = articleAPI("336886d72ea14a71a9a5c34800f9a761")
api = articleAPI("8b80d00bbc10401aa74b05e4be7fd065")

#api =  articleAPI("4009c64cd2cf4a6ea25592535e9c83e3")
#api =  articleAPI("a91daaccbde54672921fcd3742a1a85d")
#api =  articleAPI("8b3df0ca99ed46d6bbf103c934c94ce9")
#api = articleAPI("c159e3dfabdc45c993619e1bb6425002")

#api = articleAPI("98ebc312f98d4257b1c27445152e6435")
#api = articleAPI("9555d8d40c0044659546fdeeda4d290e")
#api = articleAPI("3f94c1e436964be6909319b6d0989ec9")

#api = articleAPI("7b3f3b398aec414aae09501f8c430155")
#api = articleAPI("c1083f7480f941159de8f70d60975715")
#api = articleAPI("bf53fb1b8c144020b41bbdf99b676157")
#api = articleAPI("ade519c504884db097f6ce8fe9aecc2b")

#api = articleAPI("4e3304e9ba354531ba8f35935a21188b")

month = 9
outfile = open("NYTstories_" + str(month) + ".dat", "a")

## Algorithm
# Randomly sample 1000 articles per month from the monthly data

Example #43

0

Show file

File: server.py Project: troydo42/lucy

import json
import retinasdk
import urllib
from nytimesarticle import articleAPI
from flask import Flask, request, jsonify
import giphypop
import requests

g = giphypop.Giphy()


api = articleAPI('499fdeaf3f57e86e6e07db5f763ed2b1:4:74404554')
liteClient = retinasdk.LiteClient("b24cfff0-d29c-11e5-8378-4dad29be0fab")
q = []

app = Flask(__name__)

def getMultimedia(mm):
    for m in mm:
        if 'thumbnail' in m['legacy']:
            return 'http://www.nytimes.com/' + m['legacy']['thumbnail']

def getEntity(text):

    url = "http://gateway-a.watsonplatform.net/calls/text/TextGetRankedNamedEntities?apikey=bab3a3fa37b3724f29c56caee74e314b585d993d&outputMode=json&text=" + text
    u = urllib.urlopen(url)
     # # u is a file-like object
    data = u.read()
    resp = json.loads(data)
    for entity in resp['entities']:
        if entity['type'] in ['Company','Facility','GeographicFeature'

Example #44

0

Show file

from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk, tree
from nltk.tag.stanford import StanfordNERTagger
from nytimesarticle import articleAPI
from pprint import pprint
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
plt.rcdefaults()
import matplotlib as mpl
import numpy as np
import matplotlib.lines as mlines
import sys
import operator

api_key = "01e07bd2b8034b81bc9bef8d3e35df3a"
api_interface = articleAPI(api_key)

warc_type_regex = re.compile(r'((WARC-Type:).*)')
warc_record_id_regex = re.compile(r'((WARC-Record-ID:) <.*>)')
html_regex = re.compile(r'<html\s*(((?!<html|<\/html>).)+)\s*<\/html>',
                        re.DOTALL)


def visualize(main_entity_name, article_name, article_entities, output_name,
              bar_amount):
    #Initialize Values
    dictionary = dict()
    entities = list()
    values = list()
    for element in article_entities:
        if element == []:

Example #45

0

Show file

File: final_project.py Project: YuxinJ/final_YuxinJin

#final project 
#course: computation for public policy 
#project topic: from a western perspective: China and the Environment 

#import the NYTarticles API 
from nytimesarticle import articleAPI
api = articleAPI('22bcd777b40f8d77e6ccf6469e7f9f16:11:67754634')

#test api searches 
search = api.search (q= 'environment' #put the more important keyword here. 
                                      #need to filter again how does the work 'environment' is used
                     ,fq = {'headline':'China'
                            #'subline':'environment'
                            ,'body':['China']
                            ,'source':['Reuters','AP', 'The New York Times']}
                     ,sort='oldest'
                     ,begin_date = 19900101
                     ,end_date = 20151231
                     )  


#defining a function to parse the result return from API 
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['id'] = i['_id']

Example #46

0

Show file

File: Lab2_NYTimes.py Project: Ajinkya296/hadoop-to-wordcloud

# coding: utf-8

# In[1]:


from nytimesarticle import articleAPI
from bs4 import BeautifulSoup
import requests
import re
import io


# In[2]:


api = articleAPI("7b9a5b632d244ba281e306111824d31a")


# In[26]:


search_keyword = 'stormy daniels trump' 
articles = api.search(q=search_keyword,begin_date="20161001")


# In[27]:


for doc in articles['response']['docs']:
    arti_url = doc['web_url']
    print(arti_url)

Example #47

0

Show file

File: newyorktimes.py Project: charlie86/charlie86.github.io

from nytimesarticle import articleAPI
import time

API_KEY = 'd0ca6e84f73f40a2b9cc4e7b24dc5674'

api = articleAPI(API_KEY)

articles = api.search(q = 'Trump',
	fq = {'headline':'Trump', 'source':['Reuters', 'AP', 'The New York Times']})

def parse_articles(articles):
	news = []
	for i in articles['response']['docs']:
		dic = {}
		dic['id'] = i['_id']
		if i['abstract'] is not None:
			dic['abstract'] = i['abstract'].encode('utf8')
		dic['headline'] = i['headline']['main'].encode('utf8')
		dic['desk'] = i['news_desk']
		dic['date'] = i['pub_date'][0:10] # cutting time of day.
		dic['section'] = i['section_name']
		if i['snippet'] is not None:
			dic['snippet'] = i['snippet'].encode('utf8')
		dic['source'] = i['source']
		dic['type'] = i['type_of_material']
		dic['url'] = i['web_url']
		dic['word_count'] = i['word_count']
		# locations
		locations = []
		for x in range(0,len(i['keywords'])):
			if 'glocations' in i['keywords'][x]['name']:

Example #48

0

Show file

File: sp_nyt_scrapper.py Project: lawrann/AI-for-stock-market-trending-analysis

# -*- coding: utf-8 -*-

import os
import csv
import time
from nytimesarticle import articleAPI

api = articleAPI('pE4obr7qszOl3BziHDhS47RQv8ssf7pC')

if not os.path.exists('articles_scrapped\sp_articles'):
    os.makedirs('articles_scrapped\sp_articles')


def parse_articles(articles):
    news = []
    if (articles.get("response") is not None) and (
        (articles.get("response")).get("docs") is not None):
        for i in articles['response']['docs']:
            dic = {}
            dic['id'] = i['_id']
            if i['abstract'] is not None:
                dic['abstract'] = i['abstract']
            dic['headline'] = i['headline']['main']
            dic['desk'] = i['news_desk']
            dic['date'] = i['pub_date'][0:10]  # cutting time of day.
            dic['section'] = i['section_name']
            if i['snippet'] is not None:
                dic['snippet'] = i['snippet']
            if i['lead_paragraph'] is not None:
                dic['lead_paragraph'] = i['lead_paragraph']
            dic['source'] = i['source']

Example #49

0

Show file

File: nyt_api.py Project: thomasbrawner/election_coverage

import json 
from nytimesarticle import articleAPI


with open('api_keys.json') as f:
    api_key = json.load(f)['nyt_api_key']
api = articleAPI(api_key)


date_list = [('20150101', '20150131'),
             ('20150201', '20150228'),
             ('20150301', '20150331'),
             ('20150401', '20150430'),
             ('20150501', '20150531'),
             ('20150601', '20150630'),
             ('20150701', '20150731'),
             ('20150801', '20150831')]


articles_out = []
for date_tuple in date_list:
    for i in xrange(0, 100):
        articles = api.search(q='election', 
                              fq={'source' : ['The New York Times']}, 
                              begin_date=date_tuple[0], 
                              end_date=date_tuple[1],
                              page=str(i))
        articles_out += articles['response']['docs']

with open('nyt_results_election_2015.txt', 'wb') as f:
    json.dump(articles_out, f)

Example #50

0

Show file

File: check_total_hits.py Project: muranava/NYT

####total_articles.py written by Hieronimus Loho and Adam Chekroud
'''
This script pulls the total number of articles in the NYT per quarter per year, just as a reference point.
It is very similar to the benchmarker.py script, just without the rotating keys and without any search terms. 
'''
from nytimesarticle import articleAPI
import csv
import time
import os
#Insert API Key
api_file = open(
    "/Users/hieronimusloho/Box Sync/Research Stuff/NYTLocal/NYT_keys.txt", "r")
api_list = api_file.read().split('\n')
api = articleAPI(api_list[6])
mental_health = []
#Start and end year
begin = 1900
end = 2016


#Returns the hits response of the metadata
def parse_articles(articles, year, quarter):
    shell = []
    dic = {}
    dic['year'] = year
    dic['hits'] = articles['response']['meta']['hits']
    dic['quarter'] = quarter
    shell.append(dic)
    return (shell)

Example #51

0

Show file

File: bing_search.py Project: gzpgg3x/timesOpenHack

import json
import urllib, urllib2
from nytimesarticle import articleAPI
api = articleAPI('00cc3abde644a36b3e10a27189ae1a45%3A11%3A70234304')

def run_query(search_terms):
    # Specify the base
    root_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    source = 'Web'

    # Specify how many results we wish to be returned per page.
    # Offset specifies where in the results list to start from.
    # With results_per_page = 10 and offset = 11, this would start from page 2.
    results_per_page = 10
    offset = 0

    # Wrap quotes around our query terms as required by the Bing API.
    # The query we will then use is stored within variable query.
    query = "'{0}'".format(search_terms)
    query = urllib.quote(query)

    # Construct the latter part of our request's URL.
    # Sets the format of the response to JSON and sets other properties.
    search_url = "{0}q={1}&api-key=00cc3abde644a36b3e10a27189ae1a45%3A11%3A70234304".format(
        root_url,
        query)

    # Setup authentication with the Bing servers.
    # The username MUST be a blank string, and put in your API key!
    username = ''
    bing_api_key = 'kdiorLyLAkjJpJOhWxPWI55D9fan9H1bHXn6kxWxlJw'

Example #52

0

Show file

File: NewsSearch.py Project: jajmera/Stock-Market-Analytics

from nytimesarticle import articleAPI
api = articleAPI("b42bf8a015d7be77596925a04577efa5:9:72455510")
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
##        dic['id'] = i['_id']
        if i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
##        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
##        dic['source'] = i['source']
##        dic['type'] = i['type_of_material']
##        dic['url'] = i['web_url']
##        dic['word_count'] = i['word_count']
        # locations
##        locations = []
##        for x in range(0,len(i['keywords'])):
##            if 'glocations' in i['keywords'][x]['name']:
##                locations.append(i['keywords'][x]['value'])
##        dic['locations'] = locations
        # subject
        subjects = []

Example #53

0

Show file

File: nyt_worker.py Project: Fordro47/geoNewsWorkers

import sys
import json
import requests
import time
import logging
import traceback
import pprint
from nytimesarticle import articleAPI
from pygeocoder import Geocoder

#System arguments #######################################################
date = sys.argv[1]
debugging = False

#Variables #############################################################
api = articleAPI("af0ead0b339871714bd8718ac007283b:11:73169680")

submitted = 0
duplicates = 0
coordCount = 0
multimediaCount = 0
log = logging.getLogger('nyt_worker')
log.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s :: %(message)s')

fileHandler = logging.FileHandler('logs/nyt_log.log')
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
log.addHandler(fileHandler)

Example #54

0

Show file

File: nytimes_parser.py Project: yangmh93/capstone

#!/usr/bin/env python3

import os
import json
import urllib.request
import urllib.response
import urllib.error
import random
from nytimesarticle import articleAPI

api = articleAPI("93fc659744f5238f6f95d464865562b8:16:74068039")

def parse_articles():
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = {"children": []}
    path = "C:\\Users\\Bin\\Desktop\\capstone\\json_data\\"
    d = {}
    date = input("Enter a year: ")

    top_node = {"section_name":"Articles",
    "size":500,
    "children": []}

    file_write = date + ".json"
    temp_file = open(os.path.join(path, file_write), 'w')

    for i in range(0,5):
        articles = api.search(

Example #55

0

Show file

File: scraper.py Project: meinstein/laden-with-terror

import unicodecsv as csv
from nytimesarticle import articleAPI
# api = articleAPI('f507f70d25c67d529f25cbd715cc5de9:5:67901820')
api = articleAPI('6f3db26878b18ab77f5cb78e82dd4496:3:67969482')

# articles = api.search( q = 'Obama', 
#      fq = {'headline':'Obama', 'source':['Reuters','AP', 'The New York Times']}, 
#      begin_date = 20111231 )

def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
        # dic['id'] = i['_id']
        # if i['abstract'] is not None:
        #     dic['abstract'] = i['abstract'].encode("utf8")
        # dic['headline'] = i['headline']['main'].encode("utf8")
        if type(i['headline']) is dict:
            dic['headline'] = i['headline']['main'].encode("utf8")
        # dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        #dic['section'] = i['section_name']
        # if i['snippet'] is not None:
        #     dic['snippet'] = i['snippet'].encode("utf8")
        #dic['source'] = i['source']
        # dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']

Example #56

0

Show file

File: getTimesArticles.py Project: mkulshrestha/Map-Reduce-with-Hadoop

from nytimesarticle import articleAPI
from bs4 import BeautifulSoup
import requests
api = articleAPI("MqGpINkbKmy15GOA36p7DYm09JTXSAaO")
f = open('nytarticlesday.txt', 'w')
links = []
articles = api.search(q='cold', begin_date=20190101, page=100)
#f=open('sports.txt','w')
for i in range(0, 10):

    url = articles['response']['docs'][i]['web_url']
    data = requests.get(url)
    soup = BeautifulSoup(data.content, 'html.parser')
    soup.prettify()
    for i in range((len(soup.find_all('p'))) - 3):

        f.write(soup.find_all('p')[i].get_text())

    print(url)
    links.append(url)
f.close()
import os
path = os.path.join(os.path.expanduser('~'), 'Desktop', 'file.txt')
print(path)

Example #57

0

Show file

File: nytimes.py Project: jneighbs/newsfeed

from nytimesarticle import articleAPI
from time import time
from datetime import datetime
import pickle

####################################
# WARNING SPAGHETTI CODE KEEP AWAY #
#								   #
# srsface don't touch anything     #
####################################

api = articleAPI('2aeeb020218af0f30993f12ff451d821:10:62878084')
'''
page = 0
responseObject = api.search( q = '*', fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':['travel']}, begin_date = 20140409, page=page)
articles = responseObject['response']['docs']
for article in articles:
	print article['headline']['main']
	print article['news_desk']
'''
def retrieveAll():
	page = 0
	while True:
		responseObject = api.search( q = 'NHL', fq = {'source':['Reuters','AP', 'The New York Times'], 'news_desk':['sports']}, begin_date = 20111231, page=page)
		articles = responseObject['response']['docs']
		for article in articles:
			print article['headline']['main']
			print "NEWS DESK: ", article['news_desk']
			print "ABSTRACT: ", article['abstract']
			print "SNIPPET: ", article['snippet']
			if article['snippet'] != article['lead_paragraph']:

Example #58

0

Show file

File: text_analysis.py Project: katsully/mmsd-text-analysis

search_term = cl.classify(search_tweet)

if search_term == "food":
    search_term = "hunger"
elif search_term == "water":
    search_term = "clean water"
elif search_term == "hygiene":
    search_term = "health"
elif search_term == "technology":
    search_term = "access to technology"

## GET NEWS ARTICLE
in_file = open("ny_times_key.txt")
key = in_file.read()
in_file.close()
api = articleAPI(key)

developingCountries = [
    "Mali",
    "Egypt",
    "China",
    "Cuba",
    "South Africa",
    "Algeria",
    "Angola",
    "Bolivia",
    "Brunei",
    "Burma",
    "Chile",
    "Congo",
    "Ecuador",

Example #59

0

Show file

File: nyt.py Project: russellburdt/data-science

import matplotlib.pyplot as plt
from nytimesarticle import articleAPI
from datetime import datetime, timedelta
from dateutil.parser import parse
from pprint import pprint
from pyrb import arange, format_axes, largefonts, save_pngs, open_figure
from time import sleep
from ipdb import set_trace
from tqdm import tqdm


# api key
# api = articleAPI(r'5d33d48934ef42f0a7b715cbf7f1007f')
# api = articleAPI(r'81e82e91402945198c0d0216c1b5181d')
# api = articleAPI(r'0ecfeab1d0ef4c25b78ecddf0cf23bee')
api = articleAPI(r'89616f68abff4f1691131533addc64d3')

# define article search parameters
kws = {'q': 'enron'}
time0 = parse('1999 Jan 1')
time1 = parse('2010 Feb 1')

# initialize df, scan over segments from time0 to time1
df = pd.DataFrame()
time = arange(time0, time1, timedelta(days=7))
page_limit = 200
for a, b in tqdm(zip(time[:-1], time[1:]), total=time.size - 1, desc='running api'):

    # scan pages until no results returned
    page = 0
    while True: