Example #1
0
    def search(self):
        nyt = NYTAPI("wbWOIDwmGPWGQALhXbfC3BDK3EMtFBMA")
        startDate = str(self.startDate) + " 00:00:00"
        endDate = str(self.endDate) + " 23:59:59"
        articles = nyt.article_search(
            query="Covid",
            results=10,
            dates={
                #"begin": datetime.datetime(2020, 6, 24),
                #"end": datetime.datetime(2020, 6, 27)
                "begin": datetime.datetime.strptime(startDate,
                                                    '%Y-%m-%d %H:%M:%S'),
                "end": datetime.datetime.strptime(endDate, '%Y-%m-%d %H:%M:%S')
            },
            options={
                "sort":
                "relevance",
                "sources": [
                    "New York Times", "AP", "Reuters",
                    "International Herald Tribune"
                ],
                "type_of_material": ["News"]
            })

        return articles
Example #2
0
 def search(self):
     nyt = NYTAPI("wbWOIDwmGPWGQALhXbfC3BDK3EMtFBMA")
     startDate = str(self.startDate) + " 00:00:00"
     endDate = str(self.endDate) + " 23:59:59"
     articles = nyt.article_search(
         query="Covid",
         results=10,
         dates={
             #"begin": datetime.datetime(2020, 6, 24),
             #"end": datetime.datetime(2020, 6, 27)
             "begin": datetime.strptime(startDate, '%Y-%m-%d %H:%M:%S'),
             "end": datetime.strptime(endDate, '%Y-%m-%d %H:%M:%S')
         },
         options={
             "sort":
             "relevance",
             "sources": [
                 "New York Times", "AP", "Reuters",
                 "International Herald Tribune"
             ],
             "type_of_material": ["News"]
         })
     sorted_articles = sorted(
         articles,
         key=lambda x: datetime.strptime(x['pub_date'][0:10], '%Y-%m-%d'),
         reverse=True)
     for x in range(len(sorted_articles)):
         sorted_articles[x]['pub_date'] = datetime.strptime(
             sorted_articles[x]['pub_date'][0:10],
             '%Y-%m-%d').strftime('%d-%b-%Y')
     return sorted_articles
Example #3
0
    def test_parse_dates_disabled(self):
        local_nyt = NYTAPI(API_KEY)
        data = local_nyt.article_metadata(
            "https://www.nytimes.com/live/2021/02/10/us/impeachment-trial/prosecutors-begin-arguments-against-trump-saying-he-became-the-inciter-in-chief-of-a-dangerous-insurrection"
        )

        self.assertEqual(data[0]["created_date"], "2021-02-10T11:04:08-05:00")
Example #4
0
 def test_empty_api_key(self):
     with self.assertRaises(ValueError):
         NYTAPI()
Example #5
0
 def setUp(self):
     self.nyt = NYTAPI(API_KEY, parse_dates=True)
Example #6
0
class TestNewYorkTimes(unittest.TestCase):
    def setUp(self):
        self.nyt = NYTAPI(API_KEY, parse_dates=True)

    def tearDown(self):
        self.nyt.close()

    def test_empty_api_key(self):
        with self.assertRaises(ValueError):
            NYTAPI()

    def test_top_stories(self):
        top_stories = self.nyt.top_stories()
        self.assertIsInstance(top_stories, list)
        self.assertGreater(len(top_stories), 0)

        for top_story in top_stories:
            self.assertIsInstance(top_story, dict)
            self.assertIsInstance(top_story["created_date"], datetime.datetime)
            self.assertIsInstance(top_story["published_date"],
                                  datetime.datetime)

    def test_top_stories_section(self):
        section = "world"
        top_stories_section = self.nyt.top_stories(section=section)
        self.assertIsInstance(top_stories_section, list)
        self.assertGreater(len(top_stories_section), 0)

        for top_story in top_stories_section:
            self.assertIsInstance(top_story, dict)

    def test_top_stories_wrong_section(self):
        with self.assertRaises(ValueError):
            self.nyt.top_stories("abcdfsda")

        with self.assertRaises(TypeError):
            self.nyt.top_stories(section=123)

    def test_most_viewed(self):
        most_viewed = self.nyt.most_viewed()
        self.assertIsInstance(most_viewed, list)
        self.assertGreater(len(most_viewed), 0)

        for most in most_viewed:
            self.assertIsInstance(most, dict)
            self.assertIsInstance(most["media"], list)

    def test_most_viewed_invalid_days(self):
        with self.assertRaises(ValueError):
            self.nyt.most_viewed(2)

        with self.assertRaises(TypeError):
            self.nyt.most_viewed(days="1")

    def test_most_shared(self):
        most_shared = self.nyt.most_shared()
        self.assertIsInstance(most_shared, list)
        self.assertGreater(len(most_shared), 0)

        for most in most_shared:
            self.assertIsInstance(most, dict)
            self.assertIsInstance(most["published_date"], datetime.date)
            self.assertIsInstance(most["updated"], datetime.datetime)
            self.assertIsInstance(most["media"], list)

    def test_most_shared_invalid(self):
        with self.assertRaises(ValueError):
            self.nyt.most_shared(method="twitter")

        with self.assertRaises(ValueError):
            self.nyt.most_shared(days=2)

        with self.assertRaises(TypeError):
            self.nyt.most_shared(days="2")

    def test_book_reviews(self):
        author = "Barack Obama"
        book_reviews = self.nyt.book_reviews(author=author)
        self.assertIsInstance(book_reviews, list)
        self.assertGreater(len(book_reviews), 0)

        for book_review in book_reviews:
            self.assertIsInstance(book_review, dict)
            self.assertEqual(book_review["book_author"], author)

    def test_book_reviews_invalid(self):
        with self.assertRaises(ValueError):
            self.nyt.book_reviews()

        with self.assertRaises(ValueError):
            self.nyt.book_reviews(isbn=213789, author="author")

        with self.assertRaises(ValueError):
            self.nyt.book_reviews(isbn=213789)

    def test_best_sellers_lists(self):
        best_sellers_lists = self.nyt.best_sellers_lists()
        self.assertIsInstance(best_sellers_lists, list)
        self.assertGreater(len(best_sellers_lists), 0)

    def test_best_seller_list(self):
        best_seller_list = self.nyt.best_sellers_list(date=datetime.datetime(
            2019, 1, 1),
                                                      name="hardcover-fiction")
        self.assertIsInstance(best_seller_list, list)
        self.assertEqual(best_seller_list[0]["primary_isbn13"],
                         "9780385544153")

    def test_best_seller_list_invalid(self):
        with self.assertRaises(ValueError):
            self.nyt.best_sellers_list(name="not a name")

        with self.assertRaises(TypeError):
            self.nyt.best_sellers_list(date="123")

    def test_movie_reviews(self):
        movie_reviews = self.nyt.movie_reviews()
        self.assertIsInstance(movie_reviews, list)
        self.assertGreater(len(movie_reviews), 0)

        for movie_review in movie_reviews:
            self.assertIsInstance(movie_review, dict)

    def test_movie_reviews_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.movie_reviews(keyword=123)

    def test_article_metadata(self):
        article_metadata = self.nyt.article_metadata(
            "https://www.nytimes.com/live/2021/02/10/us/impeachment-trial/prosecutors-begin-arguments-against-trump-saying-he-became-the-inciter-in-chief-of-a-dangerous-insurrection"
        )
        self.assertIsInstance(article_metadata, list)

        for article in article_metadata:
            self.assertIsInstance(article, dict)

        title = "Prosecutors argue that Trump ‘became the inciter in chief’ and retell riot with explicit video."
        creation_datetime = datetime.datetime(
            2021,
            2,
            10,
            11,
            4,
            8,
            tzinfo=datetime.timezone(datetime.timedelta(days=-1,
                                                        seconds=68400)),
        )
        self.assertEqual(article_metadata[0]["title"], title)
        self.assertEqual(
            article_metadata[0]["created_date"],
            creation_datetime,
        )

    def test_article_metadata_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.article_metadata()

        with self.assertRaises(TypeError):
            self.nyt.article_metadata(123)

        with self.assertRaises(ValueError):
            self.nyt.article_metadata("text")

    def test_archive_metadata(self):
        archive_metadata = self.nyt.archive_metadata(
            date=datetime.date.today())
        self.assertIsInstance(archive_metadata, list)
        self.assertGreater(len(archive_metadata), 0)

        for metadata in archive_metadata:
            self.assertIsInstance(metadata, dict)
            self.assertGreaterEqual(
                metadata["pub_date"],
                datetime.datetime.now(tz=datetime.timezone.utc).replace(
                    day=1, hour=0, minute=0, second=0, microsecond=0),
            )

    def test_archive_metadata_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.archive_metadata("string")

        with self.assertRaises(TypeError):
            self.nyt.archive_metadata(123)

    def test_article_search(self):
        search = self.nyt.article_search("Joe Biden", results=80)
        self.assertIsInstance(search, list)
        self.assertEqual(80, len(search))
        for article in search:
            self.assertIsInstance(article, dict)

    def test_article_search_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.article_search(123)

        with self.assertRaises(TypeError):
            self.nyt.article_search("query", datetime.date.today())

    def test_section_list(self):
        section_list = self.nyt.section_list()
        self.assertIsInstance(section_list, list)
        self.assertGreater(len(section_list), 0)

        for section in section_list:
            self.assertIsInstance(section, dict)

    def test_latest_articles(self):
        latest_articles = self.nyt.latest_articles()
        self.assertIsInstance(latest_articles, list)

        for article in latest_articles:
            self.assertIsInstance(article, dict)

    def test_latest_articles_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.latest_articles(source=123)

    def test_tag_query(self):
        tags = self.nyt.tag_query("Obama", max_results=2)
        self.assertIsInstance(tags, list)
        self.assertIs(2, len(tags))

    def test_tag_query_invalid(self):
        with self.assertRaises(TypeError):
            self.nyt.tag_query(123)

        with self.assertRaises(TypeError):
            self.nyt.tag_query("Obama", max_results="2")

    def test_parse_dates_disabled(self):
        local_nyt = NYTAPI(API_KEY)
        data = local_nyt.article_metadata(
            "https://www.nytimes.com/live/2021/02/10/us/impeachment-trial/prosecutors-begin-arguments-against-trump-saying-he-became-the-inciter-in-chief-of-a-dangerous-insurrection"
        )

        self.assertEqual(data[0]["created_date"], "2021-02-10T11:04:08-05:00")
Example #7
0
from pynytimes import NYTAPI

nyt = NYTAPI("Type_Your_Key")

reviews = nyt.movie_reviews(keyword="Batman")

l = ['display_title', 'mpaa_rating', 'headline', 'summary_short']
review_length = len(reviews)
for u in range(review_length):
    str1 = ""
    for i in l:
        s = reviews[u][i]
        str1 += s + "\t"
    print(str1)

# print(reviews)
Example #8
0
from pynytimes import NYTAPI

# Make sure to set parse dates to True so that the dates
# are parsed into datetime.datetime or datetime.date objects
nyt = NYTAPI(
    key="Your API Key",  # Get your API Key at https://developer.nytimes.com
    parse_dates=True,
)

# Get most shared articles of today
most_shared = nyt.most_shared()

# Optionally you can also define the timeframe
# Valid options are 1, 7, 30
most_shared_last_week = nyt.most_shared(days=7)
most_shared_last_month = nyt.most_shared(days=30)

# You can also define the method of sharing.
# Options are: email (default) or facebook.
most_shared_email = nyt.most_shared(method="email")
most_shared_facebook = nyt.most_shared(method="facebook")

# These options can also be mixed and matched
# So the most shared articles of last month on facebook are
most_shared_last_month_facebook = nyt.most_shared(days=30, method="facebook")
Example #9
0
key = 'EE4r1tU8dgaQej94KTnlJxWPglKKaz4e'
secret = 'AbqACp2UngzDCOWu'

import requests
import json

url = 'https://api.nytimes.com/svc/archive/v1/2020/06.json?api-key={key:AbqACp2UngzDCOWu}'

r = requests.get(url)
json_data = r.json()
from pynytimes import NYTAPI

nyt = NYTAPI("EE4r1tU8dgaQej94KTnlJxWPglKKaz4e")

import datetime

years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for YEAR in years:

    data = nyt.archive_metadata(date=datetime.datetime(YEAR, 6, 1))

    headlines_nyt = []
    #for i in range(len(data)):
    for i in range(1001):

        print(i)
        headlines_nyt.append(
            (data[i]['headline']['main'], data[i]['pub_date'], 'NY_TIMES', 0))

    import pandas as pd
    return story


@st.cache(suppress_st_warning=True)
def summarizeArticle(toSummarize, minLength, maxLength):
    return summarizer(toSummarize, min_length=minLength,
                      max_length=maxLength)[0]["summary_text"]


# NY Times API

NYTimesAPIkey = environ.get("NYTimesAPIkey")
if NYTimesAPIkey is None:
    raise KeyError("'NYTimesAPIkey' not an environment variable name.")

nyt = NYTAPI(NYTimesAPIkey)

t0 = perf_counter()
summarizer = initializeSummarizer()
t1 = perf_counter()
Δt01 = t1 - t0

# Now for the Streamlit interface:

st.sidebar.title("About")

st.sidebar.info(
    "This streamlit app uses the default HuggingFace summarization "
    "pipeline (Facebook's BART model) to summarize text from selected "
    "NY Times articles.\n\n"
    "The actual summarization time takes a few seconds, although"
Example #11
0
from PopulateDB import addArticlesDB
from pynytimes import NYTAPI
from datetime import datetime, date, timezone
from pymongo import MongoClient, errors, ASCENDING, DESCENDING
from bson import regex
import sys
import json
import requests
import random
import string

nyt = NYTAPI("qsPCmSV09wV4AbCCaJmXFPxo3nCwGtbU")
LIMIT = 10  # output is limited to 10 documents


def start():
    global myclient
    global db
    global article
    try:
        myclient = MongoClient("mongodb://localhost:27018/",
                               w=1,
                               readPreference="primaryPreferred")
        print(myclient)
        print("Connection successful!")
    except errors.ConnectionFailure as e:
        print("Connection failed!")
        print(e)

    db = myclient["nyt"]  #connect to database
    article = db["article"]  #coonect to collections
from pynytimes import NYTAPI
from datetime import datetime
import re
from textblob import TextBlob
import numpy
import pandas as pd

nyt = NYTAPI("WxQXsVSaIIlTgEfG0VnrlP7JhOVYYL0j")

search="US Embassy move to Jerusalem"
start_date = datetime(2015, 1, 1)
end_date = datetime(2019, 12, 31)

articles = nyt.article_search(
    query = search,
    results = 50,
    dates = {
        "begin": start_date,
        "end": end_date
    },
    options = {
        "sort": "relevance",
        "sources": [
            "New York Times",
        ],

    }
)

def clean(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) | (\w+:\/\/\S+)", " ", text).split())
Example #13
0
from pynytimes import NYTAPI

# Make sure to turn parse dates on so that the dates
# are parsed into datetime.datetime or datetime.date objects
nyt = NYTAPI("API Key", parse_dates=True)

# Get the most viewed articles of today
most_viewed = nyt.most_viewed()

# Optionally you can also define the time period of the most
# viewed articles
most_viewed_last_week = nyt.most_viewed(days=7)  # Valid options are 1, 7 or 30
most_viewed_last_month = nyt.most_viewed(days=30)
Example #14
0
from .basisFuncs import *
from .part3funcs import normalize_headline

import datetime
from pynytimes import NYTAPI
import time as time
import pytz
try:
    nyt = NYTAPI(os.getenv("nytimesPythonApiKey"))
except:
    print("no nyt api key oh well")


def populateNewsRatioColumn():

    mycursor.execute("update " + mainTable + " set newsRatio = 0")
    mydb.commit()
    updateFormula = "UPDATE " + mainTable + " SET newsRatio = %s WHERE id = %s"

    allTweets = getTweetsFromDB(
        returnParams=["favCount", "cleanedText", "id", "publishTime"],
        purePres=True)

    tweets = [t for t in allTweets if len(t[1]) > 1]
    ratios = []
    tuples = []
    favCounts = [t[0] for t in tweets]
    for i, t in enumerate(tweets):
        ratio = determineWhetherTweetWasInfluencedByNewsRatio(t)
        tuples.append((str(ratio), t[2]))
        ratios.append(ratio)
Example #15
0
from pynytimes import NYTAPI
nyt = NYTAPI("YOUR_API_KEY")
articles = nyt.article_search(
    "https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=indigo&api-key=YOUR_API_KEY"
)
#print(articles)
news = []
for i in articles:
    dic = {}
    dic['url'] = i['web_url']
    news.append(dic)
#print(news)
urls = []
for new in news:
    for key, url in new.items():
        urls.append(url)
for url in urls:
    print(url)

# article :indigo
"""
https://www.nytimes.com/1889/07/14/archives/no-robbery.html
https://www.nytimes.com/1858/10/29/archives/central-america-crops-and-contracts-in-costa-ricathe-proposed.html
https://www.nytimes.com/1898/08/20/archives/reviews-of-books-dialect-tales-justly-praised.html
https://www.nytimes.com/1890/10/19/archives/anne-bissell.html
https://www.nytimes.com/1859/10/27/archives/european-news-the-jason-at-st-johns-further-by-the-persia-the-great.html
https://www.nytimes.com/1859/06/27/archives/from-the-pacific-coast-nicaragua-rejects-the-american-ultimatum.html
"""
Example #16
0
from pynytimes import NYTAPI

# Make sure to turn parse dates on so that the dates
# are parsed into datetime.datetime or datetime.date objects
nyt = NYTAPI("API Key", parse_dates=True)

# Get top stories
top_stories = nyt.top_stories()

# Optionally you can also define a section
# Valid options for sections can be found in README
top_stories_science = nyt.top_stories(section="science")

@st.cache(suppress_st_warning=True)
def summarizeArticle(toSummarize, minLength, maxLength):
    return summarizer(toSummarize, min_length=minLength,
                      max_length=maxLength)[0]["summary_text"]


# NY Times API

# NYTimesAPIkey = environ.get("NYTimesAPIkey")
# if NYTimesAPIkey is None:
#     raise KeyError("'NYTimesAPIkey' not an environment variable name.")

# nyt = NYTAPI(NYTimesAPIkey)
nyt = NYTAPI(st.secrets["NYTimesAPIkey"])

t0 = perf_counter()
summarizer = initializeSummarizer()
t1 = perf_counter()
Δt01 = t1 - t0
print(f"Δt to initialize summarizer: {Δt01:5.2f}s", flush=True)

# Now for the Streamlit interface:

st.sidebar.title("About")

st.sidebar.info(
    "This streamlit app uses the default HuggingFace summarization "
    "pipeline (Facebook's BART model) to summarize text from selected "
    "NY Times articles.\n\n"
Example #18
0
from pynytimes import NYTAPI

import datetime
import random
import time
import os

random_wait = random.randint(0, 60)
time.sleep(random_wait)

begin = datetime.datetime.now()

API_KEY = os.environ["NewYorkTimesAPIKey"]
nyt = NYTAPI(API_KEY)

nyt.top_stories(section="science")
nyt.most_viewed(days=30)
time.sleep(5)
nyt.most_shared(days=30, method="email")
nyt.book_reviews(author="Michelle Obama")
time.sleep(5)
nyt.best_sellers_lists()
nyt.best_sellers_list(date=datetime.datetime(2019, 1, 1),
                      name="hardcover-fiction")
time.sleep(5)
nyt.movie_reviews(keyword="FBI", options={"order": "by-opening-date"})
nyt.article_metadata(
    url=
    "https://www.nytimes.com/2019/10/20/world/middleeast/erdogan-turkey-nuclear-weapons-trump.html"
)
time.sleep(5)
import os
from pynytimes import NYTAPI
import pandas as pd
import datetime
import time
from pprint import pprint

# import api key
key = os.getenv("api-key")

# set up wrapper for API calls
nyt = NYTAPI(key)

# create list of dates for each month from 2015 - 2017
start_date = "2015-01-01"
end_date = "2017-12-01"

date_list = pd.date_range(start_date, end_date, freq="MS")

# convert to python datetime for API calls
dates = list(date_list.to_pydatetime())

# iterate over list of dates, append to list, convert to a dataframe
article_list = []

for date in dates:

    print(f"Processing Date: {date}")

    results = nyt.archive_metadata(date = date)
Example #20
0
def get_news():
    key = "aJDq9vqaMll0JjrRpRDRWwQnwQwPKtzZ"
    nyt = NYTAPI(key)
    top_stories = nyt.top_stories()
    return top_stories[random.randint(1, 11)]["title"], "excited"
Example #21
0
from cli import parse_args
from pynytimes import NYTAPI
mode, topic_url, articles_to_scrape, driver_path = parse_args()

# For testing: If True only one article is scrapped
DEMO = False
DEMO_TOPIC = 'https://www.nytimes.com/section/world/africa'
DEMO_ARTICLE_SCRAP = 1

# Database details
HOST = 'localhost'
DATABASE = 'nytimes'

# your variables
USER = '******'
PASSWORD = '******'

# Settings for the logger
LOG = 'log_file.log'
NYTname = 'scrapper'

# API key for the NYT API
nyt = NYTAPI("qeOAmrE6yGzowmzGiIpoK0ZBHOnyJ8BG", https=False)
def getAPI():
    key = os.getenv("NYT")
    nyt = NYTAPI(str(key), parse_dates=True)
    return nyt
from datetime import date, datetime
from pynytimes import NYTAPI

# Make sure to set parse dates to True so that the dates
# are parsed into datetime.datetime or datetime.date objects
nyt = NYTAPI(
    key="Your API Key",  # Get your API Key at https://developer.nytimes.com
    parse_dates=True,
)

# Search articles about President Biden
biden = nyt.article_search("biden")

# You can optionally define the dates between which you want the articles to be
biden_january = nyt.article_search(query="biden",
                                   dates={
                                       "start": date(2021, 1, 1),
                                       "end": date(2021, 1, 31)
                                   })

# Optionally you can also define
biden = nyt.article_search("biden", )