Ejemplo n.º 1
0
 def url_path(self):
     if os.path.getsize(self.file_path) == 0:
         with open(self.file_path, "w+", encoding="utf-8") as uf:
             popular_urls_list = newspaper.popular_urls()
             for i in range(len(popular_urls_list)):
                 uf.write(popular_urls_list[i])
                 uf.write("\n")
         print("populate url list have worte")
Ejemplo n.º 2
0
 def get_trending():
     url = ('https://newsapi.org/v2/top-headlines?'
            'country=us&'
            'apiKey=bcaaf0d008994d818672b2c1141be98b')
     response = requests.get(url)
     popular_articles = {
         'articles': json.loads(response.text).get('articles')
     }
     popular_url_topics = {
         'popular_urls': newspaper.popular_urls(),
         'hot_topics': newspaper.hot()
     }
     return {**popular_articles, **popular_url_topics}
Ejemplo n.º 3
0
def trending():
    trending_terms = newspaper.hot()
    trending_urls = newspaper.popular_urls()[:10]
    return trending_terms, trending_urls
Ejemplo n.º 4
0
 def test_popular_urls(self):
     """
     just make sure this runs
     """
     newspaper.popular_urls()
Ejemplo n.º 5
0
 def test_popular_urls(self):
     """
     just make sure this runs
     """
     newspaper.popular_urls()
Ejemplo n.º 6
0
def get_popular():
    global pop
    pop = newspaper.popular_urls()
Ejemplo n.º 7
0
def get_popular_urls(self, *args, **kwargs):
    return newspaper.popular_urls()
Ejemplo n.º 8
0
 def test_popular_urls(self):
     """Just make sure this method runs
     """
     newspaper.popular_urls()
Ejemplo n.º 9
0
    def newspaper_stories(words,
                          search_type='or',
                          search_level=0,
                          urls=None,
                          display=True,
                          memorize=False,
                          language='en'):
        config = newspaper.Config()
        config.memoize_articles = memorize
        config.language = language
        config.fetch_images = False
        config.request_timeout = 20
        config.MIN_WORD_COUNT = 300
        config.MIN_SENT_COUNT = 10
        if urls == None or urls == 'top_news':
            news_urls = {
                'huffington': 'http://huffingtonpost.com',
                'reuters': 'http://www.reuters.com',
                'cbs-news': 'http://www.cbsnews.com',
                'usa-today': 'http://usatoday.com',
                'cnn': 'http://cnn.com',
                'npr': 'http://www.npr.org',
                'abc-news': 'http://abcnews.com',
                'us-news': 'http://www.usnews.com',
                'msn': 'http://msn.com',
                'pbs': 'http://www.pbs.org',
                'nbc-news': 'http://www.nbcnews.com',
                'msnbc': 'http://www.msnbc.com',
                'fox': 'http://www.foxnews.com'
            }
        elif urls == 'all_us_news':
            news_urls = {
                'abc-news': 'https://abcnews.go.com',
                'al-jazeera-english': 'http://www.aljazeera.com',
                'ars-technica': 'http://arstechnica.com',
                'associated-press': 'https://apnews.com/',
                'axios': 'https://www.axios.com',
                'bleacher-report': 'http://www.bleacherreport.com',
                'bloomberg': 'http://www.bloomberg.com',
                'breitbart-news': 'http://www.breitbart.com',
                'business-insider': 'http://www.businessinsider.com',
                'buzzfeed': 'https://www.buzzfeed.com',
                'cbs-news': 'http://www.cbsnews.com',
                'cnbc': 'http://www.cnbc.com',
                'cnn': 'http://us.cnn.com',
                'crypto-coins-news': 'https://www.ccn.com',
                'engadget': 'https://www.engadget.com',
                'entertainment-weekly': 'http://www.ew.com',
                'espn': 'http://espn.go.com',
                'espn-cric-info': 'http://www.espncricinfo.com/',
                'fortune': 'http://fortune.com',
                'fox-news': 'http://www.foxnews.com',
                'fox-sports': 'http://www.foxsports.com',
                'google-news': 'https://news.google.com',
                'hacker-news': 'https://news.ycombinator.com',
                'ign': 'http://www.ign.com',
                'mashable': 'http://mashable.com',
                'medical-news-today': 'http://www.medicalnewstoday.com',
                'msnbc': 'http://www.msnbc.com',
                'mtv-news': 'http://www.mtv.com/news',
                'national-geographic': 'http://news.nationalgeographic.com',
                'national-review': 'https://www.nationalreview.com/',
                'nbc-news': 'http://www.nbcnews.com',
                'new-scientist': 'https://www.newscientist.com/section/news',
                'newsweek': 'http://www.newsweek.com',
                'new-york-magazine': 'http://nymag.com',
                'next-big-future': 'https://www.nextbigfuture.com',
                'nfl-news': 'http://www.nfl.com/news',
                'nhl-news': 'https://www.nhl.com/news',
                'politico': 'https://www.politico.com',
                'polygon': 'http://www.polygon.com',
                'recode': 'http://www.recode.net',
                'reddit-r-all': 'https://www.reddit.com/r/all',
                'reuters': 'http://www.reuters.com',
                'techcrunch': 'https://techcrunch.com',
                'techradar': 'http://www.techradar.com',
                'american-conservative':
                'http://www.theamericanconservative.com/',
                'hill': 'http://thehill.com',
                'huffington-post': 'http://www.huffingtonpost.com',
                'next-web': 'http://thenextweb.com',
                'verge': 'http://www.theverge.com',
                'wall-street-journal': 'http://www.wsj.com',
                'washington-post': 'https://www.washingtonpost.com',
                'washington-times': 'https://www.washingtontimes.com/',
                'time': 'http://time.com',
                'usa-today': 'http://www.usatoday.com/news',
                'vice-news': 'https://news.vice.com',
                'wired': 'https://www.wired.com'
            }
        elif urls == "texas_universities":
            news_urls = {
                'A&M': 'http://www.tamu.edu',
                'A&M-Commerce': 'http://www.tamuc.edu',
                'A&M-Corpus': 'http://www.tamucc.edu',
                'A&M-Kingsville': 'http://www.tamuk.edu',
                'A&M-Galveston': 'http://www.tamug.edu',
                'A&M-PrairieView': 'http://www.pvamu.edu',
                'A&M-International': 'http://www.tamiu.edu',
                'A&M-WestTexas': 'http://www.wtamu.edu',
                'Baylor': 'http://www.baylor.edu',
                'Rice': 'http://www.rice.edu',
                'SFAustin': 'http://www.sfasu.edu',
                'SMU': 'http://www.smu.edu',
                'SulRoss': 'http://www.sulross.edu',
                'TexasState': 'http://www.txstate.edu',
                'Texas_Tech': 'http://www.ttu.edu',
                'UDallas': 'http://www.udallas.edu',
                'UHouston': 'http://www.uh.edu',
                'UTexas': 'http://www.utexas.edu',
                'UT_Dallas': 'http://www.utdallas.edu',
                'UT_ElPaso': 'http://www.utep.edu',
                'UT_Houston': 'http://www.uth.edu',
                'UT_NorthTexas': 'http://www.unt.edu',
                'UT_SanAntonio': 'http://www.utsa.edu'
            }
        elif urls == 'popular':
            news_urls = {}
            agency_urls = newspaper.popular_urls()
            for i in range(len(agency_urls)):
                val = agency_urls[i]
                url = agency_urls[i].replace("http://", "")
                url = url.replace("www.", "")
                url = url.replace("blog.", "")
                url = url.replace("blogs.", "")
                url = url.replace(".com", "")
                url = url.replace(".net", "")
                url = url.replace(".au", "")
                url = url.replace(".org", "")
                url = url.replace(".co.uk", "")
                url = url.replace("the", "")
                url = url.replace(".", "-")
                url = url.replace('usa', 'usa-')
                if url == 'berkeley-edu':
                    continue
                if url == 'beta-na-leagueoflegends':
                    continue
                if url == 'bottomline-as-ucsb-edu':
                    continue
                news_urls[url] = val
        else:
            news_urls = urls

        print("\nSearch Level {:<d}:".format(search_level), end="")
        if search_level == 0:
            print(" Screening URLs for search words")
            print("   URLs must contain one or more of:", end="")
        else:
            print(" No URL Screening")
            print("   Deep Search for Articles containing: ", end="")
        i = 0
        for word in words:
            i += 1
            if i < len(words):
                if search_type == 'or':
                    print(word + " or ", end="")
                else:
                    print(word + " & ", end="")
            else:
                print(word)

        df_articles = pd.DataFrame(columns=[
            'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text'
        ])
        n_articles = {}
        today = str(date.today())
        for agency, url in news_urls.items():
            paper = newspaper.build(url, config=config)
            if display:
                print("\n{:>6d} Articles available from {:<s} on {:<10s}:".
                      format(paper.size(), agency.upper(), today))
            article_collection = []
            for article in paper.articles:
                url_lower = article.url.lower()
                # Exclude articles that are in a language other then en
                # or contains mostly video or pictures
                # search_level 0 only downloads articles with at least
                # one of the key words in its URL
                # search_level 1 download all articles that appear to be
                # appear to be in English and are not mainly photos or
                # videos.
                # With either search level, if an article is downloaded
                # it is scanned to see if it contains the search words
                # It is also compared to other articles to verify that
                # it is not a duplicate of another article.

                # Special Filters for some Agencies
                if agency == 'cbs-news':
                    if url_lower.find('.com') >= 0:
                        # secure-fly are duplicates of http
                        if article.url.find('secure-fly') >= 0:
                            continue
                if agency == 'usa-today':
                    if url_lower.find('tunein.com') >= 0:
                        continue
                if agency == 'huffington':
                    # Ignore huffington if it's not .com
                    if url_lower.find('.com') < 0:
                        continue

                # Filter Articles that are primarily video, film or not en
                if url_lower.find('.video/')   >=0 or \
                   url_lower.find('/video')    >=0 or \
                   url_lower.find('/picture')  >=0 or \
                   url_lower.find('.pictures/')>=0 or \
                   url_lower.find('/photo')    >=0 or \
                   url_lower.find('.photos/')  >=0 or \
                   url_lower.find('espanol')   >=0 or \
                   url_lower.find('.mx/' )     >=0 or \
                   url_lower.find('/mx.' )     >=0 or \
                   url_lower.find('.fr/' )     >=0 or \
                   url_lower.find('/fr.' )     >=0 or \
                   url_lower.find('.de/' )     >=0 or \
                   url_lower.find('/de.' )     >=0 or \
                   url_lower.find('.it/' )     >=0 or \
                   url_lower.find('/it.' )     >=0 or \
                   url_lower.find('.gr/' )     >=0 or \
                   url_lower.find('/gr.' )     >=0 or \
                   url_lower.find('.se/' )     >=0 or \
                   url_lower.find('/se.' )     >=0 or \
                   url_lower.find('.es/' )     >=0 or \
                   url_lower.find('/es.' )     >=0 or \
                   url_lower.find('?button')   >=0 or \
                   url_lower.find('calendar.') >=0 or \
                   url_lower.find('calendar/') >=0 or \
                   url_lower.find('/event/')   >=0 or \
                   url_lower.find('engr.utexas') >=0 or \
                   url_lower.find('sites.smu.')  >=0:
                    continue

                # Filter if search_level == 0, URL quick search
                if search_level == 0:
                    # Verify url contains at least one of the key words
                    found_it = False
                    for word in words:
                        j = url_lower.find(word)
                        if j >= 0:
                            found_it = True
                            break
                    if found_it:
                        # Article contains words and passes filters
                        # Save this article for full review
                        article_collection.append(article.url)
                else:
                    #  No URL screening, Save for full review
                    article_collection.append(article.url)
            n_to_review = len(article_collection)
            if display:
                print("{:>6d} Selected for download".format(n_to_review))

            for article_url in article_collection:
                article = Article(article_url, config=config)
                try:
                    article.download()
                except:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    continue
                n = 0
                # Limit download failures
                stop_sec = 1  # Initial max wait time in seconds
                while n < 2:
                    try:
                        article.parse()
                        n = 99
                    except:
                        n += 1
                        # Initiate download again before new parse attempt
                        article.download()
                        # Timeout for 5 seconds waiting for download
                        t0 = time()
                        tlapse = 0
                        while tlapse < stop_sec:
                            tlapse = time() - t0
                        # Double wait time if needed for next exception
                        stop_sec = stop_sec + 1
                if n != 99:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    n_to_review -= 1
                    continue
                article.nlp()
                keywords = article.keywords
                title = article.title
                summary = article.summary
                text = article.text
                text_lower_case = text.lower()
                if search_type == 'or':
                    found_it = False
                    # Verify the url contains at least one of the key words
                    for word in words:
                        j = text_lower_case.find(word)
                        if j >= 0:
                            found_it = True
                            break
                else:
                    # search type 'and'
                    found_it = True
                    for word in words:
                        j = text_lower_case.find(word)
                        if j < 0:
                            found_it = False
                            break
                if found_it:
                    # Article contains words and passes filters
                    # Save this article for later full review
                    length = len(text)
                    df_story = pd.DataFrame([[
                        agency, article_url, length, keywords, title, summary,
                        text
                    ]],
                                            columns=[
                                                'agency', 'url', 'length',
                                                'keywords', 'title', 'summary',
                                                'text'
                                            ])
                    # Check for an identical already in the file
                    if df_articles.shape[0] == 0:
                        df_articles = df_articles.append(df_story)
                    else:
                        # Verify this story is not already in df_articles
                        same_story = False
                        for i in range(df_articles.shape[0]):
                            if text == df_articles['text'].iloc[i]:
                                same_story = True
                                n_to_review -= 1
                                continue
                        if not (same_story):
                            df_articles = df_articles.append(df_story)
                else:
                    n_to_review -= 1

                print("=", end='')
            n_articles[agency] = [n_to_review, len(article_collection)]
        if display:
            print("\n\nArticles Selected by Agency:")
            for agency in news_urls:
                ratio = str(n_articles[agency][0]) + "/" + \
                        str(n_articles[agency][1])
                ratio = ratio
                print("{:>10s} Articles from {:<s}".format(
                    ratio, agency.upper()))
            print("\nArticles Collected on " + today + ":",
                  df_articles.shape[0], 'from',
                  df_articles['agency'].nunique(), "Agencies.")
            print("\nSize    Agency    Title")
            print("*{:->78s}*".format("-"))
            for i in range(df_articles.shape[0]):
                k = len(df_articles['title'].iloc[i])
                if k > 63:
                    for j in range(25):
                        k = 63 - j
                        if df_articles['title'].iloc[i][k] == " ":
                            break

                    print("{:>5d} {:<10s} {:<63s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i][0:k]))
                    if len(df_articles['title'].iloc[i]) > 63:
                        print("                {:<60s}".format(
                            df_articles['title'].iloc[i][k:120]))
                else:
                    print("{:>5d} {:<10s} {:<s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i]))
                print("")
            print("*{:->78s}*".format("-"))
        return df_articles
Ejemplo n.º 10
0
                                       aws_access_key_id="enter key",
                                       aws_secret_access_key="enter secret")

# To create a new table, we have to set up the schema
# The schema requires a primary hash key and allows a secondary range key
my_schema = conn.create_schema(hash_key_name='id', hash_key_proto_value=int)

# Create the table by providing a name, schema and the computing power required by AWS
table = conn.create_table(name="newspaper",
                          schema=my_schema,
                          read_units=10,
                          write_units=5)

# To fill our table, we are going to parse through websites that publish news
# The newspaper package happens to contain a list of over 200 popular news websites
sources = newspaper.popular_urls()

hashkey = 1  # primary key integer to be used for DynamoDB table
source_iterator = 0  # iterator to be increased at end of loop to move to next source

for source in sources:
    parser = sources[source_iterator]
    try:
        collect = newspaper.build(
            parser)  # Build method returns an object with each sub-site url
    except:
        source_iterator += 1  # If the Build method fails, move on to next source

    # Store articles as dict type in a list to iterate through for upload to DynamoDB
    collection = []
    for article in collect.articles:
Ejemplo n.º 11
0
 def popular_urls():
     return newspaper.popular_urls()
Ejemplo n.º 12
0
import newspaper as nw
text =[]
papers=[]
popular_urls = nw.popular_urls()
for url in popular_urls:
    print 'building: ' , url
    paper = nw.build(language='en',url=url, memoize_articles=False, )
    papers.append(paper)
    print 'done..'
#nw.news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total
#nw.news_pool.join()
with open('dataset.txt', 'w+') as f:
    for paper in papers:
        print len(paper.articles)
        for article in paper.articles:
            article.download()
            article.parse()
            text.append(article.title)
            try:
                if article.title!='Something\'s Gone Terribly Wrong':
                    f.write((article.title.decode('utf-8').encode('cp1250'))+'\n')
            except:
                pass


Ejemplo n.º 13
0
 def PopularUrl(self):
     return newspaper.popular_urls()
Ejemplo n.º 14
0
 def test_popular_urls(self):
     """Just make sure this method runs
     """
     newspaper.popular_urls()
Ejemplo n.º 15
0
# Note you will need to enter your own private keys from AWS
conn = boto.dynamodb.connect_to_region(
       'us-west-2',
       aws_access_key_id="enter key",
       aws_secret_access_key="enter secret")

# To create a new table, we have to set up the schema
# The schema requires a primary hash key and allows a secondary range key
my_schema = conn.create_schema(hash_key_name = 'id', hash_key_proto_value = int)

# Create the table by providing a name, schema and the computing power required by AWS
table = conn.create_table(name="newspaper", schema=my_schema, read_units=10, write_units=5)

# To fill our table, we are going to parse through websites that publish news
# The newspaper package happens to contain a list of over 200 popular news websites
sources = newspaper.popular_urls()

hashkey = 1 # primary key integer to be used for DynamoDB table
source_iterator = 0 # iterator to be increased at end of loop to move to next source

for source in sources:
    parser = sources[source_iterator]
    try:
        collect = newspaper.build(parser) # Build method returns an object with each sub-site url
    except:
        source_iterator += 1 # If the Build method fails, move on to next source

    # Store articles as dict type in a list to iterate through for upload to DynamoDB
    collection = []  
    for article in collect.articles:
        try:
import newspaper
# hot() returns a list of the top trending terms on Google using a public api
print(newspaper.hot())
# popular_urls() returns a list of popular news source urls
print(newspaper.popular_urls())
newspaper.languages()
Ejemplo n.º 17
0
import newspaper

volkskrant = newspaper.build('https://www.volkskrant.nl/', language='nl')

print(volkskrant.size())

for category in volkskrant.category_urls():
    print(category)

for article in volkskrant.articles:
    print(article.url)

print('\n')

print(newspaper.hot(), end='\n\n')
print(newspaper.popular_urls(), end='\n\n')

print(newspaper.languages())

# url = 'https://www.volkskrant.nl/nieuws-achtergrond/eerste-dode-in-nederland-maar-wat-is-eigenlijk-de-kans-om-aan-corona-te-overlijden~bf716564/'
# article = newspaper.Article(url)
# article.download()

# article.parse()
# print(article.authors)
# article.publish_date
# article.text
# article.top_image

# article.nlp()
# article.keywords
Ejemplo n.º 18
0
import newspaper
from newspaper import Article
import pandas as pd
data = pd.read_csv('../data/uci-news-aggregator.csv',sep=',')
#while(1):
#print(newspaper.popular_urls())
urls = newspaper.popular_urls()
#urls = ['https://www.wsj.com/news/world/middle-east','http://wsj.com','http://nytimes.com','http://www.bbc.co.uk','http://www.npr.org','http://www.reuters.com','http://www.economist.com','http://www.pbs.org','http://bigstory.ap.org','http://cnn.com','http://www.ted.com','http://www.washingtonpost.com','http://www.newyorker.com','http://www.cbs.com']
#print(data.head(5))
url_test = "https://www.nytimes.com/2020/08/08/business/economy/lost-unemployment-benefits.html?action=click&module=Top%20Stories&pgtype=Homepage"
article = Article(url_test)
article.download()
article.parse()
print(article.text)
exit()
for row in data:
    try:
        first_article = Article(url=str(row['URL'], language='en'))
        print(first_article)
        #article = newspaper.build(str(row['URL']), memoize_articles=False, language='en')
        #with open('sample'+str(row['ID']), 'w') as f:
    except:
        print("Error")

    '''print("$$$$ Novi url: ",url)
    cnn_paper = newspaper.build(str(url), memoize_articles=False, language='en')
    print("$$$$ Broj artikala: ",len(cnn_paper.articles))
    j = 0
    for article in cnn_paper.articles:
        try:
            j+=1
Ejemplo n.º 19
0
def popular_paper():
    popular_url = newspaper.popular_urls()
    return popular_url