Exemple #1
0
def latestNews(source):
    try:
        nc = Newscatcher(website=source, topic='finance')
        for k in nc.get_headlines():
            print(k + "\n")
    except:
        print("unable to retrieve data from: " + source)
Exemple #2
0
 def get_articles(self):
     articles = []
     for i in self.urls:
         nc = Newscatcher(website=i, topic=self.topic)
         webart = nc.get_news(n=1)
         articles.append(webart)
     print(articles[0])
     return articles
Exemple #3
0
def get_news():
    nc = Newscatcher(website = 'marketwatch.com')
    results = nc.get_news()

    # results.keys()
    # 'url', 'topic', 'language', 'country', 'articles'

    # Get the articles
    articles = results['articles']

    first_article_summary = articles[0]['summary']
    first_article_title = articles[0]['title']

    return jsonify(articles)
Exemple #4
0
def get_news(source, page, limit):
    nc = Newscatcher(website=source)
    results = nc.get_news()
    articles = results['articles']
    app.logger.info("len articles %d", len(articles))
    i = 0
    retval = []
    offset = limit * (page - 1)
    for a in articles[offset:limit + offset]:
        # skip articles without summary
        if 'summary' in a.keys():
            b = a
            a.source = source
            a.summary = cleanhtml(a.summary)
            retval.append(b)
    return retval
def get_site_article_rating(site):
    news_source = Newscatcher(site)

    sentiment_analyzer = pipeline(task="sentiment-analysis",
                                  framework='pt',
                                  device=-1)

    data = []

    for x in news_source.news:
        s = MLStripper()
        s.feed(x['summary_detail']['value'])
        news = s.get_data()
        title = x['title']
        link = x['link']
        logger.debug(news)
        res = sentiment_analyzer(news)
        logger.debug("%s: %s: %s", site, news, res)

        data.append({
            "link": link,
            "title": title,
            "summary": news,
            "sentiment": str(res[0]["label"]),
            "confidence": float(res[0]["score"])
        })

    return data
Exemple #6
0
def catch_trends(website: str, topic=None):
    nc = Newscatcher(website=website, topic=topic)
    results = nc.get_news()
    keywords = []
    articles = results['articles']
    articles_count = len(articles)
    for article_num in tqdm.tqdm(range(articles_count)[:2]):
        article = articles[article_num]
        link = articles[article_num]["link"]
        try:
            parsed = parse_article(link)
        except Exception as e:
            print(link)
            continue
        article_keywords = parsed["keywords"]
        keywords += article_keywords
    return (keywords)
def headlines():
    website=Newscatcher('washingtonpost.com')
#   speak("you want me to read the headlines?")
#    msg = takeCommand().lower() 
#    if msg=="Yes" or "Yeah":
#        try:   
    results=website.headlines
    speak(results) 
Exemple #8
0
def newscatchernews():
    newslist = []
    searchcriteria = None
    IndianURLs = urls(country='IN')

    searchcriteria = request.GET.get('search')
    #print("criteria:",searchcriteria)

    det = []
    counter = 0
    for IndianURL in IndianURLs:

        nc = Newscatcher(website=IndianURL)
        results = nc.get_news()

        if results is not None and results['articles'] is not None:
            articles = results['articles']

            for article in articles:
                datesfound = datefinder.find_dates(article.published)
                dateresult = "x"
                for match in datesfound:

                    dateresult = match.strftime("%Y-%m-%d %H:%M")

                    txt = list(article.summary_detail.values())[3]
                    detailtext = BeautifulSoup(txt, "html.parser").get_text()

                    counter = counter + 1
                    newslist = newslist + [{
                        'Source': IndianURL,
                        'Title': article.title,
                        'Published': dateresult,
                        'Summary_Detail': detailtext,
                        'link': article.link,
                        'id': "head_" + str(counter)
                    }]
    return newslist
Exemple #9
0
    def get(self):
        # no reosurce for the topic, done.
        if not self.sources:
            return

        logger.info("Getting news for topic: {}".format(self.topic))
        KEYS = ["published_parsed", "title", "link", "summary"]

        for site in self.sources:
            news = Newscatcher(website=site).get_news()

            # no news from this source
            if not news:
                continue

            for article in news.get("articles", []):
                # if we have all the info I want
                if set(KEYS) - set(article.keys()):
                    continue
                if not article.get("published_parsed", None):
                    continue

                pub = datetime(*article["published_parsed"][:6])
                mine = MyNews(
                    source=site,
                    topic=self.topic,
                    title=article["title"],
                    link=article["link"],
                    pub_time=make_aware(pub),
                    summary=article["summary"],
                )

                try:
                    mine.save()
                except Exception:
                    # must be violating unique constraint, do nothing
                    # logger.error("Duplicate news. Skip.")
                    pass
Exemple #10
0
from newscatcher import Newscatcher
from datetime import datetime
from time import mktime

from django.core.management.base import BaseCommand

from scraping.models import Headline

# define news website
nytimes = Newscatcher('nytimes.com')
usa_today = Newscatcher('usatoday.com')
la_times = Newscatcher('latimes.com')
guardian = Newscatcher('theguardian.com')
wash_post = Newscatcher('washingtonpost.com')
daily_mail = Newscatcher('dailymail.co.uk')
NBC = Newscatcher('nbcnews.com')
fox = Newscatcher('foxnews.com')
huff_post = Newscatcher('huffpost.com')
google_news = Newscatcher('news.google.com')
wired = Newscatcher('wired.com')

news_list = [
    usa_today, la_times, guardian, wash_post, daily_mail, NBC, fox, huff_post,
    google_news, nytimes, wired
]


class Command(BaseCommand):
    help = "collect headlines"

    # define logic of command
Exemple #11
0
def show_news_from(domain):
    print('## {0}\n\n'.format(domain))
    source = Newscatcher(website = domain)
    results = source.get_news()
    show_list(results['articles'])
Exemple #12
0
def describe_sources(source):
    nc = Newscatcher(website=source)
    nc.print_headlines(n=10)
    urls_pol = urls(topic='politics')
    describe_url(urls_pol[1])
    res = Newscatcher(website=urls_pol[1], topic='politics')
Exemple #13
0
from newscatcher import describe_url
from newscatcher import Newscatcher
from newscatcher import urls
url = 'news.ycombinator.com'
url2 = 'ycombinator.com'
eng_lnx = urls(language='en')
nc = Newscatcher(website=url)
try:
    print("looking for " + url + "...")
    nc.get_news()
except Exception as e:
    print(repr(e))
describe_url(url)
print(url + ' in urls: ' + str(url in eng_lnx))
print(url2 + ' in urls: ' + str(url2 in eng_lnx))
nc2 = Newscatcher(website='ycombinator.com')
try:
    print("looking for " + url2 + "...")
    nc2.get_news()
except Exception as e:
    print(repr(e))
Exemple #14
0
def list_urls(criteria):
    try:
        global state_object
        reset_state()
        url_object = get_urls(criteria)
        logging.info('Criteria eval resulted in url object: %s' % url_object)
        url_list = url_object['urls']

        if (len(url_list) == 0):
            logging.info('Unsupported topic or country: %s' % criteria)
            return

        if (url_object['type'] == 'website'):
            selected_url = url_object['urls'][0]
            logging.info('Selected url: %s' % state_object)
        else:
            logging.info('Got urls: %s' % len(url_list))
            display_object = enumerate_array(url_list)
            logging.info('First item: %s' % display_object['0'])
            range_end = 10 if len(display_object) >= 10 else len(
                display_object)
            select_option(display_object, [0, range_end], criteria)

            if state_object['value'] == 'exit':
                return

            logging.info('Received state object: %s' % state_object)
            selected_url = state_object['value']

        if (url_object['type'] == 'topic'):
            source = Newscatcher(website=selected_url, topic=criteria)
        else:
            source = Newscatcher(website=selected_url)
        results = source.get_news()
        articles = results['articles']
        reset_state()

        titles = list(map(extract_title, articles))
        enumerated_titles = enumerate_array(titles)
        logging.info('Got titles: %s' % enumerated_titles)
        display_size = get_page_size(enumerated_titles)
        select_option(enumerated_titles, display_size, criteria)
        logging.info('Received state object: %s' % state_object)

        if state_object['value'] == 'exit':
            return

        selected_article = state_object
        print('> ', end='')
        print(
            f'{Fore.LIGHTBLACK_EX + articles[selected_article["index"]]["published"] + Fore.RESET}\n'
        )
        html = articles[selected_article['index']]['summary']
        soup = BeautifulSoup(html, 'html.parser')
        print(soup.get_text())
        print(f'{beam}')

        # todo: refactor
        while state_object['value'] != 'exit':
            print('')
            select_option(enumerated_titles, state_object['last_displayed'],
                          criteria)
            logging.info('Got state object: %s' % state_object)
            selected_article = state_object
            if selected_article['value'] != 'exit':
                print('> ', end='')
                print(
                    f'{Fore.LIGHTBLACK_EX + articles[selected_article["index"]]["published"] + Fore.RESET}\n'
                )
                html = articles[selected_article['index']]['summary']
                soup = BeautifulSoup(html, 'html.parser')
                print(soup.get_text())
                print(f'{beam}')
            else:
                break

        return state_object

    except KeyboardInterrupt:
        state_object = {'value': 'exit', 'last_displayed': [], 'index': 0}
        return
    except Exception as e:
        logging.error('Error: %s' % e)
        return
Exemple #15
0
from newscatcher import Newscatcher
from newscatcher import describe_url
from newscatcher import urls
import pandas as pd
#Used for getting available urls related to a topic
#science_urls = urls(topic='science',language='en')

#scrapping news articles
nc = Newscatcher(website='science20.com', topic='science')
results = nc.get_news()
articles = results['articles']

#Storing Information in a dictionary
data = {}
data['title'] = []
data['summary'] = []
data['date_published'] = []
data['link'] = []
for article in articles:
    data['title'].append(article['title'])
    data['summary'].append(article['summary'])
    data['date_published'].append(article['published'])
    data['link'].append(article['id'])

df = pd.DataFrame(data=data)
df.to_csv('news_scrapped.csv', index=False)
Exemple #16
0
from newscatcher import Newscatcher, urls

nc = Newscatcher(website='wsj.com')
results = nc.get_news()
#
# # results.keys()
# # 'url', 'topic', 'language', 'country', 'articles'
#
# # Get the articles
# articles = results['articles']
#
# first_article_summary = articles[0]['summary']
# first_article_title = articles[0]['title']
#
finance = urls(topic='finance')
#
# # URLs by COUNTRY
# american_urls = urls(country='US')
#
# # URLs by LANGUAGE
# english_urls = urls(language='en')