def latestNews(source): try: nc = Newscatcher(website=source, topic='finance') for k in nc.get_headlines(): print(k + "\n") except: print("unable to retrieve data from: " + source)
def get_articles(self): articles = [] for i in self.urls: nc = Newscatcher(website=i, topic=self.topic) webart = nc.get_news(n=1) articles.append(webart) print(articles[0]) return articles
def get_news(): nc = Newscatcher(website = 'marketwatch.com') results = nc.get_news() # results.keys() # 'url', 'topic', 'language', 'country', 'articles' # Get the articles articles = results['articles'] first_article_summary = articles[0]['summary'] first_article_title = articles[0]['title'] return jsonify(articles)
def get_news(source, page, limit): nc = Newscatcher(website=source) results = nc.get_news() articles = results['articles'] app.logger.info("len articles %d", len(articles)) i = 0 retval = [] offset = limit * (page - 1) for a in articles[offset:limit + offset]: # skip articles without summary if 'summary' in a.keys(): b = a a.source = source a.summary = cleanhtml(a.summary) retval.append(b) return retval
def get_site_article_rating(site): news_source = Newscatcher(site) sentiment_analyzer = pipeline(task="sentiment-analysis", framework='pt', device=-1) data = [] for x in news_source.news: s = MLStripper() s.feed(x['summary_detail']['value']) news = s.get_data() title = x['title'] link = x['link'] logger.debug(news) res = sentiment_analyzer(news) logger.debug("%s: %s: %s", site, news, res) data.append({ "link": link, "title": title, "summary": news, "sentiment": str(res[0]["label"]), "confidence": float(res[0]["score"]) }) return data
def catch_trends(website: str, topic=None): nc = Newscatcher(website=website, topic=topic) results = nc.get_news() keywords = [] articles = results['articles'] articles_count = len(articles) for article_num in tqdm.tqdm(range(articles_count)[:2]): article = articles[article_num] link = articles[article_num]["link"] try: parsed = parse_article(link) except Exception as e: print(link) continue article_keywords = parsed["keywords"] keywords += article_keywords return (keywords)
def headlines(): website=Newscatcher('washingtonpost.com') # speak("you want me to read the headlines?") # msg = takeCommand().lower() # if msg=="Yes" or "Yeah": # try: results=website.headlines speak(results)
def newscatchernews(): newslist = [] searchcriteria = None IndianURLs = urls(country='IN') searchcriteria = request.GET.get('search') #print("criteria:",searchcriteria) det = [] counter = 0 for IndianURL in IndianURLs: nc = Newscatcher(website=IndianURL) results = nc.get_news() if results is not None and results['articles'] is not None: articles = results['articles'] for article in articles: datesfound = datefinder.find_dates(article.published) dateresult = "x" for match in datesfound: dateresult = match.strftime("%Y-%m-%d %H:%M") txt = list(article.summary_detail.values())[3] detailtext = BeautifulSoup(txt, "html.parser").get_text() counter = counter + 1 newslist = newslist + [{ 'Source': IndianURL, 'Title': article.title, 'Published': dateresult, 'Summary_Detail': detailtext, 'link': article.link, 'id': "head_" + str(counter) }] return newslist
def get(self): # no reosurce for the topic, done. if not self.sources: return logger.info("Getting news for topic: {}".format(self.topic)) KEYS = ["published_parsed", "title", "link", "summary"] for site in self.sources: news = Newscatcher(website=site).get_news() # no news from this source if not news: continue for article in news.get("articles", []): # if we have all the info I want if set(KEYS) - set(article.keys()): continue if not article.get("published_parsed", None): continue pub = datetime(*article["published_parsed"][:6]) mine = MyNews( source=site, topic=self.topic, title=article["title"], link=article["link"], pub_time=make_aware(pub), summary=article["summary"], ) try: mine.save() except Exception: # must be violating unique constraint, do nothing # logger.error("Duplicate news. Skip.") pass
from newscatcher import Newscatcher from datetime import datetime from time import mktime from django.core.management.base import BaseCommand from scraping.models import Headline # define news website nytimes = Newscatcher('nytimes.com') usa_today = Newscatcher('usatoday.com') la_times = Newscatcher('latimes.com') guardian = Newscatcher('theguardian.com') wash_post = Newscatcher('washingtonpost.com') daily_mail = Newscatcher('dailymail.co.uk') NBC = Newscatcher('nbcnews.com') fox = Newscatcher('foxnews.com') huff_post = Newscatcher('huffpost.com') google_news = Newscatcher('news.google.com') wired = Newscatcher('wired.com') news_list = [ usa_today, la_times, guardian, wash_post, daily_mail, NBC, fox, huff_post, google_news, nytimes, wired ] class Command(BaseCommand): help = "collect headlines" # define logic of command
def show_news_from(domain): print('## {0}\n\n'.format(domain)) source = Newscatcher(website = domain) results = source.get_news() show_list(results['articles'])
def describe_sources(source): nc = Newscatcher(website=source) nc.print_headlines(n=10) urls_pol = urls(topic='politics') describe_url(urls_pol[1]) res = Newscatcher(website=urls_pol[1], topic='politics')
from newscatcher import describe_url from newscatcher import Newscatcher from newscatcher import urls url = 'news.ycombinator.com' url2 = 'ycombinator.com' eng_lnx = urls(language='en') nc = Newscatcher(website=url) try: print("looking for " + url + "...") nc.get_news() except Exception as e: print(repr(e)) describe_url(url) print(url + ' in urls: ' + str(url in eng_lnx)) print(url2 + ' in urls: ' + str(url2 in eng_lnx)) nc2 = Newscatcher(website='ycombinator.com') try: print("looking for " + url2 + "...") nc2.get_news() except Exception as e: print(repr(e))
def list_urls(criteria): try: global state_object reset_state() url_object = get_urls(criteria) logging.info('Criteria eval resulted in url object: %s' % url_object) url_list = url_object['urls'] if (len(url_list) == 0): logging.info('Unsupported topic or country: %s' % criteria) return if (url_object['type'] == 'website'): selected_url = url_object['urls'][0] logging.info('Selected url: %s' % state_object) else: logging.info('Got urls: %s' % len(url_list)) display_object = enumerate_array(url_list) logging.info('First item: %s' % display_object['0']) range_end = 10 if len(display_object) >= 10 else len( display_object) select_option(display_object, [0, range_end], criteria) if state_object['value'] == 'exit': return logging.info('Received state object: %s' % state_object) selected_url = state_object['value'] if (url_object['type'] == 'topic'): source = Newscatcher(website=selected_url, topic=criteria) else: source = Newscatcher(website=selected_url) results = source.get_news() articles = results['articles'] reset_state() titles = list(map(extract_title, articles)) enumerated_titles = enumerate_array(titles) logging.info('Got titles: %s' % enumerated_titles) display_size = get_page_size(enumerated_titles) select_option(enumerated_titles, display_size, criteria) logging.info('Received state object: %s' % state_object) if state_object['value'] == 'exit': return selected_article = state_object print('> ', end='') print( f'{Fore.LIGHTBLACK_EX + articles[selected_article["index"]]["published"] + Fore.RESET}\n' ) html = articles[selected_article['index']]['summary'] soup = BeautifulSoup(html, 'html.parser') print(soup.get_text()) print(f'{beam}') # todo: refactor while state_object['value'] != 'exit': print('') select_option(enumerated_titles, state_object['last_displayed'], criteria) logging.info('Got state object: %s' % state_object) selected_article = state_object if selected_article['value'] != 'exit': print('> ', end='') print( f'{Fore.LIGHTBLACK_EX + articles[selected_article["index"]]["published"] + Fore.RESET}\n' ) html = articles[selected_article['index']]['summary'] soup = BeautifulSoup(html, 'html.parser') print(soup.get_text()) print(f'{beam}') else: break return state_object except KeyboardInterrupt: state_object = {'value': 'exit', 'last_displayed': [], 'index': 0} return except Exception as e: logging.error('Error: %s' % e) return
from newscatcher import Newscatcher from newscatcher import describe_url from newscatcher import urls import pandas as pd #Used for getting available urls related to a topic #science_urls = urls(topic='science',language='en') #scrapping news articles nc = Newscatcher(website='science20.com', topic='science') results = nc.get_news() articles = results['articles'] #Storing Information in a dictionary data = {} data['title'] = [] data['summary'] = [] data['date_published'] = [] data['link'] = [] for article in articles: data['title'].append(article['title']) data['summary'].append(article['summary']) data['date_published'].append(article['published']) data['link'].append(article['id']) df = pd.DataFrame(data=data) df.to_csv('news_scrapped.csv', index=False)
from newscatcher import Newscatcher, urls nc = Newscatcher(website='wsj.com') results = nc.get_news() # # # results.keys() # # 'url', 'topic', 'language', 'country', 'articles' # # # Get the articles # articles = results['articles'] # # first_article_summary = articles[0]['summary'] # first_article_title = articles[0]['title'] # finance = urls(topic='finance') # # # URLs by COUNTRY # american_urls = urls(country='US') # # # URLs by LANGUAGE # english_urls = urls(language='en')