def getsearchresult(): searchup = defaultdict(list) newsapi = NewsApiClient(api_key="d11761b89fdb4599b1497bf951690000") keyword_ = request.args.get('keyword') print("keyword", keyword_) from_ = request.args.get('from') print("keyword", from_) to_ = request.args.get('todate') print("keyword", to_) source_ = request.args.get('source') print("keyword", source_) if source_ == "all": try: final_result = newsapi.get_everything(q=keyword_, from_param=from_, to=to_, language="en", page_size=30, sources="", sort_by="publishedAt") except NewsAPIException as error: return str(error) else: try: final_result = newsapi.get_everything(q=keyword_, from_param=from_, to=to_, sources=source_, language="en", page_size=30, sort_by="publishedAt") except NewsAPIException as error: return str(error) data = final_result["articles"] for i in data: if i['title'] is None or i['title'] == "" or i['title'] == "null" or i[ 'author'] is None or i['author'] == "" or i[ 'author'] == "null" or i['description'] is None or i[ 'description'] == "" or i['description'] == "null" or i[ 'source'] is None or i['source'] == "" or i[ 'source'] == "null" or i['url'] is None or i[ 'url'] == "" or i['url'] == "null" or i[ 'urlToImage'] is None or i[ 'urlToImage'] == "" or i[ 'urlToImage'] == "null" or i[ 'publishedAt'] is None or i[ 'publishedAt'] == "" or i[ 'publishedAt'] == "null": pass else: searchup['articles'].append(i) return jsonify(searchup)
def getCryptoNews(startDate,endDate): try: newsapi = NewsApiClient(api_key='ef9f89cce9b24cfe9ed9b61f900cc1b1') news =[] bitcoin_articles = newsapi.get_everything(q='bitcoin', sources='crypto-coins-news,bloomberg,reuters,google-news', domains='cnn,bloomberg,reuters,google', from_param = startDate, to = endDate, language='en') BTC_articles = newsapi.get_everything(q='btc', sources='crypto-coins-news,bloomberg,reuters,google-news', domains='cnn,bloomberg,reuters,google', from_param=startDate, to=endDate, language='en') crypto_articles = newsapi.get_everything(q='cryptocurrency', sources='crypto-coins-news,bloomberg,reuters,google-news', domains='cnn,bloomberg,reuters,google', from_param=startDate, to=endDate, language='en') blockchain_articles = newsapi.get_everything(q='blockchain', sources='crypto-coins-news,bloomberg,reuters,google-news', domains='cnn,bloomberg,reuters,google', from_param=startDate, to=endDate, language='en') for item in bitcoin_articles['articles']: news.append(item) for item in BTC_articles['articles']: news.append(item) for item in crypto_articles['articles']: news.append(item) for item in blockchain_articles['articles']: news.append(item) except : print('Error in Reading BTC News!') return (news)
def customsearchresults(request): user_query = request.GET['search'] newsapi = NewsApiClient(api_key='e714e075a7534f85b7e0bdfd2330c611') all_articles = newsapi.get_everything(q=user_query, language='en', sort_by='relevancy') all_articles = all_articles['articles'] json_content = [] for i in range(len(all_articles)): news_object = all_articles[i] # r1 = requests.get(news_object['url']) # text = r1.content # soup = BeautifulSoup(text, 'html.parser') # paragraph_list = soup.find_all('p') # whole_content = "" # json_content = [] # for item in range(len(paragraph_list)): # whole_content = whole_content + " " + paragraph_list[item].get_text() custom_object = { "heroes": "To be decided", "victim": "To be decided", "villian": "To be decided", "source": news_object["source"]["name"], "author": news_object["author"], "title": news_object['title'], "shortdescription": news_object['description'], "urlToImage": news_object["urlToImage"], "url": news_object["url"], } json_content.append(custom_object) return Response(json_content)
def get(self, fields=["image", "title", "image", "link"], limit=20, **kargs): newsapi = NewsApiClient(api_key=self.__key) all_news = newsapi.get_everything(**kargs) return self.__transformDate(all_news.get("articles"), fields, limit)
def gen_sentiment_df(stock="DJI"): newsapi = NewsApiClient(api_key='f8970a68f49e43a18c9b5aff8e2bcfe1') a = date(2020, 2, 29) b = date(2020, 3, 27) sentiments = {} query = "stocks & " + stock for dt in rrule(DAILY, dtstart=a, until=b): str_date = str(dt.strftime("%Y-%m-%d")) all_articles = newsapi.get_everything(q=query, from_param=str_date, to=str_date, language='en', sort_by='relevancy', page=1) headlines = "" for a in all_articles['articles']: if isinstance(a["title"], str): headlines += a["title"] i = analyze_sentiment(headlines) sentiments[dt] = {i.magnitude, i.score} sentiment_df = pd.DataFrame(list(sentiments.values()), columns=["magnitude", "score"], index=sentiments.keys()) sentiment_df = sentiment_df.fillna(0) return sentiment_df
def get_news(self): """ This function makes a request using authenticated api and instance argument to extract news articles. :return: dataframe with articles and other metadata relevant to each article """ start = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d') end = datetime.today().strftime('%Y-%m-%d') # Initialise NewsApiClient with an api key newsapi = NewsApiClient(api_key=nc.api_key) query = ' '.join( ['(' + ' OR '.join([query for query in self.queries]) + ')']) # Query for articles using keyword all_articles = newsapi.get_everything(q=query, from_param=start, to=end, language='en', sort_by='relevancy', page_size=100) # Extract articles from returned json and store in articles variable articles = all_articles['articles'] # Convert articles into dataframe articles_df = pd.DataFrame(articles) # Use only name part in the source columns articles_df['source'] = articles_df.source.map(lambda x: x['name']) # Select relevant columns for analysis articles_df = articles_df[[ 'source', 'title', 'url', 'publishedAt', 'content' ]] articles_df.columns = [ 'Source', 'Title', 'Url', 'Published', 'Content' ] return articles_df
def newsapi(stock): # newsapi_symbol = input("Enter a symbol") newsapi = NewsApiClient(api_key='861ff0ffbaaa4eaa9571ce516cc5e088') all_articles = newsapi.get_everything(q=stock, language='en', sort_by='publishedAt', page_size=100) sources = newsapi.get_sources() title = [] desc = [] i = 1 pos, neg, neu = 0, 0, 0 for article in all_articles['articles']: a = str(article['content']) title.append( str(article['title']) + ' : \n' + str(article['description'])) # desc.append(str(article['description'])) b = article['source'] c = article['publishedAt'] # print(i, a) i += 1 analysis = TextBlob(a) if analysis.sentiment.polarity > 0: # print('\nPositive:\n', a) # print('The source is:', b['name']) # print('It was published at:', c) pos += 1 elif analysis.sentiment.polarity == 0: # print('\nNeutral:\n', a) # print('The source is:', b['name']) # print('It was published at:', c) neu += 1 else: # print('\nNegative:\n', a) # print('The source is:', b['name']) # print('It was published at:', c) neg += 1 # print(title) total = pos + neg + neu pos_news, neg_news, neu_news = pos / total, neg / total, neu / total if pos_news - neg_news > 0: # print('\nThe net value of News is: ', (pos_news - neg_news + 1)/2) output = ((pos_news - neg_news + 1) * 100) / 2 else: # print("\nThe net value of News is: ", (pos_news - neg_news + 1)/2) output = ((pos_news - neg_news + 1) * 100) / 2 # print(output) return output, title, desc
def searchresults(request): user_query = request.GET['search'] newsapi = NewsApiClient(api_key='e714e075a7534f85b7e0bdfd2330c611') all_articles = newsapi.get_everything(q=user_query, language='en', sort_by='relevancy') return Response(all_articles)
def news(): if request.method == 'GET': return render_template('news.html') else: # request was a POST application.vars['keyword'] = request.form['keyword'] application.vars['earliest'] = request.form['earliest'] application.vars['latest'] = request.form['latest'] newsapi = NewsApiClient(api_key='dc919a5aeb324f01b0db89373fd71749') keyword = application.vars['keyword'] oldest = application.vars['earliest'] latest = application.vars['latest'] print(keyword) articles_page = newsapi.get_everything(q=keyword, from_param=oldest, to=latest, language='en', sort_by='popularity') print(articles_page['totalResults']) total = articles_page['totalResults'] maxpage = math.ceil(total / 20) articles = [] print(maxpage) for i in range(1, maxpage): articles_page = newsapi.get_everything(q=keyword, from_param=oldest, to=latest, language='en', sort_by='popularity', page=i) articles.extend(articles_page['articles']) articles_df = pd.DataFrame(articles) print(len(articles_df)) print(articles_df) resp = make_response(articles_df.to_csv()) resp.headers[ "Content-Disposition"] = "attachment; filename=export_news.csv" resp.headers["Content-Type"] = "text/csv" return resp
def get_articles(stock): newsapi = NewsApiClient(api_key='861ff0ffbaaa4eaa9571ce516cc5e088') all_articles = newsapi.get_everything(q=stock, language='en', sort_by='publishedAt', page_size=100) print(all_articles) return all_articles
def get_news(): # with app.app_context(): newsapi = NewsApiClient(api_key='29b0d1fda8b6452fb4df7d86a3dc5b9a') data = newsapi.get_everything(q='health and fitness', language='en', page_size=20) articles = data['articles'] # print(type(articles[0])) It showed that it is of dict type. articles_json = json.dumps(articles) return articles_json
def getInfo(query): #Key to access GoogleNews API query = query.lower() # query = urllib.parse.quote_plus(query) newsapi = NewsApiClient(api_key='edf0afe93d6644d198d8539e640134c9') # print(query) headlines = newsapi.get_top_headlines(q=query, language='en') # print(headlines) # headlines = newsapi.get_top_headlines(q=query, language='en') newsTitles = list() newsContent = list() newsSources = list() newsURL = list() # print("number of articles found = " + str(len(headlines['articles']))) #Adds all relevant information to separate lists numberOfArticles = len(headlines['articles']) if numberOfArticles > 5: numberOfArticles = 5 for x in range(numberOfArticles): source = headlines['articles'][x]['source']['name'] if source == "Google News" or source == "Reuters" or source == "Financial Times": print(source) # x -= 1 continue newsTitles.append(headlines['articles'][x]['title']) newsContent.append(headlines['articles'][x]['content']) newsSources.append(headlines['articles'][x]['source']['name']) newsURL.append(headlines['articles'][x]['url']) if len(newsTitles) < 5: today = datetime.datetime.today() start_day = today - datetime.timedelta(days=1) headlines_all = newsapi.get_everything(q=query, from_param=str(start_day), to=str(today), language='en', sort_by='relevancy') for x in range(5 - len(newsTitles)): source = headlines_all['articles'][x]['source']['name'] if source == "Google News" or source == "Reuters" or source == "Financial Times": print(source) # x -= 1 continue newsTitles.append(headlines_all['articles'][x]['title']) newsContent.append(headlines_all['articles'][x]['content']) newsSources.append(headlines_all['articles'][x]['source']['name']) newsURL.append(headlines_all['articles'][x]['url']) return newsTitles, newsContent, newsSources, newsURL
def search_articles(query_string, domain_blacklist_string, domain_whitelist_string): newsapi = NewsApiClient(api_key='391c4cadc42a4a42aaf1ea266df4adfc') headlines = newsapi.get_everything( q=query_string, language='en', sort_by='relevancy', page_size=100, domains=domain_whitelist_string # exclude_domains=domain_blacklist_string ) return headlines
def get_everything(keyword, fromdate, currentdate): newsapi = NewsApiClient(api_key='') articles = newsapi.get_everything(q=keyword, from_param=fromdate, to=currentdate, language='en', sort_by='relevancy', page_size=20) content = [] if articles['totalResults'] > 20: for article in articles['articles']: detail = {'title': article['title'], 'url': article['url'], 'source': article['source']['name']} content.append(detail) return content
def newsfeed(area): #Area specific NewsFeed today = datetime.utcnow().date() duration = datetime.utcnow().date() - timedelta(days=5) #API KEY news = NewsApiClient(api_key="0de6787ea7e44a8691ce4a5d556d18dc") #News extraction top_headlines = news.get_everything( q='+{} AND (quarantine OR corona OR covid OR lockdown) NOT positive NOT Deadlier NOT Dead' .format(area), from_param=duration, language='en') #sentiment analysis using Textblob newsfeed = sentiment(top_headlines, 0.0)
def search(): k = request.args.get('k', '') f = request.args.get('f', '') t = request.args.get('t', '') c = request.args.get('c', '') s = request.args.get('s', '') print(k) print(f) print(t) print(c) print(s) if s == "all": s = "" newsapi = NewsApiClient(api_key='3061013219ce4282b5d26bdcf8b9f966') pageSize = 30 results = newsapi.get_everything(q=k, sources=s, from_param=f, to=t, language='en', page_size=pageSize, sort_by='publishedAt') searchCount = 0 searchData = [] for i in results['articles']: if (searchCount != 15 and i["publishedAt"] and i["urlToImage"] and i["author"] and i["description"] and i["url"] and i["urlToImage"]): temp = i["source"] searchData.append(i["urlToImage"]) searchData.append(i["title"]) searchData.append(i["description"]) searchData.append(i["author"]) searchData.append(temp["name"]) searchData.append(i["publishedAt"]) searchData.append(i["url"]) searchCount = searchCount + 1 return jsonify({"results": searchData})
def news(request): newsapi = NewsApiClient(api_key='149dd9c6ff0c47cfae0d743f73171729') news = newsapi.get_everything(q='covid', language='en') articles = news['articles'] desc = [] news = [] img = [] url = [] for article in articles: news.append(article['title']) desc.append(article['description']) img.append(article['urlToImage']) url.append(article['url']) newslist = zip(news, desc, img, url) context = {"newslist": newslist} return render(request, template_name='news.html', context=context)
class NewsSources: def __init__(self): self.db = DBManager() self.news_api = NewsApiClient(api_key=newsApiKey) def getNewsFromSources(self): newsData = self.news_api.get_everything( sources=self.db.getNewsApiSourcesIDs(), language=appLanguage, page_size=pageSize)['articles'] res = {'articlesData': {}, 'articlesURLs': []} for article in newsData: articleDict = { 'title': article['title'], 'url': article['url'], 'image': article['urlToImage'], 'time': article['publishedAt'], 'newsApiID': article['source']['id'] } res['articlesData'][article['url']] = articleDict res['articlesURLs'].append(article['url']) return res
def get_news_articles(queries, fromdate, todate): fromdate = '2015-19-10' todate = '2018-20-10' newsapi = NewsApiClient(api_key='2b7935c2680f46b487d833129210d4c3') not_enough_articles = True articles_to_find = 5 articles = [] for query in queries: all_articles = newsapi.get_everything(q=query, from_param=fromdate, to=todate, sort_by='relevancy', page=1) articles_found = all_articles['articles'] for found_article in articles_found: if found_article not in articles and len(articles) < articles_to_find: articles.append(found_article) if len(articles) == articles_to_find: articles = clean_articles(articles) return articles articles = clean_articles(articles) return articles #if all queries together do not return 5 different articles, return what is found
def get_news(topic): newsClient = NewsApiClient(api_key="bad068d6ce6c4ccfb30eb5785c360efe") # q is search terms, category for category of news, language is english # if possible (foreign news may not be english) keyWords = topic + " soccer" sportsSources = newsClient.get_sources(category="sports") sourceIds = '' for i in range(len(sportsSources['sources'])): if (i == len(sportsSources['sources']) - 1): sourceIds = sourceIds = sourceIds + sportsSources['sources'][i][ 'id'] else: sourceIds = sourceIds + sportsSources['sources'][i]['id'] + "," threeDaysAgo = datetime.date(datetime.now()) - timedelta( 3) #date 3 days ago topHeadlines = newsClient.get_everything(q=keyWords, sources=sourceIds, language='en', sort_by='relevancy', from_param=threeDaysAgo) articles = topHeadlines['articles'][:3] return json.dumps(articles)
def html_table(): df = pd.DataFrame() dj = pd.DataFrame() supplierdf = pd.read_excel('suppp.xlsx', headers=True) supplierdf['name'] = supplierdf['name'].str.strip() supplierlist = supplierdf['name'].values.tolist() supplierlist = [k.lower() for k in supplierlist] newsapi = NewsApiClient(api_key='b320e2b793644396bdbeded93ff9d702') for i in range(len(supplierlist)): all_articles = newsapi.get_everything(q=supplierlist[i], language='en') for article in all_articles['articles']: df = df.from_dict(article) df['Supplier Name'] = supplierlist[i] dj = dj.append(df) newsdf = dj.drop_duplicates(subset='title', keep="first") newsdf = newsdf.reset_index() newsdf = newsdf.drop(columns=['index']) return render_template('simple.html', tables=[newsdf.to_html(classes='data')], titles=newsdf.columns.values)
def get_newsurls(limit=5, query=None): """Returns list of news urls from newsapi.org.""" # initialise newsapi KEY = 'key' # insert your newsapi key newsapi = NewsApiClient(api_key=KEY) results = list() if query is None: # get top headlines for Singapore news top_headlines_results = newsapi.get_top_headlines(language='en', country='sg') results.extend(top_headlines_results['articles'][:limit]) else: # get search results from a query search_results = newsapi.get_everything( q=query, language='en', domains='channelnewsasia.com, todayonline.com, straitstimes.com') results.extend(search_results['articles'][:limit]) urls = [article['url'] for article in results] return urls
def collect_n_save_news(self): company = "Tesla" symbol = "TSLA" api_key = env("NEWSAPI_KEY") newsapi = NewsApiClient(api_key=api_key) start_time = self.rounded_to_the_last_30th_minute_epoch() end_time = start_time + timedelta(minutes=30) news = newsapi.get_everything( q=company, language="en", from_param=start_time, to=end_time, sort_by="popularity", page_size=100, page=1, ) self.save_news_metrics(symbol, company, end_time, {"news_count": len(news["articles"])})
def get_news (): API_KEY = "a621f645307c47129920cf7858d1dffe" newsapi = NewsApiClient(api_key=API_KEY) keywords = "covid19" all_articles = newsapi.get_everything(q=keywords, language="en") return all_articles["articles"]
class NewsApiHandle: """ This class contains methods to handle everything related to the use of news api """ def __init__(self, API_Key, keyword_list): """ This method inititalizes the news api and also takes in a list of keywords as the argument and applies AND operation between them and queries it and stores it in the variable response :type keyword_list: list :param keyword_list: list of keywords to query the api """ #initialize news client with the api key self.news_api = NewsApiClient(api_key=API_Key) # the sting to be appended in the middle AND = " AND " # add AND in between the keywords in the list query_string = AND.join(keyword_list) # initialize an empty list of titles self.title_list = [] # initialize an empty list of meta descriptions self.descriptions_list = [] # initialize an empty list of Urls self.Urls_list = [] # initialize an empty list of sources self.sources_list = [] # query the api response = self.news_api.get_everything(q=query_string, sort_by='relevancy') # if the size of list_of_URLs is more then 5 set parse_length to 5 else according to its size parse_length = 5 if len(response['articles']) >= 5 else len( response['articles']) # for each article returned get the corresponding URL and append it to list_of_URLs for item in range(parse_length): # append every title to title list self.title_list.append(response["articles"][item]["title"]) # append every description to description_list self.descriptions_list.append( response["articles"][item]["description"]) # append every Urls to Urls_list self.Urls_list.append(response["articles"][item]["url"]) # append every source to source list self.sources_list.append( response["articles"][item]['source']['name']) def get_URLs(self): """ This method returns a maximum of 5 news Urls to extract content from (list) """ return self.Urls_list def get_titles(self): """ This method returns a maximum of 5 news titles (list) """ return self.title_list def get_descriptions(self): """ This method returns a maximum of 5 descriptions (list) """ return self.descriptions_list def get_sources(self): """ This method returns the source of """ return self.sources_list
class NewsURL: def __init__(self, start_date, end_date): self.API_KEY1 = '9382dd6539f448e59de4ab7c8c214f6f' #김민수 self.API_KEY2 = '08fe48df23494ab0bb4faa1162fee7fa' #이명훈 self.API_KEY3 = '0bc1cc3aff43418ba35488984b6742a4' #최범석 self.API_KEY4 = 'f996355abde44786b91bdef6bc92ee62' #이명훈2 self.API_KEY5 = '2533fbe4f09e4d9dbc51905dcd13d4a3' #최범석2 # Get the source self.tech_newsapi = NewsApiClient(api_key=self.API_KEY1) self.sources = self.tech_newsapi.get_sources() self.general_newsapi_1 = NewsApiClient(api_key=self.API_KEY2) self.general_newsapi_2 = NewsApiClient(api_key=self.API_KEY3) self.general_newsapi_3 = NewsApiClient(api_key=self.API_KEY4) self.google_newsapi = NewsApiClient(api_key=self.API_KEY5) # Make the magazine list self.general_magazine1 = [ "ABC News", "Associated Press", "Business Insider", "CBS News", "CNN" ] self.general_magazine2 = [ "Mashable", "NBC News", "The New York Times", "Reuters", "The Economist" ] self.general_magazine3 = [ "The Washington Post", "The Washington Times", "Time", "USA Today" ] self.tech_magazine = [ "Ars Technica", "Engadget", "Hacker News", "TechCrunch", "TechRader", "The Next Web", "The Verge", "Wired" ] self.today = datetime.date.today() self.start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") self.end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") self.timedelta = int((self.end_date - self.start_date).days) + 1 # company_list self.cor_list = pd.read_csv( './company_data/Company.csv')['Name'].tolist() if os.path.exists('./source/') == False: os.mkdir('./source') if os.path.exists('./source/{}'.format( self.today.strftime("%Y-%m-%d"))) == False: os.mkdir('./source/{}'.format(self.today.strftime("%Y-%m-%d"))) if os.path.exists('./backup/') == False: os.mkdir('./backup') if os.path.exists('./backup/{}'.format( self.today.strftime("%Y-%m-%d"))) == False: os.mkdir('./backup/{}'.format(self.today.strftime("%Y-%m-%d"))) print("news_crawler start! From: {}, to: {}, {}days".format( self.start_date.strftime("%Y-%m-%d"), self.end_date.strftime("%Y-%m-%d"), self.timedelta)) # Get the magazine information def make_magazine(self, mode="tech"): if mode == "tech": magazine = [] id_list = [] for s in self.sources['sources']: if s['name'] in self.tech_magazine: magazine.append(s) for m in magazine: id_list.append(m['id']) elif mode == "general": magazine_1 = list() magazine_2 = list() magazine_3 = list() general_magazine_dict = dict() for s in self.sources['sources']: if s['name'] in self.general_magazine1: magazine_1.append(s) general_magazine_dict['general_magazine1'] = magazine_1 elif s['name'] in self.general_magazine2: magazine_2.append(s) general_magazine_dict['general_magazine2'] = magazine_2 elif s['name'] in self.general_magazine3: magazine_3.append(s) general_magazine_dict['general_magazine3'] = magazine_3 id_1 = list() id_2 = list() id_3 = list() id_list = dict() for gm in [ 'general_magazine1', 'general_magazine2', 'general_magazine3' ]: print(gm) for m in general_magazine_dict[gm]: if gm == 'general_magazine1': id_1.append(m['id']) id_list[gm] = id_1 elif gm == 'general_magazine2': id_2.append(m['id']) id_list[gm] = id_2 elif gm == 'general_magazine3': id_3.append(m['id']) id_list[gm] = id_3 # Get the magazine id return id_list def make_tech_url_list(self): # newsapi.get_everything() parameters # q: Keywords or phrases to search for # sources: A comma-seperated string of identifiers (maximum 20) for the news # from: A date and optional time for the oldest article allowed. default: the oldest according to your plan # to: A date and optional time for the newest article allowed. default: the newest according to your plan # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt # page_size: The number of results to return per page. 20 is the default, 100 is the maxium # page: Use this to page through the results start_time = time.time() # Make the empty final data frame id_list = self.make_magazine(mode="tech") total_df = pd.DataFrame( columns=["Magazine", "Date", "Author", "Title", "Url"]) for id in id_list: print(id) # Make the empty backup data frame backup_df = pd.DataFrame( columns=["Magazine", "Date", "Author", "Title", "Url"]) for i in range(0, self.timedelta): date = self.start_date + datetime.timedelta(i) date = date.strftime("%Y-%m-%d") print(date) articles = self.tech_newsapi.get_everything(sources=id, from_param=date, to=date, language="en", page_size=100, page=1) for a in articles['articles']: total_df = total_df.append( { "Magazine": id, "Date": a['publishedAt'], "Author": a['author'], "Title": a['title'], "Url": a['url'] }, ignore_index=True) backup_df = backup_df.append( { "Magazine": id, "Date": a['publishedAt'], "Author": a['author'], "Title": a['title'], "Url": a['url'] }, ignore_index=True) backup_df.to_csv("./backup/{0}/{0}_{1}.csv".format( self.today.strftime("%Y-%m-%d"), id), index=False) total_df.to_csv("./source/{}/{}_techurl.csv".format( self.today.strftime("%Y-%m-%d"), self.today.strftime("%Y%m%d")), index=False, encoding='utf-8') end_time = time.time() return "success time:{}".format(end_time - start_time) def make_general_url_list(self): start_time = time.time() # newsapi.get_everything() parameters # q: Keywords or phrases to search for # sources: A comma-seperated string of identifiers (maximum 20) for the news # from_param: A date and optional time for the oldest article allowed. default: the oldest according to your plan # to: A date and optional time for the newest article allowed. default: the newest according to your plan # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt # page_size: The number of results to return per page. 20 is the default, 100 is the maxium # page: Use this to page through the results # Make the empty final data frame start_date = self.start_date.strftime("%Y-%m-%d") end_date = self.end_date.strftime("%Y-%m-%d") print("{}~{}".format(start_date, end_date)) id_dict = self.make_magazine(mode="general") total_df = pd.DataFrame( columns=["Magazine", "Date", "Author", "Title", "Url", "Company"]) for gm in [ 'general_magazine1', 'general_magazine2', 'general_magazine3' ]: id_list = id_dict[gm] if gm == 'general_magazine1': newsapi = self.general_newsapi_1 elif gm == 'general_magazine2': newsapi = self.general_newsapi_2 elif gm == 'general_magazine3': newsapi = self.general_newsapi_3 for id in id_list: print("Magazine : ", id) # Make the empty backup data frame backup_df = pd.DataFrame(columns=[ "Magazine", "Date", "Author", "Title", "Url", "Company" ]) for query in self.cor_list: print(query) articles = newsapi.get_everything(sources=id, q=query, from_param=start_date, to=end_date, language="en", page_size=100, page=1) for a in articles['articles']: total_df = total_df.append( { "Magazine": id, "Date": a['publishedAt'], "Author": a['author'], "Title": a['title'], "Url": a['url'], "Company": query }, ignore_index=True) backup_df = backup_df.append( { "Magazine": id, "Date": a['publishedAt'], "Author": a['author'], "Title": a['title'], "Url": a['url'], "Company": query }, ignore_index=True) backup_df.to_csv("./backup/{0}/{0}_{1}.csv".format( self.today.strftime("%Y-%m-%d"), id), index=False) total_df.to_csv("./source/{}/{}_genurl.csv".format( self.today.strftime("%Y-%m-%d"), self.today.strftime("%Y%m%d")), index=False, encoding='utf-8') end_time = time.time() return "success time:{}".format(end_time - start_time) # cralwer google_news url def make_google_url_list(self): start_time = time.time() # newsapi.get_everything() parameters # q: Keywords or phrases to search for # sources: A comma-seperated string of identifiers (maximum 20) for the news # from: A date and optional time for the oldest article allowed. default: the oldest according to your plan # to: A date and optional time for the newest article allowed. default: the newest according to your plan # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt # page_size: The number of results to return per page. 20 is the default, 100 is the maxium # page: Use this to page through the results # Make the empty final data frame start_date = self.start_date.strftime("%Y-%m-%d") end_date = self.end_date.strftime("%Y-%m-%d") print("{}~{}".format(start_date, end_date)) total_df = pd.DataFrame( columns=["Magazine", "Date", "Author", "Title", "Url"]) for query in self.cor_list: print(query) articles = self.google_newsapi.get_everything( sources='google-news', q=query, from_param=start_date, to=end_date, language="en", page_size=100, page=1) print(len(articles['articles'])) for a in articles['articles']: total_df = total_df.append( { "Magazine": "google_news", "Date": a['publishedAt'], "Author": a['author'], "Title": a['title'], "Url": a['url'] }, ignore_index=True) total_df.to_csv("./source/{0}/{0}_googleurl.csv".format( self.today.strftime("%Y%m%d")), index=False, encoding='utf-8') end_time = time.time() return "success time:{}".format(end_time - start_time)
from newsapi.newsapi_client import NewsApiClient import json from ibm_watson import NaturalLanguageUnderstandingV1 from ibm_cloud_sdk_core.authenticators import IAMAuthenticator from ibm_watson.natural_language_understanding_v1 import Features, SentimentOptions authenticator = IAMAuthenticator('cIcEmIFE4K73r7kGwzxbR_M-x1peReu3DM9o0WfgcdlO') natural_language_understanding = NaturalLanguageUnderstandingV1( version='2019-07-12', authenticator=authenticator ) newsapi=NewsApiClient(api_key="c650b6441dd24b2991ec29a1fd13e76c") natural_language_understanding.set_service_url('https://api.eu-de.natural-language-understanding.watson.cloud.ibm.com/instances/9d7674b4-e3b9-483d-8d45-be07ba05fc72') news=newsapi.get_everything(q='Politics',language='en') for i in news['articles']: response=natural_language_understanding.analyze( url=i['url'],features=Features(sentiment=SentimentOptions(document=True,targets=None))).get_result() print(json.dumps(response, indent=2))
from newsapi.newsapi_client import NewsApiClient import pickle import pandas as pd from spacy.lang.en import punctuation from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt nlp_eng = en_core_web_lg.load() newsapi = NewsApiClient(api_key='753eb8f70f404f9e8a1a0cb4cce69b2d') articles = [] for i in range(1, 6): temp = newsapi.get_everything(q='coronavirus', language='en', from_param='2021-02-25', to='2021-03-23', sort_by='relevancy', page=i) articles.append(temp) # print(articles) filename = 'articlesCOVID.pck1' pickle.dump(articles, open(filename, 'wb')) loaded_model = pickle.load(open(filename, 'rb')) filepath = 'D:/CPP/Junior Year/Spring 2021/CS 4650/Homework/HW5/articlesCOVID.pck1' pickle.dump(loaded_model, open(filepath, 'wb')) dados = [] for i, article in enumerate(articles):
class Newsy: def __init__(self): self.root = tk.Tk() self.root.geometry("1920x1080") self.root.title("News App") self.newsapi = NewsApiClient(api_key=API_KEY) self.top_headlines = tk.StringVar() self.all_articles = tk.StringVar() self.query = None self.createAndDisplay() def exception_handler(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) except: self.articles_list.config( text=("There was error in processing your request.")) return wrapper @exception_handler def createAndDisplay(self): self.articles_list = tk.Label( self.root, text="", font=('Helvetica', 15)) self.articles_list.place( relx=0, rely=0.25, relheight='0.7', relwidth='1') title = tk.Label(self.root, text="News App", font=('Modern', 40)) title.place(relx=0.6, rely=0.01) top_headlines = self.newsapi.get_top_headlines( language='en', country='in') all_articles = self.newsapi.get_everything(sources='bbc-news,the-verge', domains='bbc.co.uk,techcrunch.com', language='en', sort_by='relevancy', page=5) top_headlines = [news['url'] for news in top_headlines['articles']] all_articles = [news['url'] for news in all_articles['articles']] drop = tk.OptionMenu(self.root, self.top_headlines, *top_headlines) drop.place(relx=0, rely=0) drop2 = tk.OptionMenu(self.root, self.all_articles, *all_articles) drop2.place(relx=0, rely=0.1) url_button = tk.Button(self.root, text="Select", bd=1, activebackground="#ffee96", command=self.get_url) url_button.place(relx=0.8, rely=0) headline = tk.Button(self.root, text="India News", bd=1, activebackground="#ffee96", command=self.get_headlines) headline.place(relx=0.3, rely=0.2) all_news = tk.Button(self.root, text="World News", bd=1, activebackground="#ffee96", command=self.get_news) all_news.place(relx=0.4, rely=0.2) self.query = tk.Entry(self.root, font=('Courier', 10), bd=0) self.query.place(relx=0.5, rely=0.2) search = tk.Button(self.root, text="Search", bd=1, activebackground="#ffee96", command=self.search_query) search.place(relx=0.65, rely=0.2) @exception_handler def get_url(self): webbrowser.open(self.top_headlines.get()) webbrowser.open(self.all_articles.get()) @exception_handler def get_headlines(self): articles = self.newsapi.get_top_headlines( language='en', country='in') self.articles_list.config( text=(" ".join([news['title'] + "\n" for news in articles['articles']]))) @exception_handler def get_news(self): articles = self.newsapi.get_everything(sources='bbc-news,the-verge', domains='bbc.co.uk,techcrunch.com', language='en', sort_by='relevancy', page=5) self.articles_list.config(text=" ".join( [news['title'] + "\n" for news in articles['articles']])) @exception_handler def search_query(self): articles = self.newsapi.get_everything( q=self.query.get(), language='en') self.articles_list.config( text=(" ".join([news['title'] + "\n" for news in articles['articles']])))
# https://newsapi.org/docs/client-libraries/python from newsapi.newsapi_client import NewsApiClient apikey = 'eda663f8-e934-42a5-88e2-bd75014130d1' newsapi = NewsApiClient(api_key=apikey) target_lst = [ 'museums', 'united ways', 'development and relief services', 'advocacy and education', 'children and family services' ] lst = [] for topic in target_lst(): res = all_articles = newsapi.get_everything(q=topic, language='en', sort_by='relevancy', page=2) lst.extend(res) print(lst)