class Scraper: # example code # ----------------------- # x = Scraper(api_key='xyz') # print(x.scrape_all_articles(language='en')) articles = None sources = None api_key = None def __init__(self, api_key) -> None: super().__init__() self.api_key = api_key self.articles = Articles(API_KEY=self.api_key) self.sources = Sources(API_KEY=self.api_key) def scrape_articles_for_sources(self, sources): ''' Accepts the list of source names and returns all articles downloaded from the given sources :param sources: List of source id's :return: List of article json objects, containing: 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt' ''' articles = [] for source in sources: try: # list of json objects # author, title, description, url, urlToImage, publishedAt articles_for_source = self.articles.get(source=source).articles except BaseException: # if the server does not respond continue for article in articles_for_source: articles.append(article) return articles def scrape_sources(self, categories=[], language=None): ''' Gets the newsapi sources associated with the given category (optional) and language (optional) :param categories: List of categories (optional) :param language: Language (optional) :return: List of source id's ''' sources_dict = [] for category in categories: sources_dict += self.sources.get(category, language).sources sources = set([source['id'] for source in sources_dict]) return sources def scrape_all_articles(self, categories=[], language=None): ''' Scrapes and returns all articles for the given category and language (parameters are optional) :param categories: list of categories (optional) :param language: language (optional) :return: List of article json objects, containing: 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt' ''' return self.scrape_articles_for_sources( self.scrape_sources(categories, language))
def news(): with open('X.pkl', 'rb') as f: X = pickle.load(f) with open('y.pkl', 'rb') as f: y = pickle.load(f) #Generating the training and testing dataset count_vectorizer = CountVectorizer() X = count_vectorizer.fit_transform(X) # Fit the Data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=42) #Naive Bayes Classifier clf = LogisticRegression() clf.fit(X_train, y_train) apikey = 'c9c0b7a1fc944a02bdadda8c09dace91' a = Articles(API_KEY=apikey) data = a.get(source="abc-news-au", sort_by='top') data = pd.DataFrame.from_dict(data) data = pd.concat( [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)], axis=1) description = data['description'] def pre(x): data1 = str(x) data1 = remove_new_lines(data1) data1 = remove_stop_words(data1) data1 = strip(data1) data1 = remove_weird(data1) data1 = np.array(data1).reshape(-1) vect = count_vectorizer.transform(data1) my_prediction1 = clf.predict(vect) return my_prediction1 pred0 = pre(description[0]) pred1 = pre(description[1]) pred2 = pre(description[2]) pred3 = pre(description[3]) pred4 = pre(description[4]) return render_template('news.html', des0=description[0], des1=description[1], des2=description[2], des3=description[3], des4=description[4], pred0=pred0, pred1=pred1, pred2=pred2, pred3=pred3, pred4=pred4)
def get_news(sources): NEWS_API_KEY='1bae2e39f2b540f3a15dbbcb269eba9b' articles=Articles(API_KEY=NEWS_API_KEY) info=articles.get(source=sources) news_array=[] news_objects=[] length_of_articles=len(info['articles']) for i in range(0,int(length_of_articles)): headline=info['articles'][i]['title'] body=info['articles'][i]['description'] url_web=info['articles'][i]['url'] image=info['articles'][i]['urlToImage'] time=info['articles'][i]['publishedAt'] news_objects.append(Template.GenericElement(title=headline,subtitle=body,item_url=url_web, image_url=image,buttons=[Template.ButtonWeb(title='Open in web',url=url_web)])) return news_objects
from newsapi.articles import Articles from newsapi.sources import Sources key = '96af62a035db45bda517a9ca62a25ac3' a, s = Articles(API_KEY=key), Sources(API_KEY=key) s.all() # get all sources offered by newsapi a.get(source='the-new-york-times') s.get(category='technology', language='en', country='US') from newsapi import NewsAPI key = '96af62a035db45bda517a9ca62a25ac3' params = {} api = NewsAPI(key) sources = api.sources(params) articles = api.articles(sources[0]['id'], params) ################ NY Times API ############################################# import sys, csv, json reload(sys) sys.setdefaultencoding('utf8') """ About: Python wrapper for the New York Times Archive API https://developer.nytimes.com/article_search_v2.json """ class APIKeyException(Exception): def __init__(self, message):
import newsapi import requests import json import os from newsapi.articles import Articles from newsapi.sources import Sources a = Articles(API_KEY="537b165a4f314fedae8cb39788d4d713") s = Sources(API_KEY="537b165a4f314fedae8cb39788d4d713") res = a.get(source="daily-mail")['articles'] bbc = a.get(source="bbc-news")['articles'] telegraph = a.get(source="the-telegraph")['articles'] guardian = a.get(source="the-guardian-uk")['articles'] independent = a.get(source="independent")['articles'] sports = a.get(source="the-sport-bible")['articles'] # results = s.get_by_country("gb").sources # # s.get_by_category("politics") #resultsString = ''.join(str(e) for e in results) # filename = 'news_stream.py' # with open(filename, 'a') as file: # for result in independent: # print(result['title']) # # If you want other things from the tweet object you can specify it here # file.write(result['title'] + os.linesep)
class ReporterModule(BaseModule): AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"] NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"] def __init__(self, *args): super(ReporterModule, self).__init__(*args) self.API_KEY = self.get_configuration("newsapi.org_key") self.threshold = int(self.get_configuration("news_limit")) if self.API_KEY: self.articles = Articles(self.API_KEY) self.sources = Sources(self.API_KEY) else: print( "Kindly look back at the documentation to configure news module properly especially the API keys." ) return False self.sources_url = {} self.sources.information() def get_all_categories(self): return list(self.sources.all_categories()) def get_by_category(self, category): srcs = self.sources.get_by_category(category).sources self.sources_url = {} for src in srcs: self.sources_url[src['name']] = src['url'] return self.sources_url def get_sort_bys_of_source(self, source_name): return self.sources.search(source_name)[0]['sortBysAvailable'] def all_sources(self): self.sources_url = self.sources.all_names() return self.sources_url def get_news(self): self.assistant.say( "Would you prefer any specific category? If yes then what would it be?" ) category_status = self.assistant.listen().decipher() if category_status.upper() in self.NEGATIVE: category = False else: categories = self.get_all_categories() category = self.search(categories, category_status) self.assistant.say( "Any preference you would like to have about source of your news? like CNN" "or Time magazine or maybe The hindu?") source_status = self.assistant.listen().decipher() if source_status.upper() in self.NEGATIVE: source = False else: if category: sources_available = self.get_by_category(category) response = "Out of all the sources as follows" for source_name, source_url in sources_available.items(): response += " %s," % source_name response += ", which one would you like to pick?" self.assistant.say(response) source_command = self.assistant.listen().decipher() source = self.search(list(sources_available), source_command) else: self.assistant.say( "So would you want me to list all the sources around 70 which to be" "honest would be a hefty task, so if not, then just let me know of" "your source name and I would let you know if it's available or not." ) all_sources_status = self.assistant.listen().decipher() sources_available = self.all_sources() if all_sources_status.upper() in self.AFFIRMATIVE: response = "Good job, lazy ass, so here are all the available sources as follows " sources_available_list = list(sources_available) for source_name in sources_available_list: response += " %s," % source_name response += ", which one would you like to pick?" self.assistant.say(response) source_command = self.assistant.listen().decipher() all_sources_status = source_command source_found = self.search(list(sources_available), all_sources_status) source = source_found if source: sort_bys_available = self.get_sort_bys_of_source(source) if len(sort_bys_available) == 1: sort_by = sort_bys_available[0] else: if len(sort_bys_available) == 2: response = "And what kind of news sort would you like? " \ "%s or %s?" % (sort_bys_available[0], sort_bys_available[1]) else: response = "And what kind of news sort would you like? " \ "%s or %s, or maybe %s?" % (sort_bys_available[0], sort_bys_available[1], sort_bys_available[2]) self.assistant.say(response) sort_by_command = self.assistant.listen().decipher() sort_by = self.search(sort_bys_available, sort_by_command) else: self.assistant.say("And what kind of news sort would you like?" "latest or maybe top ones shown in front page?") sort_status_command = self.assistant.listen().decipher() sort_by = self.search(['top', 'popular' 'latest'], sort_status_command) if not source: if sort_by.lower() == "top": source = "google-news" elif sort_by.lower() == "latest": source = "the-telegraph" else: source = "time" response = self.get_response(source, sort_by) return response def handle(self): source = self.get_configuration("news_source") response = self.get_response(source) return response def get_response(self, source, sort_by=None, threshold=5): if self.threshold: threshold = self.threshold source = source.lower().replace(" ", "-") articles = self.articles.get(source, sort_by=sort_by).articles articles = articles[:threshold] response = "So the %s news from %s news source are as follows " % ( sort_by, source) for article in articles: if article['title']: response += "%s, " % article['title'] if article['description']: response += "%s, " % article['description'] if article['author']: response += "was reported by %s." % article['author'] response += "and in the other news. " return response @staticmethod def search(dataset, query): values = [0 for _ in range(0, len(dataset))] search = query.lower().split() upper_threshold = len(search) for index, data in enumerate(dataset): search_array = data.split() for index2, text in enumerate(search_array): if index2 >= upper_threshold: break threshold = len(search[index2]) for i in range(0, len(text)): if i >= threshold - 1: break if text[i] == search[index2][i]: values[index] += 1 max_value = max(values) max_index = values.index(max_value) return dataset[max_index]
class ReporterModule(BaseModule): AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"] NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"] def __init__(self, *args): super(ReporterModule, self).__init__(*args) self.API_KEY = self.get_configuration("newsapi.org_key") self.threshold = int(self.get_configuration("news_limit")) if self.API_KEY: self.articles = Articles(self.API_KEY) self.sources = Sources(self.API_KEY) else: print(_("error.news.configuration")) return False self.sources_url = {} self.sources.information() def get_all_categories(self): return list(self.sources.all_categories()) def get_by_category(self, category): srcs = self.sources.get_by_category(category).sources self.sources_url = {} for src in srcs: self.sources_url[src['name']] = src['url'] return self.sources_url def get_sort_bys_of_source(self, source_name): return self.sources.search(source_name)[0]['sortBysAvailable'] def all_sources(self): self.sources_url = self.sources.all_names() return self.sources_url def get_news(self): self.assistant.say(_("news.category.ask")) category_status = self.assistant.listen().decipher() if category_status.upper() in self.NEGATIVE: category = False else: categories = self.get_all_categories() category = self.search(categories, category_status) self.assistant.say(_("news.sources.ask")) source_status = self.assistant.listen().decipher() if source_status.upper() in self.NEGATIVE: source = False else: if category: sources_available = self.get_by_category(category) response = _("news.sources.list") for source_name, source_url in sources_available.items(): response += " %s," % source_name response += _("news.sources.select") self.assistant.say(response) source_command = self.assistant.listen().decipher() source = self.search(list(sources_available), source_command) else: self.assistant.say(_("news.sources.all.ask")) all_sources_status = self.assistant.listen().decipher() sources_available = self.all_sources() if all_sources_status.upper() in self.AFFIRMATIVE: response = _("news.sources.all") sources_available_list = list(sources_available) for source_name in sources_available_list: response += " %s," % source_name response += _("news.sources.select") self.assistant.say(response) source_command = self.assistant.listen().decipher() all_sources_status = source_command source_found = self.search(list(sources_available), all_sources_status) source = source_found if source: sort_bys_available = self.get_sort_bys_of_source(source) if len(sort_bys_available) == 1: sort_by = sort_bys_available[0] else: if len(sort_bys_available) == 2: response = _("news.sort.two_options").format(sort_bys_available[0], sort_bys_available[1]) else: response = _("news.sort.three_options").format( sort_bys_available[0], sort_bys_available[1], sort_bys_available[2], ) self.assistant.say(response) sort_by_command = self.assistant.listen().decipher() sort_by = self.search(sort_bys_available, sort_by_command) else: self.assistant.say(_("news.sort.described_options")) sort_status_command = self.assistant.listen().decipher() sort_by = self.search(['top', 'popular' 'latest'], sort_status_command) if not source: if sort_by.lower() == "top": source = "google-news" elif sort_by.lower() == "latest": source = "the-telegraph" else: source = "time" response = self.get_response(source, sort_by) return response def handle(self): source = self.get_configuration("news_source") response = self.get_response(source) return response def get_response(self, source, sort_by=None, threshold=5): if self.threshold: threshold = self.threshold source = source.lower().replace(" ", "-") articles = self.articles.get(source, sort_by=sort_by).articles articles = articles[:threshold] response = _("news.report").format(sort_by, source) for article in articles: if article['title']: response += "%s, " % article['title'] if article['description']: response += "%s, " % article['description'] if article['author']: response += _("news.report.by").format(article['author']) response += _("news.report.continue") return response @staticmethod def search(dataset, query): values = [0 for _ in range(0, len(dataset))] search = query.lower().split() upper_threshold = len(search) for index, data in enumerate(dataset): search_array = data.split() for index2, text in enumerate(search_array): if index2 >= upper_threshold: break threshold = len(search[index2]) for i in range(0, len(text)): if i >= threshold - 1: break if text[i] == search[index2][i]: values[index] += 1 max_value = max(values) max_index = values.index(max_value) return dataset[max_index]
import newsapi import numpy import pandas as pd from newsapi.articles import Articles apikey = '455e01c84ca44ff387187f10f202bed3' a = Articles(API_KEY=apikey) data = a.get(source="the-new-york-times", sort_by='top') #print (data) ## raw news data ## ----------------------------------------------------------- data = pd.DataFrame.from_dict(data) data = pd.concat( [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)], axis=1) #data.head() # drop unused columns # display only title and discription news_df = data.drop(columns=[ 'status', 'source', 'sortBy', 'author', 'url', 'urlToImage', 'publishedAt' ]) #print(news_df) print("---------------------------------------------------------------------") print("---------------------------------------------------------------------")