def getNews(query): googleNews = GoogleNews() googleNews.search(query) news = [] i = 0 number = min([len(googleNews.result()), 6]) for result in googleNews.result(): if (i > number): break n = {} n["title"] = result['title'] n["description"] = result['desc'] n["link"] = result['link'] if (i == 0): n["image"] = result['img'] news.append(n) i += 1 googleNews.clear() return news
def news(str): global i if i == 0: spacek(f"ofcures {str} which news you want to listen") else: spacek(f"which news you want to listen{str}") try: s = takecommend().lower() s = s.replace('about', "") spacek("which page you want ot listen") s2 = int(takecommend()) googlenews = GoogleNews() googlenews = GoogleNews('en', "2") # here you can use d which is denoted for how much linw you want to lesiten googlenews.search(s) googlenews.getpage(s2) googlenews.result() spacek(f" {str} here is news about ") spacek(s) print(googlenews.gettext()) spacek(googlenews.gettext()) except Exception as s: spacek(f"could not understand {str} what did you say say it again") i = 1 news(str)
def getPolarity(uniName): from GoogleNews import GoogleNews from newspaper import Article from newspaper import Config import pandas as pd from textblob import TextBlob uniName = uniName + ' Coronavirus' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent googlenews = GoogleNews(start='08/01/2020', end='09/26/2020') googlenews.search(uniName) result = googlenews.result() for i in range(0, 5): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) sum = 0 counter = 1 for ind in df.index: try: article = Article(df['link'][ind], config=config) article.download() article.parse() article.nlp() testimonial = TextBlob(article.summary) counter += 1 sum += testimonial.sentiment.polarity except: pass return sum / counter
def get_news(): dt_today = str(datetime.today().strftime('%m/%d/%Y')) dt_previous = datetime.today() - timedelta(days=5) dt_previous = str(dt_previous.strftime('%m/%d/%Y')) #print(dt_today) #print(dt_previous) googlenews = GoogleNews(start=dt_previous, end=dt_today) googlenews.search('Coronavirus') googlenews.getpage(1) result1 = googlenews.result() googlenews.getpage(2) result2 = googlenews.result() result = result1 + result2 news_list = list() for i in result: if i['desc'] != '': dic = dict() dic['title'] = i['title'] dic['source'] = i['media'] dic['date&time'] = i['date'] dic['desc'] = i['desc'] dic['link'] = i['link'] news_list.append(dic) return news_list
def todaysNews(str): googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(str) googlenews.getpage(1) googlenews.result() g = googlenews.gettext() return g
def news(): topic = entry.get() googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(topic) googlenews.getpage() googlenews.result() a = googlenews.gettext() output.insert(END, a) speak = Dispatch( "SAPI.SpVoice" ) #calling this dispatch method helps to interact with Microsoft Speech SDK to speak speak.Speak(a)
def news(): speak("What kind of news would you like to hear ?") type = takeCommand() googleNews = GoogleNews() googleNews = GoogleNews(lang = 'en') googleNews.search(type) # will search the kind we want to hear googleNews.getpage(1) # page number of news googleNews.result() list = googleNews.gettext() #print(list) if len(list) > 0: speak(random.choice(list)) else: speak("No news related to this topic.")
def fetch_articles(self): # how many pages to scrape N_pages = 1 links = [] # how many days from last update # TODO: look for the last update datetime in the DB days_from_last_update = (datetime.datetime.today() - self.history_start).days # for each day between start date and today: for day in range(0, days_from_last_update + 1): download_date = self.history_start + datetime.timedelta(days=day) googlenews = GoogleNews(start=download_date.strftime("%m/%d/%Y"), end=download_date.strftime("%m/%d/%Y")) googlenews.search(self.ticker) # iterate N_pages of Google News for i in range(0, N_pages): googlenews.getpage(i) result = googlenews.result() links = links + result links = list(set([x['link'] for x in links])) # for each link (without dups) get the article and its metadata articles = [] for link in links: try: downloaded = self.download_and_parse_article(link) articles.append(downloaded) except Exception as e: print(e) return articles
def googleNewsCrawler(self): result_list = [] googlenews = GoogleNews() for i in range(self.__numDays): startDateTime = self.__dateTime + timedelta(days=i) endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan) googlenews.setTimeRange( start=str(startDateTime.month) + '/' + str(startDateTime.day) + '/' + str(startDateTime.year), end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' + str(endDateTime.year)) googlenews.search(self.__keyWords) for j in range(self.__pagsEveryDay - 1): googlenews.getpage(j + 2) logging.info( str(self.__keyWords + '__' + str(startDateTime.date()) + " append " + str(int(self.__pagsEveryDay * 10)) + " items")) result_list = result_list + googlenews.result() googlenews.clear() if (i + 1) % 10 == 0: self.toJson(result_list) result_list = [] continue self.toJson(result_list)
def get_news(ticker): try: stock_data = stock_api.get_stock_data(ticker) except: raise Exception("Stock Not Found") try: googlenews = GoogleNews(period='2d') googlenews.search(ticker) result = googlenews.result() news_articles = [] for item in result: news_article = {} news_article['title'] = item['title'] news_article['media'] = item['media'] news_article['date'] = item['date'] news_article['description'] = item['desc'] news_article['link'] = item['link'] news_article['datetime'] = item['datetime'] news_articles.append(news_article) return news_articles except: raise Exception("News Error")
def googlenews_extract(date_range, num_pages, search_text): ''' Use googlenews package to extract top 30 stories per day based on search string ''' df_days = [] # loop through date range to ensure equal sample size from each day #TODO: if we want to pull multiple years of data, perhaps add multi-threading...not necessary for < ~20 calls for date in date_range: result = [] googlenews = GoogleNews(start=date, end=date) googlenews.search(search_text) print("Search Date = ", date) for i in range(0, num_pages): print('Executing GoogleNews call #', i + 1) googlenews.getpage(i) result_next = googlenews.result() print("Total records returned: ", len(result_next)) df = pd.DataFrame(result_next) df['date_calendar'] = date df_days.append(df) appended_data = pd.concat(df_days) df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1) return df_news
def scrapeTitles(self, num=0): ''' Inputs: num --> finds at least num titles Outputs: A list of raw titles How: Makese API call to Google News Cleans title list down to just words Strips out any titles that do not contain ticker ''' found = 0 titles = [] end = self.end start = self.start # max start can be reduced by is 7 days tries = 0 while found <= num: if not self.validDates() or tries > 7: break googlenews = GoogleNews(start=start, end=end) googlenews.search(self.ticker) result = googlenews.result() if len(result) == 0: break df = pd.DataFrame(result) if len(df) > 0: self.titleList = df['title'].tolist() + df['desc'].tolist() self.clean() self.stripTitleList() if self.titleList is not None: print(self.start, self.end) print("after stripTitleList: Not None") else: print(self.start, self.end) print("after stripTitleList: None") for t in self.titleList: if t not in titles: titles.append((t)) found += 1 start = self.reduceDate(start, 1) tries += 1 self.start = start self.titleList = titles[:num]
def __init__(self, politician_name): """Initialize an object representing an article.""" news = GoogleNews() news.setlang("uk") news.setencode("utf-8") news.setperiod("3d") news.search(politician_name) info = news.result() self.articles = [] name, surname = politician_name.split()[0], politician_name.split()[1] self.link = f"https://www.google.com/search?q=+{name}+{surname}+новини&source=lnms&tbm=isch" def get_data(self): r = requests.get(self.link) return r.text html_data = get_data(self) soup = BeautifulSoup(html_data, "html.parser") image_links, num = [], 0 for item in soup.find_all("img"): image_links.append(item["src"]) num += 1 if num == 6: break for i in range(5): text = info[i] info_list = [text["title"], text["link"], image_links[i + 1]] self.articles.append(info_list)
def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""): ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d") ealiest_date = ealiest_date.strftime("%m/%d/20%y") googlenews = None if end_date != "": end_date = dt.strptime(end_date, "20%y-%m-%d") end_date = end_date.strftime("%m/%d/20%y") googlenews = GoogleNews(start=earliest_date,end=end_date) else: googlenews = GoogleNews(start=earliest_date) googlenews.search('trump') for i in range(1,1000): googlenews.getpage(i) result=googlenews.result() print(len(result), result) df=pd.DataFrame(result) list=[] for ind in df.index: dict={} article = Article(df['link'][ind]) article.download() article.parse() #article.nlp() dict['Date']=df['date'][ind] dict['Media']=df['media'][ind] dict['Title']=article.title dict['Article']=article.text dict['Summary']=article.summary list.append(dict) news_df=pd.DataFrame(list) print(news_df) file_name = 'googlenews.csv' news_df.to_csv(file_name)
def get_company_news_link(company='NaN', news_num=5, time_range='today'): if company == 'NaN': return 'please input company name' news_link = [] googlenews = GoogleNews() googlenews.clear() if time_range != 'today': start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7], time_range[8:10]) end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18], time_range[19:21]) googlenews.set_time_range(start_date, end_date) googlenews.search(company) result = googlenews.result() try: for num in range(news_num): news_link.append(result[num]['link']) except IndexError: if len(news_link) == 0: return '此時段無' + company + '新聞 OR 網路不穩' return news_link else: return news_link
def testResultHasDate(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('date').lower()) self.assertIsNot('', result.get('date').lower()) print('Result date is not empty')
def testResultNumberWithTwoPages(self): googlenews = GoogleNews() googlenews.search(keyword) googlenews.getpage(2) length = len(googlenews.result()) self.assertEqual(length, 20) print('Result length with two pages is correct')
def getTitles(self, ticker, start, end): googlenews = GoogleNews(start=start, end=end) googlenews.search(ticker) result = googlenews.result() df = pd.DataFrame(result) return df['title']
def testResultHasLink(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('link').lower()) self.assertIn('http', result.get('link').lower()) print('Result contains http link')
def testResultHasImage(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('img').lower()) self.assertIn('base64', result.get('img').lower()) print('Result contains image')
def index(request): googlenews = GoogleNews() googlenews.search('Shailene Woodley') news = googlenews.result() context = {'news': news} return render(request, 'index.html', context)
def testResultContainsKeyword(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('desc').lower()) self.assertIn(keyword.lower(), result.get('desc').lower()) print('Result contains keyword')
def google_scrape(entity, start_date, end_date, days_per_period=7): ''' Scrap (using GoogleNews API) the top 10 headlines of google news on a particular entity, weekly, over a given time range Output : Pandas Dataframe with datetime, title, excerpt, domain (news origin), and article url ''' time.sleep(60) # set timer to wait 60s before scraping (google scraper has limit to scraping) # calculate the number of weeks between start and end date (inclusive) n_periods = (end_date - start_date).days // days_per_period + 2 # divide the dates into date_periods (query top 10 for each week) date_range = pd.date_range(start_date, end_date, periods=n_periods) # create result df with columns result_df = pd.DataFrame(columns=['date_time', 'title', 'excerpt', 'domain', 'article_url']) # go through the date ranges and retrieve top 10 for i in range(len(date_range)-1): start_temp = date_range[i] end_temp = date_range[i+1] news = GoogleNews(start=start_temp.strftime("%m/%d/%Y"),end=end_temp.strftime("%m/%d/%Y"), lang='en', encode='utf-8') news.search(f"{entity}") # Main bulk of time, taking ~2 seconds to search if pd.DataFrame(news.result()).empty: # No relevant articles continue # retrieve relevant news results temp_df = pd.DataFrame(news.result())[['date', 'title', 'desc', 'media', 'link']] # rename columns temp_df.columns = ['date_time', 'title', 'excerpt', 'domain', 'article_url'] # only get headlines which mention the entity of interest temp_df = temp_df[temp_df['title'].str.contains(entity,flags=re.IGNORECASE)].reset_index(drop=True) temp_df['date_time'] = temp_df.date_time.apply(date_convert) # remove rows without datetime temp_df = temp_df.dropna(axis=0, subset=["date_time"]) # combine result df result_df = pd.concat([result_df, temp_df]) result_df["source_id"] = "" return result_df
def extract_google(query_terms, startDate, endDate): if len(startDate) == 0: startDate = datetime.datetime.today().strftime('%d/%m/%Y') if len(endDate) == 0: endDate = datetime.datetime.strftime( datetime.datetime.today().date() - datetime.timedelta(days=7), '%d/%m/%Y') startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').strftime('%d/%m/%y') endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').strftime('%d/%m/%y') final_articles = [] print(startDate) print(endDate) print("Crawling Starting") # here extracting news from google news googlenews = GoogleNews() googlenews.setTimeRange(startDate, endDate) for query in query_terms: googlenews.clear() #forming the search term googlenews.search("India Technology " + query) result = googlenews.result() for n in range(len(result)): source = result[n]['media'] url = result[n]['link'] try: article = Article(url) article.download() article.parse() except Exception as e: print("Trouble downloading so skipping") continue content = article.text # summarize the content temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content) sentences = sent_detector.tokenize(temp_content) summary = (" ".join(sentences[:2]).strip()) date = result[n]['date'] if (date.find('ago') != -1): date = current.date() title = result[n]['title'] # content=result[n]['desc'] img = result[n]['img'] #adding the extracted info in final_articles list final_articles.append({ 'source': source, 'url': url, 'date': date, 'title': title, 'content': content, 'img': img }) return final_articles
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() return result
def search(keyword=None, datestart=None, dateend=None, pgs=1): # Variáveis globais global noticias global cont global acabou # Parametros de busca print('Keyword: ', keyword) #Configuração da pesquisa googlenews = GoogleNews(start=datestart, end=dateend) googlenews.setlang('pt') googlenews.search(keyword) result = googlenews.result() # Passando os dados p/ um DataFrame df = pd.DataFrame(result) # Printando as 5 primeiras notícias print(df.head()) # Pega um range de páginas obtidas do resultado acima for i in range(0, pgs): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) # Converte o DataFrame acima para uma lista de dicionários for ind in df.index: print('Noticia numero: {}'.format(ind)) dict = {} article = Article(df['link'][ind], config=config) article.download() try: article.parse() article.nlp() dict['Date'] = df['date'][ind] dict['Media'] = df['media'][ind] dict['Title'] = article.title dict['Article'] = article.text dict['Summary'] = article.summary dict['Created'] = False noticias.append(dict) except: print('Error') time.sleep(0)
class Engine: def __init__(self): self.news = GoogleNews() self.news.setlang('en') #self.news.setTimeRange('01/01/2000','01/01/2015') self.news.setencode('utf-8') self.pageNumber = 1 self.searchTerm = "" def nextPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber += 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def previousPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber -= 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def search(self, term): self.news.search(term) if len(self.news.result()) == 0: return False else: return self.news.result() def getPageNumber(self): return self.pageNumber def getResults(self): return self.news.result() def clear(self): self.news.clear() def resetPageNumber(self): self.pageNumber = 1
def scrape(): link_list = [] # Instance of class GoogleNews googlenews = GoogleNews() googlenews.search("oceans"+"+trash") for news_item in googlenews.result(): sql_insert(news_item)
def run(start_date, end_date, keyword, file, mail, importance): #find relevant news articles within given timeframe googlenews = GoogleNews() googlenews = GoogleNews(lang='en') googlenews = GoogleNews(start=start_date, end=end_date) googlenews.search(keyword) res = googlenews.result() googlenews.result() headlines = googlenews.gettext() links = googlenews.get__links( ) #note that documentation has this as googlenews.getlinks() so it might change #get page url results = articleReader(links, headlines, keyword) run.df = pd.DataFrame(results) if run.df.shape[0] > importance: run.df = run.df.iloc[0:importance] return run.df
def run(self): googlenews = GoogleNews('en', 'd') googlenews.search(self.term) headline_results = googlenews.result() for i in headline_results: print(i["desc"]) blob = TextBlob(i["desc"]) self.sentiment += blob.sentiment.polarity / len(headline_results) self.subjectivity += blob.sentiment.subjectivity / len( headline_results)