def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""): ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d") ealiest_date = ealiest_date.strftime("%m/%d/20%y") googlenews = None if end_date != "": end_date = dt.strptime(end_date, "20%y-%m-%d") end_date = end_date.strftime("%m/%d/20%y") googlenews = GoogleNews(start=earliest_date,end=end_date) else: googlenews = GoogleNews(start=earliest_date) googlenews.search('trump') for i in range(1,1000): googlenews.getpage(i) result=googlenews.result() print(len(result), result) df=pd.DataFrame(result) list=[] for ind in df.index: dict={} article = Article(df['link'][ind]) article.download() article.parse() #article.nlp() dict['Date']=df['date'][ind] dict['Media']=df['media'][ind] dict['Title']=article.title dict['Article']=article.text dict['Summary']=article.summary list.append(dict) news_df=pd.DataFrame(list) print(news_df) file_name = 'googlenews.csv' news_df.to_csv(file_name)
def news(str): global i if i == 0: spacek(f"ofcures {str} which news you want to listen") else: spacek(f"which news you want to listen{str}") try: s = takecommend().lower() s = s.replace('about', "") spacek("which page you want ot listen") s2 = int(takecommend()) googlenews = GoogleNews() googlenews = GoogleNews('en', "2") # here you can use d which is denoted for how much linw you want to lesiten googlenews.search(s) googlenews.getpage(s2) googlenews.result() spacek(f" {str} here is news about ") spacek(s) print(googlenews.gettext()) spacek(googlenews.gettext()) except Exception as s: spacek(f"could not understand {str} what did you say say it again") i = 1 news(str)
def get_admin_data(user_headline, user_img, user_keywords): admin_data = {'link': None, 'headline': None, 'content': None, 'image': None} google_news = GoogleNews(lang='en') google_news.search(user_headline) links = google_news.get__links() print('No. of links found: ', len(links)) if len(links) == 0: google_news = GoogleNews(lang='en') google_news.search(' '.join(user_keywords)) links2 = google_news.get__links() if len(links2) == 0: return admin_data else: links = links2 if len(links) == 1: link_used = links[0] else: link_used = links[1] admin_data['link'] = link_used # print(link_used) article = Article(link_used) article.download() article.parse() article.nlp() admin_data['headline'] = article.title admin_data['content'] = article.summary if article.top_image is not None: admin_data['image'] = article.top_image print('admin link: ', admin_data['link']) print('admin headline: ', admin_data['headline']) return admin_data
def todaysNews(str): googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(str) googlenews.getpage(1) googlenews.result() g = googlenews.gettext() return g
def getArticles(searchQuery, dateRange = False, startDate = '', endDate = ''): # returns first page of GoogleNews article results for the given query # in the form of a list of dictionary (dict w info per article) # Dict Keys = [title, media, date, desc, link, img] if dateRange: googlenews = GoogleNews(lang='en', start = startDate, end = endDate) else: googlenews = GoogleNews(lang='en', period = 'd') googlenews.search(searchQuery) articlesInfo = googlenews.result(sort=True) return articlesInfo
def news(): topic = entry.get() googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(topic) googlenews.getpage() googlenews.result() a = googlenews.gettext() output.insert(END, a) speak = Dispatch( "SAPI.SpVoice" ) #calling this dispatch method helps to interact with Microsoft Speech SDK to speak speak.Speak(a)
def news(): speak("What kind of news would you like to hear ?") type = takeCommand() googleNews = GoogleNews() googleNews = GoogleNews(lang = 'en') googleNews.search(type) # will search the kind we want to hear googleNews.getpage(1) # page number of news googleNews.result() list = googleNews.gettext() #print(list) if len(list) > 0: speak(random.choice(list)) else: speak("No news related to this topic.")
def getnewsData(self): today = date.today() T_split = str(today).split('-') toDate = T_split[2] + '/' + T_split[1] + '/' + T_split[0] googlenewsMkt = GoogleNews(start=toDate, end=toDate) googlenewsMkt.get_news('Market') result = googlenewsMkt.results() df = pd.DataFrame(result).head(10) dfi.export(df, './template/df_styled_Market.jpeg') googlenewsBiz = GoogleNews(start=toDate, end=toDate) googlenewsBiz.get_news('Business') result = googlenewsBiz.results() df = pd.DataFrame(result).head(10) dfi.export(df, './template/df_styled_Business.jpeg')
def news_sentiments(self): # Returns news articles curated via Finviz, Yahoo, and Google News, GET UNUSUAL OPTION ACTIVITY BASE_URL = f'https://finviz.com/quote.ashx?t={self.ticker}' soup = self._get_soup(BASE_URL) table = soup.find('table', {'class': 'fullview-news-outer'}) rows = table.find_all('tr') df_data = [] for row in rows: date = row.find('td', {'align': 'right'}) article = row.find('td', {'align': 'left'}) link = article.find('a')['href'] df_data.append((date.get_text(), article.get_text(), link)) df = pd.DataFrame(df_data, columns=['Time', 'Headline', 'Link']) BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/news?p={self.ticker}' soup = self._get_soup(BASE_URL) links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'}) news = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links] BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/press-releases?p={self.ticker}' soup = self._get_soup(BASE_URL) links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'}) press_releases = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links] # Look for keywords in the news? Any showcases, Investor/analyst days, Analyst revisions, Management transitions # Product launches, Significant stock buyback changes # Getting news from google news search googlenews = GoogleNews(lang='en', period='14d') # Specify period for news googlenews.get_news(f'${self.ticker} stock') stock_news = googlenews.results() # print([(i, j) for i, j in zip(googlenews.get_texts(), googlenews.get_links())]) # To get other pages, do googlenews.get_page(2), etc. # Have whitelist of websites to search articles from. Maybe have key word to filter out stupid stuff. sectors = self.find_competition() sector_news = [] if sectors: for sector in sectors: googlenews = GoogleNews(lang='en', period='14d') googlenews.get_news(f'{sector} sector stocks') sector_news.append(googlenews.result()) return df, news, press_releases, sector_news, stock_news
def search(): global state, config if config is None: raise Exception('Call initiateConfig first') if state is None: state = {} state['url'] = {} googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') for city in config['cities']: googlenews.search('covid in ' + city) state['url'][city] = [] for i in range(config['pagesPerCity']): googlenews.getpage(i) state['url'][city].extend(googlenews.get__links())
def getNews(query): googleNews = GoogleNews() googleNews.search(query) news = [] i = 0 number = min([len(googleNews.result()), 6]) for result in googleNews.result(): if (i > number): break n = {} n["title"] = result['title'] n["description"] = result['desc'] n["link"] = result['link'] if (i == 0): n["image"] = result['img'] news.append(n) i += 1 googleNews.clear() return news
def get_training_data(self): """ load training data from google news """ # check if data has been downloaded if not os.path.isfile('./data/sentiment_data/headlines.csv'): googlenews = GoogleNews(lang='en', start='01/01/2015') # mm/dd/yyyy news = [] keywords = [ 'Blockchain', 'Cryptocurrency', 'Bitcoin', 'Etherium', 'Stock Market', 'Finance' ] # fetch news headlines for every keyword in keywords list for keyword in tqdm(keywords): googlenews.get_news(keyword) results = googlenews.results() # append news headlines to list news for result in results: news.append([result['datetime'], result['title']]) # create a pandas dataframe with news list and save it to csv df = pd.DataFrame(news, columns=['date', 'headline']) df.to_csv('./data/sentiment_data/headlines.csv', index=False) return df else: return pd.read_csv('./data/sentiment_data/headlines.csv')
def getTitles(self, ticker, start, end): googlenews = GoogleNews(start=start, end=end) googlenews.search(ticker) result = googlenews.result() df = pd.DataFrame(result) return df['title']
def testResultHasImage(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('img').lower()) self.assertIn('base64', result.get('img').lower()) print('Result contains image')
def testResultHasLink(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('link').lower()) self.assertIn('http', result.get('link').lower()) print('Result contains http link')
def googleNewsCrawler(self): result_list = [] googlenews = GoogleNews() for i in range(self.__numDays): startDateTime = self.__dateTime + timedelta(days=i) endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan) googlenews.setTimeRange( start=str(startDateTime.month) + '/' + str(startDateTime.day) + '/' + str(startDateTime.year), end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' + str(endDateTime.year)) googlenews.search(self.__keyWords) for j in range(self.__pagsEveryDay - 1): googlenews.getpage(j + 2) logging.info( str(self.__keyWords + '__' + str(startDateTime.date()) + " append " + str(int(self.__pagsEveryDay * 10)) + " items")) result_list = result_list + googlenews.result() googlenews.clear() if (i + 1) % 10 == 0: self.toJson(result_list) result_list = [] continue self.toJson(result_list)
def crawling_news(company_name_list, start_date, end_date, save_file_name): #set logger Handler logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() logger.addHandler(stream_handler) #define googlenews googlenews = GoogleNews(lang='en', start=start_date, end=end_date, encode='utf-8') #news.google.com search sample all_title = [] logging.info('loop start') for i in range(len(company_name_list)): comp_name = company_name_list[i] googlenews.search(comp_name) logging.info('%s : %d%s' % (comp_name, ((i + 1) / len(company_name_list)) * 100, '%')) for j in range(len(googlenews.results())): temp = [] temp.append(googlenews.results()[j].get('title')) temp.append(comp_name) temp.append(fixing_date(googlenews.results()[j].get('date'))) all_title.append(temp) #clear result list googlenews.clear() all_title = pd.DataFrame(all_title) all_title.to_csv('%s.csv' % (save_file_name)) logging.info('saved as %s.csv, done!!' % (save_file_name)) return all_title
def googlenews_extract(date_range, num_pages, search_text): ''' Use googlenews package to extract top 30 stories per day based on search string ''' df_days = [] # loop through date range to ensure equal sample size from each day #TODO: if we want to pull multiple years of data, perhaps add multi-threading...not necessary for < ~20 calls for date in date_range: result = [] googlenews = GoogleNews(start=date, end=date) googlenews.search(search_text) print("Search Date = ", date) for i in range(0, num_pages): print('Executing GoogleNews call #', i + 1) googlenews.getpage(i) result_next = googlenews.result() print("Total records returned: ", len(result_next)) df = pd.DataFrame(result_next) df['date_calendar'] = date df_days.append(df) appended_data = pd.concat(df_days) df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1) return df_news
def getPolarity(uniName): from GoogleNews import GoogleNews from newspaper import Article from newspaper import Config import pandas as pd from textblob import TextBlob uniName = uniName + ' Coronavirus' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent googlenews = GoogleNews(start='08/01/2020', end='09/26/2020') googlenews.search(uniName) result = googlenews.result() for i in range(0, 5): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) sum = 0 counter = 1 for ind in df.index: try: article = Article(df['link'][ind], config=config) article.download() article.parse() article.nlp() testimonial = TextBlob(article.summary) counter += 1 sum += testimonial.sentiment.polarity except: pass return sum / counter
def get_admin_data(user_headline, user_img): admin_data = { 'link': None, 'headline': None, 'content': None, 'image': None } google_news = GoogleNews(lang='en') google_news.search(user_headline) links = google_news.get__links() print('No. of links found: ', len(links)) if len(links) == 0: return admin_data elif len(links) == 1: link_used = links[0] else: link_used = links[1] admin_data['link'] = link_used print(link_used) article = Article(link_used) article.download() article.parse() article.nlp() admin_data['headline'] = article.title admin_data['content'] = article.summary if article.top_image is None: admin_data['image'] = user_img else: admin_data['image'] = article.top_image return admin_data
def get_news(): dt_today = str(datetime.today().strftime('%m/%d/%Y')) dt_previous = datetime.today() - timedelta(days=5) dt_previous = str(dt_previous.strftime('%m/%d/%Y')) #print(dt_today) #print(dt_previous) googlenews = GoogleNews(start=dt_previous, end=dt_today) googlenews.search('Coronavirus') googlenews.getpage(1) result1 = googlenews.result() googlenews.getpage(2) result2 = googlenews.result() result = result1 + result2 news_list = list() for i in result: if i['desc'] != '': dic = dict() dic['title'] = i['title'] dic['source'] = i['media'] dic['date&time'] = i['date'] dic['desc'] = i['desc'] dic['link'] = i['link'] news_list.append(dic) return news_list
def __init__(self, politician_name): """Initialize an object representing an article.""" news = GoogleNews() news.setlang("uk") news.setencode("utf-8") news.setperiod("3d") news.search(politician_name) info = news.result() self.articles = [] name, surname = politician_name.split()[0], politician_name.split()[1] self.link = f"https://www.google.com/search?q=+{name}+{surname}+новини&source=lnms&tbm=isch" def get_data(self): r = requests.get(self.link) return r.text html_data = get_data(self) soup = BeautifulSoup(html_data, "html.parser") image_links, num = [], 0 for item in soup.find_all("img"): image_links.append(item["src"]) num += 1 if num == 6: break for i in range(5): text = info[i] info_list = [text["title"], text["link"], image_links[i + 1]] self.articles.append(info_list)
def fetch_articles(self): # how many pages to scrape N_pages = 1 links = [] # how many days from last update # TODO: look for the last update datetime in the DB days_from_last_update = (datetime.datetime.today() - self.history_start).days # for each day between start date and today: for day in range(0, days_from_last_update + 1): download_date = self.history_start + datetime.timedelta(days=day) googlenews = GoogleNews(start=download_date.strftime("%m/%d/%Y"), end=download_date.strftime("%m/%d/%Y")) googlenews.search(self.ticker) # iterate N_pages of Google News for i in range(0, N_pages): googlenews.getpage(i) result = googlenews.result() links = links + result links = list(set([x['link'] for x in links])) # for each link (without dups) get the article and its metadata articles = [] for link in links: try: downloaded = self.download_and_parse_article(link) articles.append(downloaded) except Exception as e: print(e) return articles
def testResultHasDate(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('date').lower()) self.assertIsNot('', result.get('date').lower()) print('Result date is not empty')
def testResultNumberWithTwoPages(self): googlenews = GoogleNews() googlenews.search(keyword) googlenews.getpage(2) length = len(googlenews.result()) self.assertEqual(length, 20) print('Result length with two pages is correct')
def testResultNumberAtTwoPages(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.page_at(2) length = len(result) self.assertEqual(length, 10) print('Result length at two pages is correct')
def testResultContainsKeyword(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('desc').lower()) self.assertIn(keyword.lower(), result.get('desc').lower()) print('Result contains keyword')
def get_news(ticker): try: stock_data = stock_api.get_stock_data(ticker) except: raise Exception("Stock Not Found") try: googlenews = GoogleNews(period='2d') googlenews.search(ticker) result = googlenews.result() news_articles = [] for item in result: news_article = {} news_article['title'] = item['title'] news_article['media'] = item['media'] news_article['date'] = item['date'] news_article['description'] = item['desc'] news_article['link'] = item['link'] news_article['datetime'] = item['datetime'] news_articles.append(news_article) return news_articles except: raise Exception("News Error")
def get_search_results(keyword: str): googlenews = GoogleNews(lang="en", period="7d", encode="utf-8") googlenews.get_news(keyword) googlenews.search(keyword) googlenews.get_page(1) results = googlenews.results() return results[0:5]
def index(request): googlenews = GoogleNews() googlenews.search('Shailene Woodley') news = googlenews.result() context = {'news': news} return render(request, 'index.html', context)