def get_admin_data(user_headline, user_img, user_keywords): admin_data = {'link': None, 'headline': None, 'content': None, 'image': None} google_news = GoogleNews(lang='en') google_news.search(user_headline) links = google_news.get__links() print('No. of links found: ', len(links)) if len(links) == 0: google_news = GoogleNews(lang='en') google_news.search(' '.join(user_keywords)) links2 = google_news.get__links() if len(links2) == 0: return admin_data else: links = links2 if len(links) == 1: link_used = links[0] else: link_used = links[1] admin_data['link'] = link_used # print(link_used) article = Article(link_used) article.download() article.parse() article.nlp() admin_data['headline'] = article.title admin_data['content'] = article.summary if article.top_image is not None: admin_data['image'] = article.top_image print('admin link: ', admin_data['link']) print('admin headline: ', admin_data['headline']) return admin_data
def testResultHasLink(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('link').lower()) self.assertIn('http', result.get('link').lower()) print('Result contains http link')
def scrapeTitles(self, num=0): ''' Inputs: num --> finds at least num titles Outputs: A list of raw titles How: Makese API call to Google News Cleans title list down to just words Strips out any titles that do not contain ticker ''' found = 0 titles = [] end = self.end start = self.start # max start can be reduced by is 7 days tries = 0 while found <= num: if not self.validDates() or tries > 7: break googlenews = GoogleNews(start=start, end=end) googlenews.search(self.ticker) result = googlenews.result() if len(result) == 0: break df = pd.DataFrame(result) if len(df) > 0: self.titleList = df['title'].tolist() + df['desc'].tolist() self.clean() self.stripTitleList() if self.titleList is not None: print(self.start, self.end) print("after stripTitleList: Not None") else: print(self.start, self.end) print("after stripTitleList: None") for t in self.titleList: if t not in titles: titles.append((t)) found += 1 start = self.reduceDate(start, 1) tries += 1 self.start = start self.titleList = titles[:num]
def __init__(self): self.news = GoogleNews() self.news.setlang('en') #self.news.setTimeRange('01/01/2000','01/01/2015') self.news.setencode('utf-8') self.pageNumber = 1 self.searchTerm = ""
def testResultHasDate(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('date').lower()) self.assertIsNot('', result.get('date').lower()) print('Result date is not empty')
def get_news(ticker): try: stock_data = stock_api.get_stock_data(ticker) except: raise Exception("Stock Not Found") try: googlenews = GoogleNews(period='2d') googlenews.search(ticker) result = googlenews.result() news_articles = [] for item in result: news_article = {} news_article['title'] = item['title'] news_article['media'] = item['media'] news_article['date'] = item['date'] news_article['description'] = item['desc'] news_article['link'] = item['link'] news_article['datetime'] = item['datetime'] news_articles.append(news_article) return news_articles except: raise Exception("News Error")
def index(request): googlenews = GoogleNews() googlenews.search('Shailene Woodley') news = googlenews.result() context = {'news': news} return render(request, 'index.html', context)
def testResultContainsKeyword(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('desc').lower()) self.assertIn(keyword.lower(), result.get('desc').lower()) print('Result contains keyword')
def getTitles(self, ticker, start, end): googlenews = GoogleNews(start=start, end=end) googlenews.search(ticker) result = googlenews.result() df = pd.DataFrame(result) return df['title']
def testResultNumberAtTwoPages(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.page_at(2) length = len(result) self.assertEqual(length, 10) print('Result length at two pages is correct')
def getSources(topic): #urls = ['http://www.postcrescent.com/article/20140517/APC03/305170255/Integrys-Energy-growing-shareholders-told', # 'http://www.postcrescent.com/article/20140517/APC03/305170255/Integrys-Energy-growing-shareholders-told'] urls = GoogleNews(topic) urls.extend(YahooFinance(topic)) print urls print len(urls) #NEWER CODE numUrls = len(urls) threads = [] names = [] for i in range(0, numUrls): t= threading.Thread(target=worker, args=(i,urls[i],names)) threads.append(t) t.start() for p in threads: p.join() return names
def get_training_data(self): """ load training data from google news """ # check if data has been downloaded if not os.path.isfile('./data/sentiment_data/headlines.csv'): googlenews = GoogleNews(lang='en', start='01/01/2015') # mm/dd/yyyy news = [] keywords = [ 'Blockchain', 'Cryptocurrency', 'Bitcoin', 'Etherium', 'Stock Market', 'Finance' ] # fetch news headlines for every keyword in keywords list for keyword in tqdm(keywords): googlenews.get_news(keyword) results = googlenews.results() # append news headlines to list news for result in results: news.append([result['datetime'], result['title']]) # create a pandas dataframe with news list and save it to csv df = pd.DataFrame(news, columns=['date', 'headline']) df.to_csv('./data/sentiment_data/headlines.csv', index=False) return df else: return pd.read_csv('./data/sentiment_data/headlines.csv')
def get_news(): dt_today = str(datetime.today().strftime('%m/%d/%Y')) dt_previous = datetime.today() - timedelta(days=5) dt_previous = str(dt_previous.strftime('%m/%d/%Y')) #print(dt_today) #print(dt_previous) googlenews = GoogleNews(start=dt_previous, end=dt_today) googlenews.search('Coronavirus') googlenews.getpage(1) result1 = googlenews.result() googlenews.getpage(2) result2 = googlenews.result() result = result1 + result2 news_list = list() for i in result: if i['desc'] != '': dic = dict() dic['title'] = i['title'] dic['source'] = i['media'] dic['date&time'] = i['date'] dic['desc'] = i['desc'] dic['link'] = i['link'] news_list.append(dic) return news_list
def testResultHasImage(self): googlenews = GoogleNews() googlenews.search(keyword) result = googlenews.result()[0] print(result.get('img').lower()) self.assertIn('base64', result.get('img').lower()) print('Result contains image')
def get_admin_data(user_headline, user_img): admin_data = { 'link': None, 'headline': None, 'content': None, 'image': None } google_news = GoogleNews(lang='en') google_news.search(user_headline) links = google_news.get__links() print('No. of links found: ', len(links)) if len(links) == 0: return admin_data elif len(links) == 1: link_used = links[0] else: link_used = links[1] admin_data['link'] = link_used print(link_used) article = Article(link_used) article.download() article.parse() article.nlp() admin_data['headline'] = article.title admin_data['content'] = article.summary if article.top_image is None: admin_data['image'] = user_img else: admin_data['image'] = article.top_image return admin_data
async def text_filter(message: types.Message): googlenews = GoogleNews(lang='ru') googlenews.search(str(message.text)) result = googlenews.get_links() count = 0 for i in result: await message.answer(i) if count == 4: break count += 1
def run(self): googlenews = GoogleNews('en', 'd') googlenews.search(self.term) headline_results = googlenews.result() for i in headline_results: print(i["desc"]) blob = TextBlob(i["desc"]) self.sentiment += blob.sentiment.polarity / len(headline_results) self.subjectivity += blob.sentiment.subjectivity / len( headline_results)
def scrape(): link_list = [] # Instance of class GoogleNews googlenews = GoogleNews() googlenews.search("oceans"+"+trash") for news_item in googlenews.result(): sql_insert(news_item)
def run_search(self): stocks = self.get_df() googlenews = GoogleNews() links = [] for i, j in stocks.itertuples(): googlenews.search(j) results = googlenews.getlinks() for link in results: links.append(link) return links
def get_search_results(keyword: str): googlenews = GoogleNews(lang="en", period="7d", encode="utf-8") googlenews.get_news(keyword) googlenews.search(keyword) googlenews.get_page(1) results = googlenews.results() return results[0:5]
def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""): ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d") ealiest_date = ealiest_date.strftime("%m/%d/20%y") googlenews = None if end_date != "": end_date = dt.strptime(end_date, "20%y-%m-%d") end_date = end_date.strftime("%m/%d/20%y") googlenews = GoogleNews(start=earliest_date,end=end_date) else: googlenews = GoogleNews(start=earliest_date) googlenews.search('trump') for i in range(1,1000): googlenews.getpage(i) result=googlenews.result() print(len(result), result) df=pd.DataFrame(result) list=[] for ind in df.index: dict={} article = Article(df['link'][ind]) article.download() article.parse() #article.nlp() dict['Date']=df['date'][ind] dict['Media']=df['media'][ind] dict['Title']=article.title dict['Article']=article.text dict['Summary']=article.summary list.append(dict) news_df=pd.DataFrame(list) print(news_df) file_name = 'googlenews.csv' news_df.to_csv(file_name)
def crawling_news(company_name_list, start_date, end_date, save_file_name): #set logger Handler logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() logger.addHandler(stream_handler) #define googlenews googlenews = GoogleNews(lang='en', start=start_date, end=end_date, encode='utf-8') #news.google.com search sample all_title = [] logging.info('loop start') for i in range(len(company_name_list)): comp_name = company_name_list[i] googlenews.search(comp_name) logging.info('%s : %d%s' % (comp_name, ((i + 1) / len(company_name_list)) * 100, '%')) for j in range(len(googlenews.results())): temp = [] temp.append(googlenews.results()[j].get('title')) temp.append(comp_name) temp.append(fixing_date(googlenews.results()[j].get('date'))) all_title.append(temp) #clear result list googlenews.clear() all_title = pd.DataFrame(all_title) all_title.to_csv('%s.csv' % (save_file_name)) logging.info('saved as %s.csv, done!!' % (save_file_name)) return all_title
def get_company_news_link(company='NaN', news_num=5, time_range='today'): if company == 'NaN': return 'please input company name' news_link = [] googlenews = GoogleNews() googlenews.clear() if time_range != 'today': start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7], time_range[8:10]) end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18], time_range[19:21]) googlenews.set_time_range(start_date, end_date) googlenews.search(company) result = googlenews.result() try: for num in range(news_num): news_link.append(result[num]['link']) except IndexError: if len(news_link) == 0: return '此時段無' + company + '新聞 OR 網路不穩' return news_link else: return news_link
class GoogleNewsMethods(): # Creates a googlenews object def __init__(self): self.googlenews = GoogleNews(lang="en") # This will return a list of news for perticular stock on a given date def newscollection(self, stock, date): self.googlenews.search(stock) self.newsList = self.googlenews.result() return (self.newsList)
def extract_google(query_terms, startDate, endDate): if len(startDate) == 0: startDate = datetime.datetime.today().strftime('%d/%m/%Y') if len(endDate) == 0: endDate = datetime.datetime.strftime( datetime.datetime.today().date() - datetime.timedelta(days=7), '%d/%m/%Y') startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').strftime('%d/%m/%y') endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').strftime('%d/%m/%y') final_articles = [] print(startDate) print(endDate) print("Crawling Starting") # here extracting news from google news googlenews = GoogleNews() googlenews.setTimeRange(startDate, endDate) for query in query_terms: googlenews.clear() #forming the search term googlenews.search("India Technology " + query) result = googlenews.result() for n in range(len(result)): source = result[n]['media'] url = result[n]['link'] try: article = Article(url) article.download() article.parse() except Exception as e: print("Trouble downloading so skipping") continue content = article.text # summarize the content temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content) sentences = sent_detector.tokenize(temp_content) summary = (" ".join(sentences[:2]).strip()) date = result[n]['date'] if (date.find('ago') != -1): date = current.date() title = result[n]['title'] # content=result[n]['desc'] img = result[n]['img'] #adding the extracted info in final_articles list final_articles.append({ 'source': source, 'url': url, 'date': date, 'title': title, 'content': content, 'img': img }) return final_articles
def get_top_news(term, limit=5): googlenews = GoogleNews() googlenews.search(term) result = googlenews.result() try: result = result except: pass return json.dumps(result)
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() return result
def initalize_google_news(start_date, end_date): """Initializes the googlenews object.""" print("initalize_google_news...") googlenews = GoogleNews(encode="utf-8") # create googlenews object googlenews.setlang("en") googlenews.setperiod("d") googlenews.setencode("utf-8") googlenews.setTimeRange(start_date, end_date) # using user specified date range return googlenews
def getPolarity(uniName): from GoogleNews import GoogleNews from newspaper import Article from newspaper import Config import pandas as pd from textblob import TextBlob uniName = uniName + ' Coronavirus' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent googlenews = GoogleNews(start='08/01/2020', end='09/26/2020') googlenews.search(uniName) result = googlenews.result() for i in range(0, 5): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) sum = 0 counter = 1 for ind in df.index: try: article = Article(df['link'][ind], config=config) article.download() article.parse() article.nlp() testimonial = TextBlob(article.summary) counter += 1 sum += testimonial.sentiment.polarity except: pass return sum / counter
def googleLinks(topic): googlenews = GoogleNews() googlenews.set_lang('en') googlenews.set_period('1d') googlenews.set_encode('utf-8') article = googlenews.get_news(topic) links = googlenews.get_links()[:5] actualLinks = list() for l in links: l = "http://" + l print(l) actualLinks.append( requests.get(l).url ) return actualLinks
def getNews(query): googleNews = GoogleNews() googleNews.search(query) news = [] i = 0 number = min([len(googleNews.result()), 6]) for result in googleNews.result(): if (i > number): break n = {} n["title"] = result['title'] n["description"] = result['desc'] n["link"] = result['link'] if (i == 0): n["image"] = result['img'] news.append(n) i += 1 googleNews.clear() return news
from GoogleNews import GoogleNews from readability import Document from TextRank import Summary from fetch_url import fetch_url import sys import re number_of_links = int(sys.argv[1]) query = '+'.join(sys.argv[2:]) regex = re.compile("<(.*?)>|\ ") article_list = [] summary_list = [] links = GoogleNews.search(query,number_of_links) if not links: print "No links found" else: result = fetch_url.fetch_parallel(links) while not result.empty(): article = Document(result.get()).summary() article = re.sub(regex, "", article) article = article.encode('ascii','ignore') summary = Summary.textrank(article) summary = summary.encode('ascii','ignore') article_list.append(article) summary_list.append(summary)