def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""): ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d") ealiest_date = ealiest_date.strftime("%m/%d/20%y") googlenews = None if end_date != "": end_date = dt.strptime(end_date, "20%y-%m-%d") end_date = end_date.strftime("%m/%d/20%y") googlenews = GoogleNews(start=earliest_date,end=end_date) else: googlenews = GoogleNews(start=earliest_date) googlenews.search('trump') for i in range(1,1000): googlenews.getpage(i) result=googlenews.result() print(len(result), result) df=pd.DataFrame(result) list=[] for ind in df.index: dict={} article = Article(df['link'][ind]) article.download() article.parse() #article.nlp() dict['Date']=df['date'][ind] dict['Media']=df['media'][ind] dict['Title']=article.title dict['Article']=article.text dict['Summary']=article.summary list.append(dict) news_df=pd.DataFrame(list) print(news_df) file_name = 'googlenews.csv' news_df.to_csv(file_name)
def news(str): global i if i == 0: spacek(f"ofcures {str} which news you want to listen") else: spacek(f"which news you want to listen{str}") try: s = takecommend().lower() s = s.replace('about', "") spacek("which page you want ot listen") s2 = int(takecommend()) googlenews = GoogleNews() googlenews = GoogleNews('en', "2") # here you can use d which is denoted for how much linw you want to lesiten googlenews.search(s) googlenews.getpage(s2) googlenews.result() spacek(f" {str} here is news about ") spacek(s) print(googlenews.gettext()) spacek(googlenews.gettext()) except Exception as s: spacek(f"could not understand {str} what did you say say it again") i = 1 news(str)
def googlenews_extract(date_range, num_pages, search_text): ''' Use googlenews package to extract top 30 stories per day based on search string ''' df_days = [] # loop through date range to ensure equal sample size from each day #TODO: if we want to pull multiple years of data, perhaps add multi-threading...not necessary for < ~20 calls for date in date_range: result = [] googlenews = GoogleNews(start=date, end=date) googlenews.search(search_text) print("Search Date = ", date) for i in range(0, num_pages): print('Executing GoogleNews call #', i + 1) googlenews.getpage(i) result_next = googlenews.result() print("Total records returned: ", len(result_next)) df = pd.DataFrame(result_next) df['date_calendar'] = date df_days.append(df) appended_data = pd.concat(df_days) df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1) return df_news
def getPolarity(uniName): from GoogleNews import GoogleNews from newspaper import Article from newspaper import Config import pandas as pd from textblob import TextBlob uniName = uniName + ' Coronavirus' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent googlenews = GoogleNews(start='08/01/2020', end='09/26/2020') googlenews.search(uniName) result = googlenews.result() for i in range(0, 5): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) sum = 0 counter = 1 for ind in df.index: try: article = Article(df['link'][ind], config=config) article.download() article.parse() article.nlp() testimonial = TextBlob(article.summary) counter += 1 sum += testimonial.sentiment.polarity except: pass return sum / counter
def get_news(): dt_today = str(datetime.today().strftime('%m/%d/%Y')) dt_previous = datetime.today() - timedelta(days=5) dt_previous = str(dt_previous.strftime('%m/%d/%Y')) #print(dt_today) #print(dt_previous) googlenews = GoogleNews(start=dt_previous, end=dt_today) googlenews.search('Coronavirus') googlenews.getpage(1) result1 = googlenews.result() googlenews.getpage(2) result2 = googlenews.result() result = result1 + result2 news_list = list() for i in result: if i['desc'] != '': dic = dict() dic['title'] = i['title'] dic['source'] = i['media'] dic['date&time'] = i['date'] dic['desc'] = i['desc'] dic['link'] = i['link'] news_list.append(dic) return news_list
def testResultNumberWithTwoPages(self): googlenews = GoogleNews() googlenews.search(keyword) googlenews.getpage(2) length = len(googlenews.result()) self.assertEqual(length, 20) print('Result length with two pages is correct')
def fetch_articles(self): # how many pages to scrape N_pages = 1 links = [] # how many days from last update # TODO: look for the last update datetime in the DB days_from_last_update = (datetime.datetime.today() - self.history_start).days # for each day between start date and today: for day in range(0, days_from_last_update + 1): download_date = self.history_start + datetime.timedelta(days=day) googlenews = GoogleNews(start=download_date.strftime("%m/%d/%Y"), end=download_date.strftime("%m/%d/%Y")) googlenews.search(self.ticker) # iterate N_pages of Google News for i in range(0, N_pages): googlenews.getpage(i) result = googlenews.result() links = links + result links = list(set([x['link'] for x in links])) # for each link (without dups) get the article and its metadata articles = [] for link in links: try: downloaded = self.download_and_parse_article(link) articles.append(downloaded) except Exception as e: print(e) return articles
def googleNewsCrawler(self): result_list = [] googlenews = GoogleNews() for i in range(self.__numDays): startDateTime = self.__dateTime + timedelta(days=i) endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan) googlenews.setTimeRange( start=str(startDateTime.month) + '/' + str(startDateTime.day) + '/' + str(startDateTime.year), end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' + str(endDateTime.year)) googlenews.search(self.__keyWords) for j in range(self.__pagsEveryDay - 1): googlenews.getpage(j + 2) logging.info( str(self.__keyWords + '__' + str(startDateTime.date()) + " append " + str(int(self.__pagsEveryDay * 10)) + " items")) result_list = result_list + googlenews.result() googlenews.clear() if (i + 1) % 10 == 0: self.toJson(result_list) result_list = [] continue self.toJson(result_list)
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() return result
def todaysNews(str): googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(str) googlenews.getpage(1) googlenews.result() g = googlenews.gettext() return g
def news(): topic = entry.get() googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') googlenews.search(topic) googlenews.getpage() googlenews.result() a = googlenews.gettext() output.insert(END, a) speak = Dispatch( "SAPI.SpVoice" ) #calling this dispatch method helps to interact with Microsoft Speech SDK to speak speak.Speak(a)
def get_stock(ticker, company, method='seeking alpha'): sentiments = {} ticker_data = yf.Ticker(ticker) data = ticker_data.history(start='2020-1-30', end='2020-11-17') data = data.drop(['Dividends', 'Stock Splits'], axis=1) data = data.assign(Sentiment=0) if (method == 'seeking alpha'): soup = BeautifulSoup(open('html/{}.txt'.format(ticker)), 'html.parser') articles = soup.find_all('article') for article in articles: article_title = article.find_all('a')[1].text spans = article.find_all('span') if (len(spans) == 1): article_date = spans[0].text else: article_date = spans[1].text article_date = article_date.split(', ')[1].replace('.', '') + ' 2020' article_date = pd.to_datetime(article_date, format='%b %d %Y') headline_sentiment = analyzer.polarity_scores( article_title)['compound'] if (article_date not in sentiments.keys()): sentiments[article_date] = [headline_sentiment] else: sentiments[article_date].append(headline_sentiment) elif (method == 'gn'): googlenews = GoogleNews(start='01/30/2020', end='11/17/2020') googlenews.search(company) for i in range(2, 6): googlenews.getpage(i) results = googlenews.result() for result in results: headline_sentiment = analyzer.polarity_scores( result['title'])['compound'] try: article_date = pd.to_datetime(result['date'], format='%b %d, %Y') except: continue if (article_date not in sentiments.keys()): sentiments[article_date] = [headline_sentiment] else: sentiments[article_date].append(headline_sentiment) data['Prediction'] = data[['Close']].shift(-1) data = data[:-1] for s in sentiments: average_sentiment = np.average(sentiments[s]) if (s in data.index): data.loc[s, 'Sentiment'] = average_sentiment data.to_csv('data/{}.csv'.format(ticker))
def search_google_news(query, google_date): #-- Retrieve news articles # Init googlenews googlenews = GoogleNews() #googlenews.set_period('7d') # Cannot use set_period with set_time_range, use either or. #googlenews.set_time_range(str(google_date), '2020-10-12') googlenews.set_encode('utf-8') googlenews.search(query) googlenews.getpage(50) result = googlenews.result() # Clear before searching again googlenews.clear() return result
def getNews(text): googlenews = GoogleNews("en", "m" ) #Parameters: language and timeframe (day, month, year) googlenews.search(text) googlenews.getpage(1) #first Result Page headlines = googlenews.gettext() #only show Headlines (pictures or links are also available) #convert to string, Because Wordcloud expects string like object text = ' '.join([str(elem) for elem in headlines]) generateWordCloud(text) googlenews.clear() return
def news(): speak("What kind of news would you like to hear ?") type = takeCommand() googleNews = GoogleNews() googleNews = GoogleNews(lang = 'en') googleNews.search(type) # will search the kind we want to hear googleNews.getpage(1) # page number of news googleNews.result() list = googleNews.gettext() #print(list) if len(list) > 0: speak(random.choice(list)) else: speak("No news related to this topic.")
def search(): global state, config if config is None: raise Exception('Call initiateConfig first') if state is None: state = {} state['url'] = {} googlenews = GoogleNews() googlenews = GoogleNews('en', 'd') for city in config['cities']: googlenews.search('covid in ' + city) state['url'][city] = [] for i in range(config['pagesPerCity']): googlenews.getpage(i) state['url'][city].extend(googlenews.get__links())
def news(topic: str, start_date: str = None, end_date: str = None, **kwargs): page_num = int(kwargs.get('Page', '0')) article_num = int(kwargs.get('Article', '0')) if page_num == 0 and article_num == 0: try: NewsHistory.objects.latest('search_time').delete() except Exception as e: print("No news history for this user", repr(e)) googlenews = GoogleNews() googlenews.search(topic) googlenews.getpage(1) articles = googlenews.result() articles = [ article for article in articles if len(article['title']) > 10 ] db_entry = NewsHistory(user_id=1, search_topic=topic, last_fetched_count=0, news_articles=str(articles)) articles = articles[0:3] db_entry.save() else: news_list = NewsHistory.objects.latest('search_time') news_items = ast.literal_eval(news_list.news_articles) if page_num != 0: article_start_num = page_num * 3 articles = news_items[article_start_num:article_start_num + 3] elif article_num != 0: article = news_items[article_num - 1] article_link = '<a href="{}" target="_blank">Read full article</a>'.format( article['link']) article = "<br>" + "<br>".join( [article['title'], article['desc'], article_link]) return {'response': article} article_text = [] for i, article in enumerate(articles): serial_number = str(i + 1 + page_num * 3) article_summary = (serial_number, f"{article['date']}, {article['media']}", article['title']) article_text.append(article_summary) all_articles = "<br>".join([", ".join(i) for i in article_text]) return {'response': all_articles, 'followup': True}
def search(keyword=None, datestart=None, dateend=None, pgs=1): # Variáveis globais global noticias global cont global acabou # Parametros de busca print('Keyword: ', keyword) #Configuração da pesquisa googlenews = GoogleNews(start=datestart, end=dateend) googlenews.setlang('pt') googlenews.search(keyword) result = googlenews.result() # Passando os dados p/ um DataFrame df = pd.DataFrame(result) # Printando as 5 primeiras notícias print(df.head()) # Pega um range de páginas obtidas do resultado acima for i in range(0, pgs): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) # Converte o DataFrame acima para uma lista de dicionários for ind in df.index: print('Noticia numero: {}'.format(ind)) dict = {} article = Article(df['link'][ind], config=config) article.download() try: article.parse() article.nlp() dict['Date'] = df['date'][ind] dict['Media'] = df['media'][ind] dict['Title'] = article.title dict['Article'] = article.text dict['Summary'] = article.summary dict['Created'] = False noticias.append(dict) except: print('Error') time.sleep(0)
def crawl(coin): page = news_pages[search_keyword.index(coin)] news = GoogleNews(lang='ko', encode='utf-8') news.search(coin) time.sleep(30) news.getpage(page) title = news.get_texts() url = news.get_links() desc = news.get_desc() for t, u, d in zip(title, url, desc): # print(d) idx = coin_index[search_keyword.index(coin)] if t != "" and u != "" and d != "": dic = { u"title": u'{}'.format(t), u"desc": u'{}'.format(d), u"link": u'{}'.format(u) } if coin_list[search_keyword.index(coin)] in t or coin_eng[ search_keyword.index(coin)] in t: if idx == 0: ref = db.collection(u'{}'.format( coin_eng[search_keyword.index(coin)])) ref.add(dic) time.sleep(random.uniform(2, 4)) coin_index[search_keyword.index(coin)] += 1 else: flag = True ref = db.collection(u'{}'.format( coin_eng[search_keyword.index(coin)])).stream() for doc in ref: time.sleep(random.uniform(1, 3)) check_dic = doc.to_dict() #print('[check] {}'.format(check_dic)) if dic['title'] == check_dic['title']: flag = False break if flag: print('[{}] ///// {} '.format(coin, dic)) ref = db.collection(u'{}'.format( coin_eng[search_keyword.index(coin)])) ref.add(dic) time.sleep(random.uniform(1, 5)) #print(coin,t,u) coin_index[search_keyword.index(coin)] += 1 news_pages[search_keyword.index(coin)] += 1
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() for index in result[0]: # print(index, '\n', result[0][index]) if index == "title": title = result[0][index] elif index == "link": link = result[0][index] domain = get_domain(link) else: continue return title, link, domain
class Engine: def __init__(self): self.news = GoogleNews() self.news.setlang('en') #self.news.setTimeRange('01/01/2000','01/01/2015') self.news.setencode('utf-8') self.pageNumber = 1 self.searchTerm = "" def nextPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber += 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def previousPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber -= 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def search(self, term): self.news.search(term) if len(self.news.result()) == 0: return False else: return self.news.result() def getPageNumber(self): return self.pageNumber def getResults(self): return self.news.result() def clear(self): self.news.clear() def resetPageNumber(self): self.pageNumber = 1
def get_links(self, pages=1): """obtains all relevant links from the search, for each company. Args: pages :: int number of google pages to search resuts from Stores: links :: dict(list[dict]) dictionaries of list, keys being search terms and values being relevant information (e.g. URL) """ gnews = GoogleNews(start=self.date_from, end=self.date_to) links = {} #obtaining all the URLs for s in self.search_terms: gnews.search(s) for p in range(1, pages + 1): gnews.getpage(p) result = gnews.result() #stores values until cleared links[s] = result gnews.clear() #removing irrelevant links for s in self.search_terms: tmp = [] num = dd[s] #number of relevant terms in search_terms rel_str = ' '.join(s.lower().split()[:num]) #relevant string for d in links[s]: #selection criterion, e.g. if search term #is 'apple news', then want to subset based on 'apple' rather than 'apple news' #--> filter with first word of each search term if rel_str in d['desc'].lower(): tmp.append(d) links[s] = tmp self.search_info = links return None
def news(topic: str, start_date: str = None, end_date: str = None): help_text = "news: use this to fetch news<br><br>"\ "Usage: news topic<br>"\ "options:<br>"\ "--help: get help (this screen)<br><br>"\ "Followup: After fetching a set of news articles, enter<br>"\ "n: fetch the next set of articles<br>"\ "number: fetch the details of the article" googlenews = GoogleNews() page_num = 1 detail = None if start_date is not None and end_date is not None: googlenews.setTimeRange(start_date, end_date) if topic.split()[0] == '--help': return {'response': help_text} if topic.count('~') > 0: followup = topic.split('~')[1] if followup.split()[0] == 'n': page_num = int(followup.split()[1]) + 1 print(f"Page number: {page_num}") elif followup.split()[0].isnumeric(): detail = int(followup.split()[0]) topic = topic.split('~')[0] googlenews.search(topic) googlenews.getpage(1) news_results = googlenews.result() if detail is not None: news_details = news_results[detail + 1] print(news_details) details = f'{news_details["title"]}<br>{news_details["desc"]}<br>'\ '<a href="{news_details["link"]}" target="_blank">Read full article</a>' return {'response': details} articles = [] start_num = (page_num - 1) * 3 end_num = page_num * 3 for i, article in enumerate(news_results[start_num:end_num]): serial_number = str(i + 1 + (page_num - 1) * 3) article_summary = (serial_number, f"{article['date']}, {article['media']}", article['title']) articles.append(article_summary) all_articles = "<br>".join([", ".join(i) for i in articles]) return {'response': all_articles, 'followup': True}
def extract_links(dir_c, dir_k, lang): for t in topics: print('Current topic: ', t + '\n') kw = get_keywords(dir_k, t) print('Keywords: ', kw + '\n') f_clean = open(dir_c + t + '.txt', 'r') fp = f_clean.readlines() min_d, max_d, num_d = get_date_range(fp) print('Date range: ', min_d, max_d + '\n') f_out = open(lang + '/links/' + t + '_links.txt', 'w') key_enc = quote(kw.encode('utf8')) googlenews = GoogleNews() googlenews.setlang(lang) googlenews.setTimeRange(min_d, max_d) googlenews.search(key_enc) result = googlenews.result() page = 1 num_art = len(result) curr_art = num_art while curr_art < 10*num_d: page += 1 googlenews.getpage(page) result = googlenews.result() num_art = len(result) if curr_art < num_art: curr_art = num_art else: break for i in range(curr_art): date = str(dateparser.parse(result[i]['date']).date()) link = result[i]['link'] f_out.write(date + '\n' + link) f_out.write('\n--------------------------------\n') print('--------------------------------\n') f_out.close()
def get_corpus_in_time_interval(start_time, end_time, args): query = args.query page_count = args.pages gn = GoogleNews(start=start_time, end=end_time) corpus = list() gn.search(query) for i in range(1, page_count + 1): gn.clear() gn.getpage(i) all_rel_news = gn.result() for raw_news in all_rel_news: news = News(raw_news) if i == 1: news.set_relv() if news.mainText != 'fail': corpus.append(news) return corpus
def getNews(topic, start_time, end_time): googlenews = GoogleNews(start=start_time, end=end_time) titles = [] texts = [] labels = [] for i in range(1, 2): googlenews.clear() googlenews.search(topic) googlenews.getpage(i) tmp = googlenews.result() #result += [x["title"]+x["desc"] for x in tmp] (tmp_title, tmp_text) = get_content(tmp) titles += tmp_title texts += tmp_text if i == 1: labels += [1 for _ in range(len(tmp_text))] else: labels += [0 for _ in range(len(tmp_text))] #labels = np.array(labels) return (titles, texts, labels)
def googlenews_extract(date_range, num_pages, search_text): ''' Use googlenews package to extract stories from top {num_pages} pages per day based on {search_text} ''' df_days = [] #TODO: add multi-threading for date in date_range: result = [] googlenews = GoogleNews(start=date, end=date) googlenews.search(search_text) print("Search Date = ", date) for i in range(0, num_pages): print('Executing GoogleNews call #', i+1) googlenews.getpage(i) result_next = googlenews.result() print("Total records returned: ", len(result_next)) df = pd.DataFrame(result_next) df['date_calendar'] = date df_days.append(df) appended_data = pd.concat(df_days) # Drop duplicate titles appended_data = appended_data.drop_duplicates(subset=['title']) # Append to master news df df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1) return df_news
def search_news(self, max_page=10): news_list = list() # iterate for keywords for keyword in self.keywords_dict[self.ticker]: # GoogleNews accepcts different date format from yfinance googlenews_client = GoogleNews(start=self.gnews_date_fmt(self.start_time), end=self.gnews_date_fmt(self.end_time)) googlenews_client.search(keyword) for i in range(1, max_page): googlenews_client.getpage(i) news_list = news_list + googlenews_client.result() # convert to pandas dataframe and remove duplicates temp_df = pd.DataFrame(news_list) temp_df.drop_duplicates(subset='link', inplace=True, keep='first') # get text from links content_list = list() for ind in temp_df.index: article_link = temp_df['link'][ind] if any(link_filter in article_link for link_filter in self.link_filter_list): continue try: record_dict = dict() article = Article(article_link, config=self.config) article.download() article.parse() record_dict = {'Date': temp_df['date'][ind], 'Media': temp_df['media'][ind], 'Title': article.title, 'Article': article.text, 'Link': article_link} content_list.append(record_dict) except: print('Can\'t fetch article: {:s}'.format( temp_df['link'][ind])) self.news_df = pd.DataFrame(content_list)
model = SentimentClassifier(len(class_names)) model.load_state_dict( torch.load('model/best_model_state.bin', map_location='cpu')) model = model.to(device) # review_text = input('Enter the review you want to check:\n') ## Google News start news_content = [] searchInput = input('Enter the search keyword:\n') googlenews.search(searchInput) for i in range(1, 1 + 1): googlenews.getpage(i) for i in googlenews.result(): news_content.append(i['desc']) googlenews.clear() ## End for i in news_content: encoded_review = tokenizer.encode_plus( i, max_length=MAX_LEN, add_special_tokens=True, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', )
def __extrai_noticias_gnews(q, dia_inicio, dia_fim, num_limite_paginas=1, lang='pt-BR', sleep=1, tentativas=5): """ Retorna data frame com as notícias obtidas na aba News do Google Parâmetros ---------- q : str String de busca data_inicio, dta_fim : datatime.Date Datas de início e fim para realização da busca num_limite_num_limite_paginas : int Número máxima de páginas que serão obtidas. lang : str Código da lingua para realização da busca (padrão pt-BR) sleep : int Número de segundos para esperar entre tentativas após cada erro de obtenção de página tentativas : int Número de tentativas de obnteção de uma página antes de se considerar a extração concluída Retorno ------- resultados : DataFrame Dataframe com os reulstados de busca """ # String de busca formatado adequadamente para URL # q = urllib.parse.quote(q) # Strings com as datas no formato esperado pela lib GoogleNews formato_data = '%m/%d/%Y' dia_inicio_formatado = dia_inicio.strftime(formato_data) dia_fim_formatado = dia_fim.strftime(formato_data) # Instancia interface de busca ao Google News com idioma pt-BR e período adequado gn = GoogleNews(lang=lang, start=dia_inicio_formatado, end=dia_fim_formatado) # Inicializa lista para armazenar os resultados de busca resultados = [] # Realiza busca da primeira página logger = logging.getLogger('covidata') logger.info(f'Buscando página 1') gn.search(q) resultados = resultados + gn.result() gn.clear() # Para a página 2 em diante (p2 corresponde ao índice 1) for i in range(2, num_limite_paginas + 1): logger.info(f'Buscando página {i}') # Busca a página gn.getpage(i) # Adiciona reusltado à lista resultados = resultados + gn.result() # Caso a consulta à página não tenha gerado resultados if gn.result() == []: logger.info( f'A consulta à página {i} não retornou nehnum resultado') # Diminui o contador de tentaivas tentativas = tentativas - 1 logger.info(f'*** Há {tentativas} restantes ***') # Caso o número de tentativas tenha chegado a zero, interrompe a execução if tentativas < 1: break # Caso contrário else: # Pausa script por sleep segundos antes de buscar a próxima página logger.info(f'Execução interrompida por {sleep} segundos') time.sleep(sleep) # Apaga cache do resultado gn.clear() # Cria e retorna dataframe return pd.DataFrame(resultados)