def __init__(self, politician_name): """Initialize an object representing an article.""" news = GoogleNews() news.setlang("uk") news.setencode("utf-8") news.setperiod("3d") news.search(politician_name) info = news.result() self.articles = [] name, surname = politician_name.split()[0], politician_name.split()[1] self.link = f"https://www.google.com/search?q=+{name}+{surname}+новини&source=lnms&tbm=isch" def get_data(self): r = requests.get(self.link) return r.text html_data = get_data(self) soup = BeautifulSoup(html_data, "html.parser") image_links, num = [], 0 for item in soup.find_all("img"): image_links.append(item["src"]) num += 1 if num == 6: break for i in range(5): text = info[i] info_list = [text["title"], text["link"], image_links[i + 1]] self.articles.append(info_list)
def get_news(assunto): news = GoogleNews(period='d') news.setlang('pt') news.set_encode('utf-8') news.set_time_range('12/02/2021', '13/02/2021') news.get_news(assunto) results = news.get_texts() result = results[3:8] if len(results) > 0 else "Sem notícias recentes" return result
def initalize_google_news(start_date, end_date): """Initializes the googlenews object.""" print("initalize_google_news...") googlenews = GoogleNews(encode="utf-8") # create googlenews object googlenews.setlang("en") googlenews.setperiod("d") googlenews.setencode("utf-8") googlenews.setTimeRange(start_date, end_date) # using user specified date range return googlenews
def search(keyword=None, datestart=None, dateend=None, pgs=1): # Variáveis globais global noticias global cont global acabou # Parametros de busca print('Keyword: ', keyword) #Configuração da pesquisa googlenews = GoogleNews(start=datestart, end=dateend) googlenews.setlang('pt') googlenews.search(keyword) result = googlenews.result() # Passando os dados p/ um DataFrame df = pd.DataFrame(result) # Printando as 5 primeiras notícias print(df.head()) # Pega um range de páginas obtidas do resultado acima for i in range(0, pgs): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) # Converte o DataFrame acima para uma lista de dicionários for ind in df.index: print('Noticia numero: {}'.format(ind)) dict = {} article = Article(df['link'][ind], config=config) article.download() try: article.parse() article.nlp() dict['Date'] = df['date'][ind] dict['Media'] = df['media'][ind] dict['Title'] = article.title dict['Article'] = article.text dict['Summary'] = article.summary dict['Created'] = False noticias.append(dict) except: print('Error') time.sleep(0)
class Engine: def __init__(self): self.news = GoogleNews() self.news.setlang('en') #self.news.setTimeRange('01/01/2000','01/01/2015') self.news.setencode('utf-8') self.pageNumber = 1 self.searchTerm = "" def nextPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber += 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def previousPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber -= 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def search(self, term): self.news.search(term) if len(self.news.result()) == 0: return False else: return self.news.result() def getPageNumber(self): return self.pageNumber def getResults(self): return self.news.result() def clear(self): self.news.clear() def resetPageNumber(self): self.pageNumber = 1
def extract_links(dir_c, dir_k, lang): for t in topics: print('Current topic: ', t + '\n') kw = get_keywords(dir_k, t) print('Keywords: ', kw + '\n') f_clean = open(dir_c + t + '.txt', 'r') fp = f_clean.readlines() min_d, max_d, num_d = get_date_range(fp) print('Date range: ', min_d, max_d + '\n') f_out = open(lang + '/links/' + t + '_links.txt', 'w') key_enc = quote(kw.encode('utf8')) googlenews = GoogleNews() googlenews.setlang(lang) googlenews.setTimeRange(min_d, max_d) googlenews.search(key_enc) result = googlenews.result() page = 1 num_art = len(result) curr_art = num_art while curr_art < 10*num_d: page += 1 googlenews.getpage(page) result = googlenews.result() num_art = len(result) if curr_art < num_art: curr_art = num_art else: break for i in range(curr_art): date = str(dateparser.parse(result[i]['date']).date()) link = result[i]['link'] f_out.write(date + '\n' + link) f_out.write('\n--------------------------------\n') print('--------------------------------\n') f_out.close()
def crawl(search: str, lang: str = "en", directory: str = None) -> list: gn = GoogleNews() gn.setlang(lang) gn.search(search) result = gn.result() articles = [] for res in result: try: article = Article.download(res['link']) articles.append(article) except MaxRetryError: logger.error(f'MaxRetryError for {res["link"]}') if directory is not None: for idx, article in enumerate(articles): if article is not None: filename = f'{search}_{idx}_{lang}.yml' filename = filename.lower().replace(' ', '_') article.to_yaml(f'{directory}/{filename}') return articles
''' Example of GoogleNews usage. ''' from GoogleNews import GoogleNews from pprint import pprint news = GoogleNews() news.setlang('en') news.setencode('utf-8') news.setperiod('3d') news.search('Programming') info = news.result() print(news.total_count()) print(len(info)) news.get_page(2) info = news.result() print(len(info)) pprint(info)
from GoogleNews import GoogleNews googlenews = GoogleNews() googlenews.setlang('no') def search_with_inmput( pages): #googleNews takler ikke spesial karakterer som æ ø å googlenews.search(input) if (pages >= 2): for x in range(2, pages + 1): googlenews.getpage(x) #googlenews.getpage(2) #googlenews.getpage(3) news_links = googlenews.get__links() googlenews.clear() return news_links def norge_klima_search(pages): # Searching google news using Norge and klima googlenews.search('norge klima') if (pages >= 2): for x in range(2, pages + 1): googlenews.getpage(x) #googlenews.getpage(2) #googlenews.getpage(3) news_links = googlenews.get__links() googlenews.clear() return news_links
continue if 0 < choice < 5: break else: print("That is not between 1 and 4! Try again:") print ("You entered: {} ") # Good to use format instead of string formatting with % mydict = {1:go_to_stackoverflow, 2:import_from_phone, 3:import_from_camcorder, 4:import_from_camcorder} mydict[choice]() print(askUser()) s_req = input("Enter the term you would like to search") st_date = input("Please enter your desired start date (MM-DD-YYY): ") en_date = input("Please enter your desired end date (MM-DD-YYY): ") googlenews = GoogleNews() googlenews.setlang('en') googlenews.setTimeRange(st_date,en_date) googlenews.search(s_req) googlenews.result() #create a least squares regression model using the variablles all_adj_close= all_data[['Adj Close']] all_returns = np.log(all_adj_close / all_adj_close.shift(1)) #isolate the returns you want to value for the OLS print("As a reminder, you have selected the following: " + input_string) sample_stocks = input("Please choose 2 of the stocks you have chosen to calculate a OLS regression: ")
#!/usr/bin/env python # coding: utf-8 import pandas as pd import jieba from GoogleNews import GoogleNews from bs4 import BeautifulSoup import requests googlenews = GoogleNews() googlenews.setlang('cn') googlenews.setperiod('d') googlenews.setencode('utf-8') googlenews.clear() x = input("請輸入要搜尋的關鍵字,將為你搜集相關字詞內容:") googlenews.search(x) alldata = googlenews.result() result = googlenews.gettext() links = googlenews.get_links() # print(type(result)) # print(len(result)) # print(alldata) print() for n in range(len(result)): print(result[n]) print(links[n])