def crawling_news(company_name_list, start_date, end_date, save_file_name): #set logger Handler logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() logger.addHandler(stream_handler) #define googlenews googlenews = GoogleNews(lang='en', start=start_date, end=end_date, encode='utf-8') #news.google.com search sample all_title = [] logging.info('loop start') for i in range(len(company_name_list)): comp_name = company_name_list[i] googlenews.search(comp_name) logging.info('%s : %d%s' % (comp_name, ((i + 1) / len(company_name_list)) * 100, '%')) for j in range(len(googlenews.results())): temp = [] temp.append(googlenews.results()[j].get('title')) temp.append(comp_name) temp.append(fixing_date(googlenews.results()[j].get('date'))) all_title.append(temp) #clear result list googlenews.clear() all_title = pd.DataFrame(all_title) all_title.to_csv('%s.csv' % (save_file_name)) logging.info('saved as %s.csv, done!!' % (save_file_name)) return all_title
def getNews(query): googleNews = GoogleNews() googleNews.search(query) news = [] i = 0 number = min([len(googleNews.result()), 6]) for result in googleNews.result(): if (i > number): break n = {} n["title"] = result['title'] n["description"] = result['desc'] n["link"] = result['link'] if (i == 0): n["image"] = result['img'] news.append(n) i += 1 googleNews.clear() return news
def job(self): #Download current database self.getDB() self.print_header(self.rawFileName) self.lineCounter(self.rawFileName) x = 0 for tag in self.newsTags: #print("Collecting newses from tag: " + tag + "...") self.logger.info(f"Collecting newses from tag: {tag}") googlenews = GoogleNews() googlenews.clear() googlenews.set_lang(self.newsLang) googlenews.setperiod('1d') googlenews.get_news(tag) output = googlenews.results(sort=True) output = pd.DataFrame(output) x = x + len(output['title']) self.saveToFile(output, self.rawFileName) self.logger.info(f"Collected amount of news: {x}") self.removeDuplicates(self.rawFileName, self.finalFileName) #os.remove(rawFileName) #delete bufor file #logger.info(f"Removed file with duplicates: {rawFileName}") os.rename(self.finalFileName, self.rawFileName) #rename final file to bufor name self.logger.info(f"Renamed: {self.finalFileName} to: {self.rawFileName}") self.backupDB()
def googleNewsCrawler(self): result_list = [] googlenews = GoogleNews() for i in range(self.__numDays): startDateTime = self.__dateTime + timedelta(days=i) endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan) googlenews.setTimeRange( start=str(startDateTime.month) + '/' + str(startDateTime.day) + '/' + str(startDateTime.year), end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' + str(endDateTime.year)) googlenews.search(self.__keyWords) for j in range(self.__pagsEveryDay - 1): googlenews.getpage(j + 2) logging.info( str(self.__keyWords + '__' + str(startDateTime.date()) + " append " + str(int(self.__pagsEveryDay * 10)) + " items")) result_list = result_list + googlenews.result() googlenews.clear() if (i + 1) % 10 == 0: self.toJson(result_list) result_list = [] continue self.toJson(result_list)
def get_company_news_link(company='NaN', news_num=5, time_range='today'): if company == 'NaN': return 'please input company name' news_link = [] googlenews = GoogleNews() googlenews.clear() if time_range != 'today': start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7], time_range[8:10]) end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18], time_range[19:21]) googlenews.set_time_range(start_date, end_date) googlenews.search(company) result = googlenews.result() try: for num in range(news_num): news_link.append(result[num]['link']) except IndexError: if len(news_link) == 0: return '此時段無' + company + '新聞 OR 網路不穩' return news_link else: return news_link
def extract_google(query_terms, startDate, endDate): if len(startDate) == 0: startDate = datetime.datetime.today().strftime('%d/%m/%Y') if len(endDate) == 0: endDate = datetime.datetime.strftime( datetime.datetime.today().date() - datetime.timedelta(days=7), '%d/%m/%Y') startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').strftime('%d/%m/%y') endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').strftime('%d/%m/%y') final_articles = [] print(startDate) print(endDate) print("Crawling Starting") # here extracting news from google news googlenews = GoogleNews() googlenews.setTimeRange(startDate, endDate) for query in query_terms: googlenews.clear() #forming the search term googlenews.search("India Technology " + query) result = googlenews.result() for n in range(len(result)): source = result[n]['media'] url = result[n]['link'] try: article = Article(url) article.download() article.parse() except Exception as e: print("Trouble downloading so skipping") continue content = article.text # summarize the content temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content) sentences = sent_detector.tokenize(temp_content) summary = (" ".join(sentences[:2]).strip()) date = result[n]['date'] if (date.find('ago') != -1): date = current.date() title = result[n]['title'] # content=result[n]['desc'] img = result[n]['img'] #adding the extracted info in final_articles list final_articles.append({ 'source': source, 'url': url, 'date': date, 'title': title, 'content': content, 'img': img }) return final_articles
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() return result
def search_google_news(query, google_date): #-- Retrieve news articles # Init googlenews googlenews = GoogleNews() #googlenews.set_period('7d') # Cannot use set_period with set_time_range, use either or. #googlenews.set_time_range(str(google_date), '2020-10-12') googlenews.set_encode('utf-8') googlenews.search(query) googlenews.getpage(50) result = googlenews.result() # Clear before searching again googlenews.clear() return result
def getNews(text): googlenews = GoogleNews("en", "m" ) #Parameters: language and timeframe (day, month, year) googlenews.search(text) googlenews.getpage(1) #first Result Page headlines = googlenews.gettext() #only show Headlines (pictures or links are also available) #convert to string, Because Wordcloud expects string like object text = ' '.join([str(elem) for elem in headlines]) generateWordCloud(text) googlenews.clear() return
def get_news(text): googlenews = GoogleNews() googlenews.search(text) googlenews.clear() googlenews.getpage(2) result = googlenews.result() for index in result[0]: # print(index, '\n', result[0][index]) if index == "title": title = result[0][index] elif index == "link": link = result[0][index] domain = get_domain(link) else: continue return title, link, domain
async def search(ctx, *, message): googlenews = GoogleNews(lang='en', period='d') googlenews.search(message) result = googlenews.gettext() embed = discord.Embed() embed.colour = discord.Colour.from_rgb(255, 225, 135) embed.set_author(name="Google NEWS", icon_url="https://i.imgur.com/tDLGRiT.jpg") embed.set_footer( text="Data from Google News | WallSt Bot made by Bruno Lazaro.") embed.add_field( name=f"{message} News", value= (f'{result[1]}\n\n{result[2]}\n\n{result[3]}\n\n{result[4]}\n\n{result[5]}\n\n{result[6]}' ), inline=False) await ctx.send(embed=embed) googlenews.clear()
def get_links(self, pages=1): """obtains all relevant links from the search, for each company. Args: pages :: int number of google pages to search resuts from Stores: links :: dict(list[dict]) dictionaries of list, keys being search terms and values being relevant information (e.g. URL) """ gnews = GoogleNews(start=self.date_from, end=self.date_to) links = {} #obtaining all the URLs for s in self.search_terms: gnews.search(s) for p in range(1, pages + 1): gnews.getpage(p) result = gnews.result() #stores values until cleared links[s] = result gnews.clear() #removing irrelevant links for s in self.search_terms: tmp = [] num = dd[s] #number of relevant terms in search_terms rel_str = ' '.join(s.lower().split()[:num]) #relevant string for d in links[s]: #selection criterion, e.g. if search term #is 'apple news', then want to subset based on 'apple' rather than 'apple news' #--> filter with first word of each search term if rel_str in d['desc'].lower(): tmp.append(d) links[s] = tmp self.search_info = links return None
def detailedNews(): name = request.form["companyName"] googlenews = GoogleNews() googlenews.clear() googlenews.search(name) newsresult = googlenews.result(sort=True) if 'logged_in' in session: time1 = datetime.now() unm = session["username"] db = pymysql.connect(host="localhost", user="******", password="", database="stock") cursor = db.cursor() sqlnews = "select * from newshistory where username= %s " if ((cursor.execute(sqlnews, (unm)) != 0)): db.commit() newshistory = cursor.fetchall() count1 = len(newsresult) if (count1 > 5): count1 = 5 else: count1 = count1 try: with db.cursor() as cursor: sql = "insert into newshistory(newsname,username,time)values(%s,%s,%s)" cursor.execute(sql, (name, unm, time1)) db.commit() finally: db.close() return render_template('DetailedNews.html', title='Display News', l=newsresult, year=datetime.now().year, name=name, newshistory=newshistory, count1=count1)
def get_corpus_in_time_interval(start_time, end_time, args): query = args.query page_count = args.pages gn = GoogleNews(start=start_time, end=end_time) corpus = list() gn.search(query) for i in range(1, page_count + 1): gn.clear() gn.getpage(i) all_rel_news = gn.result() for raw_news in all_rel_news: news = News(raw_news) if i == 1: news.set_relv() if news.mainText != 'fail': corpus.append(news) return corpus
def scrape_the_news(): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent topiclist = NLP_news() print(topiclist[0]) googlenews = GoogleNews() googlenews.set_lang('en') googlenews.set_encode('utf-8') googlenews.set_period('7d') googlenews.get_news(topiclist[0]) result = googlenews.results() googlenews.clear() df = pd.DataFrame(result) df = df.drop(['date', 'media'], axis=1) df.columns = ['Date', 'Summary', 'Image', 'Link', 'Site', 'Title'] df = df[['Title', 'Summary', 'Image', 'Link', 'Date', 'Site']] conn = psycopg2.connect("dbname=EdTech user=postgres password=edtech123") curr = conn.cursor() for i, row in df.iterrows(): try: row.Link = 'https://' + row.Link columns = row.keys() values = [row[column] for column in columns] insert_statement = "INSERT INTO scrapenews_newslist VALUES (nextval('scrapenews_newslist_id_seq'::regclass),%s, %s, %s, %s, %s, %s)" curr.execute(insert_statement, tuple(values)) except: print('could not add row', i) conn.commit() curr.close() conn.close()
def getNews(topic, start_time, end_time): googlenews = GoogleNews(start=start_time, end=end_time) titles = [] texts = [] labels = [] for i in range(1, 2): googlenews.clear() googlenews.search(topic) googlenews.getpage(i) tmp = googlenews.result() #result += [x["title"]+x["desc"] for x in tmp] (tmp_title, tmp_text) = get_content(tmp) titles += tmp_title texts += tmp_text if i == 1: labels += [1 for _ in range(len(tmp_text))] else: labels += [0 for _ in range(len(tmp_text))] #labels = np.array(labels) return (titles, texts, labels)
def googlenews_function(keyword='台積電', language='cn', start_date='2020/12/01', end_date='2020/12/28'): ''' - 日期 - 關鍵字 - 語言 - 爬幾頁 ''' googlenews = GoogleNews() googlenews.clear() googlenews.set_encode('utf-8') googlenews.set_lang(language) all_date_start = start_date.split('/') start_year = all_date_start[0] start_month = all_date_start[1] start_day = all_date_start[2] all_date_start = '{}/{}/{}'.format(start_month, start_day, start_year) all_date_end = end_date.split('/') end_year = all_date_end[0] end_month = all_date_end[1] end_day = all_date_end[2] all_date_end = '{}/{}/{}'.format(end_month, end_day, end_year) googlenews.set_time_range(start=all_date_start, end=all_date_end) googlenews.search(keyword) data = googlenews.result() print("資料總筆數:", len(data)) news = pd.DataFrame(data) # news.to_csv("GoogleNews_" + keyword +"_日期" + start_date.replace('/', '-') + '到' +end_date.replace('/', '-')+ ".csv", index= False) return news
def getNews(query): googleNews = GoogleNews() googleNews.search(query) news = [] i = 0 number = min([len(googleNews.result()), 7]) for result in googleNews.result(): if (i > number): break news.append({ "title": result['title'], "description": result['desc'], "link": result['link'], "date": result['date'], "image": result['img'] }) googleNews.clear() return news
class Engine: def __init__(self): self.news = GoogleNews() self.news.setlang('en') #self.news.setTimeRange('01/01/2000','01/01/2015') self.news.setencode('utf-8') self.pageNumber = 1 self.searchTerm = "" def nextPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber += 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def previousPage(self): if self.news.result == None: raise RuntimeError("Engine has not searched yet") self.pageNumber -= 1 self.news.clear() self.news.getpage(self.pageNumber) if len(self.news.result()) == 0: return False else: return True def search(self, term): self.news.search(term) if len(self.news.result()) == 0: return False else: return self.news.result() def getPageNumber(self): return self.pageNumber def getResults(self): return self.news.result() def clear(self): self.news.clear() def resetPageNumber(self): self.pageNumber = 1
model.load_state_dict( torch.load('model/best_model_state.bin', map_location='cpu')) model = model.to(device) # review_text = input('Enter the review you want to check:\n') ## Google News start news_content = [] searchInput = input('Enter the search keyword:\n') googlenews.search(searchInput) for i in range(1, 1 + 1): googlenews.getpage(i) for i in googlenews.result(): news_content.append(i['desc']) googlenews.clear() ## End for i in news_content: encoded_review = tokenizer.encode_plus( i, max_length=MAX_LEN, add_special_tokens=True, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoded_review['input_ids'].to(device) attention_mask = encoded_review['attention_mask'].to(device)
def __extrai_noticias_gnews(q, dia_inicio, dia_fim, num_limite_paginas=1, lang='pt-BR', sleep=1, tentativas=5): """ Retorna data frame com as notícias obtidas na aba News do Google Parâmetros ---------- q : str String de busca data_inicio, dta_fim : datatime.Date Datas de início e fim para realização da busca num_limite_num_limite_paginas : int Número máxima de páginas que serão obtidas. lang : str Código da lingua para realização da busca (padrão pt-BR) sleep : int Número de segundos para esperar entre tentativas após cada erro de obtenção de página tentativas : int Número de tentativas de obnteção de uma página antes de se considerar a extração concluída Retorno ------- resultados : DataFrame Dataframe com os reulstados de busca """ # String de busca formatado adequadamente para URL # q = urllib.parse.quote(q) # Strings com as datas no formato esperado pela lib GoogleNews formato_data = '%m/%d/%Y' dia_inicio_formatado = dia_inicio.strftime(formato_data) dia_fim_formatado = dia_fim.strftime(formato_data) # Instancia interface de busca ao Google News com idioma pt-BR e período adequado gn = GoogleNews(lang=lang, start=dia_inicio_formatado, end=dia_fim_formatado) # Inicializa lista para armazenar os resultados de busca resultados = [] # Realiza busca da primeira página logger = logging.getLogger('covidata') logger.info(f'Buscando página 1') gn.search(q) resultados = resultados + gn.result() gn.clear() # Para a página 2 em diante (p2 corresponde ao índice 1) for i in range(2, num_limite_paginas + 1): logger.info(f'Buscando página {i}') # Busca a página gn.getpage(i) # Adiciona reusltado à lista resultados = resultados + gn.result() # Caso a consulta à página não tenha gerado resultados if gn.result() == []: logger.info( f'A consulta à página {i} não retornou nehnum resultado') # Diminui o contador de tentaivas tentativas = tentativas - 1 logger.info(f'*** Há {tentativas} restantes ***') # Caso o número de tentativas tenha chegado a zero, interrompe a execução if tentativas < 1: break # Caso contrário else: # Pausa script por sleep segundos antes de buscar a próxima página logger.info(f'Execução interrompida por {sleep} segundos') time.sleep(sleep) # Apaga cache do resultado gn.clear() # Cria e retorna dataframe return pd.DataFrame(resultados)
main_topic = input('Choose a topic: ') while run: try: gnews.search(main_topic) for n, result in enumerate(gnews.result()): print(n, result['title']) ## choose which article to pick article = input('\nChoose an article by its main index or choose [all] to visualize all links: ') if article == 'all': [print(f'{n}: {new}') for n, new in enumerate(gnews.gettext())] print('--') [print(f'{n}: {link}') for n, link in enumerate(gnews.get__links())] else: article = int(article) print(f'Article - Title: {gnews.gettext()[article]}') list_artcl = gnews.gettext() print(f'Article - Link: {gnews.get__links()[article]}') list_links = gnews.get__links() print('==========================================================================\n') go_further = input('DO you want to read other articles? (y/n) ').lower() if go_further == 'y' or go_further == 'yes': gnews.clear() ## before going in loop, needs to clears the article list else: run = False ## break the loop except: print('Error :S') run = False ## break the loop
def run(self): googlenews = GoogleNews() calList = self.genCalList(self.start,self.end) posL,negL,neuL,comL = [],[],[],[] pageCount = 10 ALL_RESULT = {} t = time.time() for date in calList: print(f'\n\n{date}') PREV_RES = [] RESULT = {} continu = True for page in range(1,pageCount+1): t1 = time.time() googlenews = GoogleNews(start=date, end=date, lang='en') googlenews.search(self.word) googlenews.getpage(page) results = googlenews.result() googlenews.clear() if results == []: continu = False break results = [dict(t) for t in {tuple(d.items()) for d in results}] results = [i for i in results if i not in PREV_RES] if len(results) < 1: break for res in results: RESULT[res['title']] = res['link'] PREV_RES = results print(f"Page: {page}. Name: {self.word}. t={round(time.time()-t1,2)}s") if not os.path.exists(f'./news/data/{self.word}'): os.mkdir(f'./news/data/{self.word}') old_data = {} if os.path.isfile(f'./news/data/{self.word}/{self.word}.json'): with open(f'./news/data/{self.word}/{self.word}.json','r') as JSON: old_data = json.load(JSON) JSON.close() if continu == False: print('No results.') with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON: old_data[date] = {} json.dump(old_data,JSON) JSON.close() continue titles = [i for i in RESULT.keys()] links = [i for i in RESULT.values()] body = self.genBody(links) allWords = [self.word] + self.synonyms print('\nFetching <p> text...') oldText = self.genAllText(body,allWords) allText = [i for i in oldText if i != ''] print(f"Texts aquired of total: {(len(allText)/len(oldText))*100}%") if allText == []: with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON: old_data[date] = {} json.dump(old_data,JSON) JSON.close() continue print('Sentiment analysis...') pos,neg,neu,com,count = self.measureSentiment(allText) pos_,neg_,neu_,com_ = self.avSentiment(pos,neg,neu,com,count) posL += [pos_] negL += [neg_] neuL += [neu_] comL += [com_] formatted = {'synonyms':self.synonyms,'pos':pos_,'neg':neg_,'neu':neu_,'com':com_,'raw text':allText,'sample size':len(RESULT),'page count':pageCount} old_data[date] = formatted with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON: json.dump(old_data,JSON) JSON.close() print(time.time()-t)
def parse(self): #for the number of news sources you have loop through them for i in range(len(self.news_sources)): if self.news_sources[i] == 'NewsYCombinator': #access the website and find all the stories on the front page self.markup.append( requests.get('https://news.ycombinator.com/').text) soup = BeautifulSoup(self.markup[i], 'html.parser') links = soup.findAll("a", {"class": "storylink"}) self.saved_links = [] #search all stories on front page to find out whether or not your key words are there for link in links: for keyword in self.keywords: if keyword in link.text: self.saved_links.append(link) #get all of the links and save them, then declare that articles have been found for a in range(len(self.saved_links)): self.read_links.append(str(self.saved_links[a]['href'])) self.hasArticles = True elif self.news_sources[i] == 'NewYorkTimes': #To get your api key go to nyt developers website create an account and create an app, then select search api and copy your key from there api = articleAPI('API KEY') #loop through all key words to find out whether or not articles have them for a in range(len(self.keywords)): if (datetime.datetime.now().day - 1 > 0): articles = api.search( q=self.keywords[a], begin_date=datetime.datetime.now().year * 10000 + (datetime.datetime.now().month) * 100 + (datetime.datetime.now().day - 1), page=1) elif (datetime.datetime.now().month - 1 == 4 or datetime.datetime.now().month - 1 == 6 or datetime.datetime.now().month - 1 == 9 or datetime.datetime.now().month - 1 == 11): articles = api.search( q=self.keywords[a], begin_date=datetime.datetime.now().year * 10000 + (datetime.datetime.now().month - 1) * 100 + (datetime.datetime.now().day + 29), page=1) elif (datetime.datetime.now().month - 1 == 2 and datetime.datetime.now().year % 4 == 0): articles = api.search( q=self.keywords[a], begin_date=datetime.datetime.now().year * 10000 + (datetime.datetime.now().month - 1) * 100 + (datetime.datetime.now().day + 28), page=1) elif (datetime.datetime.now().month - 1 == 2): articles = api.search( q=self.keywords[a], begin_date=datetime.datetime.now().year * 10000 + (datetime.datetime.now().month - 1) * 100 + (datetime.datetime.now().day + 28), page=1) else: articles = api.search( q=self.keywords[a], begin_date=datetime.datetime.now().year * 10000 + (datetime.datetime.now().month - 1) * 100 + (datetime.datetime.now().day + 30), page=1) self.list_of_articles = [] for docs in articles['response']['docs']: article_blurbs = {} article_blurbs = docs.get('headline').get( 'main') + '\n' + docs.get( 'web_url') + '\n' + docs.get('snippet') self.list_of_articles.append(str(article_blurbs)) #if has an article, declare articles have been found if len(self.list_of_articles) > 0: self.hasArticles = True elif self.news_sources[i] == 'GoogleNews': googlenews = GoogleNews() googlenews = GoogleNews(lang='en') googlenews = GoogleNews( start=str(datetime.datetime.now().month) + '/' + str(datetime.datetime.now().day - 1) + '/' + str(datetime.datetime.now().year), end=str(datetime.datetime.now().month) + '/' + str(datetime.datetime.now().day) + '/' + str(datetime.datetime.now().year)) self.googleArticles = [[] for z in range(len(self.keywords))] for a in range(len(self.keywords)): googlenews.search(self.keywords[a]) gnews = googlenews.result() for docs2 in gnews: self.googleArticles[a].append( str(docs2.get('title')) + '\n' + str(docs2.get('date')) + '\n' + str(docs2.get('link')) + '\n' + str(docs2.get('desc'))) googlenews.clear() if len(self.googleArticles) > 0: self.hasArticles = True