def get_company_news_link(company='NaN', news_num=5, time_range='today'): if company == 'NaN': return 'please input company name' news_link = [] googlenews = GoogleNews() googlenews.clear() if time_range != 'today': start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7], time_range[8:10]) end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18], time_range[19:21]) googlenews.set_time_range(start_date, end_date) googlenews.search(company) result = googlenews.result() try: for num in range(news_num): news_link.append(result[num]['link']) except IndexError: if len(news_link) == 0: return '此時段無' + company + '新聞 OR 網路不穩' return news_link else: return news_link
def get_news(assunto): news = GoogleNews(period='d') news.setlang('pt') news.set_encode('utf-8') news.set_time_range('12/02/2021', '13/02/2021') news.get_news(assunto) results = news.get_texts() result = results[3:8] if len(results) > 0 else "Sem notícias recentes" return result
def crawling_news(company_name_list, start_date, end_date): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() logger.addHandler(stream_handler) googlenews = GoogleNews() googlenews.set_lang('en') googlenews.set_time_range('start_date', 'end_date') googlenews.set_encode('utf-8') #news.google.com search sample all_title = [] logging.info('loop start') for i in range(len(company_name_list)): googlenews.get_news(company_name_list[i]) logging.info('%s : %0.2f%s' % (company_name_list[i], ((i + 1) / len(company_name_list)) * 100, '%')) for j in range(len(googlenews.results())): all_title.append(googlenews.results()[j].get('title')) all_title = pd.DataFrame(all_title) all_title.to_csv('sp500news.csv') logging.info('saved to csv, done!!') return all_title
def googlenews_function(keyword='台積電', language='cn', start_date='2020/12/01', end_date='2020/12/28'): ''' - 日期 - 關鍵字 - 語言 - 爬幾頁 ''' googlenews = GoogleNews() googlenews.clear() googlenews.set_encode('utf-8') googlenews.set_lang(language) all_date_start = start_date.split('/') start_year = all_date_start[0] start_month = all_date_start[1] start_day = all_date_start[2] all_date_start = '{}/{}/{}'.format(start_month, start_day, start_year) all_date_end = end_date.split('/') end_year = all_date_end[0] end_month = all_date_end[1] end_day = all_date_end[2] all_date_end = '{}/{}/{}'.format(end_month, end_day, end_year) googlenews.set_time_range(start=all_date_start, end=all_date_end) googlenews.search(keyword) data = googlenews.result() print("資料總筆數:", len(data)) news = pd.DataFrame(data) # news.to_csv("GoogleNews_" + keyword +"_日期" + start_date.replace('/', '-') + '到' +end_date.replace('/', '-')+ ".csv", index= False) return news
from datetime import date from GoogleNews import GoogleNews news = GoogleNews() news.set_lang('en') date_today = date.today() news.set_time_range('01/11/2020', date_today) news.set_encode('utf-8') topic = input("Topic : ") news.search(topic) news.get_page(2) #headlines with links WORLD NEWS for i in range(6): print(news.results()[i]["title"]) print(news.results()[i]["link"])
def main(): all_df = [] sid_obj = SentimentIntensityAnalyzer() googlenews = GoogleNews() googlenews.set_lang('en') googlenews.set_encode('utf-16') """ Primary Phrases refer to the keywords we are interested in studying Secondary Phrases refer to the target countries """ company_name = ['Pfizer', 'AstraZeneca', 'Sputnik', 'Sinovac'] # testing_countries = ['Egypt', 'Kenya', 'Nigeria'] testing_countries = [] """ Months refer to the date range """ # months = ['08/01/2020', '09/01/2020', '10/01/2020'] # months = ['01/01/2020', '02/01/2020', '03/01/2020', '04/01/2020', '05/01/2020', '06/01/2020', '07/01/2020', '08/01/2020', '09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021', '02/01/2021'] months = ['09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021', '02/01/2021'] for first in company_name: fin = [] seen = [] with open('sample.csv', mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) summary_data = [] for row in csv_reader: # print(row) second = row['\ufeffCountry'] if (second not in testing_countries and len(testing_countries)!=0): continue full_phrase = first+" "+second print(full_phrase) counter = 0 sum_sent = 0 pos_count = 0 # neu_count = 0 neg_count = 0 neg_article = {'title': 'N/A', '% Negative': 0} for i in range(0, len(months)-1): googlenews.set_time_range(months[i],months[i+1]) googlenews.get_news(full_phrase) res = googlenews.results() #It would be very easy to get more than the first page. Simply use: googlenews.get_page(2) or result = googlenews.page_at(2), in conjunction with googlenews.total_count() #(to see how many results show up on that page, if there are zero, then probably that'the last page, but I'm not sure if that's exactly how it works) for result in res: if result['title'] not in seen: # print(result) result['start date'] = months[i] result['end date'] = months[i+1] result['company'] = first result['country'] = second result['latitude'] = row['Latitude'] result['longitude'] = row['Longitude'] sentiment_dict = sid_obj.polarity_scores(result['title']) result['% Negative'] = sentiment_dict['neg']*100 result['% Neutral'] = sentiment_dict['neu']*100 result['% Positive'] = sentiment_dict['pos']*100 result['Magnitude'] = sentiment_dict['compound']*50 + 50 counter += 1 sum_sent += result['Magnitude'] # result.pop('date') # result.pop('datetime') # result.pop('img') # result.pop('media') # if result['% Negative'] > result['% Neutral'] and result['% Negative']>result['% Positive']: neg_count += 1 # elif result['% Neutral'] > result['% Positive']: neu_count += 1 # else: pos_count += 1 if result['% Positive'] > result['% Negative']: pos_count += 1 else: neg_count += 1 if result['% Negative'] >= neg_article['% Negative']: neg_article = result fin.append(result) seen.append(result['title']) posPercent = 50 if pos_count+neg_count>0: posPercent = pos_count/(pos_count + neg_count) magni = 0 if counter>0: magni = sum_sent/counter country_comp_score = {'country': second, 'latitude': row['Latitude'], 'longitude': row['Longitude'], 'magnitude': magni, 'positive': pos_count, 'negative': neg_count, 'pos/(pos+neg)': posPercent, 'Most negative title': neg_article['title']} summary_data.append(country_comp_score) all_df.append((country_comp_score, first)) df = pd.DataFrame(fin) df.drop(columns=['date', 'datetime', 'img', 'media']) df.to_csv("./Output/{}_output.csv".format(first),index=False) summary_df = pd.DataFrame(summary_data) summary_df.to_csv("./Output/{}_summary_output.csv".format(first),index=False) # all_df.append(summary_df) # meta_data = [] # # with open('sample.csv', mode='r') as csv_file: # dic_len = sum(1 for line in open('sample.csv')) # with open('sample.csv', mode='r') as csv_file: # csv_reader = csv.DictReader(csv_file) # for j in range(0, dic_len): # most_pos = 0 # for i in range(0, len(company_name)): # if all_df[most_pos][j]['positive']<all_df[i][j]['positive']: # most_pos = i # meta_data.append({all_df[0][j]['\ufeffCountry']: company_name[most_positive]}) fields = ['Country', 'Company', 'Count'] meta_data = [] seen = [] for result in all_df: if result[0]['country'] not in seen: seen.append(result[0]['country']) meta_data.append([result[0]['country'], result[1], result[0]['positive']]) else: for candidate in meta_data: if candidate[0]==result[0]['country'] and candidate[2]<result[0]['positive']: candidate[1] = result[1] candidate[2] = result[0]['positive'] with open('./Output/meta_data.csv', 'w') as f: write = csv.writer(f) write.writerow(fields) write.writerows(meta_data)
def wait(secs: int): for _ in tqdm(range(secs), desc='Waiting'): sleep(1) last_crawled_day = read_backup(BACKUP_FILE) next_day = last_crawled_day - timedelta( days=1) if last_crawled_day else date.today() try: while True: client = GoogleNews(lang='en', encode='utf-8') date_str = next_day.strftime(DATE_FORMAT) client.set_time_range(date_str, date_str) client.search('bitcoin btc') for i in tqdm(range(1, PAGES + 1), desc=f"{date_str}'s pages"): client.getpage(i) results = client.result() wait(randint(1, 30)) print(f'{len(results)} results from {PAGES} pages.') print('Saving results') parsed_results = [] for result in results: try: article = Article(result['link'], config=config) article.download() article.parse() except newspaper.article.ArticleException: continue
Months refer to the date range """ months = ['08/01/2020', '09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021'] fin = [] seen = [] for first in primary_phrases: for second in secondary_phrases: full_phrase = first+" "+second print(full_phrase) for i in range(0, len(months)-1): googlenews.set_time_range(months[i],months[i+1]) googlenews.get_news(full_phrase) res = googlenews.results(sort=True) #It would be very easy to get more than the first page. Simply use: googlenews.get_page(2) or result = googlenews.page_at(2), in conjunction with googlenews.total_count() #(to see how many results show up on that page, if there are zero, then probably that'the last page, but I'm not sure if that's exactly how it works) for result in res: if result['title'] not in seen: result['start date'] = months[i] result['end date'] = months[i+1] result['primary phrase'] = first result['secondary phrase'] = second result['full phrase'] = full_phrase fin.append(result) seen.append(result['title'])
) welcome_response = [ "hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me" ] data = open('HR.txt', 'r', errors='ignore') raw = data.read() raw = raw.lower() raw[:1000] sent_tokens = nltk.sent_tokenize(raw) 'Connect Google News to project' googlenews = GoogleNews() googlenews = GoogleNews(lang='en') googlenews = GoogleNews(period='7d') googlenews.set_time_range('10/14/2020', '12/14/2020') googlenews.set_encode('utf-8') def Normalize(text): remove_punct_dict = dict( (ord(punct), None) for punct in string.punctuation) # word tokenization word_token = nltk.word_tokenize(text.lower().translate(remove_punct_dict)) # remove ascii new_words = [] for word in word_token: new_word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('utf-8', 'ignore') new_words.append(new_word)