コード例 #1
0
def get_company_news_link(company='NaN', news_num=5, time_range='today'):

    if company == 'NaN':
        return 'please input company name'

    news_link = []
    googlenews = GoogleNews()
    googlenews.clear()

    if time_range != 'today':
        start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7],
                                          time_range[8:10])
        end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18],
                                        time_range[19:21])
        googlenews.set_time_range(start_date, end_date)

    googlenews.search(company)
    result = googlenews.result()

    try:
        for num in range(news_num):
            news_link.append(result[num]['link'])
    except IndexError:
        if len(news_link) == 0:
            return '此時段無' + company + '新聞 OR 網路不穩'
        return news_link
    else:
        return news_link
コード例 #2
0
ファイル: news.py プロジェクト: eriksonlb/TARS
def get_news(assunto):
    news = GoogleNews(period='d')
    news.setlang('pt')
    news.set_encode('utf-8')
    news.set_time_range('12/02/2021', '13/02/2021')
    news.get_news(assunto)
    results = news.get_texts()
    result = results[3:8] if len(results) > 0 else "Sem notícias recentes"
    return result
コード例 #3
0
def crawling_news(company_name_list, start_date, end_date):
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    logger.addHandler(stream_handler)

    googlenews = GoogleNews()
    googlenews.set_lang('en')
    googlenews.set_time_range('start_date', 'end_date')
    googlenews.set_encode('utf-8')
    #news.google.com search sample
    all_title = []
    logging.info('loop start')
    for i in range(len(company_name_list)):
        googlenews.get_news(company_name_list[i])
        logging.info('%s : %0.2f%s' %
                     (company_name_list[i],
                      ((i + 1) / len(company_name_list)) * 100, '%'))
        for j in range(len(googlenews.results())):
            all_title.append(googlenews.results()[j].get('title'))
    all_title = pd.DataFrame(all_title)
    all_title.to_csv('sp500news.csv')
    logging.info('saved to csv, done!!')
    return all_title
コード例 #4
0
def googlenews_function(keyword='台積電',
                        language='cn',
                        start_date='2020/12/01',
                        end_date='2020/12/28'):
    '''
    - 日期
    - 關鍵字
    - 語言
    - 爬幾頁

    '''
    googlenews = GoogleNews()
    googlenews.clear()
    googlenews.set_encode('utf-8')
    googlenews.set_lang(language)

    all_date_start = start_date.split('/')
    start_year = all_date_start[0]
    start_month = all_date_start[1]
    start_day = all_date_start[2]
    all_date_start = '{}/{}/{}'.format(start_month, start_day, start_year)

    all_date_end = end_date.split('/')
    end_year = all_date_end[0]
    end_month = all_date_end[1]
    end_day = all_date_end[2]
    all_date_end = '{}/{}/{}'.format(end_month, end_day, end_year)

    googlenews.set_time_range(start=all_date_start, end=all_date_end)

    googlenews.search(keyword)
    data = googlenews.result()
    print("資料總筆數:", len(data))
    news = pd.DataFrame(data)
    # news.to_csv("GoogleNews_" + keyword +"_日期" + start_date.replace('/', '-') + '到' +end_date.replace('/', '-')+ ".csv", index= False)
    return news
コード例 #5
0
from datetime import date
from GoogleNews import GoogleNews
news = GoogleNews()
news.set_lang('en')
date_today = date.today()
news.set_time_range('01/11/2020', date_today)
news.set_encode('utf-8')
topic = input("Topic : ")
news.search(topic)
news.get_page(2)
#headlines with links WORLD NEWS
for i in range(6):
    print(news.results()[i]["title"])
    print(news.results()[i]["link"])
def main():

	all_df = []

	sid_obj = SentimentIntensityAnalyzer() 	

	googlenews = GoogleNews()
	googlenews.set_lang('en')
	googlenews.set_encode('utf-16')

	"""
	Primary Phrases refer to the keywords we are interested in studying
	Secondary Phrases refer to the target countries
	"""
	company_name = ['Pfizer', 'AstraZeneca', 'Sputnik', 'Sinovac']

	# testing_countries = ['Egypt', 'Kenya', 'Nigeria']
	testing_countries = []

	"""
	Months refer to the date range 
	"""
	# months = ['08/01/2020', '09/01/2020', '10/01/2020']
	# months = ['01/01/2020', '02/01/2020', '03/01/2020', '04/01/2020', '05/01/2020', '06/01/2020', '07/01/2020', '08/01/2020', '09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021', '02/01/2021']
	months = ['09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021', '02/01/2021']

	for first in company_name:

		fin = []
		seen = []
		
		with open('sample.csv', mode='r') as csv_file:
			csv_reader = csv.DictReader(csv_file)
			
			summary_data = []

			for row in csv_reader:
				# print(row)
				second = row['\ufeffCountry']
				if (second not in testing_countries and len(testing_countries)!=0): 
					continue

				full_phrase = first+" "+second

				print(full_phrase)

				counter = 0
				sum_sent = 0
				
				pos_count = 0
				# neu_count = 0
				neg_count = 0

				neg_article = {'title': 'N/A', '% Negative': 0}

				for i in range(0, len(months)-1):
					googlenews.set_time_range(months[i],months[i+1])
					googlenews.get_news(full_phrase)
					res = googlenews.results()

					#It would be very easy to get more than the first page. Simply use: googlenews.get_page(2) or result = googlenews.page_at(2), in conjunction with googlenews.total_count() 
					#(to see how many results show up on that page, if there are zero, then probably that'the last page, but I'm not sure if that's exactly how it works)

					for result in res:
						if result['title'] not in seen:
							# print(result)
							result['start date'] = months[i]
							result['end date'] = months[i+1]
							result['company'] = first
							result['country'] = second
							result['latitude'] = row['Latitude']
							result['longitude'] = row['Longitude']

							sentiment_dict = sid_obj.polarity_scores(result['title'])
							result['% Negative'] = sentiment_dict['neg']*100
							result['% Neutral'] = sentiment_dict['neu']*100
							result['% Positive'] = sentiment_dict['pos']*100
							result['Magnitude'] = sentiment_dict['compound']*50 + 50

							counter += 1
							sum_sent += result['Magnitude']
							
							# result.pop('date')
							# result.pop('datetime')
							# result.pop('img')
							# result.pop('media')

							# if result['% Negative'] > result['% Neutral'] and result['% Negative']>result['% Positive']: neg_count += 1
							# elif result['% Neutral'] > result['% Positive']: neu_count += 1
							# else: pos_count += 1
							if result['% Positive'] > result['% Negative']: pos_count += 1
							else: neg_count += 1

							if result['% Negative'] >= neg_article['% Negative']: neg_article = result

							fin.append(result)
							seen.append(result['title'])

				posPercent = 50
				if pos_count+neg_count>0: posPercent = pos_count/(pos_count + neg_count)

				magni = 0
				if counter>0: magni = sum_sent/counter

				country_comp_score = {'country': second, 'latitude': row['Latitude'], 
				'longitude': row['Longitude'], 'magnitude': magni, 'positive': pos_count, 
				'negative': neg_count, 'pos/(pos+neg)': posPercent, 'Most negative title': neg_article['title']}

				summary_data.append(country_comp_score)
				all_df.append((country_comp_score, first))

			df = pd.DataFrame(fin)
			df.drop(columns=['date', 'datetime', 'img', 'media'])
			df.to_csv("./Output/{}_output.csv".format(first),index=False)

			summary_df = pd.DataFrame(summary_data)
			summary_df.to_csv("./Output/{}_summary_output.csv".format(first),index=False)
			# all_df.append(summary_df)
	
	# meta_data = []
	# # with open('sample.csv', mode='r') as csv_file:
	# dic_len = sum(1 for line in open('sample.csv'))

	# with open('sample.csv', mode='r') as csv_file:
	# 	csv_reader = csv.DictReader(csv_file)
	# 	for j in range(0, dic_len):
	# 		most_pos = 0
	# 		for i in range(0, len(company_name)):
	# 			if all_df[most_pos][j]['positive']<all_df[i][j]['positive']: 
	# 				most_pos = i
	# 		meta_data.append({all_df[0][j]['\ufeffCountry']: company_name[most_positive]})

	fields = ['Country', 'Company', 'Count']  

	meta_data = []
	seen = []
	for result in all_df:
		if result[0]['country'] not in seen:
			seen.append(result[0]['country'])
			meta_data.append([result[0]['country'], result[1], result[0]['positive']])
		else:
			for candidate in meta_data:
				if candidate[0]==result[0]['country'] and candidate[2]<result[0]['positive']:
					candidate[1] = result[1]
					candidate[2] = result[0]['positive']

	with open('./Output/meta_data.csv', 'w') as f:
		write = csv.writer(f)      
		write.writerow(fields)
		write.writerows(meta_data)
コード例 #7
0
def wait(secs: int):
    for _ in tqdm(range(secs), desc='Waiting'):
        sleep(1)


last_crawled_day = read_backup(BACKUP_FILE)

next_day = last_crawled_day - timedelta(
    days=1) if last_crawled_day else date.today()

try:
    while True:
        client = GoogleNews(lang='en', encode='utf-8')
        date_str = next_day.strftime(DATE_FORMAT)
        client.set_time_range(date_str, date_str)
        client.search('bitcoin btc')
        for i in tqdm(range(1, PAGES + 1), desc=f"{date_str}'s pages"):
            client.getpage(i)
            results = client.result()
            wait(randint(1, 30))
        print(f'{len(results)} results from {PAGES} pages.')
        print('Saving results')
        parsed_results = []
        for result in results:
            try:
                article = Article(result['link'], config=config)
                article.download()
                article.parse()
            except newspaper.article.ArticleException:
                continue
コード例 #8
0
Months refer to the date range 
"""
months = ['08/01/2020', '09/01/2020', '10/01/2020', '11/01/2020', '12/01/2020', '01/01/2021']

fin = []

seen = []

for first in primary_phrases:
	for second in secondary_phrases:
		full_phrase = first+" "+second

		print(full_phrase)

		for i in range(0, len(months)-1):
			googlenews.set_time_range(months[i],months[i+1])
			googlenews.get_news(full_phrase)
			res = googlenews.results(sort=True)

			#It would be very easy to get more than the first page. Simply use: googlenews.get_page(2) or result = googlenews.page_at(2), in conjunction with googlenews.total_count() 
			#(to see how many results show up on that page, if there are zero, then probably that'the last page, but I'm not sure if that's exactly how it works)

			for result in res:
				if result['title'] not in seen:
					result['start date'] = months[i]
					result['end date'] = months[i+1]
					result['primary phrase'] = first
					result['secondary phrase'] = second
					result['full phrase'] = full_phrase
					fin.append(result)
					seen.append(result['title'])
コード例 #9
0
)
welcome_response = [
    "hi", "hey", "*nods*", "hi there", "hello",
    "I am glad! You are talking to me"
]
data = open('HR.txt', 'r', errors='ignore')
raw = data.read()
raw = raw.lower()
raw[:1000]
sent_tokens = nltk.sent_tokenize(raw)

'Connect Google News to project'
googlenews = GoogleNews()
googlenews = GoogleNews(lang='en')
googlenews = GoogleNews(period='7d')
googlenews.set_time_range('10/14/2020', '12/14/2020')
googlenews.set_encode('utf-8')


def Normalize(text):
    remove_punct_dict = dict(
        (ord(punct), None) for punct in string.punctuation)
    # word tokenization
    word_token = nltk.word_tokenize(text.lower().translate(remove_punct_dict))

    # remove ascii
    new_words = []
    for word in word_token:
        new_word = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)