logger.debug('System platform : Darwin') driver_path += 'chromedriverMac' elif platform == 'linux': logger.debug('System platform : Linux') driver_path += 'chromedriverLinux' elif platform == 'win32': logger.debug('System platform : Window') driver_path += 'chromedriverWindow' else: logger.error( f'[{sys.platform}] not supported. Check your system platform.') raise Exception() # 크롬 드라이버 인스턴스 생성 chrome = generate_chrome(driver_path=driver_path, headless=True, download_path=DOWNLOAD_DIR) # 페이지 요청 url = 'https://github.com/login' chrome.get(url) time.sleep(3) # 깃허브 로그인 login_page = chrome.page_source elm = chrome.find_element_by_id('login_field') elm.send_keys('깃허브 아이디') elm = chrome.find_element_by_id('password') elm.send_keys('깃허브 비밀번호') elm.send_keys(Keys.RETURN)
if platform == 'darwin': print('System platform : Darwin') driver_path += 'chromedriver_mac' elif platform == 'linux': print('System platform : Linux') driver_path += 'chromedriver_linux' elif platform == 'win32': print('System platform : Window') driver_path = os.path.join(driver_path,'chromedriver_win.exe') else: print(f'[{sys.platform}] not supported. Check your system platform.') raise Exception() # 크롬 드라이버 인스턴스 생성 chrome = chromedriver.generate_chrome( driver_path=driver_path, headless=headless, download_path=DOWNLOAD_DIR) # 페이지 요청 url = 'http://edu.kisti.re.kr/index.asp?beurl=' chrome.get(url) chrome.implicitly_wait(30) elm = chrome.find_element_by_id('login_id') elm.send_keys('A202001789') elm = chrome.find_element_by_id('login_pw') elm.send_keys('505065') time.sleep(2) elm.send_keys(Keys.RETURN) chrome.implicitly_wait(15)
def naver(): from selenium import webdriver import re from selenium.webdriver.common.keys import Keys import time cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) result = [] res = [] # 웹 셋팅 chrome = chromedriver.generate_chrome(driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( date) chrome.get(url) time.sleep(2) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', cleanText(body[0].text) [(cleanText(body[0].text)).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 from ckonlpy.tag import Twitter t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') import nltk tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) from wordcloud import WordCloud import matplotlib.pyplot as plt import time date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) tmp_data = dict(data_1) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
def twitter(): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) import time import nltk keyword = Main.text() # 웹 셋팅 chrome = chromedriver.generate_chrome( driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) time.sleep(3) # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div') # for i in range(15): # for q in range(3): # body = chrome.find_element_by_css_selector('body') # body.send_keys(Keys.PAGE_DOWN) # time.sleep(1) # for ttt in tqdm(text2): # result.append(ttt.text) # time.sleep(1) # # # result2 = [] # for i in range(len(result)): # if i % 2 == 0: # result2.append(result[i]) # print(len(result2)) # # result3 = [] # for i in range(len(result2)): # result3.append(cleanText(result2[i])) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div') for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in tqdm(text2): result.append(re.sub('\n', '', ttt.text)) t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) # 텍스트파일에 댓글 저장하기 file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)