Esempio n. 1
0
    logger.debug('System platform : Darwin')
    driver_path += 'chromedriverMac'
elif platform == 'linux':
    logger.debug('System platform : Linux')
    driver_path += 'chromedriverLinux'
elif platform == 'win32':
    logger.debug('System platform : Window')
    driver_path += 'chromedriverWindow'
else:
    logger.error(
        f'[{sys.platform}] not supported. Check your system platform.')
    raise Exception()

# 크롬 드라이버 인스턴스 생성
chrome = generate_chrome(driver_path=driver_path,
                         headless=True,
                         download_path=DOWNLOAD_DIR)

# 페이지 요청
url = 'https://github.com/login'
chrome.get(url)
time.sleep(3)

# 깃허브 로그인
login_page = chrome.page_source

elm = chrome.find_element_by_id('login_field')
elm.send_keys('깃허브 아이디')
elm = chrome.find_element_by_id('password')
elm.send_keys('깃허브 비밀번호')
elm.send_keys(Keys.RETURN)
Esempio n. 2
0
if platform == 'darwin':
    print('System platform : Darwin')
    driver_path += 'chromedriver_mac'
elif platform == 'linux':
    print('System platform : Linux')
    driver_path += 'chromedriver_linux'
elif platform == 'win32':
    print('System platform : Window')
    driver_path = os.path.join(driver_path,'chromedriver_win.exe')
else:
    print(f'[{sys.platform}] not supported. Check your system platform.')
    raise Exception()

# 크롬 드라이버 인스턴스 생성
chrome = chromedriver.generate_chrome(
    driver_path=driver_path,
    headless=headless,
    download_path=DOWNLOAD_DIR)

# 페이지 요청
url = 'http://edu.kisti.re.kr/index.asp?beurl='
chrome.get(url)
chrome.implicitly_wait(30)
elm = chrome.find_element_by_id('login_id')
elm.send_keys('A202001789')
elm = chrome.find_element_by_id('login_pw')
elm.send_keys('505065')
time.sleep(2)
elm.send_keys(Keys.RETURN)

chrome.implicitly_wait(15)
Esempio n. 3
0
def naver():
    from selenium import webdriver
    import re
    from selenium.webdriver.common.keys import Keys
    import time
    cr_name = 'naver'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)

    # 네이버 헤드라인 가져오는소스

    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    result = []
    res = []

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(driver_path=Main.driver_path,
                                          headless=Main.headless,
                                          download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Naver 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format(
        date)
    chrome.get(url)
    time.sleep(2)

    # scroll(3)
    for sun in range(4, 10):
        pr = chrome.find_elements_by_xpath(
            '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun))
        for p in pr:
            result.append(p.find_elements_by_tag_name('a'))
        # print(result)

        for i, q in enumerate(result):
            for e in q:
                res.append(e.get_attribute('href'))
    http = list(set(res))
    len(http)
    https = []

    for idx in range(len(http)):
        if http[idx].find('popularDay') >= 0:
            continue
        else:
            https.append(http[idx])

    files = pd.DataFrame()

    for i in range(len(https)):
        res = requests.get(https[i])
        soup = BeautifulSoup(res.content, 'html.parser')
        body = soup.select('._article_body_contents')
        files = files.append(
            pd.DataFrame(
                {
                    'Title':
                    soup.find('div', attrs={
                        'class': 'article_info'
                    }).h3.text,
                    'Contents':
                    re.sub(
                        '   ', '',
                        re.sub(
                            '    ', '',
                            re.sub(
                                '\t', '',
                                cleanText(body[0].text)
                                [(cleanText(body[0].text)).find('{}') + 2:]))),
                    'link':
                    https[i]
                },
                index=[i]))

    text2 = files.Contents
    # 텍스트파일에 저장 csv
    files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2),
                 index=False,
                 encoding='utf-8')

    # -------------------------------------

    # 사전만들기
    from ckonlpy.tag import Twitter
    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    import nltk
    tokens_ko = []

    for i in range(len(text2)):
        tokens_ko.append(t.nouns(text2[i]))

    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)

    data_1 = []
    for i in range(len(data)):
        for q in range(0, 1, 1):
            if len(data[i][0]) >= 2:
                data_1.append(data[i])

    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    import time
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    tmp_data = dict(data_1)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white',
                          max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path + "/naver_{}.png".format(date),
                bbox_inces='tight',
                dpi=400,
                pad_inches=0)
Esempio n. 4
0
def twitter():
    cr_name = 'twitter'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)


    import time
    import nltk
    keyword = Main.text()

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(
        driver_path=Main.driver_path,
        headless=Main.headless,
        download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Twitter 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword)
    chrome.get(url)
    time.sleep(3)


    # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div')


    # for i in range(15):
    #     for q in range(3):
    #         body = chrome.find_element_by_css_selector('body')
    #         body.send_keys(Keys.PAGE_DOWN)
    #         time.sleep(1)
    #     for ttt in tqdm(text2):
    #         result.append(ttt.text)
    #     time.sleep(1)
    #
    #
    # result2 = []
    # for i in range(len(result)):
    #     if i % 2 == 0:
    #         result2.append(result[i])
    # print(len(result2))
    #
    # result3 = []
    # for i in range(len(result2)):
    #     result3.append(cleanText(result2[i]))

    body = chrome.find_element_by_css_selector('body')
    text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div')

    for i in range(10):
        for q in range(3):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(1)
        for ttt in tqdm(text2):
            result.append(re.sub('\n', '', ttt.text))

    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    tokens_ko = []

    for i in range(len(result)):
        tokens_ko.append(t.nouns(result[i]))
    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))


    # 텍스트파일에 댓글 저장하기
    file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8')

    for review in result:
        file.write(review + '\n')

    file.close()

    tmp_data = dict(data)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white', max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)