def main(accepted_langs, do_schedule_restart=True):
    global driver
    if do_schedule_restart:
        schedule_restart()

    if driver is None:
        chrome_options = get_chrome_options(headless=True)
        driver = Chrome(chrome_options=chrome_options)
    LiveNewsLinkCrawler(driver=driver).start()
    LiveNewsContentCrawler().start()
def get_naver_press_list():
    url = "https://news.naver.com/"

    driver = webdriver.Chrome(chrome_options=get_chrome_options())
    driver.get(url)

    # Click load press
    btn = driver.find_element_by_xpath('//*[@id="index.press.btn"]')
    btn.click()
    time.sleep(1)

    html_source = driver.page_source
    driver.quit()

    tree = html.fromstring(html_source)
    providers = tree.xpath('//*[@id="index.press.area"]//li/a/text()')
    return providers
Esempio n. 3
0
def crawl_all_links(THREAD_COUNT, start_date, end_date):
    dates = rrule(DAILY, dtstart=start_date, until=end_date)

    from inkedNewsCrawler.utils.web_drivers import get_chrome_options
    chrome_options = get_chrome_options(headless=False)
    # instantiate browsers
    for i in range(THREAD_COUNT):
        print("SETUP Driver %i" % (i + 1))
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.PhantomJS()

        available_drivers.append(driver)
        # pass

    pool = ThreadPool(THREAD_COUNT)
    pool.map(start_crawl, dates)
    # close the pool and wait for the work to finish
    pool.close()
    pool.join()

    #  Clean drivers
    for driver in available_drivers:
        driver.quit()
Esempio n. 4
0
from datetime import datetime

from inkedNewsCrawler.custom_crawler.naver_news_crawler.naver_news_link_crawler_threaded import NaverDateNewsLinkCrawler
from selenium import webdriver


def callback(date, result):
    for r in result:
        print(r)


date = datetime(2018, 8, 3)

# driver = webdriver.PhantomJS()
# NaverDateNewsLinkCrawler(date, driver, callback, skip_crawled_date=False).parse()

from inkedNewsCrawler.utils.web_drivers import get_chrome_options
chrome_options = get_chrome_options(headless=False)
chrome_driver = webdriver.Chrome(chrome_options=chrome_options)
NaverDateNewsLinkCrawler(date,
                         chrome_driver,
                         callback,
                         skip_crawled_date=False).crawl_all()
Esempio n. 5
0
from datetime import datetime

from selenium.webdriver import Chrome

from inkedNewsCrawler.custom_crawler.naver_news_crawler.naver_news_link_crawler_threaded import \
    NaverDateNewsLinkCrawler

from inkedNewsCrawler.utils.web_drivers import get_chrome_options
chrome_options = get_chrome_options()
driver = Chrome(chrome_options=chrome_options)



def callback(a):
    ...


maxPerPage = 50

def main():
    crawler = NaverDateNewsLinkCrawler(date=datetime(2018, 1, 1), driver=driver, on_items_complete=callback, skip_crawled_date=False)
    crawler.load_page()

    for i in range(20):
        crawler.parse_article_in_page()
        crawler.move_to_next_page()
        max = maxPerPage*(i+1)
        print(crawler.link_data_list[max - 50:max])