Esempio n. 1
0
def get_html(d_url, p_url, c_t):

    global cache_currentTime

    t = time()

    if (p_url in cache_dict) and (t - cache_currentTime < c_t):
        print(
            f"Returned {p_url} from cache. Cached  time:{t-cache_currentTime} "
        )
        return cache_dict[p_url]

    browser = get_driver(d_url)

    if connect_to_site(browser, p_url):

        print(f"Get page {p_url}")
        html = browser.page_source

        cache_dict[p_url] = html
        cache_currentTime = t

        browser.quit()

        return html
    else:
        print('Error connecting to [p_url]')
        browser.quit()
        return None
Esempio n. 2
0
def run_process(page_number, filename):
    browser = get_driver()
    if connect_to_base(browser, page_number):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
        browser.quit()
    else:
        print('Error connecting to hackernews')
        browser.quit()
def run_process(page_number, filename, headless):

    # init browser
    browser = get_driver(headless)

    if connect_to_base(browser, page_number):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)

        # exit
        browser.quit()
    else:
        print("Error connecting to hackernews")
        browser.quit()
        output_list = parse_html(html)

        ########
        # here #
        ########
        write_to_file(output_list, filename)
    else:
        print('Error connecting to hackernews')


if __name__ == '__main__':
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'output_{output_timestamp}.csv'
    browser = get_driver()
    # scrape and crawl
    while current_page <= 20:
        print(f'Scraping page #{current_page}...')
        run_process(current_page, output_filename, browser)

        ########
        # here #
        ########
        current_page = current_page + 1
    # exit
    browser.quit()
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')
Esempio n. 5
0
import sys
from time import sleep

from scrapers.scraper import get_driver, connect_to_base, parse_html


def run_process(rowser):
    if connect_to_base(browser):
        print(f'Scraping random Wikipedia page...')
        sleep(2)
        html = browser.page_source
        return parse_html(html)
    else:
        print("Error connecting to Wikipedia")
        return False


if __name__ == '__main__':
    browser = get_driver(sys.argv[1])
    data = run_process(browser)
    print(data)
    browser.quit()
    print(f'Finished!')
Esempio n. 6
0
 def test_brower(self):
     browser=get_driver(self.driver_url)
     self.assertTrue(browser is not None)
if __name__ == "__main__":

    # headless mode?
    headless = False
    if len(sys.argv) > 1:
        if sys.argv[1] == "headless":
            print("Running in headless mode")
            headless = True

    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"output_{output_timestamp}.csv"

    # init browser
    browser = get_driver(headless=headless)

    # scrape and crawl
    while current_page <= 20:
        print(f"Scraping page #{current_page}...")
        run_process(current_page, output_filename, browser)
        current_page = current_page + 1

    # exit
    browser.quit()
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time} seconds")