예제 #1
0
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.

    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    '''
    from calibre.utils.browser import Browser
    if use_robust_parser:
        import mechanize
        opener = Browser(factory=mechanize.RobustFactory())
    else:
        opener = Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
    to_add = {}
    http_proxy = proxies.get('http', None)
    if http_proxy:
        to_add['http'] = http_proxy
    https_proxy = proxies.get('https', None)
    if https_proxy:
        to_add['https'] = https_proxy
    if to_add:
        opener.set_proxies(to_add)

    return opener
예제 #2
0
파일: __init__.py 프로젝트: zyhong/calibre
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, verify_ssl_certificates=True, handle_refresh=True):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.

    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    :param verify_ssl_certificates: If false SSL certificates errors are ignored
    '''
    from calibre.utils.browser import Browser
    opener = Browser(verify_ssl=verify_ssl_certificates)
    opener.set_handle_refresh(handle_refresh, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
    to_add = {}
    http_proxy = proxies.get('http', None)
    if http_proxy:
        to_add['http'] = http_proxy
    https_proxy = proxies.get('https', None)
    if https_proxy:
        to_add['https'] = https_proxy
    if to_add:
        opener.set_proxies(to_add)

    return opener
예제 #3
0
def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.

    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    :param verify_ssl_certificates: If false SSL certificates errors are ignored
    '''
    from calibre.utils.browser import Browser
    opener = Browser(verify_ssl=verify_ssl_certificates)
    opener.set_handle_refresh(handle_refresh, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = random_user_agent(0, allow_ie=False)
    elif user_agent == 'common_words/based':
        from calibre.utils.random_ua import common_english_word_ua
        user_agent = common_english_word_ua()
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
    to_add = {}
    http_proxy = proxies.get('http', None)
    if http_proxy:
        to_add['http'] = http_proxy
    https_proxy = proxies.get('https', None)
    if https_proxy:
        to_add['https'] = https_proxy
    if to_add:
        opener.set_proxies(to_add)

    return opener
예제 #4
0
def browser(
    honor_time=True,
    max_time=2,
    mobile_browser=False,
    user_agent=None,
    use_robust_parser=False,
    verify_ssl_certificates=True,
):
    """
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.

    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    :param verify_ssl_certificates: If false SSL certificates errors are ignored
    """
    from calibre.utils.browser import Browser

    if use_robust_parser:
        import mechanize

        opener = Browser(factory=mechanize.RobustFactory(), verify_ssl=verify_ssl_certificates)
    else:
        opener = Browser(verify_ssl=verify_ssl_certificates)
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
    opener.addheaders = [("User-agent", user_agent)]
    proxies = get_proxies()
    to_add = {}
    http_proxy = proxies.get("http", None)
    if http_proxy:
        to_add["http"] = http_proxy
    https_proxy = proxies.get("https", None)
    if https_proxy:
        to_add["https"] = https_proxy
    if to_add:
        opener.set_proxies(to_add)

    return opener
예제 #5
0
def jsbrowser(*args, **kwargs):
    from calibre.web.jsbrowser.browser import Browser
    return Browser(*args, **kwargs)