def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request ''' from calibre.utils.browser import Browser if use_robust_parser: import mechanize opener = Browser(factory=mechanize.RobustFactory()) else: opener = Browser() opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() to_add = {} http_proxy = proxies.get('http', None) if http_proxy: to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: to_add['https'] = https_proxy if to_add: opener.set_proxies(to_add) return opener
def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request :param verify_ssl_certificates: If false SSL certificates errors are ignored ''' from calibre.utils.browser import Browser opener = Browser(verify_ssl=verify_ssl_certificates) opener.set_handle_refresh(handle_refresh, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: user_agent = random_user_agent(0, allow_ie=False) elif user_agent == 'common_words/based': from calibre.utils.random_ua import common_english_word_ua user_agent = common_english_word_ua() opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() to_add = {} http_proxy = proxies.get('http', None) if http_proxy: to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: to_add['https'] = https_proxy if to_add: opener.set_proxies(to_add) return opener
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, verify_ssl_certificates=True, handle_refresh=True): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request :param verify_ssl_certificates: If false SSL certificates errors are ignored ''' from calibre.utils.browser import Browser opener = Browser(verify_ssl=verify_ssl_certificates) opener.set_handle_refresh(handle_refresh, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() to_add = {} http_proxy = proxies.get('http', None) if http_proxy: to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: to_add['https'] = https_proxy if to_add: opener.set_proxies(to_add) return opener
def browser( honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False, verify_ssl_certificates=True, ): """ Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request :param verify_ssl_certificates: If false SSL certificates errors are ignored """ from calibre.utils.browser import Browser if use_robust_parser: import mechanize opener = Browser(factory=mechanize.RobustFactory(), verify_ssl=verify_ssl_certificates) else: opener = Browser(verify_ssl=verify_ssl_certificates) opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [("User-agent", user_agent)] proxies = get_proxies() to_add = {} http_proxy = proxies.get("http", None) if http_proxy: to_add["http"] = http_proxy https_proxy = proxies.get("https", None) if https_proxy: to_add["https"] = https_proxy if to_add: opener.set_proxies(to_add) return opener