Example #1
0
def get_html(url, options={}):
    try:
        agent = UserAgent()
    except FakeUserAgentError:
        pass
    base_headers = {
        'User-Agent': agent.__getattr__('random'),
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    headers = dict(base_headers, **options)
    print('Getting', url)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print('获取源码失败')
            return
    except ConnectionError:
        print('连接失败')
Example #2
0
class Navigator:
    def __init__(self,
                 driver='chrome',
                 user_agent='random',
                 request_method='urllib',
                 timeout=10):
        """
		:type driver: str or ChromeDriver or FirefoxDriver or NoneType
		:param str user_agent: the default user agent, one of random, ie, ff, chrome, etc.
		"""
        if isinstance(driver, str):
            if driver.lower() == 'chrome':
                self._driver = Chrome()
            elif driver.lower() == 'firefox':
                self._driver = Firefox()
            else:
                raise ValueError(f'Unknown driver: "{driver}"')
        else:
            self._driver = driver
        self._user_agent = UserAgent(cache=False, use_cache_server=False)

        self._default_user_agent = user_agent
        self._default_request_method = request_method
        self._timeout = timeout
        self._url = None
        self._page_source = None
        self._parsed_html = None
        self._load_start_time = None
        self._load_end_time = None
        self._parse_start_time = None
        self._parse_end_time = None

    def __del__(self):
        self.driver.quit()

    @property
    def driver(self):
        """
		:rtype: ChromeDriver or FirefoxDriver
		"""
        return self._driver

    def _get_by_driver(self, url, element_id=None, timeout_exception='error'):
        self.driver.get(url=url)
        if element_id is None:
            time.sleep(self._timeout)
        else:
            try:
                element_present = expected_conditions.presence_of_all_elements_located(
                    (By.ID, element_id))
                WebDriverWait(driver=self.driver,
                              timeout=self._timeout).until(element_present)
                self._load_end_time = datetime.now()
            except TimeoutException:
                self._load_end_time = None
                self._page_source = None
                if timeout_exception[0].lower == 'e':
                    raise TimeoutException(
                        f'Timed out waiting for page:"{url}" to load!')
                elif timeout_exception[0].lower == 'w':
                    warnings.warn(
                        message=f'Timed out waiting for page:"{url}" to load!')
        return self.driver.page_source

    def _get_by_urllib(self,
                       url,
                       user_agent=None,
                       encoding='utf-8',
                       headers=None):
        user_agent = user_agent or self._default_user_agent
        headers = headers or {
            'user-agent': self._user_agent.__getattr__(user_agent)
        }
        request = urllib.request.Request(url)
        for key, value in headers.items():
            request.add_header(key=key, val=value)

        with urllib.request.urlopen(request,
                                    timeout=self._timeout) as response:
            html = response.read().decode(encoding)
        return html

    def get(self,
            url,
            method=None,
            user_agent=None,
            element_id=None,
            encoding='utf-8',
            timeout_exception='error',
            parser='lxml'):
        """
		:type url: str
		:param str or NoneType method: one of 'urllib' or 'selenium' or None, None will choose the default
		:param str or NoneType user_agent: one of None (to choose default), random, ie, ff, etc.
		:type element_id: str or NoneType
		:type encoding: str
		:param str timeout_exception: one of 'error', 'warning', 'ignore'
		:rtype:
		"""
        self._url = url
        method = method or self._default_request_method

        self._load_start_time = datetime.now()
        if method.lower() == 'urllib':
            html = self._get_by_urllib(url=url,
                                       encoding=encoding,
                                       user_agent=user_agent)
        elif method.lower() == 'selenium':
            html = self._get_by_driver(url=url,
                                       element_id=element_id,
                                       timeout_exception=timeout_exception)
        else:
            raise ValueError(f'Unknown method: "{method}"!')
        self._load_end_time = datetime.now()
        self._page_source = html
        if parser:
            return self.parse_html(parser=parser, html=html)
        else:
            return self._page_source

    def parse_html(self, parser, html):
        self._parse_start_time = datetime.now()
        self._parsed_html = BeautifulSoup(html, parser)
        self._parse_end_time = datetime.now()
        return self._parsed_html

    @property
    def loading_time(self):
        return get_elapsed_seconds(start=self._load_start_time,
                                   end=self._load_end_time)

    @property
    def parsing_time(self):
        return get_elapsed_seconds(start=self._parse_start_time,
                                   end=self._parse_end_time)