Example #1
0
def get_session(**kwargs) -> HTMLSession:
    session = HTMLSession()
    if kwargs['use_proxy']:
        session.proxies = {'http': 'rproxy:5566', 'https': 'rproxy:5566'}
    if not kwargs['default_header']:
        session.headers = get_headers()
    return session
Example #2
0
    def __init__(self, **kwargs):
        '''
        Base class for common scraping tasks

        Args:

        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()

        # delay/expire
        if kwargs.get('delay'):
            self.delay = kwargs['delay']
        else:
            self.delay = 2

        if kwargs.get('expire_hours'):
            self.expire_hours = kwargs['expire_hours']
        else:
            self.expire_hours = 168

        # add cookies
        if kwargs.get('cookies'):
            _s.cookies = kwargs['cookies']
        else:
            try:
                import cookielib
                _s.cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError):
                import http.cookiejar
                _s.cookies = http.cookiejar.MozillaCookieJar()
                
        # add headers
        if kwargs.get('headers'):
            _s.headers = kwargs['headers']
        else:
            ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
            _s.headers = {'User-Agent': ua}

        # add proxies
        if kwargs.get('proxies'):
            _s.proxies = kwargs['proxies']

        # add cache
        if not '/' in kwargs.get('cache_name', ''):
            self.cache_name = os.path.join('/tmp', kwargs['cache_name'])
        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache
            _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), 
                                    cache_etags = False,
                                    heuristic=ExpiresAfter(hours=self.expire_hours)))
        except ImportError as e:
            try:
                import requests_cache
                requests_cache.install_cache(self.cache_name)
            except:
                logging.exception('could not install cache')
        self.s = _s
Example #3
0
    def __init__(self, **kwargs):
        """
        """
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()
        self.delay = kwargs.get("delay", 2)
        self.expire_hours = kwargs.get("expire_hours", 168)

        # add cookies
        if kwargs.get("cookies"):
            _s.cookies = kwargs["cookies"]
        else:
            import http.cookiejar

            _s.cookies = http.cookiejar.MozillaCookieJar()

        # add headers
        default_headers = {
            "User-Agent": random.choice(USER_AGENTS),
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "accept": "application/json, text/plain, */*",
        }
        _s.headers.update(default_headers)
        if kwargs.get("headers"):
            _s.headers.update(kwargs["headers"])

        # add proxies
        if kwargs.get("proxies"):
            _s.proxies = kwargs["proxies"]

        # add cache
        if not kwargs.get("cache_name"):
            self.cache_name = os.path.join("/tmp", random_string(32))
        elif "/" not in kwargs.get("cache_name", ""):
            self.cache_name = os.path.join("/tmp", kwargs["cache_name"])
        else:
            self.cache_name = kwargs.get("cache_name")

        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache

            _s.mount(
                "http://",
                CacheControlAdapter(
                    cache=FileCache(self.cache_name),
                    cache_etags=False,
                    heuristic=ExpiresAfter(hours=self.expire_hours),
                ),
            )
        except ImportError:
            try:
                import requests_cache

                requests_cache.install_cache(self.cache_name)
            except BaseException:
                logging.exception("could not install cache")
        self.session = _s
Example #4
0
    def auth_html(self, order_id: str):
        count: int = 0
        session = HTMLSession()
        session.proxies = self.proxy_worker.get_proxy_dict()
        session.headers = self.headers_work.get_headers()
        cookies = self.cookies_work.get_cookies()
        while count < self.number_attempts:
            try:
                response = session.get(settings.LOGIN_PAGE, cookies=cookies)
                response.html.render()
                data = response.html.html
            except requests.exceptions.ConnectionError as error:
                self._send_task_report("target_connect_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": '',
                                           "order": order_id
                                       })

                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    '0',
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            try:
                response.raise_for_status()
            except requests.HTTPError as error:
                if response.status_code == 403:
                    if self.is_update_proxy:
                        # update proxy server settings
                        proxy = self.api_worker.update_proxy(
                            self.proxy_worker.get_proxy_id())
                        if proxy:
                            self.proxy_worker.set_proxy_data(
                                proxy[1], proxy[0])
                            session.proxies = self.proxy_worker.get_proxy_dict(
                            )
                    count += 1
                    time.sleep(config.DELAY_REQUESTS)
                    self._send_task_report("main_content_error",
                                           data={
                                               "message": error.__repr__(),
                                               "code":
                                               str(response.status_code),
                                               "order": order_id
                                           })
                    continue
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            except requests.exceptions.RequestException as error:
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
                # set cookies

            return {
                "status":
                True,
                "error":
                False,
                "status_code":
                str(response.status_code),
                "page_content":
                data,
                "type_res":
                "request_module",
                "proxy":
                tuple([
                    self.proxy_worker.get_proxy_id(),
                    self.proxy_worker.get_proxy_dict()
                ])
            }

        return {
            "status":
            False,
            "error":
            True,
            "status_code":
            "403",
            "message":
            "Perhaps the proxy server did not respond in time. 403 HTTPError",
            "type_res":
            "request_module",
            "proxy":
            tuple([
                self.proxy_worker.get_proxy_id(),
                self.proxy_worker.get_proxy_dict()
            ])
        }
Example #5
0
    def get_content(self, link: str, order_id: str):
        """
        Request page content for a given links.
        If the request status is 403,
        it requests an updated proxy server from the system api.
        :param order_id: str
        :param link: str
        :return: None
        """
        count: int = 0
        session = HTMLSession()
        session.proxies = self.proxy_worker.get_proxy_dict()
        session.headers = settings.LOGIN_HEADERS
        cookies = self.cookies_work.get_cookies()
        while count < self.number_attempts:
            try:
                response = session.get(link,
                                       timeout=(config.REQUEST_TIMEOUT,
                                                config.RESPONSE_TIMEOUT),
                                       cookies=cookies)
                session.close()
            except requests.exceptions.ConnectionError as error:
                self._send_task_report("target_connect_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": '',
                                           "order": order_id
                                       })

                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    '0',
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
            try:
                response.raise_for_status()
            except requests.HTTPError as error:
                if response.status_code == 403:
                    if self.is_update_proxy:
                        # update proxy server settings
                        proxy = self.api_worker.update_proxy(
                            self.proxy_worker.get_proxy_id())
                        if proxy:
                            self.proxy_worker.set_proxy_data(
                                proxy[1], proxy[0])
                            session.proxies = self.proxy_worker.get_proxy_dict(
                            )
                    count += 1
                    time.sleep(config.DELAY_REQUESTS)
                    self._send_task_report("main_content_error",
                                           data={
                                               "message": error.__repr__(),
                                               "code":
                                               str(response.status_code),
                                               "order": order_id
                                           })
                    continue
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            except requests.exceptions.RequestException as error:
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
            # set cookies

            return {
                "status":
                True,
                "error":
                False,
                "status_code":
                str(response.status_code),
                "message":
                response.text,
                "type_res":
                "request_module",
                "proxy":
                tuple([
                    self.proxy_worker.get_proxy_id(),
                    self.proxy_worker.get_proxy_dict()
                ])
            }

        return {
            "status":
            False,
            "error":
            True,
            "status_code":
            "403",
            "message":
            "Perhaps the proxy server did not respond in time. 403 HTTPError",
            "type_res":
            "request_module",
            "proxy":
            tuple([
                self.proxy_worker.get_proxy_id(),
                self.proxy_worker.get_proxy_dict()
            ])
        }