コード例 #1
0
ファイル: throttled.py プロジェクト: zhanglipku/mediacloud
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledDomainException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().

        The throttling routine will not be applied after the first successful request, to allow for redirects and
        other followup requests to succeed.  To ensure proper throttling, a new object should be create for each
        top level request.

        Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by
        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR.
        """
        if self._use_throttling:
            domain = mediawords.util.url.get_url_distinctive_domain(
                request.url())

            domain_timeout = self.domain_timeout
            if domain_timeout > 1 and (is_shortened_url(request.url())
                                       or domain in _ACCELERATED_DOMAINS):
                domain_timeout = max(
                    1,
                    int(self.domain_timeout /
                        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR))

            # this postgres function returns true if we are allowed to make the request and false otherwise. this
            # function does not use a table lock, so some extra requests might sneak through, but that's better than
            # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
            got_domain_lock = self.db.query(
                "select get_domain_web_requests_lock(%s, %s)",
                (domain, domain_timeout)).flat()[0]

            log.debug("domain lock obtained for %s: %s" %
                      (str(request.url()), str(got_domain_lock)))

            if not got_domain_lock:
                raise McThrottledDomainException("domain " + str(domain) +
                                                 " is locked.")
        else:
            log.debug("domain lock obtained for %s: skipped" %
                      str(request.url()))

        self._use_throttling = False

        return super().request(request)
コード例 #2
0
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords'][
                'blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(
                blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern,
                         string=url,
                         flags=re.IGNORECASE | re.UNICODE):
                request.set_url("http://blacklistedsite.localhost/%s" % url)

        return request
コード例 #3
0
ファイル: __init__.py プロジェクト: robpotter89/backend
    def __blacklist_request_if_needed(self, request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        blacklist_url_pattern = self._user_agent_config.blacklist_url_pattern()

        if blacklist_url_pattern:

            # MC_REWRITE_TO_PYTHON: a string might be coming from Perl
            if isinstance(blacklist_url_pattern, bytes):
                blacklist_url_pattern = decode_object_from_bytes_if_needed(
                    blacklist_url_pattern)
            if isinstance(blacklist_url_pattern, str):
                blacklist_url_pattern = re.compile(blacklist_url_pattern,
                                                   flags=re.IGNORECASE
                                                   | re.UNICODE)

            if re.search(pattern=blacklist_url_pattern,
                         string=url) is not None:
                request.set_url("http://0.0.0.1/%s" % url)

        return request
コード例 #4
0
ファイル: throttled.py プロジェクト: berkmancenter/mediacloud
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledDomainException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().

        The throttling routine will not be applied after the first successful request, to allow for redirects and
        other followup requests to succeed.  To ensure proper throttling, a new object should be create for each
        top level request.

        Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by
        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR.
        """
        if self._use_throttling:
            domain = mediawords.util.url.get_url_distinctive_domain(request.url())

            domain_timeout = self.domain_timeout
            if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS):
                domain_timeout = max(1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR))

            # this postgres function returns true if we are allowed to make the request and false otherwise. this
            # function does not use a table lock, so some extra requests might sneak through, but that's better than
            # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
            got_domain_lock = self.db.query(
                "select get_domain_web_requests_lock(%s, %s)",
                (domain, domain_timeout)).flat()[0]

            log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock)))

            if not got_domain_lock:
                raise McThrottledDomainException("domain " + str(domain) + " is locked.")
        else:
            log.debug("domain lock obtained for %s: skipped" % str(request.url()))

        self._use_throttling = False

        return super().request(request)
コード例 #5
0
ファイル: __init__.py プロジェクト: robpotter89/backend
    def __prepare_request(self, request: Request) -> requests.PreparedRequest:
        """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid."""
        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None)
                or (auth_username is not None and auth_password is None)):
            raise McRequestException(
                "Either both or none of HTTP authentication credentials must be not None."
            )

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0)
                    or (len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException(
                    "Either both or none of HTTP authentication credentials must be not Empty."
                )

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(
                requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        return requests_prepared_request
コード例 #6
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'],
                                             'logs', 'http_request.log')

        with open(http_request_log_path, 'a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (
                sql_now(),
                url,
            ))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (
                http_request_log_path,
                str(ex),
            ))
            pass
コード例 #7
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
コード例 #8
0
ファイル: __init__.py プロジェクト: berkmancenter/mediacloud
    def __prepare_request(self, request: Request) -> requests.PreparedRequest:
        """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid."""
        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None) or (
                auth_username is not None and auth_password is None)):
            raise McRequestException("Either both or none of HTTP authentication credentials must be not None.")

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0) or (
                    len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException("Either both or none of HTTP authentication credentials must be not Empty.")

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),))

        return requests_prepared_request
コード例 #9
0
ファイル: __init__.py プロジェクト: berkmancenter/mediacloud
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log')

        with open(http_request_log_path, encoding='utf-8', mode='a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (sql_now(), url,))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),))
            pass
コード例 #10
0
ファイル: throttled.py プロジェクト: chautong/mediacloud
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledUserAgentTimeoutException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().
        """
        domain = mediawords.util.url.get_url_distinctive_domain(request.url())

        # this postgres function returns true if we are allowed to make the request and false otherwise.
        # this function does not use a table lock, so some extra requests might sneak through, but that's better than
        # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
        got_domain_lock = self.db.query(
            "select get_domain_web_requests_lock(%s, %s)",
            (domain, self.domain_timeout)).flat()[0]

        if not got_domain_lock:
            raise McThrottledUserAgentTimeoutException("domain " + str(domain) + " is locked.")

        return super(ThrottledUserAgent, self).request(request)
コード例 #11
0
ファイル: __init__.py プロジェクト: berkmancenter/mediacloud
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords']['blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None:
                request.set_url("http://0.0.0.1/%s" % url)

        return request
コード例 #12
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None)
                or (auth_username is not None and auth_password is None)):
            raise McRequestException(
                "Either both or none of HTTP authentication credentials must be not None."
            )

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0)
                    or (len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException(
                    "Either both or none of HTTP authentication credentials must be not Empty."
                )

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(
                requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        error_is_client_side = False

        try:
            requests_response = self.__session.send(
                request=requests_prepared_request,
                timeout=self.timeout(),

                # To be able to enforce max_size
                stream=True,
            )

        except requests.TooManyRedirects as ex:

            # On too many redirects, return the last fetched page (just like LWP::UserAgent does)
            log.warning("Exceeded max. redirects for URL %s" % request.url())
            requests_response = ex.response
            response_data = str(ex)

        except requests.Timeout as ex:

            log.warning("Timeout for URL %s" % request.url())

            # We treat timeouts as client-side errors too because we can retry on them
            error_is_client_side = True

            requests_response = requests.Response()
            requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
            requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
            requests_response.request = requests_prepared_request

            requests_response.history = []

            response_data = str(ex)

        except Exception as ex:

            # Client-side error
            log.warning("Client-side error while processing request %s: %s" % (
                str(request),
                str(ex),
            ))

            error_is_client_side = True

            requests_response = requests.Response()
            requests_response.status_code = HTTPStatus.BAD_REQUEST.value
            requests_response.reason = "Client-side error"
            requests_response.request = requests_prepared_request

            # Previous request / response chain is not built for client-side errored requests
            requests_response.history = []

            requests_response.headers = {
                # LWP::UserAgent compatibility
                'Client-Warning': 'Client-side error',
            }

            response_data = str(ex)

        else:

            try:

                max_size = self.max_size()

                response_data = ""
                read_response_data = True

                if max_size is not None:
                    content_length = requests_response.headers.get(
                        'Content-Length', None)

                    if content_length is not None:
                        content_length = int(content_length)
                        if content_length > max_size:
                            log.warning(
                                "Content-Length exceeds %d for URL %s" % (
                                    max_size,
                                    url,
                                ))

                            # Release the response to return connection back to the pool
                            # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow)
                            requests_response.close()

                            read_response_data = False

                if read_response_data:

                    if requests_response.encoding is None:

                        if requests_response.apparent_encoding is None:
                            # If encoding is not in HTTP headers nor can be determined from content itself, assume that
                            # it's UTF-8
                            requests_response.encoding = 'UTF-8'

                        else:
                            # Test the encoding guesser's opinion, just like browsers do
                            requests_response.encoding = requests_response.apparent_encoding

                    else:

                        # If "Content-Type" HTTP header contains a string "text" and doesn't have "charset" property,
                        # "requests" falls back to setting the encoding to ISO-8859-1, which is probably not right
                        # (encoding might have been defined in the HTML content itself via <meta> tag), so we use the
                        # "apparent encoding" instead
                        if requests_response.encoding.lower() == 'iso-8859-1':
                            if requests_response.apparent_encoding is not None:
                                requests_response.encoding = requests_response.apparent_encoding

                    # Some pages report some funky encoding; in that case, fallback to UTF-8
                    try:
                        codecs.lookup(requests_response.encoding)
                    except LookupError:
                        log.warning("Invalid encoding %s for URL %s" %
                                    (requests_response.encoding,
                                     requests_response.url))
                        requests_response.encoding = 'UTF-8'

                    response_data_size = 0
                    for chunk in requests_response.iter_content(
                            chunk_size=None, decode_unicode=True):
                        response_data += chunk
                        response_data_size += len(chunk)

                        # Content-Length might be missing / lying, so we measure size while fetching the data too
                        if max_size is not None:
                            if response_data_size > max_size:
                                log.warning("Data size exceeds %d for URL %s" %
                                            (
                                                max_size,
                                                url,
                                            ))

                                # Release the response to return connection back to the pool
                                # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow)
                                requests_response.close()

                                break

            except requests.RequestException as ex:

                log.warning("Error reading data for URL %s" % request.url())

                # We treat timeouts as client-side errors too because we can retry on them
                error_is_client_side = True

                requests_response = requests.Response()
                requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
                requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
                requests_response.request = requests_prepared_request

                requests_response.history = []

                response_data = str(ex)

        if requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        if response_data is None:
            # Probably a programming error
            raise McRequestException("Response data is None.")

        response = Response.from_requests_response(
            requests_response=requests_response,
            data=response_data,
        )

        if error_is_client_side:
            response.set_error_is_client_side(
                error_is_client_side=error_is_client_side)

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request)

            previous_response = Response.from_requests_response(
                requests_response=previous_rq_response)
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=requests_response.request)
        response.set_request(response_request)

        return response
コード例 #13
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        try:
            requests_prepared_request = self.__prepare_request(request)
        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        try:
            user_agent_response = self.__execute_request(
                requests_prepared_request)
        except Exception as ex:
            raise McRequestException("Unable to execute request %s: %s" % (
                str(requests_prepared_request),
                str(ex),
            ))

        try:
            response_data = self.__read_response_data(
                user_agent_response.requests_response)
        except Exception as ex:
            log.warning("Error reading data for URL %s" % request.url())

            user_agent_response.requests_response = requests.Response()
            user_agent_response.requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
            user_agent_response.requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
            user_agent_response.requests_response.request = requests_prepared_request

            user_agent_response.requests_response.history = []

            # We treat timeouts as client-side errors too because we can retry on them
            user_agent_response.error_is_client_side = True

            response_data = str(ex)

        if user_agent_response.requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        if response_data is None:
            # Probably a programming error
            raise McRequestException("Response data is None.")

        response = Response.from_requests_response(
            requests_response=user_agent_response.requests_response,
            data=response_data,
        )

        if user_agent_response.error_is_client_side is True:
            response.set_error_is_client_side(
                error_is_client_side=user_agent_response.error_is_client_side)

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(
                user_agent_response.requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request)

            # Sometimes reading the (chunked?) previous response's data fails with:
            #
            #      AttributeError: 'NoneType' object has no attribute 'readline'
            #
            # Previous response's data is not that important, so fail rather silently.
            try:
                previous_rq_response_data = previous_rq_response.text
            except Exception as ex:
                log.warning("Reading previous response's data failed: %s" %
                            str(ex))
                previous_rq_response_data = ''

            previous_response = Response.from_requests_response(
                requests_response=previous_rq_response,
                data=previous_rq_response_data)
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=user_agent_response.requests_response.
            request)
        response.set_request(response_request)

        return response