Exemple #1
0
    def __init__(self, lua, exceptions, reply, exposed_request, har_entry=None, read_body=False):
        super(_ExposedResponse, self).__init__(lua, exceptions)
        self.headers = self.lua.python2lua(get_headers_dict(reply))

        if har_entry is None:
            if read_body:
                resp_info = reply2har(reply, include_content=True, binary_content=False)
            else:
                resp_info = reply2har(reply)
        else:
            resp_info = har_entry["response"]

        self.request = exposed_request
        self._info = resp_info
        self._info_lua = None
        self._body_binary = None
Exemple #2
0
    def store_reply_finished(self, req_id, reply):
        """
        Store information about a finished reply.
        """
        if not self.log.has_entry(req_id):
            return
        entry = self.log.get_mutable_entry(req_id)
        entry["_splash_processing_state"] = self.REQUEST_FINISHED

        # update timings
        now = datetime.utcnow()
        start_time = entry['_tmp']['start_time']
        response_start_time = entry['_tmp']['response_start_time']

        receive_time = get_duration(response_start_time, now)
        total_time = get_duration(start_time, now)

        entry["timings"]["receive"] = receive_time
        entry["time"] = total_time

        if not entry["timings"]["send"]:
            wait_time = entry["timings"]["wait"]
            entry["timings"]["send"] = total_time - receive_time - wait_time
            if entry["timings"]["send"] < 1e-6:
                entry["timings"]["send"] = 0

        # update other reply information
        entry["response"].update(reply2har(reply, include_content=True))
Exemple #3
0
    def _handleMetaData(self):
        """Signal emitted before reading response body, after getting headers
        """
        reply = self.sender()
        self._handle_reply_cookies(reply)

        callbacks = self._getWebPageAttribute(reply.request(), "callbacks")

        if callbacks and "on_response_headers" in callbacks:
            for cb in callbacks["on_response_headers"]:
                try:
                    cb(reply)
                except:
                    # TODO unhandled exceptions in lua callbacks
                    # should we raise errors here?
                    # https://github.com/scrapinghub/splash/issues/161
                    self.log("error in on_response_headers callback", min_level=1)
                    self.log(traceback.format_exc(), min_level=1)

        har_entry = self._harEntry()
        if har_entry is not None:
            if har_entry["_tmp"]["state"] == self.REQUEST_FINISHED:
                self.log("Headers received for {url}; ignoring", reply, min_level=3)
                return

            har_entry["_tmp"]["state"] = self.REQUEST_HEADERS_RECEIVED
            har_entry["response"].update(har_qt.reply2har(reply))

            now = datetime.utcnow()
            request_sent = har_entry["_tmp"]["request_sent_time"]
            har_entry["_tmp"]["response_start_time"] = now
            har_entry["timings"]["wait"] = har.get_duration(request_sent, now)

        self.log("Headers received for {url}", reply, min_level=3)
Exemple #4
0
    def store_reply_finished(self, req_id, reply, content):
        """
        Store information about a finished reply.
        """
        if not self.log.has_entry(req_id):
            return
        entry = self.log.get_mutable_entry(req_id)
        entry["_splash_processing_state"] = self.REQUEST_FINISHED

        # update timings
        now = datetime.utcnow()
        start_time = entry["_tmp"]["start_time"]
        response_start_time = entry["_tmp"]["response_start_time"]

        receive_time = get_duration(response_start_time, now)
        total_time = get_duration(start_time, now)

        entry["timings"]["receive"] = receive_time
        entry["time"] = total_time

        if not entry["timings"]["send"]:
            wait_time = entry["timings"]["wait"]
            entry["timings"]["send"] = total_time - receive_time - wait_time
            if entry["timings"]["send"] < 1e-6:
                entry["timings"]["send"] = 0

        # update other reply information
        entry["response"].update(reply2har(reply, content=content))
Exemple #5
0
    def _handleFinished(self):
        reply = self.sender()
        self._cancelReplyTimer(reply)
        har_entry = self._harEntry()
        if har_entry is not None:
            har_entry["_tmp"]["state"] = self.REQUEST_FINISHED

            now = datetime.utcnow()
            start_time = har_entry['_tmp']['start_time']
            response_start_time = har_entry['_tmp']['response_start_time']

            receive_time = har.get_duration(response_start_time, now)
            total_time = har.get_duration(start_time, now)

            har_entry["timings"]["receive"] = receive_time
            har_entry["time"] = total_time

            if not har_entry["timings"]["send"]:
                wait_time = har_entry["timings"]["wait"]
                har_entry["timings"][
                    "send"] = total_time - receive_time - wait_time
                if har_entry["timings"]["send"] < 1e-6:
                    har_entry["timings"]["send"] = 0

            har_entry["response"].update(har_qt.reply2har(reply))

        self.log("Finished downloading {url}", reply)
Exemple #6
0
    def _handleFinished(self):
        reply = self.sender()
        har_entry = self._harEntry()
        if har_entry is not None:
            har_entry["_tmp"]["state"] = self.REQUEST_FINISHED

            now = datetime.utcnow()
            start_time = har_entry['_tmp']['start_time']
            response_start_time = har_entry['_tmp']['response_start_time']

            receive_time = har.get_duration(response_start_time, now)
            total_time = har.get_duration(start_time, now)

            har_entry["timings"]["receive"] = receive_time
            har_entry["time"] = total_time

            if not har_entry["timings"]["send"]:
                wait_time = har_entry["timings"]["wait"]
                har_entry["timings"]["send"] = total_time - receive_time - wait_time
                if har_entry["timings"]["send"] < 1e-6:
                    har_entry["timings"]["send"] = 0

            har_entry["response"].update(har_qt.reply2har(reply))

        self.log("Finished downloading {url}", reply)
Exemple #7
0
 def store_new_reply(self, req_id, reply):
     """
     Store initial reply information.
     """
     if not self.log.has_entry(req_id):
         return
     entry = self.log.get_mutable_entry(req_id)
     entry["response"].update(reply2har(reply))
Exemple #8
0
 def store_new_reply(self, req_id, reply):
     """
     Store initial reply information.
     """
     if not self.log.has_entry(req_id):
         return
     entry = self.log.get_mutable_entry(req_id)
     entry["response"].update(reply2har(reply))
    def createRequest(self, operation, request, outgoingData=None):
        """
        This method is called when a new request is sent;
        it must return a reply object to work with.
        """
        start_time = datetime.utcnow()

        request = self._wrapRequest(request)
        self._handle_custom_headers(request)
        self._handle_request_cookies(request)

        with self._proxyApplied(request):
            callbacks = self._getWebPageAttribute(request, "callbacks")
            if callbacks and 'on_request' in callbacks:
                for cb in callbacks["on_request"]:
                    try:
                        cb(request, operation, outgoingData)
                    except:
                        # Unhandled exceptions in createRequest method cause
                        # segfaults, so we log all errors.
                        self.log("error in on_resource_requested callback", min_level=1)
                        self.log(traceback.format_exc(), min_level=1)

            if hasattr(request, 'custom_proxy'):
                self.setProxy(request.custom_proxy)

            har_entry = self._harEntry(request, create=True)
            if har_entry is not None:
                har_entry.update(self._initialHarData(
                    start_time=start_time,
                    operation=operation,
                    request=request,
                    outgoingData=outgoingData
                ))

            reply = super(ProxiedQNetworkAccessManager, self).createRequest(
                operation, request, outgoingData
            )

            if hasattr(request, 'timeout'):
                timeout = request.timeout * 1000
                if timeout:
                    self._setReplyTimeout(reply, timeout)

            if har_entry is not None:
                har_entry["response"].update(har_qt.reply2har(reply))

            reply.error.connect(self._handleError)
            reply.finished.connect(self._handleFinished)
            # http://doc.qt.io/qt-5/qnetworkreply.html#metaDataChanged
            reply.metaDataChanged.connect(self._handleMetaData)
            reply.downloadProgress.connect(self._handleDownloadProgress)

        return reply
Exemple #10
0
    def createRequest(self, operation, request, outgoingData=None):
        """
        This method is called when a new request is sent;
        it must return a reply object to work with.
        """
        start_time = datetime.utcnow()

        request = self._wrapRequest(request)
        self._handle_custom_headers(request)
        self._handle_request_cookies(request)

        with self._proxyApplied(request):
            callbacks = self._getWebPageAttribute(request, "callbacks")
            if callbacks and 'on_request' in callbacks:
                for cb in callbacks["on_request"]:
                    try:
                        cb(request, operation, outgoingData)
                    except:
                        # Unhandled exceptions in createRequest method cause
                        # segfaults, so we log all errors.
                        self.log("error in on_resource_requested callback",
                                 min_level=1)
                        self.log(traceback.format_exc(), min_level=1)

            if hasattr(request, 'custom_proxy'):
                self.setProxy(request.custom_proxy)

            har_entry = self._harEntry(request, create=True)
            if har_entry is not None:
                har_entry.update(
                    self._initialHarData(start_time=start_time,
                                         operation=operation,
                                         request=request,
                                         outgoingData=outgoingData))

            reply = super(ProxiedQNetworkAccessManager,
                          self).createRequest(operation, request, outgoingData)

            if hasattr(request, 'timeout'):
                timeout = request.timeout * 1000
                if timeout:
                    self._setReplyTimeout(reply, timeout)

            if har_entry is not None:
                har_entry["response"].update(har_qt.reply2har(reply))

            reply.error.connect(self._handleError)
            reply.finished.connect(self._handleFinished)
            # http://doc.qt.io/qt-5/qnetworkreply.html#metaDataChanged
            reply.metaDataChanged.connect(self._handleMetaData)
            reply.downloadProgress.connect(self._handleDownloadProgress)

        return reply
Exemple #11
0
 def __init__(self, lua, reply):
     self.lua = lua
     self.response = reply
     # according to specs HTTP response headers should not contain unicode
     # https://github.com/kennethreitz/requests/issues/1926#issuecomment-35524028
     _headers = {str(k): str(v) for k, v in reply.rawHeaderPairs()}
     self.headers = self.lua.python2lua(_headers)
     self.info = self.lua.python2lua(reply2har(reply))
     commands = get_commands(self)
     self.commands = self.lua.python2lua(commands)
     self.attr_whitelist = list(commands.keys()) + self._attribute_whitelist
     self._exceptions = []
     self.request = self.lua.python2lua(request2har(reply.request(), reply.operation()))
Exemple #12
0
 def __init__(self, lua, reply, har_entry=None):
     super(_ExposedResponse, self).__init__(lua)
     # according to specs HTTP response headers should not contain unicode
     # https://github.com/kennethreitz/requests/issues/1926#issuecomment-35524028
     _headers = {str(k): str(v) for k, v in reply.rawHeaderPairs()}
     self.headers = self.lua.python2lua(_headers)
     if har_entry is None:
         resp_info = reply2har(reply)
     else:
         resp_info = har_entry['response']
     self.info = self.lua.python2lua(resp_info)
     self.request = self.lua.python2lua(
         request2har(reply.request(), reply.operation())
     )
Exemple #13
0
 def __init__(self, lua, reply):
     self.lua = lua
     self.response = reply
     # according to specs HTTP response headers should not contain unicode
     # https://github.com/kennethreitz/requests/issues/1926#issuecomment-35524028
     _headers = {str(k): str(v) for k, v in reply.rawHeaderPairs()}
     self.headers = self.lua.python2lua(_headers)
     self.info = self.lua.python2lua(reply2har(reply))
     commands = get_commands(self)
     self.commands = self.lua.python2lua(commands)
     self.attr_whitelist = list(commands.keys()) + self._attribute_whitelist
     self._exceptions = []
     self.request = self.lua.python2lua(
         request2har(reply.request(), reply.operation()))
Exemple #14
0
    def _handleMetaData(self):
        reply = self.sender()
        har_entry = self._harEntry()
        if har_entry is not None:
            if har_entry["_tmp"]["state"] == self.REQUEST_FINISHED:
                self.log("Headers received for {url}; ignoring", reply, min_level=3)
                return

            har_entry["_tmp"]["state"] = self.REQUEST_HEADERS_RECEIVED
            har_entry["response"].update(har_qt.reply2har(reply))

            now = datetime.utcnow()
            request_sent = har_entry["_tmp"]["request_sent_time"]
            har_entry["_tmp"]["response_start_time"] = now
            har_entry["timings"]["wait"] = har.get_duration(request_sent, now)

        self.log("Headers received for {url}", reply, min_level=3)
Exemple #15
0
    def store_reply_headers_received(self, req_id, reply):
        """
        Update reply information when HTTP headers are received.
        """
        if not self.log.has_entry(req_id):
            return
        entry = self.log.get_mutable_entry(req_id)
        if entry["_splash_processing_state"] == self.REQUEST_FINISHED:
            # self.log("Headers received for {url}; ignoring", reply,
            #           min_level=3)
            return

        entry["_splash_processing_state"] = self.REQUEST_HEADERS_RECEIVED
        entry["response"].update(reply2har(reply))

        now = datetime.utcnow()
        request_sent = entry["_tmp"]["request_sent_time"]
        entry["_tmp"]["response_start_time"] = now
        entry["timings"]["wait"] = get_duration(request_sent, now)
Exemple #16
0
    def store_reply_headers_received(self, req_id, reply):
        """
        Update reply information when HTTP headers are received.
        """
        if not self.log.has_entry(req_id):
            return
        entry = self.log.get_mutable_entry(req_id)
        if entry["_splash_processing_state"] == self.REQUEST_FINISHED:
            # self.log("Headers received for {url}; ignoring", reply,
            #           min_level=3)
            return

        entry["_splash_processing_state"] = self.REQUEST_HEADERS_RECEIVED
        entry["response"].update(reply2har(reply))

        now = datetime.utcnow()
        request_sent = entry["_tmp"]["request_sent_time"]
        entry["_tmp"]["response_start_time"] = now
        entry["timings"]["wait"] = get_duration(request_sent, now)
Exemple #17
0
    def _handleMetaData(self):
        reply = self.sender()
        self._handle_reply_cookies(reply)

        har_entry = self._harEntry()
        if har_entry is not None:
            if har_entry["_tmp"]["state"] == self.REQUEST_FINISHED:
                self.log("Headers received for {url}; ignoring", reply, min_level=3)
                return

            har_entry["_tmp"]["state"] = self.REQUEST_HEADERS_RECEIVED
            har_entry["response"].update(har_qt.reply2har(reply))

            now = datetime.utcnow()
            request_sent = har_entry["_tmp"]["request_sent_time"]
            har_entry["_tmp"]["response_start_time"] = now
            har_entry["timings"]["wait"] = har.get_duration(request_sent, now)

        self.log("Headers received for {url}", reply, min_level=3)
Exemple #18
0
    def _handleMetaData(self):
        """Signal emitted before reading response body, after getting headers
        """
        reply = self.sender()
        self._handle_reply_cookies(reply)

        callbacks = self._getWebPageAttribute(reply.request(), "callbacks")

        if callbacks and "on_response_headers" in callbacks:
            for cb in callbacks["on_response_headers"]:
                try:
                    cb(reply)
                except:
                    # TODO unhandled exceptions in lua callbacks
                    # should we raise errors here?
                    # https://github.com/scrapinghub/splash/issues/161
                    self.log("error in on_response_headers callback",
                             min_level=1)
                    self.log(traceback.format_exc(), min_level=1)

        har_entry = self._harEntry()
        if har_entry is not None:
            if har_entry["_tmp"]["state"] == self.REQUEST_FINISHED:
                self.log("Headers received for {url}; ignoring",
                         reply,
                         min_level=3)
                return

            har_entry["_tmp"]["state"] = self.REQUEST_HEADERS_RECEIVED
            har_entry["response"].update(har_qt.reply2har(reply))

            now = datetime.utcnow()
            request_sent = har_entry["_tmp"]["request_sent_time"]
            har_entry["_tmp"]["response_start_time"] = now
            har_entry["timings"]["wait"] = har.get_duration(request_sent, now)

        self.log("Headers received for {url}", reply, min_level=3)
Exemple #19
0
    def createRequest(self, operation, request, outgoingData=None):
        """
        This method is called when a new request is sent;
        it must return a reply object to work with.
        """
        start_time = datetime.utcnow()

        request = self._wrapRequest(request)
        self._handle_custom_headers(request)
        self._handle_request_cookies(request)

        har_entry = self._harEntry(request, create=True)
        if har_entry is not None:
            if outgoingData is None:
                bodySize = -1
            else:
                bodySize = outgoingData.size()
            har_entry.update({
                '_tmp': {
                    'start_time': start_time,
                    'request_start_sending_time': start_time,
                    'request_sent_time': start_time,
                    'response_start_time': start_time,

                    # 'outgoingData': outgoingData,
                    'state': self.REQUEST_CREATED,
                },
                "startedDateTime": har.format_datetime(start_time),
                "request": {
                    "method": OPERATION_NAMES.get(operation, '?'),
                    "url": unicode(request.url().toString()),
                    "httpVersion": "HTTP/1.1",
                    "cookies": har_qt.request_cookies2har(request),
                    "queryString": har_qt.querystring2har(request.url()),
                    "headers": har_qt.headers2har(request),
                    "headersSize": har_qt.headers_size(request),
                    "bodySize": bodySize,
                },
                "response": {
                    "bodySize": -1,
                },
                "cache": {},
                "timings": {
                    "blocked": -1,
                    "dns": -1,
                    "connect": -1,
                    "ssl": -1,
                    "send": 0,
                    "wait": 0,
                    "receive": 0,
                },
                "time": 0,
            })

        with self._proxyApplied(request):
            reply = super(ProxiedQNetworkAccessManager,
                          self).createRequest(operation, request, outgoingData)
            if har_entry is not None:
                har_entry["response"].update(har_qt.reply2har(reply))

            reply.error.connect(self._handleError)
            reply.finished.connect(self._handleFinished)
            reply.metaDataChanged.connect(self._handleMetaData)
            reply.downloadProgress.connect(self._handleDownloadProgress)

        return reply
Exemple #20
0
 def callback(reply):
     reply_har = reply2har(reply, include_content=True, binary_content=True)
     self._return(cmd_id, self.lua.python2lua(reply_har))
Exemple #21
0
 def callback(reply):
     reply_har = reply2har(reply,
                           include_content=True,
                           binary_content=True)
     self._return(cmd_id, self.lua.python2lua(reply_har))
Exemple #22
0
 def callback(reply):
     reply_har = reply2har(reply, include_content=True, binary_content=True)
     cmd.return_result(self.lua.python2lua(reply_har))
Exemple #23
0
    def createRequest(self, operation, request, outgoingData=None):
        """
        This method is called when a new request is sent;
        it must return a reply object to work with.
        """
        start_time = datetime.utcnow()

        request = self._wrapRequest(request)
        har_entry = self._harEntry(request, create=True)
        if har_entry is not None:
            if outgoingData is None:
                bodySize = -1
            else:
                bodySize = outgoingData.size()
            har_entry.update({
                '_tmp': {
                    'start_time': start_time,
                    'request_start_sending_time': start_time,
                    'request_sent_time': start_time,
                    'response_start_time': start_time,

                    # 'outgoingData': outgoingData,
                    'state': self.REQUEST_CREATED,
                },
                "startedDateTime": har.format_datetime(start_time),
                "request": {
                    "method": OPERATION_NAMES.get(operation, '?'),
                    "url": unicode(request.url().toString()),
                    "httpVersion": "HTTP/1.1",
                    "cookies": har_qt.request_cookies2har(request),
                    "queryString": har_qt.querystring2har(request.url()),
                    "headers": har_qt.headers2har(request),

                    "headersSize" : har_qt.headers_size(request),
                    "bodySize": bodySize,
                },
                "response": {
                    "bodySize": -1,
                },
                "cache": {},
                "timings": {
                    "blocked": -1,
                    "dns": -1,
                    "connect": -1,
                    "ssl": -1,

                    "send": 0,
                    "wait": 0,
                    "receive": 0,
                },
                "time": 0,
            })

        with self._proxyApplied(request):
            reply = super(ProxiedQNetworkAccessManager, self).createRequest(
                operation, request, outgoingData
            )
            if har_entry is not None:
                har_entry["response"].update(har_qt.reply2har(reply))

            reply.error.connect(self._handleError)
            reply.finished.connect(self._handleFinished)
            reply.metaDataChanged.connect(self._handleMetaData)
            reply.downloadProgress.connect(self._handleDownloadProgress)

        return reply