Example #1
0
 def _response_nonfinal(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
     "Got a non-final response."
     nfres = HttpResponse(self.add_note)
     nfres.process_top_line(self.exchange.res_version, status, phrase)
     nfres.process_raw_headers(res_headers)
     StatusChecker(nfres, self.request)
     self.nonfinal_responses.append(nfres)
Example #2
0
 def _response_nonfinal(self, status: bytes, phrase: bytes, 
                        res_headers: RawHeaderListType) -> None:
     "Got a non-final response."
     nfres = HttpResponse(self.add_note)
     nfres.process_top_line(self.exchange.res_version, status, phrase)
     nfres.process_raw_headers(res_headers)
     StatusChecker(nfres, self.request)
     self.nonfinal_responses.append(nfres)
Example #3
0
class RedFetcher(thor.events.EventEmitter):
    """
    Abstract class for a fetcher.

    Fetches the given URI (with the provided method, headers and body) and:
      - emits 'status' and 'debug' as it progresses
      - emits 'fetch_done' when the fetch is finished.

    If provided, 'name' indicates the type of the request, and is used to
    help set notes and status events appropriately.
    """

    check_name = "undefined"
    response_phrase = "undefined"
    client = RedHttpClient()
    client.idle_timeout = 5
    robot_emitter = thor.events.EventEmitter()

    def __init__(self, config: SectionProxy) -> None:
        thor.events.EventEmitter.__init__(self)
        self.config = config
        self.notes = []  # type: List[Note]
        self.transfer_in = 0
        self.transfer_out = 0
        self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
        self.nonfinal_responses = []  # type: List[HttpResponse]
        self.response = HttpResponse(self.add_note)  # type: HttpResponse
        self.exchange = None  # type: thor.http.ClientExchange
        self.fetch_started = False
        self.fetch_done = False

    def __getstate__(self) -> Dict[str, Any]:
        state = thor.events.EventEmitter.__getstate__(self)
        del state["exchange"]
        return state

    def __repr__(self) -> str:
        out = [self.__class__.__name__]
        if self.request.uri:
            out.append("%s" % self.request.uri)
        if self.fetch_started:
            out.append("fetch_started")
        if self.fetch_done:
            out.append("fetch_done")
        return "<%s at %#x>" % (", ".join(out), id(self))

    def add_note(self, subject: str, note: Type[Note],
                 **kw: Union[str, int]) -> None:
        "Set a note."
        if "response" not in kw:
            kw["response"] = self.response_phrase
        self.notes.append(note(subject, kw))

    def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None:
        "Ignore a note (for requests)."
        return

    def preflight(self) -> bool:
        """
        Check to see if we should bother running. Return True
        if so; False if not. Can be overridden.
        """
        return True

    def set_request(
        self,
        iri: str,
        method: str = "GET",
        req_hdrs: StrHeaderListType = None,
        req_body: bytes = None,
    ) -> None:
        """
        Set the resource's request. All values are strings.
        """
        self.request.method = method
        self.response.is_head_response = method == "HEAD"  # type: ignore
        try:
            self.request.set_iri(iri)
        except httperr.UrlError as why:
            self.response.http_error = why
        self.response.base_uri = self.request.uri  # type: ignore
        if req_hdrs:
            self.request.set_headers(req_hdrs)
        self.request.payload = req_body  # type: ignore    # FIXME: encoding
        self.request.complete = True  # cheating a bit

    def check(self) -> None:
        """
        Make an asynchronous HTTP request to uri, emitting 'status' as it's
        updated and 'fetch_done' when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        if not self.preflight() or self.request.uri is None:
            # generally a good sign that we're not going much further.
            self._fetch_done()
            return
        self.run_continue(True)

    def run_continue(self, allowed: bool) -> None:
        """
        Continue after getting the robots file.
        """
        if not allowed:
            self.response.http_error = RobotsTxtError()
            self._fetch_done()
            return

        self.fetch_started = True

        if "user-agent" not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(("User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on("response_nonfinal", self._response_nonfinal)
        self.exchange.once("response_start", self._response_start)
        self.exchange.on("response_body", self._response_body)
        self.exchange.once("response_done", self._response_done)
        self.exchange.on("error", self._response_error)
        self.emit("status",
                  "fetching %s (%s)" % (self.request.uri, self.check_name))
        self.emit("debug",
                  "fetching %s (%s)" % (self.request.uri, self.check_name))
        req_hdrs = [
            (k.encode("ascii", "replace"), v.encode("ascii", "replace"))
            for (k, v) in self.request.headers
        ]  # FIXME: should complain
        self.exchange.request_start(
            self.request.method.encode("ascii"),
            self.request.uri.encode("ascii"),
            req_hdrs,
        )
        self.request.start_time = thor.time()
        if not self.fetch_done:  # the request could have immediately failed.
            if self.request.payload is not None:
                self.exchange.request_body(self.request.payload)
                self.transfer_out += len(self.request.payload)
        if not self.fetch_done:  # the request could have immediately failed.
            self.exchange.request_done([])

    def _response_nonfinal(self, status: bytes, phrase: bytes,
                           res_headers: RawHeaderListType) -> None:
        "Got a non-final response."
        nfres = HttpResponse(self.add_note)
        nfres.process_top_line(self.exchange.res_version, status, phrase)
        nfres.process_raw_headers(res_headers)
        StatusChecker(nfres, self.request)
        self.nonfinal_responses.append(nfres)

    def _response_start(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
        "Process the response start-line and headers."
        self.response.start_time = thor.time()
        self.response.process_top_line(self.exchange.res_version, status,
                                       phrase)
        self.response.process_raw_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk: bytes) -> None:
        "Process a chunk of the response body."
        self.transfer_in += len(chunk)
        self.response.feed_body(chunk)

    def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None:
        "Finish analysing the response, handling any parse errors."
        self.emit("debug",
                  "fetched %s (%s)" % (self.request.uri, self.check_name))
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        self._fetch_done()

    def _response_error(self, error: httperr.HttpError) -> None:
        "Handle an error encountered while fetching the response."
        self.emit(
            "debug",
            "fetch error %s (%s) - %s" %
            (self.request.uri, self.check_name, error.desc),
        )
        err_sample = error.detail[:40] or ""
        if isinstance(error, httperr.ExtraDataError):
            if self.response.status_code == "304":
                self.add_note("body", BODY_NOT_ALLOWED, sample=err_sample)
            else:
                self.add_note("body", EXTRA_DATA, sample=err_sample)
        elif isinstance(error, httperr.ChunkError):
            self.add_note("header-transfer-encoding",
                          BAD_CHUNK,
                          chunk_sample=err_sample)
        elif isinstance(error, httperr.HeaderSpaceError):
            subject = "header-%s" % (error.detail.lower().strip())
            self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail)
        else:
            self.response.http_error = error
        self._fetch_done()

    def _fetch_done(self) -> None:
        if not self.fetch_done:
            self.fetch_done = True
            self.exchange = None
            self.emit("fetch_done")
Example #4
0
class RedFetcher(thor.events.EventEmitter):
    """
    Abstract class for a fetcher.

    Fetches the given URI (with the provided method, headers and body) and:
      - emits 'status' as it progresses
      - emits 'fetch_done' when the fetch is finished.

    If provided, 'name' indicates the type of the request, and is used to
    help set notes and status events appropriately.
    """
    check_name = "undefined"
    response_phrase = "undefined"
    client = RedHttpClient()
    robot_fetcher = RobotFetcher()

    def __init__(self) -> None:
        thor.events.EventEmitter.__init__(self)
        self.notes = [] # type: List[Note]
        self.transfer_in = 0
        self.transfer_out = 0
        self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
        self.nonfinal_responses = []                  # type: List[HttpResponse]
        self.response = HttpResponse(self.add_note)   # type: HttpResponse
        self.exchange = None                          # type: thor.http.ClientExchange
        self.follow_robots_txt = True # Should we pay attention to robots file?
        self.fetch_started = False
        self.fetch_done = False

    def __getstate__(self) -> Dict[str, Any]:
        state = thor.events.EventEmitter.__getstate__(self)
        del state['exchange']
        return state

    def __repr__(self) -> str:
        out = [self.__class__.__name__]
        if self.request.uri:
            out.append("%s" % self.request.uri)
        if self.fetch_started:
            out.append("fetch_started")
        if self.fetch_done:
            out.append("fetch_done")
        return "<%s at %#x>" % (", ".join(out), id(self))

    def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None:
        "Set a note."
        if 'response' not in kw:
            kw['response'] = self.response_phrase
        self.notes.append(note(subject, kw))

    def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None:
        "Ignore a note (for requests)."
        return

    def preflight(self) -> bool:
        """
        Check to see if we should bother running. Return True
        if so; False if not. Can be overridden.
        """
        return True

    def set_request(self, iri: str, method: str="GET",
                    req_hdrs: StrHeaderListType=None, req_body: bytes=None) -> None:
        """
        Set the resource's request. All values are strings.
        """
        self.request.method = method
        self.response.is_head_response = (method == "HEAD")   # type: ignore
        self.request.set_iri(iri)
        self.response.base_uri = self.request.uri             # type: ignore
        if req_hdrs:
            self.request.set_headers(req_hdrs)
        self.request.payload = req_body # type: ignore    # FIXME: encoding
        self.request.complete = True  # cheating a bit

    def check(self) -> None:
        """
        Make an asynchronous HTTP request to uri, emitting 'status' as it's
        updated and 'fetch_done' when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        if not self.preflight() or self.request.uri is None:
            # generally a good sign that we're not going much further.
            self._fetch_done()
            return

        if self.follow_robots_txt:
            self.robot_fetcher.once("robot-%s" % self.request.uri, self.run_continue)
            self.robot_fetcher.check_robots(self.request.uri)
        else:
            self.run_continue(True)

    def run_continue(self, allowed: bool) -> None:
        """
        Continue after getting the robots file.
        """
        if not allowed:
            self.response.http_error = RobotsTxtError()
            self._fetch_done()
            return

        self.fetch_started = True

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(("User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_nonfinal', self._response_nonfinal)
        self.exchange.once('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.once('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name))
        req_hdrs = [(k.encode('ascii'), v.encode('ascii')) for (k, v) in self.request.headers]
        self.exchange.request_start(
            self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs)
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])

    def _response_nonfinal(self, status: bytes, phrase: bytes, 
                           res_headers: RawHeaderListType) -> None:
        "Got a non-final response."
        nfres = HttpResponse(self.add_note)
        nfres.process_top_line(self.exchange.res_version, status, phrase)
        nfres.process_raw_headers(res_headers)
        StatusChecker(nfres, self.request)
        self.nonfinal_responses.append(nfres)

    def _response_start(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
        "Process the response start-line and headers."
        self.response.start_time = thor.time()
        self.response.process_top_line(self.exchange.res_version, status, phrase)
        self.response.process_raw_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk: bytes) -> None:
        "Process a chunk of the response body."
        self.transfer_in += len(chunk)
        self.response.feed_body(chunk)

    def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None:
        "Finish analysing the response, handling any parse errors."
        self.emit("status", "fetched %s (%s)" % (self.request.uri, self.check_name))
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        self._fetch_done()

    def _response_error(self, error: httperr.HttpError) -> None:
        "Handle an error encountered while fetching the response."
        self.emit("status", "fetch error %s (%s) - %s" % (
            self.request.uri, self.check_name, error.desc))
        err_sample = error.detail[:40] or ""
        if error.client_recoverable:
            pass # we'll get to this later.
        elif isinstance(error, httperr.ExtraDataError):
            if self.response.status_code == "304":
                self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample)
            else:
                self.add_note('body', EXTRA_DATA, sample=err_sample)
        elif isinstance(error, httperr.ChunkError):
            self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample)
        else:
            self.response.http_error = error
        self._fetch_done()

    def _fetch_done(self) -> None:
        if not self.fetch_done:
            self.fetch_done = True
            self.emit("fetch_done")