Example #1
0
 def _response_nonfinal(self, status: bytes, phrase: bytes, 
                        res_headers: RawHeaderListType) -> None:
     "Got a non-final response."
     nfres = HttpResponse(self.add_note)
     nfres.process_top_line(self.exchange.res_version, status, phrase)
     nfres.process_raw_headers(res_headers)
     StatusChecker(nfres, self.request)
     self.nonfinal_responses.append(nfres)
Example #2
0
 def __init__(self, config: SectionProxy) -> None:
     thor.events.EventEmitter.__init__(self)
     self.config = config
     self.notes = []  # type: List[Note]
     self.transfer_in = 0
     self.transfer_out = 0
     self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
     self.nonfinal_responses = []  # type: List[HttpResponse]
     self.response = HttpResponse(self.add_note)  # type: HttpResponse
     self.exchange = None  # type: thor.http.ClientExchange
     self.fetch_started = False
     self.fetch_done = False
Example #3
0
 def __init__(self) -> None:
     thor.events.EventEmitter.__init__(self)
     self.notes = [] # type: List[Note]
     self.transfer_in = 0
     self.transfer_out = 0
     self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
     self.nonfinal_responses = []                  # type: List[HttpResponse]
     self.response = HttpResponse(self.add_note)   # type: HttpResponse
     self.exchange = None                          # type: thor.http.ClientExchange
     self.follow_robots_txt = True # Should we pay attention to robots file?
     self.fetch_started = False
     self.fetch_done = False
Example #4
0
 def __init__(self) -> None:
     thor.events.EventEmitter.__init__(self)
     self.notes = [] # type: List[Note]
     self.transfer_in = 0
     self.transfer_out = 0
     self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
     self.nonfinal_responses = []                  # type: List[HttpResponse]
     self.response = HttpResponse(self.add_note)   # type: HttpResponse
     self.exchange = None                          # type: thor.http.ClientExchange
     self.follow_robots_txt = True # Should we pay attention to robots file?
     self.fetch_started = False
     self.fetch_done = False
Example #5
0
 def _response_nonfinal(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
     "Got a non-final response."
     nfres = HttpResponse(self.add_note)
     nfres.process_top_line(self.exchange.res_version, status, phrase)
     nfres.process_raw_headers(res_headers)
     StatusChecker(nfres, self.request)
     self.nonfinal_responses.append(nfres)
Example #6
0
 def __init__(self, iri, method="GET", req_hdrs=None, req_body=None,
              status_cb=None, body_procs=None, name=None):
     RedState.__init__(self, name)
     self.request = HttpRequest(self.notes, self.name)
     self.request.method = method
     self.request.set_iri(iri)
     self.request.headers = req_hdrs or []
     self.request.payload = req_body
     self.response = HttpResponse(self.notes, self.name)
     self.response.is_head_response = (method == "HEAD")
     self.response.base_uri = self.request.uri
     self.response.set_decoded_procs(body_procs or [])
     self.exchange = None
     self.status_cb = status_cb
     self.done_cb = None
     self.outstanding_tasks = 0
     self._st = [] # FIXME: this is temporary, for debugging thor
Example #7
0
class RedFetcher(RedState):
    """
    Fetches the given URI (with the provided method, headers and body) and
    calls:
      - status_cb as it progresses, and
      - every function in the body_procs list with each chunk of the body, and
      - done_cb when all tasks are done.
    If provided, type indicates the type of the request, and is used to
    help set notes and status_cb appropriately.

    The done() method is called when the response is done, NOT when all
    tasks are done. It can add tasks by calling add_task().

    """
    client = RedHttpClient()

    def __init__(self, iri, method="GET", req_hdrs=None, req_body=None,
                 status_cb=None, body_procs=None, name=None):
        RedState.__init__(self, name)
        self.request = HttpRequest(self.notes, self.name)
        self.request.method = method
        self.request.set_iri(iri)
        self.request.headers = req_hdrs or []
        self.request.payload = req_body
        self.response = HttpResponse(self.notes, self.name)
        self.response.is_head_response = (method == "HEAD")
        self.response.base_uri = self.request.uri
        self.response.set_decoded_procs(body_procs or [])
        self.exchange = None
        self.status_cb = status_cb
        self.done_cb = None
        self.outstanding_tasks = 0
        self._st = [] # FIXME: this is temporary, for debugging thor

    def __getstate__(self):
        state = self.__dict__.copy()
        del state['exchange']
        del state['status_cb']
        del state['done_cb']
        return state

    def add_task(self, task, *args):
        "Remeber that we've started a task."
        self.outstanding_tasks += 1
        self._st.append('add_task(%s)' % str(task))
        task(*args, done_cb=self.finish_task)

    def finish_task(self):
        "Note that we've finished a task, and see if we're done."
        self.outstanding_tasks -= 1
        self._st.append('finish_task()')
        assert self.outstanding_tasks >= 0, self._st
        if self.outstanding_tasks == 0:
            if self.done_cb:
                self.done_cb()
                self.done_cb = None
            # clean up potentially cyclic references
            self.status_cb = None

    def done(self):
        "Callback for when the response is complete and analysed."
        raise NotImplementedError

    def preflight(self):
        """
        Callback to check to see if we should bother running. Return True
        if so; False if not.
        """
        return True
            
    def run(self, done_cb=None):
        """
        Make an asynchronous HTTP request to uri, calling status_cb as it's
        updated and done_cb when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        self.outstanding_tasks += 1
        self._st.append('run(%s)' % str(done_cb))
        self.done_cb = done_cb
        if not self.preflight() or self.request.uri == None:
            # generally a good sign that we're not going much further.
            self.finish_task()
            return
        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(
                (u"User-Agent", u"RED/%s (http://redbot.org/)" % __version__))
        self.exchange = self.client.exchange()
        self.exchange.on('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.on('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        if self.status_cb and self.name:
            self.status_cb("fetching %s (%s)" % (
                self.request.uri, self.name
            ))
        req_hdrs = [
            (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
            for (k, v) in self.request.headers
        ]
        self.exchange.request_start(
            self.request.method, self.request.uri, req_hdrs
        )
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
        self.exchange.request_done([])

    def _response_start(self, status, phrase, res_headers):
        "Process the response start-line and headers."
        self._st.append('_response_start(%s, %s)' % (status, phrase))
        self.response.start_time = thor.time()
        self.response.version = self.exchange.res_version
        self.response.status_code = status.decode('iso-8859-1', 'replace')
        self.response.status_phrase = phrase.decode('iso-8859-1', 'replace')
        self.response.set_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk):
        "Process a chunk of the response body."
        self.response.feed_body(chunk)

    def _response_done(self, trailers):
        "Finish analysing the response, handling any parse errors."
        self._st.append('_response_done()')
        self.response.complete_time = thor.time()
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        if self.status_cb and self.name:
            self.status_cb("fetched %s (%s)" % (
                self.request.uri, self.name
            ))
        self.done()
        self.finish_task()

    def _response_error(self, error):
        "Handle an error encountered while fetching the response."
        self._st.append('_response_error(%s)' % (str(error)))
        self.response.complete_time = thor.time()
        self.response.http_error = error
        if isinstance(error, httperr.BodyForbiddenError):
            self.add_note('header-none', rs.BODY_NOT_ALLOWED)
#        elif isinstance(error, httperr.ExtraDataErr):
#            res.payload_len += len(err.get('detail', ''))
        elif isinstance(error, httperr.ChunkError):
            err_msg = error.detail[:20] or ""
            self.add_note('header-transfer-encoding', rs.BAD_CHUNK,
                chunk_sample=err_msg.encode('string_escape')
            )
        self.done()
        self.finish_task()
Example #8
0
class RedFetcher(thor.events.EventEmitter):
    """
    Abstract class for a fetcher.

    Fetches the given URI (with the provided method, headers and body) and:
      - emits 'status' and 'debug' as it progresses
      - emits 'fetch_done' when the fetch is finished.

    If provided, 'name' indicates the type of the request, and is used to
    help set notes and status events appropriately.
    """

    check_name = "undefined"
    response_phrase = "undefined"
    client = RedHttpClient()
    client.idle_timeout = 5
    robot_emitter = thor.events.EventEmitter()

    def __init__(self, config: SectionProxy) -> None:
        thor.events.EventEmitter.__init__(self)
        self.config = config
        self.notes = []  # type: List[Note]
        self.transfer_in = 0
        self.transfer_out = 0
        self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
        self.nonfinal_responses = []  # type: List[HttpResponse]
        self.response = HttpResponse(self.add_note)  # type: HttpResponse
        self.exchange = None  # type: thor.http.ClientExchange
        self.fetch_started = False
        self.fetch_done = False

    def __getstate__(self) -> Dict[str, Any]:
        state = thor.events.EventEmitter.__getstate__(self)
        del state["exchange"]
        return state

    def __repr__(self) -> str:
        out = [self.__class__.__name__]
        if self.request.uri:
            out.append("%s" % self.request.uri)
        if self.fetch_started:
            out.append("fetch_started")
        if self.fetch_done:
            out.append("fetch_done")
        return "<%s at %#x>" % (", ".join(out), id(self))

    def add_note(self, subject: str, note: Type[Note],
                 **kw: Union[str, int]) -> None:
        "Set a note."
        if "response" not in kw:
            kw["response"] = self.response_phrase
        self.notes.append(note(subject, kw))

    def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None:
        "Ignore a note (for requests)."
        return

    def preflight(self) -> bool:
        """
        Check to see if we should bother running. Return True
        if so; False if not. Can be overridden.
        """
        return True

    def set_request(
        self,
        iri: str,
        method: str = "GET",
        req_hdrs: StrHeaderListType = None,
        req_body: bytes = None,
    ) -> None:
        """
        Set the resource's request. All values are strings.
        """
        self.request.method = method
        self.response.is_head_response = method == "HEAD"  # type: ignore
        try:
            self.request.set_iri(iri)
        except httperr.UrlError as why:
            self.response.http_error = why
        self.response.base_uri = self.request.uri  # type: ignore
        if req_hdrs:
            self.request.set_headers(req_hdrs)
        self.request.payload = req_body  # type: ignore    # FIXME: encoding
        self.request.complete = True  # cheating a bit

    def check(self) -> None:
        """
        Make an asynchronous HTTP request to uri, emitting 'status' as it's
        updated and 'fetch_done' when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        if not self.preflight() or self.request.uri is None:
            # generally a good sign that we're not going much further.
            self._fetch_done()
            return
        self.run_continue(True)

    def run_continue(self, allowed: bool) -> None:
        """
        Continue after getting the robots file.
        """
        if not allowed:
            self.response.http_error = RobotsTxtError()
            self._fetch_done()
            return

        self.fetch_started = True

        if "user-agent" not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(("User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on("response_nonfinal", self._response_nonfinal)
        self.exchange.once("response_start", self._response_start)
        self.exchange.on("response_body", self._response_body)
        self.exchange.once("response_done", self._response_done)
        self.exchange.on("error", self._response_error)
        self.emit("status",
                  "fetching %s (%s)" % (self.request.uri, self.check_name))
        self.emit("debug",
                  "fetching %s (%s)" % (self.request.uri, self.check_name))
        req_hdrs = [
            (k.encode("ascii", "replace"), v.encode("ascii", "replace"))
            for (k, v) in self.request.headers
        ]  # FIXME: should complain
        self.exchange.request_start(
            self.request.method.encode("ascii"),
            self.request.uri.encode("ascii"),
            req_hdrs,
        )
        self.request.start_time = thor.time()
        if not self.fetch_done:  # the request could have immediately failed.
            if self.request.payload is not None:
                self.exchange.request_body(self.request.payload)
                self.transfer_out += len(self.request.payload)
        if not self.fetch_done:  # the request could have immediately failed.
            self.exchange.request_done([])

    def _response_nonfinal(self, status: bytes, phrase: bytes,
                           res_headers: RawHeaderListType) -> None:
        "Got a non-final response."
        nfres = HttpResponse(self.add_note)
        nfres.process_top_line(self.exchange.res_version, status, phrase)
        nfres.process_raw_headers(res_headers)
        StatusChecker(nfres, self.request)
        self.nonfinal_responses.append(nfres)

    def _response_start(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
        "Process the response start-line and headers."
        self.response.start_time = thor.time()
        self.response.process_top_line(self.exchange.res_version, status,
                                       phrase)
        self.response.process_raw_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk: bytes) -> None:
        "Process a chunk of the response body."
        self.transfer_in += len(chunk)
        self.response.feed_body(chunk)

    def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None:
        "Finish analysing the response, handling any parse errors."
        self.emit("debug",
                  "fetched %s (%s)" % (self.request.uri, self.check_name))
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        self._fetch_done()

    def _response_error(self, error: httperr.HttpError) -> None:
        "Handle an error encountered while fetching the response."
        self.emit(
            "debug",
            "fetch error %s (%s) - %s" %
            (self.request.uri, self.check_name, error.desc),
        )
        err_sample = error.detail[:40] or ""
        if isinstance(error, httperr.ExtraDataError):
            if self.response.status_code == "304":
                self.add_note("body", BODY_NOT_ALLOWED, sample=err_sample)
            else:
                self.add_note("body", EXTRA_DATA, sample=err_sample)
        elif isinstance(error, httperr.ChunkError):
            self.add_note("header-transfer-encoding",
                          BAD_CHUNK,
                          chunk_sample=err_sample)
        elif isinstance(error, httperr.HeaderSpaceError):
            subject = "header-%s" % (error.detail.lower().strip())
            self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail)
        else:
            self.response.http_error = error
        self._fetch_done()

    def _fetch_done(self) -> None:
        if not self.fetch_done:
            self.fetch_done = True
            self.exchange = None
            self.emit("fetch_done")
Example #9
0
def checkCaching(response: HttpResponse, request: HttpRequest = None) -> None:
    "Examine HTTP caching characteristics."

    # get header values
    lm_hdr = response.parsed_headers.get("last-modified", None)
    date_hdr = response.parsed_headers.get("date", None)
    expires_hdr = response.parsed_headers.get("expires", None)
    etag_hdr = response.parsed_headers.get("etag", None)
    age_hdr = response.parsed_headers.get("age", None)
    cc_set = response.parsed_headers.get("cache-control", [])
    cc_list = [k for (k, v) in cc_set]
    cc_dict = dict(cc_set)
    cc_keys = list(cc_dict.keys())

    # Last-Modified
    if lm_hdr:
        serv_date = date_hdr or response.start_time
        if lm_hdr > serv_date:
            response.add_note("header-last-modified", LM_FUTURE)
        else:
            response.add_note(
                "header-last-modified",
                LM_PRESENT,
                last_modified_string=relative_time(lm_hdr, serv_date),
            )

    # known Cache-Control directives that don't allow duplicates
    known_cc = [
        "max-age",
        "no-store",
        "s-maxage",
        "public",
        "private",
        "pre-check",
        "post-check",
        "stale-while-revalidate",
        "stale-if-error",
    ]

    # check for mis-capitalised directives /
    # assure there aren't any dup directives with different values
    for cc in cc_keys:
        if cc.lower() in known_cc and cc != cc.lower():
            response.add_note("header-cache-control",
                              CC_MISCAP,
                              cc_lower=cc.lower(),
                              cc=cc)
        if cc in known_cc and cc_list.count(cc) > 1:
            response.add_note("header-cache-control", CC_DUP, cc=cc)

    # Who can store this?
    if request and request.method not in cacheable_methods:
        response.store_shared = response.store_private = False
        request.add_note("method", METHOD_UNCACHEABLE, method=request.method)
        return  # bail; nothing else to see here
    if "no-store" in cc_keys:
        response.store_shared = response.store_private = False
        response.add_note("header-cache-control", NO_STORE)
        return  # bail; nothing else to see here
    if "private" in cc_keys:
        response.store_shared = False
        response.store_private = True
        response.add_note("header-cache-control", PRIVATE_CC)
    elif (request
          and "authorization" in [k.lower() for k, v in request.headers]
          and "public" not in cc_keys):
        response.store_shared = False
        response.store_private = True
        response.add_note("header-cache-control", PRIVATE_AUTH)
    else:
        response.store_shared = response.store_private = True
        response.add_note("header-cache-control", STOREABLE)

    # no-cache?
    if "no-cache" in cc_keys:
        if lm_hdr is None and etag_hdr is None:
            response.add_note("header-cache-control", NO_CACHE_NO_VALIDATOR)
        else:
            response.add_note("header-cache-control", NO_CACHE)
        return

    # pre-check / post-check
    if "pre-check" in cc_keys or "post-check" in cc_keys:
        if "pre-check" not in cc_keys or "post-check" not in cc_keys:
            response.add_note("header-cache-control", CHECK_SINGLE)
        else:
            pre_check = post_check = None
            try:
                pre_check = int(cc_dict["pre-check"])
                post_check = int(cc_dict["post-check"])
            except ValueError:
                response.add_note("header-cache-control", CHECK_NOT_INTEGER)
            if pre_check is not None and post_check is not None:
                if pre_check == 0 and post_check == 0:
                    response.add_note("header-cache-control", CHECK_ALL_ZERO)
                elif post_check > pre_check:
                    response.add_note("header-cache-control",
                                      CHECK_POST_BIGGER)
                    post_check = pre_check
                elif post_check == 0:
                    response.add_note("header-cache-control", CHECK_POST_ZERO)
                else:
                    response.add_note(
                        "header-cache-control",
                        CHECK_POST_PRE,
                        pre_check=pre_check,
                        post_check=post_check,
                    )

    # vary?
    vary = response.parsed_headers.get("vary", set())
    if "*" in vary:
        response.add_note("header-vary", VARY_ASTERISK)
        return  # bail; nothing else to see here
    if len(vary) > 3:
        response.add_note("header-vary",
                          VARY_COMPLEX,
                          vary_count=f_num(len(vary)))
    else:
        if "user-agent" in vary:
            response.add_note("header-vary", VARY_USER_AGENT)
        if "host" in vary:
            response.add_note("header-vary", VARY_HOST)

    # calculate age
    response.age = age_hdr or 0
    age_str = relative_time(response.age, 0, 0)
    if date_hdr and date_hdr > 0:
        apparent_age = max(0, int(response.start_time - date_hdr))
    else:
        apparent_age = 0
    current_age = max(apparent_age, response.age)
    current_age_str = relative_time(current_age, 0, 0)
    if response.age >= 1:
        response.add_note("header-age header-date", CURRENT_AGE, age=age_str)

    # Check for clock skew and dateless origin server.
    if not date_hdr:
        response.add_note("", DATE_CLOCKLESS)
        if expires_hdr or lm_hdr:
            response.add_note("header-expires header-last-modified",
                              DATE_CLOCKLESS_BAD_HDR)
    else:
        skew = date_hdr - response.start_time + (response.age)
        if response.age > max_clock_skew and (current_age -
                                              skew) < max_clock_skew:
            response.add_note("header-date header-age", AGE_PENALTY)
        elif abs(skew) > max_clock_skew:
            response.add_note(
                "header-date",
                DATE_INCORRECT,
                clock_skew_string=relative_time(skew, 0, 2),
            )
        else:
            response.add_note("header-date", DATE_CORRECT)

    # calculate freshness
    freshness_lifetime = 0
    has_explicit_freshness = False
    has_cc_freshness = False
    freshness_hdrs = ["header-date"]
    if "s-maxage" in cc_keys:
        freshness_lifetime = cc_dict["s-maxage"]
        freshness_hdrs.append("header-cache-control")
        has_explicit_freshness = True
        has_cc_freshness = True
    elif "max-age" in cc_keys:
        freshness_lifetime = cc_dict["max-age"]
        freshness_hdrs.append("header-cache-control")
        has_explicit_freshness = True
        has_cc_freshness = True
    elif "expires" in response.parsed_headers:
        # An invalid Expires header means it's automatically stale
        has_explicit_freshness = True
        freshness_hdrs.append("header-expires")
        freshness_lifetime = (expires_hdr or 0) - (date_hdr
                                                   or int(response.start_time))

    freshness_left = freshness_lifetime - current_age
    freshness_left_str = relative_time(abs(int(freshness_left)), 0, 0)
    freshness_lifetime_str = relative_time(int(freshness_lifetime), 0, 0)

    response.freshness_lifetime = freshness_lifetime
    fresh = freshness_left > 0
    if has_explicit_freshness:
        if fresh:
            response.add_note(
                " ".join(freshness_hdrs),
                FRESHNESS_FRESH,
                freshness_lifetime=freshness_lifetime_str,
                freshness_left=freshness_left_str,
                current_age=current_age_str,
            )
        elif has_cc_freshness and response.age > freshness_lifetime:
            response.add_note(
                " ".join(freshness_hdrs),
                FRESHNESS_STALE_CACHE,
                freshness_lifetime=freshness_lifetime_str,
                freshness_left=freshness_left_str,
                current_age=current_age_str,
            )
        else:
            response.add_note(
                " ".join(freshness_hdrs),
                FRESHNESS_STALE_ALREADY,
                freshness_lifetime=freshness_lifetime_str,
                freshness_left=freshness_left_str,
                current_age=current_age_str,
            )

    # can heuristic freshness be used?
    elif response.status_code in heuristic_cacheable_status:
        response.add_note("header-last-modified", FRESHNESS_HEURISTIC)
    else:
        response.add_note("", FRESHNESS_NONE)

    # can stale responses be served?
    if "must-revalidate" in cc_keys:
        if fresh:
            response.add_note("header-cache-control", FRESH_MUST_REVALIDATE)
        elif has_explicit_freshness:
            response.add_note("header-cache-control", STALE_MUST_REVALIDATE)
    elif "proxy-revalidate" in cc_keys or "s-maxage" in cc_keys:
        if fresh:
            response.add_note("header-cache-control", FRESH_PROXY_REVALIDATE)
        elif has_explicit_freshness:
            response.add_note("header-cache-control", STALE_PROXY_REVALIDATE)
    else:
        if fresh:
            response.add_note("header-cache-control", FRESH_SERVABLE)
        elif has_explicit_freshness:
            response.add_note("header-cache-control", STALE_SERVABLE)

    # public?
    if "public" in cc_keys:  # TODO: check for authentication in request
        response.add_note("header-cache-control", PUBLIC)
Example #10
0
class RedFetcher(RedState):
    """
    Abstract class for a fetcher.

    Fetches the given URI (with the provided method, headers and body) and
    calls:
      - status_cb as it progresses, and
      - every function in the body_procs list with each chunk of the body, and
      - done_cb when all tasks are done.
    If provided, type indicates the type of the request, and is used to
    help set notes and status_cb appropriately.

    The done() method is called when the response is done, NOT when all
    tasks are done. It can add tasks by calling add_task().

    """
    client = RedHttpClient()
    robot_files = {} # cache of robots.txt
    robot_cache_dir = None
    robot_lookups = {}

    def __init__(self, iri, method="GET", req_hdrs=None, req_body=None,
                 status_cb=None, body_procs=None, name=None):
        RedState.__init__(self, name)
        self.request = HttpRequest(self.notes, self.name)
        self.request.method = method
        self.request.set_iri(iri)
        self.request.headers = req_hdrs or []
        self.request.payload = req_body
        self.response = HttpResponse(self.notes, self.name)
        self.response.is_head_response = (method == "HEAD")
        self.response.base_uri = self.request.uri
        self.response.set_decoded_procs(body_procs or [])
        self.exchange = None
        self.status_cb = status_cb
        self.done_cb = None # really should be "all tasks done"
        self.outstanding_tasks = 0
        self.follow_robots_txt = True # Should we pay attention to robots file?
        self._st = [] # FIXME: this is temporary, for debugging thor

    def __getstate__(self):
        state = self.__dict__.copy()
        del state['exchange']
        del state['status_cb']
        del state['done_cb']
        return state

    def add_task(self, task, *args):
        "Remeber that we've started a task."
        self.outstanding_tasks += 1
        self._st.append('add_task(%s)' % str(task))
        task(*args, done_cb=self.finish_task)

    def finish_task(self):
        "Note that we've finished a task, and see if we're done."
        self.outstanding_tasks -= 1
        self._st.append('finish_task()')
        assert self.outstanding_tasks >= 0, self._st
        if self.outstanding_tasks == 0:
            if self.done_cb:
                self.done_cb()
                self.done_cb = None
            # clean up potentially cyclic references
            self.status_cb = None

    def done(self):
        "Callback for when the response is complete and analysed."
        raise NotImplementedError

    def preflight(self):
        """
        Callback to check to see if we should bother running. Return True
        if so; False if not.
        """
        return True

    def fetch_robots_txt(self, url, cb, network=True):
        """
        Fetch the robots.txt URL and then feed the response to cb.
        If the status code is not 200, send a blank doc back.

        If network is False, we won't use the network, will return the result
        immediately if cached, and will assume it's OK if we don't have a
        cached file.
        """

        origin = url_to_origin(self.request.uri)
        if origin == None:
            cb("")
            return ""
        origin_hash = hashlib.sha1(origin).hexdigest()

        if self.robot_files.has_key(origin):
            # FIXME: freshness lifetime
            cb(self.robot_files[origin])
            return self.robot_files[origin]

        if self.robot_cache_dir:
            robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt != None:
                cb(cached_robots_txt)
                return cached_robots_txt

        if not network:
            cb("")
            return ""

        if self.robot_lookups.has_key(origin):
            self.robot_lookups[origin].append(cb)
        else:
            self.robot_lookups[origin] = [cb]
            exchange = self.client.exchange()
            @thor.on(exchange)
            def response_start(status, phrase, headers):
                exchange.status = status

            exchange.res_body = ""
            @thor.on(exchange)
            def response_body(chunk):
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers):
                if not exchange.status.startswith("2"):
                    robots_txt = ""
                else:
                    robots_txt = exchange.res_body

                self.robot_files[origin] = robots_txt
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, 60*30)

                for _cb in self.robot_lookups[origin]:
                    _cb(robots_txt)
                del self.robot_lookups[origin]

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start("GET", robots_url,
                [('User-Agent', UA_STRING)])
            exchange.request_done([])

    def run(self, done_cb=None):
        """
        Make an asynchronous HTTP request to uri, calling status_cb as it's
        updated and done_cb when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        self.outstanding_tasks += 1
        self._st.append('run(%s)' % str(done_cb))
        self.done_cb = done_cb
        if not self.preflight() or self.request.uri == None:
            # generally a good sign that we're not going much further.
            self.finish_task()
            return

        if self.follow_robots_txt:
            self.fetch_robots_txt(self.request.uri, self.run_continue)
        else:
            self.run_continue("")

    def run_continue(self, robots_txt):
        """
        Continue after getting the robots file.
        TODO: refactor callback style into events.
        """
        if robots_txt == "": # empty or non-200
            pass
        else:
            checker = RobotFileParser()
            checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
            if not checker.can_fetch(UA_STRING, self.request.uri):
                self.response.http_error = RobotsTxtError()
                self.finish_task()
                return # TODO: show error?

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(
                (u"User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.on('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        if self.status_cb and self.name:
            self.status_cb("fetching %s (%s)" % (
                self.request.uri, self.name
            ))
        req_hdrs = [
            (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
            for (k, v) in self.request.headers
        ]
        self.exchange.request_start(
            self.request.method, self.request.uri, req_hdrs
        )
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])

    def _response_start(self, status, phrase, res_headers):
        "Process the response start-line and headers."
        self._st.append('_response_start(%s, %s)' % (status, phrase))
        self.response.start_time = thor.time()
        self.response.version = self.exchange.res_version
        self.response.status_code = status.decode('iso-8859-1', 'replace')
        self.response.status_phrase = phrase.decode('iso-8859-1', 'replace')
        self.response.set_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk):
        "Process a chunk of the response body."
        self.transfer_in += len(chunk)
        self.response.feed_body(chunk)

    def _response_done(self, trailers):
        "Finish analysing the response, handling any parse errors."
        self._st.append('_response_done()')
        self.response.complete_time = thor.time()
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        if self.status_cb and self.name:
            self.status_cb("fetched %s (%s)" % (
                self.request.uri, self.name
            ))
        self.done()
        self.finish_task()

    def _response_error(self, error):
        "Handle an error encountered while fetching the response."
        self._st.append('_response_error(%s)' % (str(error)))
        self.response.complete_time = thor.time()
        self.response.http_error = error
        if isinstance(error, httperr.BodyForbiddenError):
            self.add_note('header-none', rs.BODY_NOT_ALLOWED)
#        elif isinstance(error, httperr.ExtraDataErr):
#            res.payload_len += len(err.get('detail', ''))
        elif isinstance(error, httperr.ChunkError):
            err_msg = error.detail[:20] or ""
            self.add_note('header-transfer-encoding', rs.BAD_CHUNK,
                chunk_sample=err_msg.encode('string_escape')
            )
        self.done()
        self.finish_task()
Example #11
0
class RedFetcher(thor.events.EventEmitter):
    """
    Abstract class for a fetcher.

    Fetches the given URI (with the provided method, headers and body) and:
      - emits 'status' as it progresses
      - emits 'fetch_done' when the fetch is finished.

    If provided, 'name' indicates the type of the request, and is used to
    help set notes and status events appropriately.
    """
    check_name = "undefined"
    response_phrase = "undefined"
    client = RedHttpClient()
    robot_fetcher = RobotFetcher()

    def __init__(self) -> None:
        thor.events.EventEmitter.__init__(self)
        self.notes = [] # type: List[Note]
        self.transfer_in = 0
        self.transfer_out = 0
        self.request = HttpRequest(self.ignore_note)  # type: HttpRequest
        self.nonfinal_responses = []                  # type: List[HttpResponse]
        self.response = HttpResponse(self.add_note)   # type: HttpResponse
        self.exchange = None                          # type: thor.http.ClientExchange
        self.follow_robots_txt = True # Should we pay attention to robots file?
        self.fetch_started = False
        self.fetch_done = False

    def __getstate__(self) -> Dict[str, Any]:
        state = thor.events.EventEmitter.__getstate__(self)
        del state['exchange']
        return state

    def __repr__(self) -> str:
        out = [self.__class__.__name__]
        if self.request.uri:
            out.append("%s" % self.request.uri)
        if self.fetch_started:
            out.append("fetch_started")
        if self.fetch_done:
            out.append("fetch_done")
        return "<%s at %#x>" % (", ".join(out), id(self))

    def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None:
        "Set a note."
        if 'response' not in kw:
            kw['response'] = self.response_phrase
        self.notes.append(note(subject, kw))

    def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None:
        "Ignore a note (for requests)."
        return

    def preflight(self) -> bool:
        """
        Check to see if we should bother running. Return True
        if so; False if not. Can be overridden.
        """
        return True

    def set_request(self, iri: str, method: str="GET",
                    req_hdrs: StrHeaderListType=None, req_body: bytes=None) -> None:
        """
        Set the resource's request. All values are strings.
        """
        self.request.method = method
        self.response.is_head_response = (method == "HEAD")   # type: ignore
        self.request.set_iri(iri)
        self.response.base_uri = self.request.uri             # type: ignore
        if req_hdrs:
            self.request.set_headers(req_hdrs)
        self.request.payload = req_body # type: ignore    # FIXME: encoding
        self.request.complete = True  # cheating a bit

    def check(self) -> None:
        """
        Make an asynchronous HTTP request to uri, emitting 'status' as it's
        updated and 'fetch_done' when it's done. Reason is used to explain what the
        request is in the status callback.
        """
        if not self.preflight() or self.request.uri is None:
            # generally a good sign that we're not going much further.
            self._fetch_done()
            return

        if self.follow_robots_txt:
            self.robot_fetcher.once("robot-%s" % self.request.uri, self.run_continue)
            self.robot_fetcher.check_robots(self.request.uri)
        else:
            self.run_continue(True)

    def run_continue(self, allowed: bool) -> None:
        """
        Continue after getting the robots file.
        """
        if not allowed:
            self.response.http_error = RobotsTxtError()
            self._fetch_done()
            return

        self.fetch_started = True

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(("User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_nonfinal', self._response_nonfinal)
        self.exchange.once('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.once('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name))
        req_hdrs = [(k.encode('ascii'), v.encode('ascii')) for (k, v) in self.request.headers]
        self.exchange.request_start(
            self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs)
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])

    def _response_nonfinal(self, status: bytes, phrase: bytes, 
                           res_headers: RawHeaderListType) -> None:
        "Got a non-final response."
        nfres = HttpResponse(self.add_note)
        nfres.process_top_line(self.exchange.res_version, status, phrase)
        nfres.process_raw_headers(res_headers)
        StatusChecker(nfres, self.request)
        self.nonfinal_responses.append(nfres)

    def _response_start(self, status: bytes, phrase: bytes,
                        res_headers: RawHeaderListType) -> None:
        "Process the response start-line and headers."
        self.response.start_time = thor.time()
        self.response.process_top_line(self.exchange.res_version, status, phrase)
        self.response.process_raw_headers(res_headers)
        StatusChecker(self.response, self.request)
        checkCaching(self.response, self.request)

    def _response_body(self, chunk: bytes) -> None:
        "Process a chunk of the response body."
        self.transfer_in += len(chunk)
        self.response.feed_body(chunk)

    def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None:
        "Finish analysing the response, handling any parse errors."
        self.emit("status", "fetched %s (%s)" % (self.request.uri, self.check_name))
        self.response.transfer_length = self.exchange.input_transfer_length
        self.response.header_length = self.exchange.input_header_length
        self.response.body_done(True, trailers)
        self._fetch_done()

    def _response_error(self, error: httperr.HttpError) -> None:
        "Handle an error encountered while fetching the response."
        self.emit("status", "fetch error %s (%s) - %s" % (
            self.request.uri, self.check_name, error.desc))
        err_sample = error.detail[:40] or ""
        if error.client_recoverable:
            pass # we'll get to this later.
        elif isinstance(error, httperr.ExtraDataError):
            if self.response.status_code == "304":
                self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample)
            else:
                self.add_note('body', EXTRA_DATA, sample=err_sample)
        elif isinstance(error, httperr.ChunkError):
            self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample)
        else:
            self.response.http_error = error
        self._fetch_done()

    def _fetch_done(self) -> None:
        if not self.fetch_done:
            self.fetch_done = True
            self.emit("fetch_done")
Example #12
0
def checkCaching(response: HttpResponse, request: HttpRequest=None) -> None:
    "Examine HTTP caching characteristics."

    # get header values
    lm_hdr = response.parsed_headers.get('last-modified', None)
    date_hdr = response.parsed_headers.get('date', None)
    expires_hdr = response.parsed_headers.get('expires', None)
    etag_hdr = response.parsed_headers.get('etag', None)
    age_hdr = response.parsed_headers.get('age', None)
    cc_set = response.parsed_headers.get('cache-control', [])
    cc_list = [k for (k, v) in cc_set]
    cc_dict = dict(cc_set)
    cc_keys = list(cc_dict.keys())

    # Last-Modified
    if lm_hdr:
        serv_date = date_hdr or response.start_time
        if lm_hdr > serv_date:
            response.add_note('header-last-modified', LM_FUTURE)
        else:
            response.add_note('header-last-modified', LM_PRESENT,
                              last_modified_string=relative_time(lm_hdr, serv_date))

    # known Cache-Control directives that don't allow duplicates
    known_cc = ["max-age", "no-store", "s-maxage", "public",
                "private", "pre-check", "post-check",
                "stale-while-revalidate", "stale-if-error"]

    # check for mis-capitalised directives /
    # assure there aren't any dup directives with different values
    for cc in cc_keys:
        if cc.lower() in known_cc and cc != cc.lower():
            response.add_note('header-cache-control', CC_MISCAP,
                              cc_lower=cc.lower(), cc=cc)
        if cc in known_cc and cc_list.count(cc) > 1:
            response.add_note('header-cache-control', CC_DUP, cc=cc)

    # Who can store this?
    if request and request.method not in cacheable_methods:
        response.store_shared = response.store_private = False
        request.add_note('method', METHOD_UNCACHEABLE, method=request.method)
        return # bail; nothing else to see here
    elif 'no-store' in cc_keys:
        response.store_shared = response.store_private = False
        response.add_note('header-cache-control', NO_STORE)
        return # bail; nothing else to see here
    elif 'private' in cc_keys:
        response.store_shared = False
        response.store_private = True
        response.add_note('header-cache-control', PRIVATE_CC)
    elif request and 'authorization' in [k.lower() for k, v in request.headers] \
      and 'public' not in cc_keys:
        response.store_shared = False
        response.store_private = True
        response.add_note('header-cache-control', PRIVATE_AUTH)
    else:
        response.store_shared = response.store_private = True
        response.add_note('header-cache-control', STOREABLE)

    # no-cache?
    if 'no-cache' in cc_keys:
        if lm_hdr is None and etag_hdr is None:
            response.add_note('header-cache-control', NO_CACHE_NO_VALIDATOR)
        else:
            response.add_note('header-cache-control', NO_CACHE)
        return

    # pre-check / post-check
    if 'pre-check' in cc_keys or 'post-check' in cc_keys:
        if 'pre-check' not in cc_keys or 'post-check' not in cc_keys:
            response.add_note('header-cache-control', CHECK_SINGLE)
        else:
            pre_check = post_check = None
            try:
                pre_check = int(cc_dict['pre-check'])
                post_check = int(cc_dict['post-check'])
            except ValueError:
                response.add_note('header-cache-control', CHECK_NOT_INTEGER)
            if pre_check is not None and post_check is not None:
                if pre_check == 0 and post_check == 0:
                    response.add_note('header-cache-control', CHECK_ALL_ZERO)
                elif post_check > pre_check:
                    response.add_note('header-cache-control', CHECK_POST_BIGGER)
                    post_check = pre_check
                elif post_check == 0:
                    response.add_note('header-cache-control', CHECK_POST_ZERO)
                else:
                    response.add_note('header-cache-control', CHECK_POST_PRE,
                                      pre_check=pre_check, post_check=post_check)

    # vary?
    vary = response.parsed_headers.get('vary', set())
    if "*" in vary:
        response.add_note('header-vary', VARY_ASTERISK)
        return # bail; nothing else to see here
    elif len(vary) > 3:
        response.add_note('header-vary', VARY_COMPLEX, vary_count=f_num(len(vary)))
    else:
        if "user-agent" in vary:
            response.add_note('header-vary', VARY_USER_AGENT)
        if "host" in vary:
            response.add_note('header-vary', VARY_HOST)
        # TODO: enumerate the axes in a message

    # calculate age
    response.age = age_hdr or 0
    age_str = relative_time(response.age, 0, 0)
    if date_hdr and date_hdr > 0:
        apparent_age = max(0, int(response.start_time - date_hdr))
    else:
        apparent_age = 0
    current_age = max(apparent_age, response.age)
    current_age_str = relative_time(current_age, 0, 0)
    if response.age >= 1:
        response.add_note('header-age header-date', CURRENT_AGE, age=age_str)

    # Check for clock skew and dateless origin server.
    if not date_hdr:
        response.add_note('', DATE_CLOCKLESS)
        if expires_hdr or lm_hdr:
            response.add_note('header-expires header-last-modified', DATE_CLOCKLESS_BAD_HDR)
    else:
        skew = date_hdr - response.start_time + (response.age)
        if response.age > max_clock_skew and (current_age - skew) < max_clock_skew:
            response.add_note('header-date header-age', AGE_PENALTY)
        elif abs(skew) > max_clock_skew:
            response.add_note('header-date', DATE_INCORRECT,
                              clock_skew_string=relative_time(skew, 0, 2))
        else:
            response.add_note('header-date', DATE_CORRECT)

    # calculate freshness
    freshness_lifetime = 0
    has_explicit_freshness = False
    has_cc_freshness = False
    freshness_hdrs = ['header-date']
    if 's-maxage' in cc_keys:
        freshness_lifetime = cc_dict['s-maxage']
        freshness_hdrs.append('header-cache-control')
        has_explicit_freshness = True
        has_cc_freshness = True
    elif 'max-age' in cc_keys:
        freshness_lifetime = cc_dict['max-age']
        freshness_hdrs.append('header-cache-control')
        has_explicit_freshness = True
        has_cc_freshness = True
    elif 'expires' in response.parsed_headers:
        # An invalid Expires header means it's automatically stale
        has_explicit_freshness = True
        freshness_hdrs.append('header-expires')
        freshness_lifetime = (expires_hdr or 0) - (date_hdr or response.start_time)

    freshness_left = freshness_lifetime - current_age
    freshness_left_str = relative_time(abs(int(freshness_left)), 0, 0)
    freshness_lifetime_str = relative_time(int(freshness_lifetime), 0, 0)

    response.freshness_lifetime = freshness_lifetime
    fresh = freshness_left > 0
    if has_explicit_freshness:
        if fresh:
            response.add_note(" ".join(freshness_hdrs), FRESHNESS_FRESH,
                              freshness_lifetime=freshness_lifetime_str,
                              freshness_left=freshness_left_str,
                              current_age=current_age_str)
        # FIXME: response.age = None
        elif has_cc_freshness and response.age > freshness_lifetime:
            response.add_note(" ".join(freshness_hdrs), FRESHNESS_STALE_CACHE,
                              freshness_lifetime=freshness_lifetime_str,
                              freshness_left=freshness_left_str,
                              current_age=current_age_str)
        else:
            response.add_note(" ".join(freshness_hdrs), FRESHNESS_STALE_ALREADY,
                              freshness_lifetime=freshness_lifetime_str,
                              freshness_left=freshness_left_str,
                              current_age=current_age_str)

    # can heuristic freshness be used?
    elif response.status_code in heuristic_cacheable_status:
        response.add_note('header-last-modified', FRESHNESS_HEURISTIC)
    else:
        response.add_note('', FRESHNESS_NONE)

    # can stale responses be served?
    if 'must-revalidate' in cc_keys:
        if fresh:
            response.add_note('header-cache-control', FRESH_MUST_REVALIDATE)
        elif has_explicit_freshness:
            response.add_note('header-cache-control', STALE_MUST_REVALIDATE)
    elif 'proxy-revalidate' in cc_keys or 's-maxage' in cc_keys:
        if fresh:
            response.add_note('header-cache-control', FRESH_PROXY_REVALIDATE)
        elif has_explicit_freshness:
            response.add_note('header-cache-control', STALE_PROXY_REVALIDATE)
    else:
        if fresh:
            response.add_note('header-cache-control', FRESH_SERVABLE)
        elif has_explicit_freshness:
            response.add_note('header-cache-control', STALE_SERVABLE)

    # public?
    if 'public' in cc_keys: # TODO: check for authentication in request
        response.add_note('header-cache-control', PUBLIC)