def postprocess(cls, result): if not isinstance(result, dict): logging.error("internal exception raised", type = type(result), result = result) return {"status" : 600, "error_message" : "internal exception raised %s" % result} if result.has_key("error_message") or result["status"] != 200 or result["doc"] is None: return result #dns cache actual_url = result["url"] if result["meta"].get("dns_cache_enabled", False): if actual_url != result["meta"]["url"]: parsed_result = misc.parse_url(actual_url) if parsed_result is not None and dns_cache.has_dns_cache(parsed_result.netloc): ip = socket.gethostbyname(parsed_result.netloc) dns_cache.set_dns_cache(parsed_result.netloc, ip) #compression body = result["doc"] ce=result["headers"].get('Content-Encoding',None) if ce and ce.lower().find('gzip')!=-1: body=cStringIO.StringIO(body) body=gzip.GzipFile(fileobj=body,mode='rb').read() #chunked transfer encoding if result["meta"].get("chunked_transfer_decoding", False) and result["headers"].get('Transfer-Encoding') == 'chunked': body = Downloader.decode_chunked_transfer(body) #create result dict result["doc"] = body return result
def allowed_url(url, user_agent, scheme=None, host=None): if scheme is None or host is None: parsed_result = misc.parse_url(url) if parsed_result is None: return False scheme = parsed_result.scheme host = parsed_result.netloc robot_parser = _get_robot_parser(scheme, host) if robot_parser is not None and not robot_parser.can_fetch(user_agent, url): return False else: return True
def preprocess(cls, url, robotstxt_enabled): #initialize result result = {"url" : url, "status" : 600, "doc" : None, "headers" : None} parsed_result = misc.parse_url(url) if parsed_result is None: result["error_message"] = "parse url failed" return False, result #check robots.txt if robotstxt_enabled and not robotstxt.allowed_url(url, user_agent or "", parsed_result.scheme, parsed_result.netloc): result["error_message"] = "robots.txt disallowed" return False, result return True, result