コード例 #1
0
ファイル: downloaders.py プロジェクト: qwang2505/ccrawler
    def postprocess(cls, result):
        if not isinstance(result, dict):
            logging.error("internal exception raised", type = type(result), result = result)
            return {"status" : 600, "error_message" : "internal exception raised %s" % result}
        if result.has_key("error_message") or result["status"] != 200 or result["doc"] is None:
            return result

        #dns cache
        actual_url = result["url"]

        if result["meta"].get("dns_cache_enabled", False):
            if actual_url != result["meta"]["url"]:
                parsed_result = misc.parse_url(actual_url)
                if parsed_result is not None and dns_cache.has_dns_cache(parsed_result.netloc):
                    ip = socket.gethostbyname(parsed_result.netloc)
                    dns_cache.set_dns_cache(parsed_result.netloc, ip)

        #compression
        body = result["doc"]
        ce=result["headers"].get('Content-Encoding',None)
        if ce and ce.lower().find('gzip')!=-1:
            body=cStringIO.StringIO(body)
            body=gzip.GzipFile(fileobj=body,mode='rb').read()

        #chunked transfer encoding
        if result["meta"].get("chunked_transfer_decoding", False) and result["headers"].get('Transfer-Encoding') == 'chunked':
            body = Downloader.decode_chunked_transfer(body)

        #create result dict
        result["doc"] = body
        return result
コード例 #2
0
ファイル: robotstxt.py プロジェクト: qwang2505/ccrawler
def allowed_url(url, user_agent, scheme=None, host=None):
    if scheme is None or host is None:
        parsed_result = misc.parse_url(url)
        if parsed_result is None:
            return False
        scheme = parsed_result.scheme
        host = parsed_result.netloc

    robot_parser = _get_robot_parser(scheme, host)
    if robot_parser is not None and not robot_parser.can_fetch(user_agent, url):
        return False
    else:
        return True
コード例 #3
0
ファイル: downloaders.py プロジェクト: qwang2505/ccrawler
    def preprocess(cls, url, robotstxt_enabled):
        #initialize result
        result = {"url" : url, "status" : 600, "doc" : None, "headers" : None}
        parsed_result = misc.parse_url(url)
        if parsed_result is None:
            result["error_message"] = "parse url failed"
            return False, result

        #check robots.txt
        if robotstxt_enabled and not robotstxt.allowed_url(url, user_agent or "", parsed_result.scheme, parsed_result.netloc):
            result["error_message"] = "robots.txt disallowed"
            return False, result

        return True, result