コード例 #1
0
ファイル: downloaders.py プロジェクト: qwang2505/ccrawler
    def postprocess(cls, result):
        if not isinstance(result, dict):
            logging.error("internal exception raised", type = type(result), result = result)
            return {"status" : 600, "error_message" : "internal exception raised %s" % result}
        if result.has_key("error_message") or result["status"] != 200 or result["doc"] is None:
            return result

        #dns cache
        actual_url = result["url"]

        if result["meta"].get("dns_cache_enabled", False):
            if actual_url != result["meta"]["url"]:
                parsed_result = misc.parse_url(actual_url)
                if parsed_result is not None and dns_cache.has_dns_cache(parsed_result.netloc):
                    ip = socket.gethostbyname(parsed_result.netloc)
                    dns_cache.set_dns_cache(parsed_result.netloc, ip)

        #compression
        body = result["doc"]
        ce=result["headers"].get('Content-Encoding',None)
        if ce and ce.lower().find('gzip')!=-1:
            body=cStringIO.StringIO(body)
            body=gzip.GzipFile(fileobj=body,mode='rb').read()

        #chunked transfer encoding
        if result["meta"].get("chunked_transfer_decoding", False) and result["headers"].get('Transfer-Encoding') == 'chunked':
            body = Downloader.decode_chunked_transfer(body)

        #create result dict
        result["doc"] = body
        return result
コード例 #2
0
ファイル: redis_client.py プロジェクト: qwang2505/ccrawler
    def _load_hash_object(self, values, data_type, name, fields, strict):
        """
        if fields is None, we assume it's raw mode, all fields will be retrieved, and no field encoder/decoder will be used
        """

        data_config = self._data_types.get(data_type)
        input_dict = values if fields is None else dict(zip(fields, values))

        result_dict = {}
        for field, value in input_dict.items():
            if value is None:
                if strict:
                    logging.error("field %s does not exist in the result, %s" % (field, name))
                    #raise Exception("field %s does not exist in the result, %s" % (field, name))
                    return None
                else:
                    value = None
            elif value == "\\None":
                value = None
            else:
                if RedisClient._special_none_pattern.match(value) is not None:
                    value = value[1:]

                if fields is not None:
                    field_configs = filter(lambda field_config : field_config == field if isinstance(field_config, str) else field_config[0] == field, data_config["fields"])
                    if len(field_configs) == 0:
                        raise Exception("unexpected field, %s, %s, %s" % (data_type, name, field))
                    field_config = field_configs[0]
                    if isinstance(field_config, tuple) and field_config[1] != str:
                        decoder = field_config[1]
                        value = decoder(value)

            result_dict[field] = value

        return result_dict
コード例 #3
0
ファイル: link_extractor.py プロジェクト: qwang2505/ccrawler
 def _load_doc(self, url, html):
     try:
         dom = page_parser.parse_unicode(html, url, notify=logging.info)
     except:
         logging.error("raw doc can not be parsed", url)
         return None
     return dom
コード例 #4
0
ファイル: downloaders.py プロジェクト: qwang2505/ccrawler
    def _crawl_sync(self, url, timeout, request_header, meta):
        result = {"url" : url, "status" : 600, "doc" : None, "headers" : None, "meta" : meta}

        req = urllib2.Request(url)

        #set headers
        for key, value in request_header.items():
            req.add_header(key, value)

        #set timeout
        try:
            response=urllib2.urlopen(req, timeout=timeout)
        except Exception as e:
            error_message = misc.exception_to_str(e)
            if error_message.find("HTTP Error 304: Not Modified") != -1:
                result["status"] = 304
                return result
            else:
                result["error_message"] = error_message
                logging.error("static_crawl failed when opening url", url = url, exception = e)
                return result

        try:
            body = response.read()
        except Exception as e:
            result["error_message"] = str(e)
            logging.error("static_crawl failed when reading response", url = url, exception = e)
            return result

        result["url"] = response.url
        result["status"] = response.code
        result["doc"] = body
        result["headers"] = response.headers
        return result
コード例 #5
0
ファイル: heartbeat.py プロジェクト: qwang2505/ccrawler
    def _check_is_all_crash(self):
        """
        original method of checking whether all service is alive
        not used right now
        """
	heartbeats = heartbeatdb.get_heartbeats(self._heart_beat_config["check_duration"])
        heartbeats = misc.cursor_to_array(heartbeats)
        heartbeats = misc.select(heartbeats, fields=["ip", "handler_name", "pid"])
        heartbeats = misc.distinct(heartbeats)
        handler_counts_per_machine = misc.count(heartbeats, key = lambda heartbeat : "%s_%s" % (heartbeat["ip"], heartbeat["handler_name"]))
        heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts_per_machine), type="handler_counts_per_machine")
        handler_counts = misc.count(heartbeats, key = lambda heartbeat : heartbeat["handler_name"])
        heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts), type="handler_counts_total")
        logging.debug("current alive handler counts", handler_counts)
        #Note: currently we will send email if no handler is running
        if len(filter(lambda handler_name : handler_counts.get(handler_name, 0) == 0, self._heart_beat_config["required_handlers"])) > 0:
            if self._last_notification_time is None or datetime.datetime.now() - self._last_notification_time >= \
                datetime.timedelta(seconds=self._heart_beat_config["notification_duration"]):

                email_body = "some handlers are not running:\n %s" % handler_counts_per_machine
                self._send_email(
                    self._heart_beat_config["email_server"],
                    self._heart_beat_config["email_from"],
                    self._heart_beat_config["email_tos"],
                    self._heart_beat_config["email_title"],
                    email_body)
                self._last_notification_time = datetime.datetime.now()
                logging.error("heartbeat server detects required handlers are not fully running, notification email sent", handler_counts_per_machine)
コード例 #6
0
ファイル: page_analysis.py プロジェクト: qwang2505/ccrawler
def _load():
    global _pa_initialized
    if not _pa_initialized:
        _pa_initialized = pa.init(common_settings.page_analysis_logger_prefix, common_settings.page_analysis_config_files)
        if not _pa_initialized:
            logging.error("page_analysis lib can't be loaded")
            return False
        else:
            logging.info("page_analysis lib has been loaded")

    return True
コード例 #7
0
    def predict(self, url, url_info, extras = None):
        output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False}
        if url_info["url_class"] is None:
            url_info["url_class"] = "undefined"

        if url_info["last_crawled"] is None:
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "last_crawled is None"
        elif url_info["crawl_status"] == "alive":
            if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "unexpected"
                output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"])
            else:
                need_recrawl = self._recrawling_url(url, url_info["url_class"])
                if need_recrawl:
                    alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info)
                    if not alive:
                        output_msg["crawl_status"] = "notAlive"
                else:
                    output_msg["crawl_status"] = "notAlive"
        elif url_info["crawl_status"] == "error":
            if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "crawl_error"
                output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"]
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info)
                output_msg["retry_count_inc"] = True
        elif url_info["crawl_status"] == "redirected":
            if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]:
                output_msg["crawl_status"] = "notAlive"
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info)
                output_msg["redirect_count_inc"] = True
        else:
            logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"])
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"]

        if output_msg["recrawl_time"] is not None:
            output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"])

        if output_msg["recrawl_duration"] is not None:
            output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"])
        return output_msg
コード例 #8
0
ファイル: redis_client.py プロジェクト: qwang2505/ccrawler
    def _reconnect(args, msg, e):
        if len(args) == 0 or not isinstance(args[0], RedisClient):
            raise Exception("unsupported decorator, it should be applied to RedisClient methods")
        self = args[0]

        while self._stop_condition is None or not self._stop_condition():
            logging.error("redis connection error: %s, %s reconnecting..." % (msg, e))
            time.sleep(5)
            try:
                self._wait()
                return True
            except redis.exceptions.ConnectionError as e:
                pass

        logging.warn("whole process is terminating")
        raise Exception("whole process is terminating")
コード例 #9
0
ファイル: heartbeat.py プロジェクト: qwang2505/ccrawler
    def repair_service(self,info_dict,config):
        """
        auto repair the dead service
        """
	command_dict = self._heart_beat_config["repair_command"]
        for item_id in info_dict:
            item = info_dict[item_id]
            if "class_name" in item and item['class_name'] in command_dict:
                user = config["DEFAULT"].get("user")
                password = config["DEFAULT"].get("password")
                host = item['server_ip']
                class_name = item['class_name']
                command = command_dict[class_name]
                concurrency_expected = str(item["expected_concurrency"] - item["actual_concurrency"])
                try:
                    self.ssh_cmd(user, host, password, command%concurrency_expected)
                except Exception as e:
                    logging.error(e)
コード例 #10
0
    def initialize(self):
        # enable dns cache
        if settings.dns_cache_enabled:
            dns_cache.enable_dns_cache()

        # enable user agent rotate
        if settings.user_agent_rotation_enabled:
            self._user_agent_list = misc.load_user_agent(settings.user_agent_file)
            if len(self._user_agent_list) == 0:
                self._user_agent_list.append(settings.default_user_agent)
                logging.error("user agent can't be downloaded, use default one")
        else:
            self._user_agent_list = [settings.default_user_agent]

        # initialize downloader, use twisted or urllib2
        if settings.downloader_type == "twisted":
            self._downloader = downloaders.TwistedDownloader()
        elif settings.downloader_type == "urllib2":
            self._downloader = downloaders.UrlLib2Downloader()
        else:
            raise Exception("unsupported downloader type")
コード例 #11
0
ファイル: decoder.py プロジェクト: qwang2505/ccrawler
def decode(url, headers, body, encoding=None):
    '''
    decode html to unicode
    '''

    try_count = 0

    while True:
        if encoding is not None:
            try_count = -1
        else:
            encoding, try_count = _try_get_encoding(headers, body, try_count)
            if encoding is None:
                logging.error("decoding failed for url", url)
                return None, None
        html = _try_decode(url, body, encoding)
        if html is not None:
            logging.debug('decode url succeeded', url = url, encoding = encoding, try_count = try_count)
            return html, encoding
        else:
            try_count += 1
            encoding = None
コード例 #12
0
ファイル: crawl_handler.py プロジェクト: qwang2505/ccrawler
    def _process(self, message):
        # normalize url
        url = url_analyser.normalize_url(message["url"])
        if url is None:
            logging.error("invalid url for crawl", url = message["url"])
            return {"status" : -1}
        message["url"] = url

        #fill optional fields
        url_info = misc.clone_dict(message, fields = ["url", "source", "root_url", "parent_url", "crawl_priority", "crawl_depth"])
        self._assign_url_info_defaults(url_info)

        if url_info["root_url"] is None:
            url_info["root_url"] = url

        #deterimine crawl priority/depth
        is_valid, url_info["crawl_priority"], url_info["crawl_depth"] = crawl_priority_and_depth_evaluator.evaluate(url, url_info["source"], url_info)
        if not is_valid:
            return {"status" : -1}

        # stores to urlRepository table
        url_info["page_last_modified"] = None
        url_info["crawl_status"] = "crawling"
        url_info["last_crawled"] = None
        url_info["original_url"] = None
        # all urls is static now
        url_info["crawl_type"] = "static"
        # TODO add to crawler db, this should not be done here
        # some project do not need to store url info into database
        # should use middleware for these kind of actions
        #success, promoted = crawlerdb.add_url_info(url, url_info, True)

        if message["source"] != "redirected":
            # notify crawler
            message_type, crawler_message = CrawlerUtils.build_crawler_request_msg(url, url_info)
            handler.HandlerRepository.process(message_type, crawler_message)

        return {"status" : 1}
コード例 #13
0
    def normalize_url(self, url, base_url = None):
        if url is None or len(url) == 0:
            return None

        original_url = url
        #Note: here asume all non-unicode urls are encoded by utf-8
        if isinstance(url, str):
            url = url.decode("utf-8")

        if not isinstance(url, unicode):
            logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url)
            return None

        url = url.replace('%20', ' ').strip()

        #fix http scheme:
        url = self._fix_http_scheme(url)

        #handle relative url
        if base_url is not None:
            url = urlparse.urljoin(base_url, url)

        #common normlization
        try:
            url = urlnorm.norm(url)
        except Exception as e:
            logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e)
            return None

        try:
            parse_result = urlparse.urlparse(url)
        except Exception as e:
            logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url)
            return None

        if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]:
            logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url)
            return None


        netloc = parse_result.netloc
        host = parse_result.netloc.split(':')[0]
        if ip_regex.match(host) is None: #if it's an ip host

            #check if domain and tld exists
            subdomain, domain, tld = tldextract.extract(host)
            if len(domain) == 0 or len(tld) == 0:
                logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url)
                return None

            #fix chinese punctuation
            for i in range(len(chinese_punctuation_map[0])):
                src = chinese_punctuation_map[0][i]
                dst = chinese_punctuation_map[1][i]
                netloc = netloc.replace(src, dst)

            #add www if not exists
            if len(subdomain) == 0:
                netloc = "www." + netloc

        fragment = parse_result.fragment
        if not fragment.startswith("!"): #Google's recommendation for ajax request
            fragment = ""
        if len(parse_result.scheme) == 0 or len(netloc) == 0:
            logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url)
            return None

        url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment))

        #canonicalize url
        #Note: it's too strong, and sometimes change the url semantics.
        #url = ccrawler.utils.url.canonicalize_url(url)

        url = url.strip()
        if len(url) > self._settings["general_crawl_policies"]["max_url_length"]:
            logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url)
            return None
        elif len(url) == 0:
            logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url)
            return None
        else:
            return url