Esempio n. 1
0
    def _update_url_info(cls, url, update_map, inc_map, cond=None, with_get=False, fields=[]):
        deleting = update_map.has_key("crawl_status") and update_map["crawl_status"] in ["failed", "notAlive"]
        if not deleting:
            cache_update_map = misc.clone_dict(update_map, UrlCacheClient._fields, soft=True)
            cache_inc_map = misc.clone_dict(inc_map, UrlCacheClient._fields, soft=True)
        else:
            cache_update_map = {}
            cache_inc_map = {}

        ret_value = common_settings.cache_client().set("url", url, update_map = cache_update_map, inc_map = cache_inc_map, cond = cond, with_get = with_get, fields = fields)

        if deleting:
            common_settings.cache_client().delete("url", url)
            common_settings.cache_client().set("url_dedup", url, data = "0")

        return ret_value
Esempio n. 2
0
    def _handle_redirect(self, url, message):
        original_url = message["original_url"]

        #Note: double check if the whole flow is consistent
        #add redirected url_info by crawl_handler
        crawl_request_msg = {"url" : url, "source" : "redirected", "parent_url" : original_url, "root_url" : url, "crawl_priority" : message["crawl_priority"], "crawl_depth" : message["crawl_depth"]}
        result = handler.HandlerRepository.process("crawl_request", crawl_request_msg, force_inproc = True)
        if result["status"] >= 0:
            logging.debug(self._log_formatter("redirected succeeded", url = url, original_url = original_url))
            #handle redirected url crawler_response
            crawler_response_msg = misc.clone_dict(message, ["url", "status", "doc", "headers", "page_last_modified", "last_crawled", "error_message"])
            crawler_response_msg["original_url"] = url
            # get url_info from message meta instead of db
            #url_info = crawlerdb.get_url_info(url, common_settings.crawler_msg_meta_fields)
            url_info = message['meta']
            crawler_response_msg["meta"] = url_info
            result = handler.HandlerRepository.process("crawler_response", crawler_response_msg)

            #handle original url crawler_response
            message["url"] = original_url
            message["redirect_url"] = url
            message["status"] = 801
        else:
            message["url"] = original_url
            message["status"] = 802
Esempio n. 3
0
    def build_crawler_request_msg(cls, url, url_info):
        message = misc.clone_dict(url_info, ["url", "page_last_modified"])
        message["__priority"] = url_info["crawl_priority"]
        message["meta"] = misc.clone_dict(url_info, common_settings.crawler_msg_meta_fields)
        if common_settings.strong_politeness:
            message["__group_hash"] = url_info["full_domain"]
        else:
            message["__group_hash"] = misc.md5(url)

        if url_info["crawl_type"] == "static":
            message_type = "__internal_crawler_request"
        elif url_info["crawl_type"] == "dynamic":
            message_type = "__internal_dynamic_crawler_request"
        else:
            raise Exception("unsupported crawl_type %s" % url_info["crawl_type"])

        return message_type, message
    def process_crawler_response(self, result):
        if not result.has_key("url"):
            return None
        if result["status"] == 700:
            self.crawl_url(self._async_mode, result["url"], result["meta"], self.get_user_agent(), None, result["meta"]["page_last_modified"])
            return result["meta"]
        else:
            #send crawler_response message
            input_msg = result["meta"]
            fields = ["url", "status", "doc", "headers"]
            message = misc.clone_dict(result, fields)
            message["page_last_modified"] = input_msg["page_last_modified"]
            message["original_url"] = input_msg["url"]
            message["last_crawled"] = datetime2timestamp(datetime.datetime.utcnow())
            message["error_message"] = result.get("error_message", None)
            message["meta"] = input_msg["meta"]
            message["meta"]["crawl_type"] = "static"
            if result["headers"] is not None and result["headers"].has_key("Last-Modified"):
                message["page_last_modified"] = result["headers"].get('Last-Modified')

            handler.HandlerRepository.process("__internal_crawler_response", message)
            return result["meta"]
Esempio n. 5
0
    def _process(self, message):
        # normalize url
        url = url_analyser.normalize_url(message["url"])
        if url is None:
            logging.error("invalid url for crawl", url = message["url"])
            return {"status" : -1}
        message["url"] = url

        #fill optional fields
        url_info = misc.clone_dict(message, fields = ["url", "source", "root_url", "parent_url", "crawl_priority", "crawl_depth"])
        self._assign_url_info_defaults(url_info)

        if url_info["root_url"] is None:
            url_info["root_url"] = url

        #deterimine crawl priority/depth
        is_valid, url_info["crawl_priority"], url_info["crawl_depth"] = crawl_priority_and_depth_evaluator.evaluate(url, url_info["source"], url_info)
        if not is_valid:
            return {"status" : -1}

        # stores to urlRepository table
        url_info["page_last_modified"] = None
        url_info["crawl_status"] = "crawling"
        url_info["last_crawled"] = None
        url_info["original_url"] = None
        # all urls is static now
        url_info["crawl_type"] = "static"
        # TODO add to crawler db, this should not be done here
        # some project do not need to store url info into database
        # should use middleware for these kind of actions
        #success, promoted = crawlerdb.add_url_info(url, url_info, True)

        if message["source"] != "redirected":
            # notify crawler
            message_type, crawler_message = CrawlerUtils.build_crawler_request_msg(url, url_info)
            handler.HandlerRepository.process(message_type, crawler_message)

        return {"status" : 1}
    def predict(self, url, url_info, extras = None):
        output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False}
        if url_info["url_class"] is None:
            url_info["url_class"] = "undefined"

        if url_info["last_crawled"] is None:
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "last_crawled is None"
        elif url_info["crawl_status"] == "alive":
            if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "unexpected"
                output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"])
            else:
                need_recrawl = self._recrawling_url(url, url_info["url_class"])
                if need_recrawl:
                    alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info)
                    if not alive:
                        output_msg["crawl_status"] = "notAlive"
                else:
                    output_msg["crawl_status"] = "notAlive"
        elif url_info["crawl_status"] == "error":
            if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "crawl_error"
                output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"]
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info)
                output_msg["retry_count_inc"] = True
        elif url_info["crawl_status"] == "redirected":
            if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]:
                output_msg["crawl_status"] = "notAlive"
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info)
                output_msg["redirect_count_inc"] = True
        else:
            logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"])
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"]

        if output_msg["recrawl_time"] is not None:
            output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"])

        if output_msg["recrawl_duration"] is not None:
            output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"])
        return output_msg