def _update_url_info(cls, url, update_map, inc_map, cond=None, with_get=False, fields=[]): deleting = update_map.has_key("crawl_status") and update_map["crawl_status"] in ["failed", "notAlive"] if not deleting: cache_update_map = misc.clone_dict(update_map, UrlCacheClient._fields, soft=True) cache_inc_map = misc.clone_dict(inc_map, UrlCacheClient._fields, soft=True) else: cache_update_map = {} cache_inc_map = {} ret_value = common_settings.cache_client().set("url", url, update_map = cache_update_map, inc_map = cache_inc_map, cond = cond, with_get = with_get, fields = fields) if deleting: common_settings.cache_client().delete("url", url) common_settings.cache_client().set("url_dedup", url, data = "0") return ret_value
def _handle_redirect(self, url, message): original_url = message["original_url"] #Note: double check if the whole flow is consistent #add redirected url_info by crawl_handler crawl_request_msg = {"url" : url, "source" : "redirected", "parent_url" : original_url, "root_url" : url, "crawl_priority" : message["crawl_priority"], "crawl_depth" : message["crawl_depth"]} result = handler.HandlerRepository.process("crawl_request", crawl_request_msg, force_inproc = True) if result["status"] >= 0: logging.debug(self._log_formatter("redirected succeeded", url = url, original_url = original_url)) #handle redirected url crawler_response crawler_response_msg = misc.clone_dict(message, ["url", "status", "doc", "headers", "page_last_modified", "last_crawled", "error_message"]) crawler_response_msg["original_url"] = url # get url_info from message meta instead of db #url_info = crawlerdb.get_url_info(url, common_settings.crawler_msg_meta_fields) url_info = message['meta'] crawler_response_msg["meta"] = url_info result = handler.HandlerRepository.process("crawler_response", crawler_response_msg) #handle original url crawler_response message["url"] = original_url message["redirect_url"] = url message["status"] = 801 else: message["url"] = original_url message["status"] = 802
def build_crawler_request_msg(cls, url, url_info): message = misc.clone_dict(url_info, ["url", "page_last_modified"]) message["__priority"] = url_info["crawl_priority"] message["meta"] = misc.clone_dict(url_info, common_settings.crawler_msg_meta_fields) if common_settings.strong_politeness: message["__group_hash"] = url_info["full_domain"] else: message["__group_hash"] = misc.md5(url) if url_info["crawl_type"] == "static": message_type = "__internal_crawler_request" elif url_info["crawl_type"] == "dynamic": message_type = "__internal_dynamic_crawler_request" else: raise Exception("unsupported crawl_type %s" % url_info["crawl_type"]) return message_type, message
def process_crawler_response(self, result): if not result.has_key("url"): return None if result["status"] == 700: self.crawl_url(self._async_mode, result["url"], result["meta"], self.get_user_agent(), None, result["meta"]["page_last_modified"]) return result["meta"] else: #send crawler_response message input_msg = result["meta"] fields = ["url", "status", "doc", "headers"] message = misc.clone_dict(result, fields) message["page_last_modified"] = input_msg["page_last_modified"] message["original_url"] = input_msg["url"] message["last_crawled"] = datetime2timestamp(datetime.datetime.utcnow()) message["error_message"] = result.get("error_message", None) message["meta"] = input_msg["meta"] message["meta"]["crawl_type"] = "static" if result["headers"] is not None and result["headers"].has_key("Last-Modified"): message["page_last_modified"] = result["headers"].get('Last-Modified') handler.HandlerRepository.process("__internal_crawler_response", message) return result["meta"]
def _process(self, message): # normalize url url = url_analyser.normalize_url(message["url"]) if url is None: logging.error("invalid url for crawl", url = message["url"]) return {"status" : -1} message["url"] = url #fill optional fields url_info = misc.clone_dict(message, fields = ["url", "source", "root_url", "parent_url", "crawl_priority", "crawl_depth"]) self._assign_url_info_defaults(url_info) if url_info["root_url"] is None: url_info["root_url"] = url #deterimine crawl priority/depth is_valid, url_info["crawl_priority"], url_info["crawl_depth"] = crawl_priority_and_depth_evaluator.evaluate(url, url_info["source"], url_info) if not is_valid: return {"status" : -1} # stores to urlRepository table url_info["page_last_modified"] = None url_info["crawl_status"] = "crawling" url_info["last_crawled"] = None url_info["original_url"] = None # all urls is static now url_info["crawl_type"] = "static" # TODO add to crawler db, this should not be done here # some project do not need to store url info into database # should use middleware for these kind of actions #success, promoted = crawlerdb.add_url_info(url, url_info, True) if message["source"] != "redirected": # notify crawler message_type, crawler_message = CrawlerUtils.build_crawler_request_msg(url, url_info) handler.HandlerRepository.process(message_type, crawler_message) return {"status" : 1}
def predict(self, url, url_info, extras = None): output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False} if url_info["url_class"] is None: url_info["url_class"] = "undefined" if url_info["last_crawled"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "last_crawled is None" elif url_info["crawl_status"] == "alive": if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"]) else: need_recrawl = self._recrawling_url(url, url_info["url_class"]) if need_recrawl: alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info) if not alive: output_msg["crawl_status"] = "notAlive" else: output_msg["crawl_status"] = "notAlive" elif url_info["crawl_status"] == "error": if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "crawl_error" output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"] else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info) output_msg["retry_count_inc"] = True elif url_info["crawl_status"] == "redirected": if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]: output_msg["crawl_status"] = "notAlive" else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info) output_msg["redirect_count_inc"] = True else: logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"]) output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"] if output_msg["recrawl_time"] is not None: output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"]) if output_msg["recrawl_duration"] is not None: output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"]) return output_msg