def add_url_info(url, url_info, merge = False): """ Enabled cache """ assign_url_info_defaults(url, url_info) existed, alive = UrlCacheClient.check_url_exists(url) if not existed: _insert_url_info(url, url_info) return True, False elif alive and merge: now = datetime2timestamp(datetime.datetime.utcnow()) update_map = {"last_discovered" : now} # duplicate crawl request merge, will promote crawl_priority/crawl_depth if any fields = ["crawl_priority", "crawl_depth", "crawl_status", "url_class", "last_crawled"] existing_url_info = get_url_info(url, fields = fields) if existing_url_info is None: return False, False priority_promoted = _merge_url_info(url, existing_url_info, url_info, update_map) promoted = False misc.copy_dict(existing_url_info, url_info, fields = ["crawl_status", "url_class", "last_crawled"]) if common_settings.core_settings["general_crawl_policies"]["preemptive_priority_promotion"] and url_info["last_crawled"] is None and priority_promoted: if url_info["crawl_status"] == "crawling": update_map["expires"] = now promoted = True update_url_info(url, update_map, {"discovered_count" : 1}) return False, promoted else: return False, False
def _process(self, message): update_map = {} misc.copy_dict(message["meta"], message, common_settings.crawler_msg_meta_fields) url = message["url"] if url != message["original_url"]: self._handle_redirect(url, message) #decode some message fields #message updated fields: headers, page_last_modified self._decode_fields(url, message) #init some message fields message["crawl_status"] = "alive" self._merge_error_message("crawl_error", message.get("error_message", None), update_map) #update_map["redirect_url"] = None #main process #required message fields: status, original_url, doc, headers, encoding/encoding_created_time, crawl_type, full_domain #update_map updated fields: encoding/encoding_created_time, doc, headers, first_modified, last_modified, error_type/message; modified_count #message updated fields: crawl_status, doc, first_modified, last_modified, modified_count #db updated fields: md5 message["crawl_status"], decoded_doc, error_type, error_message = self._process_main(url, message, update_map) #logging.debug("crawler_response process_main", crawl_status = message["crawl_status"], md5_hash = md5_hash, error_message = error_message) misc.copy_dict(update_map, message, ["doc", "first_modified", "last_modified"], soft = True) self._merge_error_message(error_type, error_message, update_map) # process crawl_response message here if decoded_doc is not None: message['doc'] = decoded_doc handler.HandlerRepository.process("crawl_response", message) return message
def _insert_url_info(url, url_info): UrlCacheClient.update_url_info(url, url_info) first_update_map, second_update_map = misc.separate_dict(url_info, common_settings.database_table_fields["urlRepositoryMeta"]) misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields + ["url", "_id"]) db.urlRepository.insert(first_update_map) crawlerMetadb.insert_url_info_meta(second_update_map)
def _make_update(update_map, inc_map = None): now = datetime2timestamp(datetime.datetime.utcnow()) #add status_last_modified field if update_map.has_key("crawl_status"): update_map["status_last_modified"] = now #separate url_info fields from meta_url_info fields first_update_map, second_update_map = misc.separate_dict(update_map, common_settings.database_table_fields["urlRepositoryMeta"]) first_inc_map, second_inc_map = misc.separate_dict(inc_map if inc_map is not None else {}, common_settings.database_table_fields["urlRepositoryMeta"]) misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields, soft = True) misc.copy_dict(first_inc_map, second_inc_map, common_settings.common_url_info_fields, soft = True) first_update = _create_update(first_update_map, first_inc_map) second_update = _create_update(second_update_map, second_inc_map) return first_update, second_update