Beispiel #1
0
def add_url_info(url, url_info, merge = False):
    """
    Enabled cache
    """

    assign_url_info_defaults(url, url_info)

    existed, alive = UrlCacheClient.check_url_exists(url)
    if not existed:
        _insert_url_info(url, url_info)
        return True, False
    elif alive and merge:
        now = datetime2timestamp(datetime.datetime.utcnow())
        update_map = {"last_discovered" : now}

        # duplicate crawl request merge, will promote crawl_priority/crawl_depth if any
        fields = ["crawl_priority", "crawl_depth", "crawl_status", "url_class", "last_crawled"]
        existing_url_info = get_url_info(url, fields = fields)
        if existing_url_info is None:
            return False, False
        priority_promoted = _merge_url_info(url, existing_url_info, url_info, update_map)

        promoted = False
        misc.copy_dict(existing_url_info, url_info, fields = ["crawl_status", "url_class", "last_crawled"])
        if common_settings.core_settings["general_crawl_policies"]["preemptive_priority_promotion"] and url_info["last_crawled"] is None and priority_promoted:
            if url_info["crawl_status"] == "crawling":
                update_map["expires"] = now
                promoted = True

        update_url_info(url, update_map, {"discovered_count" : 1})
        return False, promoted
    else:
        return False, False
    def _process(self, message):
        update_map = {}

        misc.copy_dict(message["meta"], message, common_settings.crawler_msg_meta_fields)
        url = message["url"]
        if url != message["original_url"]:
            self._handle_redirect(url, message)

        #decode some message fields
        #message updated fields: headers, page_last_modified
        self._decode_fields(url, message)

        #init some message fields
        message["crawl_status"] = "alive"
        self._merge_error_message("crawl_error", message.get("error_message", None), update_map)
        #update_map["redirect_url"] = None

        #main process
        #required message fields: status, original_url, doc, headers, encoding/encoding_created_time, crawl_type, full_domain
        #update_map updated fields: encoding/encoding_created_time, doc, headers, first_modified, last_modified, error_type/message; modified_count
        #message updated fields: crawl_status, doc, first_modified, last_modified, modified_count
        #db updated fields: md5
        message["crawl_status"], decoded_doc, error_type, error_message  = self._process_main(url, message, update_map)
        #logging.debug("crawler_response process_main", crawl_status = message["crawl_status"], md5_hash = md5_hash, error_message = error_message)
        misc.copy_dict(update_map, message, ["doc", "first_modified", "last_modified"], soft = True)
        self._merge_error_message(error_type, error_message, update_map)
        # process crawl_response message here
        if decoded_doc is not None:
            message['doc'] = decoded_doc
        handler.HandlerRepository.process("crawl_response", message)
        return message
Beispiel #3
0
def _insert_url_info(url, url_info):
    UrlCacheClient.update_url_info(url, url_info)

    first_update_map, second_update_map = misc.separate_dict(url_info, common_settings.database_table_fields["urlRepositoryMeta"])
    misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields + ["url", "_id"])

    db.urlRepository.insert(first_update_map)
    crawlerMetadb.insert_url_info_meta(second_update_map)
Beispiel #4
0
def _make_update(update_map, inc_map = None):
    now = datetime2timestamp(datetime.datetime.utcnow())

    #add status_last_modified field
    if update_map.has_key("crawl_status"):
        update_map["status_last_modified"] = now

    #separate url_info fields from meta_url_info fields
    first_update_map, second_update_map = misc.separate_dict(update_map, common_settings.database_table_fields["urlRepositoryMeta"])
    first_inc_map, second_inc_map = misc.separate_dict(inc_map if inc_map is not None else {}, common_settings.database_table_fields["urlRepositoryMeta"])
    misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields, soft = True)
    misc.copy_dict(first_inc_map, second_inc_map, common_settings.common_url_info_fields, soft = True)

    first_update = _create_update(first_update_map, first_inc_map)
    second_update = _create_update(second_update_map, second_inc_map)
    return first_update, second_update