Beispiel #1
0
def find_and_modify_expired_url_info(expired_time, fields):
    """
    Disabled cache
    """

    cond = {"recrawl_time" : {"$lte" : datetime2timestamp(expired_time)}, "crawl_status" : "alive"}
    now = datetime2timestamp(datetime.datetime.utcnow())
    update_map = {"crawl_status" : "crawling", "last_crawl_start_time" : now}
    return _cond_update_url_info(cond, update_map, None, fields)
Beispiel #2
0
def add_inc(crawled_count = 0, modified_count = 0, processed_count = 0):
    inc_map = {"crawled_count" : crawled_count, "modified_count" : modified_count, "processed_count" : processed_count}

    now = datetime.datetime.utcnow()
    checkpoint = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute)
    timestamp = datetime2timestamp(checkpoint)
    update_map = {"exactTime" : now, "checkpoint" : checkpoint}
    update = {"$set" : update_map, "$inc" : inc_map}
    db.minuteCounts.update({"_id" : timestamp}, update, upsert = True)
    db.minuteCounts.remove({"_id" : {"$lt" : datetime2timestamp(checkpoint - datetime.timedelta(seconds = TOTAL_MINUTE_ROW_COUNT))}})

    db.totalCounts.update({"_id" : "totalCounts"}, {"$set" : {"exactTime" : now, "timestamp" : timestamp}, "$inc" : inc_map}, upsert = True)
    db.totalCounts.update({"_id" : "totalCounts", "startTime" : None}, {"$set" : {"startTime" : now}})
Beispiel #3
0
def assign_url_info_defaults(url, url_info):
    url_info["_id"] = misc.md5(url)
    now = datetime2timestamp(datetime.datetime.utcnow())
    url_info["created_time"] = now
    url_info["crawled_count"] = 0
    url_info["url_class"] = None
    url_info["error_messages"] = []
    #url_info["processed_count"] = 0
    #url_info["last_processed"] = None
    url_info["first_modified"] = None
    url_info["last_modified"] = None
    url_info["modified_count"] = 0
    url_info["valid_link_count"] = None
    url_info["retry_count"] = 0
    url_info["status_last_modified"] = now
    url_info["encoding"] = None
    url_info["encoding_created_time"] = None
    url_info["redirect_url"] = None
    #url_info["last_finished"] = None
    #url_info["expires"] = now
    url_info["doc"] = None
    url_info["headers"] = None
    url_info["md5"] = None
    #url_info["process_status"] = True
    url_info["last_discovered"] = now
    url_info["discovered_count"] = 1
    url_info["comments"] = ""
    url_info["redirect_count"] = 0
    url_info["recrawl_time"] = now
    url_info["recrawl_duration"] = 0
    url_info["recrawl_priority"] = url_info["crawl_priority"]

    _, full_domain, _ = misc.get_url_domain_info(url)
    url_info["full_domain"] = full_domain
Beispiel #4
0
def add_url_info(url, url_info, merge = False):
    """
    Enabled cache
    """

    assign_url_info_defaults(url, url_info)

    existed, alive = UrlCacheClient.check_url_exists(url)
    if not existed:
        _insert_url_info(url, url_info)
        return True, False
    elif alive and merge:
        now = datetime2timestamp(datetime.datetime.utcnow())
        update_map = {"last_discovered" : now}

        # duplicate crawl request merge, will promote crawl_priority/crawl_depth if any
        fields = ["crawl_priority", "crawl_depth", "crawl_status", "url_class", "last_crawled"]
        existing_url_info = get_url_info(url, fields = fields)
        if existing_url_info is None:
            return False, False
        priority_promoted = _merge_url_info(url, existing_url_info, url_info, update_map)

        promoted = False
        misc.copy_dict(existing_url_info, url_info, fields = ["crawl_status", "url_class", "last_crawled"])
        if common_settings.core_settings["general_crawl_policies"]["preemptive_priority_promotion"] and url_info["last_crawled"] is None and priority_promoted:
            if url_info["crawl_status"] == "crawling":
                update_map["expires"] = now
                promoted = True

        update_url_info(url, update_map, {"discovered_count" : 1})
        return False, promoted
    else:
        return False, False
Beispiel #5
0
def _create_update(update_map, inc_map):
    if len(update_map) == 0 and len(inc_map) == 0:
        return None

    now = datetime2timestamp(datetime.datetime.utcnow())
    update = {"$set" : update_map}
    if inc_map is not None:
        update["$inc"] = inc_map

    #adjust error_message field
    if update_map.has_key("error_message"):
        if not update_map.has_key("error_type"):
            raise Exception("error_type is required if error_message is set")
    if update_map.has_key("error_type"):
        if not update_map.has_key("error_message"):
            raise Exception("error_message is required if error_type is set")

    if update_map.has_key("error_message"):
        error_message = update_map["error_message"]
        error_type = update_map["error_type"]
        error_message = {"timestamp": now, "message" : error_message, "type" : error_type}
        update["$push"] = {"error_messages" : error_message}
        update_map.pop("error_message")
        update_map.pop("error_type")
    return update
Beispiel #6
0
def save_handler_counts(handler_counts, type):
    now = datetime2timestamp(datetime.datetime.utcnow())
    insert = {}
    insert["_id"] = misc.md5(str(now))
    insert["datetime"] = now
    insert["handler_counts"] = handler_counts
    insert["type"] = type
    return db.handlerStatistics.save(insert)
Beispiel #7
0
def set(cache_type, cache_key, data, expiry=None, warn_count=500, max_count=1000):
    if not global_caches.has_key(cache_type):
        global_caches[cache_type] = {}

    current_cache = global_caches[cache_type]
    now = datetime.datetime.utcnow()
    timestamp = datetime2timestamp(now)
    if expiry is not None:
        expiry = datetime2timestamp(now + datetime.timedelta(seconds=expiry))

    # removes expired caches
    if len(current_cache) >= warn_count:
        for cache_key, item in current_cache.items():
            if item["expiry"] is not None and item["expiry"] <= timestamp:
                current_cache.pop(cache_key)

    # removes oldest item if exceeded
    if not current_cache.has_key(cache_key) and len(current_cache) >= max_count:
        oldest_pair = min(current_cache.items(), key=lambda pair: pair[1]["timestamp"])
        current_cache.pop(oldest_pair[0])

    current_cache[cache_key] = {"data": data, "timestamp": timestamp, "expiry": expiry}
Beispiel #8
0
def _get_recent_time_count(n, hour_or_day):
    now = datetime2timestamp(datetime.datetime.utcnow()) / 1000
    if hour_or_day:
        duration_mins = n * 60
    else:
        duration_mins = n * 60 * 24

    minute_counts = db.minuteCounts.find({"_id" : {"$gte" : now - 60 * duration_mins}})
    time_counts = {}
    for minute_count in minute_counts:
        timestamp = minute_count["_id"]
        time = timestamp2datetime(timestamp)
        if hour_or_day:
            checkpoint = datetime.datetime(time.year, time.month, time.day, time.hour)
        else:
            checkpoint = datetime.datetime(time.year, time.month, time.day)
        timestamp = datetime2timestamp(checkpoint)
        if not time_counts.has_key(timestamp):
            time_counts[timestamp] = {"timestamp" : timestamp, "checkpoint" : checkpoint, "crawled_count" : 0, "modified_count" : 0, "processed_count" : 0}
        time_counts[timestamp]["crawled_count"] += minute_count["crawled_count"]
        time_counts[timestamp]["modified_count"] += minute_count["modified_count"]
        time_counts[timestamp]["processed_count"] += minute_count["processed_count"]
    return sorted(time_counts.values(), key = lambda time_count : time_count["timestamp"])
Beispiel #9
0
    def _once(self):
        now = datetime2timestamp(datetime.datetime.utcnow())
        message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name,
            "pid" : self._process_id, "handler_key" : self._handler_key}

        try:
            self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self._client_socket.connect((self._server_address, self._server_port))
            self._client_socket.send(simplejson.dumps(message))
            logging.debug("heartbeat client sent message", message)
        except socket.error as e:
            logging.warn("socket error for heartbeat client", exception = e)
        finally:
            self._client_socket.close()
    def predict(self, url, url_info, extras = None):
        output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False}
        if url_info["url_class"] is None:
            url_info["url_class"] = "undefined"

        if url_info["last_crawled"] is None:
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "last_crawled is None"
        elif url_info["crawl_status"] == "alive":
            if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "unexpected"
                output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"])
            else:
                need_recrawl = self._recrawling_url(url, url_info["url_class"])
                if need_recrawl:
                    alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info)
                    if not alive:
                        output_msg["crawl_status"] = "notAlive"
                else:
                    output_msg["crawl_status"] = "notAlive"
        elif url_info["crawl_status"] == "error":
            if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]:
                output_msg["crawl_status"] = "failed"
                output_msg["error_type"] = "crawl_error"
                output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"]
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info)
                output_msg["retry_count_inc"] = True
        elif url_info["crawl_status"] == "redirected":
            if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]:
                output_msg["crawl_status"] = "notAlive"
            else:
                output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info)
                output_msg["redirect_count_inc"] = True
        else:
            logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"])
            output_msg["crawl_status"] = "failed"
            output_msg["error_type"] = "unexpected"
            output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"]

        if output_msg["recrawl_time"] is not None:
            output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"])

        if output_msg["recrawl_duration"] is not None:
            output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"])
        return output_msg
Beispiel #11
0
def _make_update(update_map, inc_map = None):
    now = datetime2timestamp(datetime.datetime.utcnow())

    #add status_last_modified field
    if update_map.has_key("crawl_status"):
        update_map["status_last_modified"] = now

    #separate url_info fields from meta_url_info fields
    first_update_map, second_update_map = misc.separate_dict(update_map, common_settings.database_table_fields["urlRepositoryMeta"])
    first_inc_map, second_inc_map = misc.separate_dict(inc_map if inc_map is not None else {}, common_settings.database_table_fields["urlRepositoryMeta"])
    misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields, soft = True)
    misc.copy_dict(first_inc_map, second_inc_map, common_settings.common_url_info_fields, soft = True)

    first_update = _create_update(first_update_map, first_inc_map)
    second_update = _create_update(second_update_map, second_inc_map)
    return first_update, second_update
Beispiel #12
0
    def _assign_url_info_defaults(self, url_info):
        now = datetime2timestamp(datetime.datetime.utcnow())
        url_info["created_time"] = now
        url_info["crawled_count"] = 0
        url_info["error_messages"] = []
        url_info["retry_count"] = 0
        url_info["encoding"] = None
        url_info["encoding_created_time"] = None
        url_info["redirect_url"] = None
        #url_info["last_finished"] = None
        #url_info["expires"] = now
        url_info["doc"] = None
        url_info["headers"] = None
        # TODO not used?
        url_info["md5"] = None
        #url_info["process_status"] = True
        url_info["comments"] = ""
        url_info["redirect_count"] = 0

        _, full_domain, _ = misc.get_url_domain_info(url_info['url'])
        url_info["full_domain"] = full_domain
    def process_crawler_response(self, result):
        if not result.has_key("url"):
            return None
        if result["status"] == 700:
            self.crawl_url(self._async_mode, result["url"], result["meta"], self.get_user_agent(), None, result["meta"]["page_last_modified"])
            return result["meta"]
        else:
            #send crawler_response message
            input_msg = result["meta"]
            fields = ["url", "status", "doc", "headers"]
            message = misc.clone_dict(result, fields)
            message["page_last_modified"] = input_msg["page_last_modified"]
            message["original_url"] = input_msg["url"]
            message["last_crawled"] = datetime2timestamp(datetime.datetime.utcnow())
            message["error_message"] = result.get("error_message", None)
            message["meta"] = input_msg["meta"]
            message["meta"]["crawl_type"] = "static"
            if result["headers"] is not None and result["headers"].has_key("Last-Modified"):
                message["page_last_modified"] = result["headers"].get('Last-Modified')

            handler.HandlerRepository.process("__internal_crawler_response", message)
            return result["meta"]
Beispiel #14
0
def get_recent_minute_count(n):
    now = datetime2timestamp(datetime.datetime.utcnow()) / 1000
    return db.minuteCounts.find({"_id" : {"$gte" : now - n * 60}}).sort([("_id", -1)])
Beispiel #15
0
def save_heartbeat(message):
    now = datetime2timestamp(datetime.datetime.utcnow())
    message["_id"] = misc.md5(str(now))
    message["datetime"] = now
    return db.heartbeats.save(message)
Beispiel #16
0
def get_heartbeats(check_duration):
    checkpoint = datetime.datetime.utcnow() - datetime.timedelta(seconds=check_duration)
    return db.heartbeats.find({"datetime" : {"$gt" : datetime2timestamp(checkpoint)}})
Beispiel #17
0
def save_redirect_url(url, redirect_url):
    now = datetime2timestamp(datetime.utcnow())
    _db.urlRedirects.save({"_id" : misc.md5(url), "url" : url, "redirect_url" : redirect_url, "created_time" : now})
Beispiel #18
0
    def test_url_type(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["url_test"] = common_settings.redis_cache_config["data_types"]["url"]
        client = common_settings.cache_client()
        url_info = {"crawl_status" : "crawling", "url_class" : None, "crawl_priority" : 1, "crawl_depth" : 0, "last_crawled" : datetime2timestamp(datetime.datetime.utcnow())}
        url = "http://www.baidu.com"
        client.set("url_test", url, update_map = url_info)
        self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"]))

        url_info = {"crawl_status" : "alive", "url_class" : "details", "crawl_priority" : 3, "crawl_depth" : -1, "last_crawled" : None}
        client.set("url_test", url, update_map = url_info)
        self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"]))

        client.set("url_test", url, update_map = {"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123})
        self.assertEqual({"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123, "crawl_depth" : -1}, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth"]))