def _get_recrawl_time_and_priority(self, url_info):
        last_crawled = timestamp2datetime(url_info["last_crawled"])
        last_modified = timestamp2datetime(url_info["last_modified"])
        first_modified = timestamp2datetime(url_info["first_modified"])
        modified_count = url_info["modified_count"]

        #calculate next document modification predication.
        average_modified_period = misc.diff_seconds(last_crawled, first_modified) / modified_count # may be slightly negative
        last_modified_since = misc.diff_seconds(last_crawled, last_modified) #may be slightly negative

        min_recrawl_interval = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["min_recrawl_interval"]
        max_recrawl_interval = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["max_recrawl_interval"]
        max_alive_interval   = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["max_alive_interval"]

        if last_modified_since >= max_alive_interval:
            return False, None, None

        last_modified_since = min(max(last_modified_since, min_recrawl_interval, average_modified_period), max_recrawl_interval)

        recrawl_duration = datetime.timedelta(seconds = last_modified_since)

        #calculate recrawl priority, recrawl priority is in [crawl_priority, Lowest_priority] range.
        crawl_priority = url_info["crawl_priority"]
        delta = (last_modified_since - min_recrawl_interval) * 1.0 / (max_recrawl_interval - min_recrawl_interval)
        recrawl_priority = int(crawl_priority + (self._settings["total_priority_count"] - crawl_priority - 1) * delta)

        return True, last_crawled + recrawl_duration, recrawl_duration, recrawl_priority
Example #2
0
    def _decode_doc(self, url, message):
        if message["crawl_type"] == "dynamic":
            encoding = "utf-8"
        elif message["encoding"] is not None and message["encoding_created_time"] is not None and \
            datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \
            datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]):
            encoding = message["encoding"]
        else:
            encoding = None

        if encoding is None:
            encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"])

        content_type = message["headers"].get('Content-Type', None)
        decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \
            message["doc"], encoding)
        if message['encoding'] is None:
            message['encoding'] = used_encoding
            message['encoding_create_time'] = datetime.datetime.utcnow()

        return decoded_doc, used_encoding
Example #3
0
def _get_recent_time_count(n, hour_or_day):
    now = datetime2timestamp(datetime.datetime.utcnow()) / 1000
    if hour_or_day:
        duration_mins = n * 60
    else:
        duration_mins = n * 60 * 24

    minute_counts = db.minuteCounts.find({"_id" : {"$gte" : now - 60 * duration_mins}})
    time_counts = {}
    for minute_count in minute_counts:
        timestamp = minute_count["_id"]
        time = timestamp2datetime(timestamp)
        if hour_or_day:
            checkpoint = datetime.datetime(time.year, time.month, time.day, time.hour)
        else:
            checkpoint = datetime.datetime(time.year, time.month, time.day)
        timestamp = datetime2timestamp(checkpoint)
        if not time_counts.has_key(timestamp):
            time_counts[timestamp] = {"timestamp" : timestamp, "checkpoint" : checkpoint, "crawled_count" : 0, "modified_count" : 0, "processed_count" : 0}
        time_counts[timestamp]["crawled_count"] += minute_count["crawled_count"]
        time_counts[timestamp]["modified_count"] += minute_count["modified_count"]
        time_counts[timestamp]["processed_count"] += minute_count["processed_count"]
    return sorted(time_counts.values(), key = lambda time_count : time_count["timestamp"])
 def _get_redirect_time_and_priority(self, url_info):
     last_crawled = timestamp2datetime(url_info["last_crawled"])
     recrawl_duration = datetime.timedelta(seconds = self._settings["recrawl_policies"]["redirect_wait_duration"])
     recrawl_priority = url_info["crawl_priority"]
     return last_crawled + recrawl_duration, recrawl_duration, recrawl_priority