def _get_recrawl_time_and_priority(self, url_info): last_crawled = timestamp2datetime(url_info["last_crawled"]) last_modified = timestamp2datetime(url_info["last_modified"]) first_modified = timestamp2datetime(url_info["first_modified"]) modified_count = url_info["modified_count"] #calculate next document modification predication. average_modified_period = misc.diff_seconds(last_crawled, first_modified) / modified_count # may be slightly negative last_modified_since = misc.diff_seconds(last_crawled, last_modified) #may be slightly negative min_recrawl_interval = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["min_recrawl_interval"] max_recrawl_interval = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["max_recrawl_interval"] max_alive_interval = self._settings["recrawl_policies"]["url_class_policies"][url_info["url_class"]]["max_alive_interval"] if last_modified_since >= max_alive_interval: return False, None, None last_modified_since = min(max(last_modified_since, min_recrawl_interval, average_modified_period), max_recrawl_interval) recrawl_duration = datetime.timedelta(seconds = last_modified_since) #calculate recrawl priority, recrawl priority is in [crawl_priority, Lowest_priority] range. crawl_priority = url_info["crawl_priority"] delta = (last_modified_since - min_recrawl_interval) * 1.0 / (max_recrawl_interval - min_recrawl_interval) recrawl_priority = int(crawl_priority + (self._settings["total_priority_count"] - crawl_priority - 1) * delta) return True, last_crawled + recrawl_duration, recrawl_duration, recrawl_priority
def _decode_doc(self, url, message): if message["crawl_type"] == "dynamic": encoding = "utf-8" elif message["encoding"] is not None and message["encoding_created_time"] is not None and \ datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \ datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]): encoding = message["encoding"] else: encoding = None if encoding is None: encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"]) content_type = message["headers"].get('Content-Type', None) decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \ message["doc"], encoding) if message['encoding'] is None: message['encoding'] = used_encoding message['encoding_create_time'] = datetime.datetime.utcnow() return decoded_doc, used_encoding
def _get_recent_time_count(n, hour_or_day): now = datetime2timestamp(datetime.datetime.utcnow()) / 1000 if hour_or_day: duration_mins = n * 60 else: duration_mins = n * 60 * 24 minute_counts = db.minuteCounts.find({"_id" : {"$gte" : now - 60 * duration_mins}}) time_counts = {} for minute_count in minute_counts: timestamp = minute_count["_id"] time = timestamp2datetime(timestamp) if hour_or_day: checkpoint = datetime.datetime(time.year, time.month, time.day, time.hour) else: checkpoint = datetime.datetime(time.year, time.month, time.day) timestamp = datetime2timestamp(checkpoint) if not time_counts.has_key(timestamp): time_counts[timestamp] = {"timestamp" : timestamp, "checkpoint" : checkpoint, "crawled_count" : 0, "modified_count" : 0, "processed_count" : 0} time_counts[timestamp]["crawled_count"] += minute_count["crawled_count"] time_counts[timestamp]["modified_count"] += minute_count["modified_count"] time_counts[timestamp]["processed_count"] += minute_count["processed_count"] return sorted(time_counts.values(), key = lambda time_count : time_count["timestamp"])
def _get_redirect_time_and_priority(self, url_info): last_crawled = timestamp2datetime(url_info["last_crawled"]) recrawl_duration = datetime.timedelta(seconds = self._settings["recrawl_policies"]["redirect_wait_duration"]) recrawl_priority = url_info["crawl_priority"] return last_crawled + recrawl_duration, recrawl_duration, recrawl_priority