def find_and_modify_expired_url_info(expired_time, fields): """ Disabled cache """ cond = {"recrawl_time" : {"$lte" : datetime2timestamp(expired_time)}, "crawl_status" : "alive"} now = datetime2timestamp(datetime.datetime.utcnow()) update_map = {"crawl_status" : "crawling", "last_crawl_start_time" : now} return _cond_update_url_info(cond, update_map, None, fields)
def add_inc(crawled_count = 0, modified_count = 0, processed_count = 0): inc_map = {"crawled_count" : crawled_count, "modified_count" : modified_count, "processed_count" : processed_count} now = datetime.datetime.utcnow() checkpoint = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute) timestamp = datetime2timestamp(checkpoint) update_map = {"exactTime" : now, "checkpoint" : checkpoint} update = {"$set" : update_map, "$inc" : inc_map} db.minuteCounts.update({"_id" : timestamp}, update, upsert = True) db.minuteCounts.remove({"_id" : {"$lt" : datetime2timestamp(checkpoint - datetime.timedelta(seconds = TOTAL_MINUTE_ROW_COUNT))}}) db.totalCounts.update({"_id" : "totalCounts"}, {"$set" : {"exactTime" : now, "timestamp" : timestamp}, "$inc" : inc_map}, upsert = True) db.totalCounts.update({"_id" : "totalCounts", "startTime" : None}, {"$set" : {"startTime" : now}})
def assign_url_info_defaults(url, url_info): url_info["_id"] = misc.md5(url) now = datetime2timestamp(datetime.datetime.utcnow()) url_info["created_time"] = now url_info["crawled_count"] = 0 url_info["url_class"] = None url_info["error_messages"] = [] #url_info["processed_count"] = 0 #url_info["last_processed"] = None url_info["first_modified"] = None url_info["last_modified"] = None url_info["modified_count"] = 0 url_info["valid_link_count"] = None url_info["retry_count"] = 0 url_info["status_last_modified"] = now url_info["encoding"] = None url_info["encoding_created_time"] = None url_info["redirect_url"] = None #url_info["last_finished"] = None #url_info["expires"] = now url_info["doc"] = None url_info["headers"] = None url_info["md5"] = None #url_info["process_status"] = True url_info["last_discovered"] = now url_info["discovered_count"] = 1 url_info["comments"] = "" url_info["redirect_count"] = 0 url_info["recrawl_time"] = now url_info["recrawl_duration"] = 0 url_info["recrawl_priority"] = url_info["crawl_priority"] _, full_domain, _ = misc.get_url_domain_info(url) url_info["full_domain"] = full_domain
def add_url_info(url, url_info, merge = False): """ Enabled cache """ assign_url_info_defaults(url, url_info) existed, alive = UrlCacheClient.check_url_exists(url) if not existed: _insert_url_info(url, url_info) return True, False elif alive and merge: now = datetime2timestamp(datetime.datetime.utcnow()) update_map = {"last_discovered" : now} # duplicate crawl request merge, will promote crawl_priority/crawl_depth if any fields = ["crawl_priority", "crawl_depth", "crawl_status", "url_class", "last_crawled"] existing_url_info = get_url_info(url, fields = fields) if existing_url_info is None: return False, False priority_promoted = _merge_url_info(url, existing_url_info, url_info, update_map) promoted = False misc.copy_dict(existing_url_info, url_info, fields = ["crawl_status", "url_class", "last_crawled"]) if common_settings.core_settings["general_crawl_policies"]["preemptive_priority_promotion"] and url_info["last_crawled"] is None and priority_promoted: if url_info["crawl_status"] == "crawling": update_map["expires"] = now promoted = True update_url_info(url, update_map, {"discovered_count" : 1}) return False, promoted else: return False, False
def _create_update(update_map, inc_map): if len(update_map) == 0 and len(inc_map) == 0: return None now = datetime2timestamp(datetime.datetime.utcnow()) update = {"$set" : update_map} if inc_map is not None: update["$inc"] = inc_map #adjust error_message field if update_map.has_key("error_message"): if not update_map.has_key("error_type"): raise Exception("error_type is required if error_message is set") if update_map.has_key("error_type"): if not update_map.has_key("error_message"): raise Exception("error_message is required if error_type is set") if update_map.has_key("error_message"): error_message = update_map["error_message"] error_type = update_map["error_type"] error_message = {"timestamp": now, "message" : error_message, "type" : error_type} update["$push"] = {"error_messages" : error_message} update_map.pop("error_message") update_map.pop("error_type") return update
def save_handler_counts(handler_counts, type): now = datetime2timestamp(datetime.datetime.utcnow()) insert = {} insert["_id"] = misc.md5(str(now)) insert["datetime"] = now insert["handler_counts"] = handler_counts insert["type"] = type return db.handlerStatistics.save(insert)
def set(cache_type, cache_key, data, expiry=None, warn_count=500, max_count=1000): if not global_caches.has_key(cache_type): global_caches[cache_type] = {} current_cache = global_caches[cache_type] now = datetime.datetime.utcnow() timestamp = datetime2timestamp(now) if expiry is not None: expiry = datetime2timestamp(now + datetime.timedelta(seconds=expiry)) # removes expired caches if len(current_cache) >= warn_count: for cache_key, item in current_cache.items(): if item["expiry"] is not None and item["expiry"] <= timestamp: current_cache.pop(cache_key) # removes oldest item if exceeded if not current_cache.has_key(cache_key) and len(current_cache) >= max_count: oldest_pair = min(current_cache.items(), key=lambda pair: pair[1]["timestamp"]) current_cache.pop(oldest_pair[0]) current_cache[cache_key] = {"data": data, "timestamp": timestamp, "expiry": expiry}
def _get_recent_time_count(n, hour_or_day): now = datetime2timestamp(datetime.datetime.utcnow()) / 1000 if hour_or_day: duration_mins = n * 60 else: duration_mins = n * 60 * 24 minute_counts = db.minuteCounts.find({"_id" : {"$gte" : now - 60 * duration_mins}}) time_counts = {} for minute_count in minute_counts: timestamp = minute_count["_id"] time = timestamp2datetime(timestamp) if hour_or_day: checkpoint = datetime.datetime(time.year, time.month, time.day, time.hour) else: checkpoint = datetime.datetime(time.year, time.month, time.day) timestamp = datetime2timestamp(checkpoint) if not time_counts.has_key(timestamp): time_counts[timestamp] = {"timestamp" : timestamp, "checkpoint" : checkpoint, "crawled_count" : 0, "modified_count" : 0, "processed_count" : 0} time_counts[timestamp]["crawled_count"] += minute_count["crawled_count"] time_counts[timestamp]["modified_count"] += minute_count["modified_count"] time_counts[timestamp]["processed_count"] += minute_count["processed_count"] return sorted(time_counts.values(), key = lambda time_count : time_count["timestamp"])
def _once(self): now = datetime2timestamp(datetime.datetime.utcnow()) message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name, "pid" : self._process_id, "handler_key" : self._handler_key} try: self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._client_socket.connect((self._server_address, self._server_port)) self._client_socket.send(simplejson.dumps(message)) logging.debug("heartbeat client sent message", message) except socket.error as e: logging.warn("socket error for heartbeat client", exception = e) finally: self._client_socket.close()
def predict(self, url, url_info, extras = None): output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False} if url_info["url_class"] is None: url_info["url_class"] = "undefined" if url_info["last_crawled"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "last_crawled is None" elif url_info["crawl_status"] == "alive": if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"]) else: need_recrawl = self._recrawling_url(url, url_info["url_class"]) if need_recrawl: alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info) if not alive: output_msg["crawl_status"] = "notAlive" else: output_msg["crawl_status"] = "notAlive" elif url_info["crawl_status"] == "error": if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "crawl_error" output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"] else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info) output_msg["retry_count_inc"] = True elif url_info["crawl_status"] == "redirected": if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]: output_msg["crawl_status"] = "notAlive" else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info) output_msg["redirect_count_inc"] = True else: logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"]) output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"] if output_msg["recrawl_time"] is not None: output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"]) if output_msg["recrawl_duration"] is not None: output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"]) return output_msg
def _make_update(update_map, inc_map = None): now = datetime2timestamp(datetime.datetime.utcnow()) #add status_last_modified field if update_map.has_key("crawl_status"): update_map["status_last_modified"] = now #separate url_info fields from meta_url_info fields first_update_map, second_update_map = misc.separate_dict(update_map, common_settings.database_table_fields["urlRepositoryMeta"]) first_inc_map, second_inc_map = misc.separate_dict(inc_map if inc_map is not None else {}, common_settings.database_table_fields["urlRepositoryMeta"]) misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields, soft = True) misc.copy_dict(first_inc_map, second_inc_map, common_settings.common_url_info_fields, soft = True) first_update = _create_update(first_update_map, first_inc_map) second_update = _create_update(second_update_map, second_inc_map) return first_update, second_update
def _assign_url_info_defaults(self, url_info): now = datetime2timestamp(datetime.datetime.utcnow()) url_info["created_time"] = now url_info["crawled_count"] = 0 url_info["error_messages"] = [] url_info["retry_count"] = 0 url_info["encoding"] = None url_info["encoding_created_time"] = None url_info["redirect_url"] = None #url_info["last_finished"] = None #url_info["expires"] = now url_info["doc"] = None url_info["headers"] = None # TODO not used? url_info["md5"] = None #url_info["process_status"] = True url_info["comments"] = "" url_info["redirect_count"] = 0 _, full_domain, _ = misc.get_url_domain_info(url_info['url']) url_info["full_domain"] = full_domain
def process_crawler_response(self, result): if not result.has_key("url"): return None if result["status"] == 700: self.crawl_url(self._async_mode, result["url"], result["meta"], self.get_user_agent(), None, result["meta"]["page_last_modified"]) return result["meta"] else: #send crawler_response message input_msg = result["meta"] fields = ["url", "status", "doc", "headers"] message = misc.clone_dict(result, fields) message["page_last_modified"] = input_msg["page_last_modified"] message["original_url"] = input_msg["url"] message["last_crawled"] = datetime2timestamp(datetime.datetime.utcnow()) message["error_message"] = result.get("error_message", None) message["meta"] = input_msg["meta"] message["meta"]["crawl_type"] = "static" if result["headers"] is not None and result["headers"].has_key("Last-Modified"): message["page_last_modified"] = result["headers"].get('Last-Modified') handler.HandlerRepository.process("__internal_crawler_response", message) return result["meta"]
def get_recent_minute_count(n): now = datetime2timestamp(datetime.datetime.utcnow()) / 1000 return db.minuteCounts.find({"_id" : {"$gte" : now - n * 60}}).sort([("_id", -1)])
def save_heartbeat(message): now = datetime2timestamp(datetime.datetime.utcnow()) message["_id"] = misc.md5(str(now)) message["datetime"] = now return db.heartbeats.save(message)
def get_heartbeats(check_duration): checkpoint = datetime.datetime.utcnow() - datetime.timedelta(seconds=check_duration) return db.heartbeats.find({"datetime" : {"$gt" : datetime2timestamp(checkpoint)}})
def save_redirect_url(url, redirect_url): now = datetime2timestamp(datetime.utcnow()) _db.urlRedirects.save({"_id" : misc.md5(url), "url" : url, "redirect_url" : redirect_url, "created_time" : now})
def test_url_type(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["url_test"] = common_settings.redis_cache_config["data_types"]["url"] client = common_settings.cache_client() url_info = {"crawl_status" : "crawling", "url_class" : None, "crawl_priority" : 1, "crawl_depth" : 0, "last_crawled" : datetime2timestamp(datetime.datetime.utcnow())} url = "http://www.baidu.com" client.set("url_test", url, update_map = url_info) self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"])) url_info = {"crawl_status" : "alive", "url_class" : "details", "crawl_priority" : 3, "crawl_depth" : -1, "last_crawled" : None} client.set("url_test", url, update_map = url_info) self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"])) client.set("url_test", url, update_map = {"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123}) self.assertEqual({"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123, "crawl_depth" : -1}, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth"]))