def assign_url_info_defaults(url, url_info): url_info["_id"] = misc.md5(url) now = datetime2timestamp(datetime.datetime.utcnow()) url_info["created_time"] = now url_info["crawled_count"] = 0 url_info["url_class"] = None url_info["error_messages"] = [] #url_info["processed_count"] = 0 #url_info["last_processed"] = None url_info["first_modified"] = None url_info["last_modified"] = None url_info["modified_count"] = 0 url_info["valid_link_count"] = None url_info["retry_count"] = 0 url_info["status_last_modified"] = now url_info["encoding"] = None url_info["encoding_created_time"] = None url_info["redirect_url"] = None #url_info["last_finished"] = None #url_info["expires"] = now url_info["doc"] = None url_info["headers"] = None url_info["md5"] = None #url_info["process_status"] = True url_info["last_discovered"] = now url_info["discovered_count"] = 1 url_info["comments"] = "" url_info["redirect_count"] = 0 url_info["recrawl_time"] = now url_info["recrawl_duration"] = 0 url_info["recrawl_priority"] = url_info["crawl_priority"] _, full_domain, _ = misc.get_url_domain_info(url) url_info["full_domain"] = full_domain
def test_url_dedup_type(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["url_dedup_test"] = {"content_type" : "redis/set"} client = common_settings.cache_client() client.delete("url_dedup_test", None) url_list = ["http://www.baidu.com", "http://www.google.com", "http://www.sina.com.cn"] for url in url_list: md5 = misc.md5(url) client.set("url_dedup_test", md5) for url in url_list: self.assertEqual(True, client.get("url_dedup_test", misc.md5(url))) self.assertEqual(False, client.get("url_dedup_test", misc.md5("http://www.google.com/"))) self.assertFalse(client.set("url_dedup_test", misc.md5("http://www.google.com/"), with_get=True)) self.assertEqual(True, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
def get_result_by_url(url, start_index=1, page_type=1): cond = {'_id': misc.md5(url)} fields = NO_ROW_ID if page_type == 1: #only query specific page: fields = copy.copy(_RESULT_META_FIELDS) fields[_CONTENT_COLUMN_NAME % start_index] = 1 return _db.results.find_one(cond, fields=fields)
def save_handler_counts(handler_counts, type): now = datetime2timestamp(datetime.datetime.utcnow()) insert = {} insert["_id"] = misc.md5(str(now)) insert["datetime"] = now insert["handler_counts"] = handler_counts insert["type"] = type return db.handlerStatistics.save(insert)
def _generate_name(cls, id_generator, content_type, data_type, data_key): if content_type == "redis/set":#ignores id_generator for redis/set type return data_type elif id_generator == "raw": return ":".join([data_type, data_key]) elif id_generator == "md5": return ":".join([data_type, misc.md5(data_key)]) elif id_generator == "none": return data_type else: raise Exception("not supported id_generator %s" % id_generator)
def save_crawl_domain_info(url, domain_type = "full_domain", crawl_priority = -1, crawl_depth = -1, \ recrawl_details = False, recrawl_list = False, recrawl_undefined = False):#-1 means auto config needed domain_info = misc.get_url_domain_info(url) domain_types = common_settings.domain_types domain = domain_info[domain_types.index(domain_type)] update_map = {"domain" : domain, "domain_type" : domain_type, "url" : url, "crawl_priority" : crawl_priority, "crawl_depth" : crawl_depth, "recrawl_details" : recrawl_details, "recrawl_list" : recrawl_list, "recrawl_undefined" : recrawl_undefined, "_id" : misc.md5(''.join([domain, domain_type])) } db.crawlDomainWhitelist.save(update_map)#Note: will override duplicate domain
def update_transcode_result(result, pages, page_type='details', process_type='batch'): i = 1 for page in pages: content = '' for node in page: content += p.tostring(node) result[_CONTENT_COLUMN_NAME%i] = content i += 1 result['statusCode'] = _TYPE_STATUSCODE_DICT[page_type] if page_type in _TYPE_STATUSCODE_DICT else _DEFAULT_STATUSCODE result['_id'] = misc.md5(result['url']) result['processType'] = process_type cond = {'_id': result['_id']} update = result _db.results.update(cond, update, upsert=True)
def build_crawler_request_msg(cls, url, url_info): message = misc.clone_dict(url_info, ["url", "page_last_modified"]) message["__priority"] = url_info["crawl_priority"] message["meta"] = misc.clone_dict(url_info, common_settings.crawler_msg_meta_fields) if common_settings.strong_politeness: message["__group_hash"] = url_info["full_domain"] else: message["__group_hash"] = misc.md5(url) if url_info["crawl_type"] == "static": message_type = "__internal_crawler_request" elif url_info["crawl_type"] == "dynamic": message_type = "__internal_dynamic_crawler_request" else: raise Exception("unsupported crawl_type %s" % url_info["crawl_type"]) return message_type, message
def default_cond(url): return {"_id" : misc.md5(url)}
def save_offline_manipulation(manipulation, result, type): now = datetime.datetime.now() db.offlineManipulations.save({"_id" : misc.md5(str(now)), "manipulation" : manipulation, "result" : result, "datetime" : now, "type" : type})
def save_redirect_url(url, redirect_url): now = datetime2timestamp(datetime.utcnow()) _db.urlRedirects.save({"_id" : misc.md5(url), "url" : url, "redirect_url" : redirect_url, "created_time" : now})
def save_heartbeat(message): now = datetime2timestamp(datetime.datetime.utcnow()) message["_id"] = misc.md5(str(now)) message["datetime"] = now return db.heartbeats.save(message)