def postprocess(cls, result): if not isinstance(result, dict): logging.error("internal exception raised", type = type(result), result = result) return {"status" : 600, "error_message" : "internal exception raised %s" % result} if result.has_key("error_message") or result["status"] != 200 or result["doc"] is None: return result #dns cache actual_url = result["url"] if result["meta"].get("dns_cache_enabled", False): if actual_url != result["meta"]["url"]: parsed_result = misc.parse_url(actual_url) if parsed_result is not None and dns_cache.has_dns_cache(parsed_result.netloc): ip = socket.gethostbyname(parsed_result.netloc) dns_cache.set_dns_cache(parsed_result.netloc, ip) #compression body = result["doc"] ce=result["headers"].get('Content-Encoding',None) if ce and ce.lower().find('gzip')!=-1: body=cStringIO.StringIO(body) body=gzip.GzipFile(fileobj=body,mode='rb').read() #chunked transfer encoding if result["meta"].get("chunked_transfer_decoding", False) and result["headers"].get('Transfer-Encoding') == 'chunked': body = Downloader.decode_chunked_transfer(body) #create result dict result["doc"] = body return result
def _load_hash_object(self, values, data_type, name, fields, strict): """ if fields is None, we assume it's raw mode, all fields will be retrieved, and no field encoder/decoder will be used """ data_config = self._data_types.get(data_type) input_dict = values if fields is None else dict(zip(fields, values)) result_dict = {} for field, value in input_dict.items(): if value is None: if strict: logging.error("field %s does not exist in the result, %s" % (field, name)) #raise Exception("field %s does not exist in the result, %s" % (field, name)) return None else: value = None elif value == "\\None": value = None else: if RedisClient._special_none_pattern.match(value) is not None: value = value[1:] if fields is not None: field_configs = filter(lambda field_config : field_config == field if isinstance(field_config, str) else field_config[0] == field, data_config["fields"]) if len(field_configs) == 0: raise Exception("unexpected field, %s, %s, %s" % (data_type, name, field)) field_config = field_configs[0] if isinstance(field_config, tuple) and field_config[1] != str: decoder = field_config[1] value = decoder(value) result_dict[field] = value return result_dict
def _load_doc(self, url, html): try: dom = page_parser.parse_unicode(html, url, notify=logging.info) except: logging.error("raw doc can not be parsed", url) return None return dom
def _crawl_sync(self, url, timeout, request_header, meta): result = {"url" : url, "status" : 600, "doc" : None, "headers" : None, "meta" : meta} req = urllib2.Request(url) #set headers for key, value in request_header.items(): req.add_header(key, value) #set timeout try: response=urllib2.urlopen(req, timeout=timeout) except Exception as e: error_message = misc.exception_to_str(e) if error_message.find("HTTP Error 304: Not Modified") != -1: result["status"] = 304 return result else: result["error_message"] = error_message logging.error("static_crawl failed when opening url", url = url, exception = e) return result try: body = response.read() except Exception as e: result["error_message"] = str(e) logging.error("static_crawl failed when reading response", url = url, exception = e) return result result["url"] = response.url result["status"] = response.code result["doc"] = body result["headers"] = response.headers return result
def _check_is_all_crash(self): """ original method of checking whether all service is alive not used right now """ heartbeats = heartbeatdb.get_heartbeats(self._heart_beat_config["check_duration"]) heartbeats = misc.cursor_to_array(heartbeats) heartbeats = misc.select(heartbeats, fields=["ip", "handler_name", "pid"]) heartbeats = misc.distinct(heartbeats) handler_counts_per_machine = misc.count(heartbeats, key = lambda heartbeat : "%s_%s" % (heartbeat["ip"], heartbeat["handler_name"])) heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts_per_machine), type="handler_counts_per_machine") handler_counts = misc.count(heartbeats, key = lambda heartbeat : heartbeat["handler_name"]) heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts), type="handler_counts_total") logging.debug("current alive handler counts", handler_counts) #Note: currently we will send email if no handler is running if len(filter(lambda handler_name : handler_counts.get(handler_name, 0) == 0, self._heart_beat_config["required_handlers"])) > 0: if self._last_notification_time is None or datetime.datetime.now() - self._last_notification_time >= \ datetime.timedelta(seconds=self._heart_beat_config["notification_duration"]): email_body = "some handlers are not running:\n %s" % handler_counts_per_machine self._send_email( self._heart_beat_config["email_server"], self._heart_beat_config["email_from"], self._heart_beat_config["email_tos"], self._heart_beat_config["email_title"], email_body) self._last_notification_time = datetime.datetime.now() logging.error("heartbeat server detects required handlers are not fully running, notification email sent", handler_counts_per_machine)
def _load(): global _pa_initialized if not _pa_initialized: _pa_initialized = pa.init(common_settings.page_analysis_logger_prefix, common_settings.page_analysis_config_files) if not _pa_initialized: logging.error("page_analysis lib can't be loaded") return False else: logging.info("page_analysis lib has been loaded") return True
def predict(self, url, url_info, extras = None): output_msg = {"crawl_status" : "alive", "recrawl_time" : None, "recrawl_duration" : None, "recrawl_priority" : None, "retry_count_inc" : False, "redirect_count_inc" : False} if url_info["url_class"] is None: url_info["url_class"] = "undefined" if url_info["last_crawled"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "last_crawled is None" elif url_info["crawl_status"] == "alive": if url_info["modified_count"] <= 0 or url_info["url_class"] is None or url_info["last_modified"] is None or url_info["first_modified"] is None: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "any of url_class/last_modified/first_modified is none, or modified_count <= 0: %s" % misc.clone_dict(url_info, ["modified_count", "url_class", "last_modified", "first_modified"]) else: need_recrawl = self._recrawling_url(url, url_info["url_class"]) if need_recrawl: alive, output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_recrawl_time_and_priority(url_info) if not alive: output_msg["crawl_status"] = "notAlive" else: output_msg["crawl_status"] = "notAlive" elif url_info["crawl_status"] == "error": if url_info["retry_count"] >= self._settings["recrawl_policies"]["max_retry_count"]: output_msg["crawl_status"] = "failed" output_msg["error_type"] = "crawl_error" output_msg["error_message"] = "retry count exceeded %d" % self._settings["recrawl_policies"]["max_retry_count"] else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_retry_time_and_priority(url_info) output_msg["retry_count_inc"] = True elif url_info["crawl_status"] == "redirected": if url_info["redirect_count"] >= self._settings["recrawl_policies"]["max_redirect_count"]: output_msg["crawl_status"] = "notAlive" else: output_msg["recrawl_time"], output_msg["recrawl_duration"], output_msg["recrawl_priority"] = self._get_redirect_time_and_priority(url_info) output_msg["redirect_count_inc"] = True else: logging.error("unexpected crawl status", url = url, crawl_status = url_info["crawl_status"]) output_msg["crawl_status"] = "failed" output_msg["error_type"] = "unexpected" output_msg["error_message"] = "unexpected crawl status in recrawl:%s" % url_info["crawl_status"] if output_msg["recrawl_time"] is not None: output_msg["recrawl_time"] = datetime2timestamp(output_msg["recrawl_time"]) if output_msg["recrawl_duration"] is not None: output_msg["recrawl_duration"] = misc.delta_seconds(output_msg["recrawl_duration"]) return output_msg
def _reconnect(args, msg, e): if len(args) == 0 or not isinstance(args[0], RedisClient): raise Exception("unsupported decorator, it should be applied to RedisClient methods") self = args[0] while self._stop_condition is None or not self._stop_condition(): logging.error("redis connection error: %s, %s reconnecting..." % (msg, e)) time.sleep(5) try: self._wait() return True except redis.exceptions.ConnectionError as e: pass logging.warn("whole process is terminating") raise Exception("whole process is terminating")
def repair_service(self,info_dict,config): """ auto repair the dead service """ command_dict = self._heart_beat_config["repair_command"] for item_id in info_dict: item = info_dict[item_id] if "class_name" in item and item['class_name'] in command_dict: user = config["DEFAULT"].get("user") password = config["DEFAULT"].get("password") host = item['server_ip'] class_name = item['class_name'] command = command_dict[class_name] concurrency_expected = str(item["expected_concurrency"] - item["actual_concurrency"]) try: self.ssh_cmd(user, host, password, command%concurrency_expected) except Exception as e: logging.error(e)
def initialize(self): # enable dns cache if settings.dns_cache_enabled: dns_cache.enable_dns_cache() # enable user agent rotate if settings.user_agent_rotation_enabled: self._user_agent_list = misc.load_user_agent(settings.user_agent_file) if len(self._user_agent_list) == 0: self._user_agent_list.append(settings.default_user_agent) logging.error("user agent can't be downloaded, use default one") else: self._user_agent_list = [settings.default_user_agent] # initialize downloader, use twisted or urllib2 if settings.downloader_type == "twisted": self._downloader = downloaders.TwistedDownloader() elif settings.downloader_type == "urllib2": self._downloader = downloaders.UrlLib2Downloader() else: raise Exception("unsupported downloader type")
def decode(url, headers, body, encoding=None): ''' decode html to unicode ''' try_count = 0 while True: if encoding is not None: try_count = -1 else: encoding, try_count = _try_get_encoding(headers, body, try_count) if encoding is None: logging.error("decoding failed for url", url) return None, None html = _try_decode(url, body, encoding) if html is not None: logging.debug('decode url succeeded', url = url, encoding = encoding, try_count = try_count) return html, encoding else: try_count += 1 encoding = None
def _process(self, message): # normalize url url = url_analyser.normalize_url(message["url"]) if url is None: logging.error("invalid url for crawl", url = message["url"]) return {"status" : -1} message["url"] = url #fill optional fields url_info = misc.clone_dict(message, fields = ["url", "source", "root_url", "parent_url", "crawl_priority", "crawl_depth"]) self._assign_url_info_defaults(url_info) if url_info["root_url"] is None: url_info["root_url"] = url #deterimine crawl priority/depth is_valid, url_info["crawl_priority"], url_info["crawl_depth"] = crawl_priority_and_depth_evaluator.evaluate(url, url_info["source"], url_info) if not is_valid: return {"status" : -1} # stores to urlRepository table url_info["page_last_modified"] = None url_info["crawl_status"] = "crawling" url_info["last_crawled"] = None url_info["original_url"] = None # all urls is static now url_info["crawl_type"] = "static" # TODO add to crawler db, this should not be done here # some project do not need to store url info into database # should use middleware for these kind of actions #success, promoted = crawlerdb.add_url_info(url, url_info, True) if message["source"] != "redirected": # notify crawler message_type, crawler_message = CrawlerUtils.build_crawler_request_msg(url, url_info) handler.HandlerRepository.process(message_type, crawler_message) return {"status" : 1}
def normalize_url(self, url, base_url = None): if url is None or len(url) == 0: return None original_url = url #Note: here asume all non-unicode urls are encoded by utf-8 if isinstance(url, str): url = url.decode("utf-8") if not isinstance(url, unicode): logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url) return None url = url.replace('%20', ' ').strip() #fix http scheme: url = self._fix_http_scheme(url) #handle relative url if base_url is not None: url = urlparse.urljoin(base_url, url) #common normlization try: url = urlnorm.norm(url) except Exception as e: logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e) return None try: parse_result = urlparse.urlparse(url) except Exception as e: logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url) return None if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]: logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url) return None netloc = parse_result.netloc host = parse_result.netloc.split(':')[0] if ip_regex.match(host) is None: #if it's an ip host #check if domain and tld exists subdomain, domain, tld = tldextract.extract(host) if len(domain) == 0 or len(tld) == 0: logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url) return None #fix chinese punctuation for i in range(len(chinese_punctuation_map[0])): src = chinese_punctuation_map[0][i] dst = chinese_punctuation_map[1][i] netloc = netloc.replace(src, dst) #add www if not exists if len(subdomain) == 0: netloc = "www." + netloc fragment = parse_result.fragment if not fragment.startswith("!"): #Google's recommendation for ajax request fragment = "" if len(parse_result.scheme) == 0 or len(netloc) == 0: logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url) return None url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment)) #canonicalize url #Note: it's too strong, and sometimes change the url semantics. #url = ccrawler.utils.url.canonicalize_url(url) url = url.strip() if len(url) > self._settings["general_crawl_policies"]["max_url_length"]: logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url) return None elif len(url) == 0: logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url) return None else: return url