def start(self, heart_beat_config): self._server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: self._server_socket.bind(('', heart_beat_config["server_port"])) self._server_socket.listen(heart_beat_config["backlog"]) self._interval = heart_beat_config["server_interval"] self._heart_beat_config = heart_beat_config self._last_notification_time = None signal.signal(signal.SIGTERM, HeartBeatServer._stop) signal.signal(signal.SIGINT, HeartBeatServer._stop) # for ctrl-c thread.start_new_thread(self._run, ()) logging.info("heartbeat server started") while not HeartBeatServer.global_stop_event.is_set(): client_socket, _ = self._server_socket.accept() raw_data = client_socket.recv(heart_beat_config["max_data_size"]) message = simplejson.loads(raw_data) logging.debug("heartbeat server received message", message) heartbeatdb.save_heartbeat(message) client_socket.close() except socket.error as e: logging.warn("socket error for heartbeat server!!!", exception = e) finally: self._server_socket.close() logging.info("heartbeat server terminated")
def _wait(self): if not self._validation_enabled: return while self._client.get(self._valid_key) != "1": #Notes: all clients will wait until the __valid_redis field is set to 1 if self._stop_condition is not None and self._stop_condition(): logging.warn("whole process is terminating") raise Exception("whole process is terminating") else: logging.warn("redis server is loading data") time.sleep(5)
def evaluate(self, url, source, url_info, extras = None): crawl_priority = -1 crawl_depth = -1 # if exist in url info, read it if url_info["crawl_priority"] is not None: crawl_priority = url_info["crawl_priority"] if url_info["crawl_depth"] is not None: crawl_depth = url_info["crawl_depth"] #url validation, if not url_validator.validate(url, url_info["parent_url"]): logging.warn("invalid crawl url", url = url, parent_url = url_info["parent_url"]) return False, crawl_priority, crawl_depth #for non-parsed urls, determined based on domains or defaults. if source == "offline" or source == "online" or source == "post_ondemand": if url_info["crawl_priority"] is None or url_info["crawl_depth"] is None: # determine priority and depth by souce crawl_priority, crawl_depth = self._determine(url, source) #use default explicit ones if url_info["crawl_priority"] is not None: crawl_priority = url_info["crawl_priority"] if url_info["crawl_depth"] is not None: crawl_depth = url_info["crawl_depth"] #for parsed urls, priority += 1, depth -= 1 # TODO why priority + 1? elif source == "parsed" or source == "redirected": crawl_priority = url_info["crawl_priority"] if crawl_priority < self._settings["total_priority_count"] - 1: crawl_priority += 1 crawl_depth = url_info["crawl_depth"] - 1 #handle external url if url_analyser.is_external_url(url, url_info["parent_url"]): mode = self._settings["general_crawl_policies"]["external_crawl_mode"] # mode could be continue or new. if new, use source determine # new priority and depth. if mode == "new": crawl_priority, crawl_depth = self._determine(url, "external") else: raise Exception("unsupported source %s", source = source) # raise exceed expetion if crawl_priority < 0 or crawl_priority >= self._settings["total_priority_count"]: raise Exception("priority exceeded %s" % crawl_priority) if crawl_depth < 0: raise Exception("crawl_depth can't be less than 0 %s" % crawl_depth) return True, crawl_priority, crawl_depth
def _once(self): now = datetime2timestamp(datetime.datetime.utcnow()) message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name, "pid" : self._process_id, "handler_key" : self._handler_key} try: self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._client_socket.connect((self._server_address, self._server_port)) self._client_socket.send(simplejson.dumps(message)) logging.debug("heartbeat client sent message", message) except socket.error as e: logging.warn("socket error for heartbeat client", exception = e) finally: self._client_socket.close()
def _decode_fields(self, url, message): #decode some message fields if message["page_last_modified"] is not None: message["page_last_modified"] = decoder.decode_string(message["page_last_modified"]) if message["page_last_modified"] is None: logging.warn("decode page_last_modified failed", url = url) if message["headers"] is not None: decoded_headers = {} for key in message["headers"].keys(): value = message["headers"].get(key, "") decoded_key = decoder.decode_string(key) if decoded_key is None: logging.warn("decoded http response header key failed", url = url, field = unicode({"key" : key, "value" : value})) continue if not re.match("^[a-zA-Z0-9-]+$", decoded_key): logging.warn("filtered invalid http response header key", url = url, field = unicode({"key" : key, "value" : value})) continue decoded_value = decoder.decode_string(value) if decoded_value is None: logging.warn("decoded http response header value failed", url = url, field = unicode({"key" : key, "value" : value})) continue decoded_headers[decoded_key] = decoded_value message["headers"] = decoded_headers
def _reconnect(args, msg, e): if len(args) == 0 or not isinstance(args[0], RedisClient): raise Exception("unsupported decorator, it should be applied to RedisClient methods") self = args[0] while self._stop_condition is None or not self._stop_condition(): logging.error("redis connection error: %s, %s reconnecting..." % (msg, e)) time.sleep(5) try: self._wait() return True except redis.exceptions.ConnectionError as e: pass logging.warn("whole process is terminating") raise Exception("whole process is terminating")
def normalize_url(self, url, base_url = None): if url is None or len(url) == 0: return None original_url = url #Note: here asume all non-unicode urls are encoded by utf-8 if isinstance(url, str): url = url.decode("utf-8") if not isinstance(url, unicode): logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url) return None url = url.replace('%20', ' ').strip() #fix http scheme: url = self._fix_http_scheme(url) #handle relative url if base_url is not None: url = urlparse.urljoin(base_url, url) #common normlization try: url = urlnorm.norm(url) except Exception as e: logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e) return None try: parse_result = urlparse.urlparse(url) except Exception as e: logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url) return None if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]: logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url) return None netloc = parse_result.netloc host = parse_result.netloc.split(':')[0] if ip_regex.match(host) is None: #if it's an ip host #check if domain and tld exists subdomain, domain, tld = tldextract.extract(host) if len(domain) == 0 or len(tld) == 0: logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url) return None #fix chinese punctuation for i in range(len(chinese_punctuation_map[0])): src = chinese_punctuation_map[0][i] dst = chinese_punctuation_map[1][i] netloc = netloc.replace(src, dst) #add www if not exists if len(subdomain) == 0: netloc = "www." + netloc fragment = parse_result.fragment if not fragment.startswith("!"): #Google's recommendation for ajax request fragment = "" if len(parse_result.scheme) == 0 or len(netloc) == 0: logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url) return None url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment)) #canonicalize url #Note: it's too strong, and sometimes change the url semantics. #url = ccrawler.utils.url.canonicalize_url(url) url = url.strip() if len(url) > self._settings["general_crawl_policies"]["max_url_length"]: logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url) return None elif len(url) == 0: logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url) return None else: return url