def start(self, heart_beat_config): self._server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: self._server_socket.bind(('', heart_beat_config["server_port"])) self._server_socket.listen(heart_beat_config["backlog"]) self._interval = heart_beat_config["server_interval"] self._heart_beat_config = heart_beat_config self._last_notification_time = None signal.signal(signal.SIGTERM, HeartBeatServer._stop) signal.signal(signal.SIGINT, HeartBeatServer._stop) # for ctrl-c thread.start_new_thread(self._run, ()) logging.info("heartbeat server started") while not HeartBeatServer.global_stop_event.is_set(): client_socket, _ = self._server_socket.accept() raw_data = client_socket.recv(heart_beat_config["max_data_size"]) message = simplejson.loads(raw_data) logging.debug("heartbeat server received message", message) heartbeatdb.save_heartbeat(message) client_socket.close() except socket.error as e: logging.warn("socket error for heartbeat server!!!", exception = e) finally: self._server_socket.close() logging.info("heartbeat server terminated")
def _check_is_all_crash(self): """ original method of checking whether all service is alive not used right now """ heartbeats = heartbeatdb.get_heartbeats(self._heart_beat_config["check_duration"]) heartbeats = misc.cursor_to_array(heartbeats) heartbeats = misc.select(heartbeats, fields=["ip", "handler_name", "pid"]) heartbeats = misc.distinct(heartbeats) handler_counts_per_machine = misc.count(heartbeats, key = lambda heartbeat : "%s_%s" % (heartbeat["ip"], heartbeat["handler_name"])) heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts_per_machine), type="handler_counts_per_machine") handler_counts = misc.count(heartbeats, key = lambda heartbeat : heartbeat["handler_name"]) heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts), type="handler_counts_total") logging.debug("current alive handler counts", handler_counts) #Note: currently we will send email if no handler is running if len(filter(lambda handler_name : handler_counts.get(handler_name, 0) == 0, self._heart_beat_config["required_handlers"])) > 0: if self._last_notification_time is None or datetime.datetime.now() - self._last_notification_time >= \ datetime.timedelta(seconds=self._heart_beat_config["notification_duration"]): email_body = "some handlers are not running:\n %s" % handler_counts_per_machine self._send_email( self._heart_beat_config["email_server"], self._heart_beat_config["email_from"], self._heart_beat_config["email_tos"], self._heart_beat_config["email_title"], email_body) self._last_notification_time = datetime.datetime.now() logging.error("heartbeat server detects required handlers are not fully running, notification email sent", handler_counts_per_machine)
def _handle_redirect(self, url, message): original_url = message["original_url"] #Note: double check if the whole flow is consistent #add redirected url_info by crawl_handler crawl_request_msg = {"url" : url, "source" : "redirected", "parent_url" : original_url, "root_url" : url, "crawl_priority" : message["crawl_priority"], "crawl_depth" : message["crawl_depth"]} result = handler.HandlerRepository.process("crawl_request", crawl_request_msg, force_inproc = True) if result["status"] >= 0: logging.debug(self._log_formatter("redirected succeeded", url = url, original_url = original_url)) #handle redirected url crawler_response crawler_response_msg = misc.clone_dict(message, ["url", "status", "doc", "headers", "page_last_modified", "last_crawled", "error_message"]) crawler_response_msg["original_url"] = url # get url_info from message meta instead of db #url_info = crawlerdb.get_url_info(url, common_settings.crawler_msg_meta_fields) url_info = message['meta'] crawler_response_msg["meta"] = url_info result = handler.HandlerRepository.process("crawler_response", crawler_response_msg) #handle original url crawler_response message["url"] = original_url message["redirect_url"] = url message["status"] = 801 else: message["url"] = original_url message["status"] = 802
def _try_decode(url, body, encode): html = None try: decoder = codecs.lookup(encode) html = decoder.decode(body)[0] except Exception: logging.debug("try decode failed", encoding = encode,url = url) return html
def _process(self): while True: now = datetime.datetime.utcnow() url_info = crawlerdb.find_and_modify_expired_url_info(now, common_settings.crawler_msg_meta_fields) if url_info is None: break url = url_info["url"] message_type, crawler_request_msg = CrawlerUtils.build_crawler_request_msg(url, url_info) handler.HandlerRepository.process(message_type, crawler_request_msg) logging.debug(self._log_formatter("sent to crawler", url=url))
def _init(self, config, data_config): self._client = redis.StrictRedis(host=config.get("host", "localhost"), port=config.get("port", 6379), db=config.get("db", 0)) self._valid_key = config.get("valid_key", "__valid_redis") self._stop_condition = None self._data_types = data_config.get("data_types", {}) self._validation_enabled = config.get("validation_enabled", False) self._enabled = config.get("enabled", False) if self._enabled: self._wait() logging.debug("redis client initialized")
def _once(self): now = datetime2timestamp(datetime.datetime.utcnow()) message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name, "pid" : self._process_id, "handler_key" : self._handler_key} try: self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._client_socket.connect((self._server_address, self._server_port)) self._client_socket.send(simplejson.dumps(message)) logging.debug("heartbeat client sent message", message) except socket.error as e: logging.warn("socket error for heartbeat client", exception = e) finally: self._client_socket.close()
def decode(url, headers, body, encoding=None): ''' decode html to unicode ''' try_count = 0 while True: if encoding is not None: try_count = -1 else: encoding, try_count = _try_get_encoding(headers, body, try_count) if encoding is None: logging.error("decoding failed for url", url) return None, None html = _try_decode(url, body, encoding) if html is not None: logging.debug('decode url succeeded', url = url, encoding = encoding, try_count = try_count) return html, encoding else: try_count += 1 encoding = None
def _stop(cls, signum, frame): HeartBeatServer.global_stop_event.set() logging.debug("heartbeat server is terminating gracefully")
def start_by_thread(self): thread.start_new_thread(self._run, ()) logging.debug("heartbeat client started")
def _run(self): while not (self._stop_condition is not None and self._stop_condition()): time.sleep(self._interval) self._once() else: logging.debug("heartbeat client terminated")