Exemple #1
0
    def start(self, heart_beat_config):
        self._server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self._server_socket.bind(('', heart_beat_config["server_port"]))
            self._server_socket.listen(heart_beat_config["backlog"])
            self._interval = heart_beat_config["server_interval"]
            self._heart_beat_config = heart_beat_config
            self._last_notification_time = None

            signal.signal(signal.SIGTERM, HeartBeatServer._stop)
            signal.signal(signal.SIGINT, HeartBeatServer._stop) # for ctrl-c

            thread.start_new_thread(self._run, ())

            logging.info("heartbeat server started")
            while not HeartBeatServer.global_stop_event.is_set():
                client_socket, _ = self._server_socket.accept()
                raw_data = client_socket.recv(heart_beat_config["max_data_size"])
                message = simplejson.loads(raw_data)
                logging.debug("heartbeat server received message", message)
                heartbeatdb.save_heartbeat(message)
                client_socket.close()
        except socket.error as e:
            logging.warn("socket error for heartbeat server!!!", exception = e)
        finally:
            self._server_socket.close()
            logging.info("heartbeat server terminated")
Exemple #2
0
    def _check_is_all_crash(self):
        """
        original method of checking whether all service is alive
        not used right now
        """
	heartbeats = heartbeatdb.get_heartbeats(self._heart_beat_config["check_duration"])
        heartbeats = misc.cursor_to_array(heartbeats)
        heartbeats = misc.select(heartbeats, fields=["ip", "handler_name", "pid"])
        heartbeats = misc.distinct(heartbeats)
        handler_counts_per_machine = misc.count(heartbeats, key = lambda heartbeat : "%s_%s" % (heartbeat["ip"], heartbeat["handler_name"]))
        heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts_per_machine), type="handler_counts_per_machine")
        handler_counts = misc.count(heartbeats, key = lambda heartbeat : heartbeat["handler_name"])
        heartbeatdb.save_handler_counts(simplejson.dumps(handler_counts), type="handler_counts_total")
        logging.debug("current alive handler counts", handler_counts)
        #Note: currently we will send email if no handler is running
        if len(filter(lambda handler_name : handler_counts.get(handler_name, 0) == 0, self._heart_beat_config["required_handlers"])) > 0:
            if self._last_notification_time is None or datetime.datetime.now() - self._last_notification_time >= \
                datetime.timedelta(seconds=self._heart_beat_config["notification_duration"]):

                email_body = "some handlers are not running:\n %s" % handler_counts_per_machine
                self._send_email(
                    self._heart_beat_config["email_server"],
                    self._heart_beat_config["email_from"],
                    self._heart_beat_config["email_tos"],
                    self._heart_beat_config["email_title"],
                    email_body)
                self._last_notification_time = datetime.datetime.now()
                logging.error("heartbeat server detects required handlers are not fully running, notification email sent", handler_counts_per_machine)
    def _handle_redirect(self, url, message):
        original_url = message["original_url"]

        #Note: double check if the whole flow is consistent
        #add redirected url_info by crawl_handler
        crawl_request_msg = {"url" : url, "source" : "redirected", "parent_url" : original_url, "root_url" : url, "crawl_priority" : message["crawl_priority"], "crawl_depth" : message["crawl_depth"]}
        result = handler.HandlerRepository.process("crawl_request", crawl_request_msg, force_inproc = True)
        if result["status"] >= 0:
            logging.debug(self._log_formatter("redirected succeeded", url = url, original_url = original_url))
            #handle redirected url crawler_response
            crawler_response_msg = misc.clone_dict(message, ["url", "status", "doc", "headers", "page_last_modified", "last_crawled", "error_message"])
            crawler_response_msg["original_url"] = url
            # get url_info from message meta instead of db
            #url_info = crawlerdb.get_url_info(url, common_settings.crawler_msg_meta_fields)
            url_info = message['meta']
            crawler_response_msg["meta"] = url_info
            result = handler.HandlerRepository.process("crawler_response", crawler_response_msg)

            #handle original url crawler_response
            message["url"] = original_url
            message["redirect_url"] = url
            message["status"] = 801
        else:
            message["url"] = original_url
            message["status"] = 802
Exemple #4
0
def _try_decode(url, body, encode):
    html = None
    try:
        decoder = codecs.lookup(encode)
        html = decoder.decode(body)[0]
    except Exception:
        logging.debug("try decode failed", encoding = encode,url = url)
    return html
Exemple #5
0
    def _process(self):
        while True:
            now = datetime.datetime.utcnow()
            url_info = crawlerdb.find_and_modify_expired_url_info(now, common_settings.crawler_msg_meta_fields)
            if url_info is None:
                break

            url = url_info["url"]
            message_type, crawler_request_msg = CrawlerUtils.build_crawler_request_msg(url, url_info)
            handler.HandlerRepository.process(message_type, crawler_request_msg)
            logging.debug(self._log_formatter("sent to crawler", url=url))
Exemple #6
0
    def _init(self, config, data_config):
        self._client = redis.StrictRedis(host=config.get("host", "localhost"), port=config.get("port", 6379), db=config.get("db", 0))
        self._valid_key = config.get("valid_key", "__valid_redis")
        self._stop_condition = None
        self._data_types = data_config.get("data_types", {})
        self._validation_enabled = config.get("validation_enabled", False)
        self._enabled = config.get("enabled", False)

        if self._enabled:
            self._wait()

        logging.debug("redis client initialized")
Exemple #7
0
    def _once(self):
        now = datetime2timestamp(datetime.datetime.utcnow())
        message = {"datetime" : now, "ip" : self._ip, "handler_name" : self._handler_name,
            "pid" : self._process_id, "handler_key" : self._handler_key}

        try:
            self._client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self._client_socket.connect((self._server_address, self._server_port))
            self._client_socket.send(simplejson.dumps(message))
            logging.debug("heartbeat client sent message", message)
        except socket.error as e:
            logging.warn("socket error for heartbeat client", exception = e)
        finally:
            self._client_socket.close()
Exemple #8
0
def decode(url, headers, body, encoding=None):
    '''
    decode html to unicode
    '''

    try_count = 0

    while True:
        if encoding is not None:
            try_count = -1
        else:
            encoding, try_count = _try_get_encoding(headers, body, try_count)
            if encoding is None:
                logging.error("decoding failed for url", url)
                return None, None
        html = _try_decode(url, body, encoding)
        if html is not None:
            logging.debug('decode url succeeded', url = url, encoding = encoding, try_count = try_count)
            return html, encoding
        else:
            try_count += 1
            encoding = None
Exemple #9
0
 def _stop(cls, signum, frame):
     HeartBeatServer.global_stop_event.set()
     logging.debug("heartbeat server is terminating gracefully")
Exemple #10
0
 def start_by_thread(self):
     thread.start_new_thread(self._run, ())
     logging.debug("heartbeat client started")
Exemple #11
0
 def _run(self):
     while not (self._stop_condition is not None and self._stop_condition()):
         time.sleep(self._interval)
         self._once()
     else:
         logging.debug("heartbeat client terminated")