def test_monitoring(self): class TestedClass(Monitor): def __init__(self, cqueue): self.cqueue = cqueue Monitor.__init__(self) @Monitor.synchronized def execute(self): self.cqueue.put(1) sleep(1) self.cqueue.get() class TesterThread(Thread): def __init__(self, tc): self.tc = tc Thread.__init__(self) def run(self): self.tc.execute() q = Queue() tc = TestedClass(q) a, b = TesterThread(tc), TesterThread(tc) a.start(), b.start() while a.is_alive() or b.is_alive(): sleep(0.1) self.assertNotEqual(q.qsize(), 2)
def test_acquire_contextmanager(self): class TestedClass(Monitor): def __init__(self, cqueue): self.cqueue = cqueue Monitor.__init__(self) @Monitor.synchronized def execute(self): self.cqueue.put(1) sleep(1) self.cqueue.get() class TesterThread(Thread): def __init__(self, tc): self.tc = tc Thread.__init__(self) def run(self): self.tc.execute() cq = Queue() cq.put(1) tc = TestedClass(cq) tt = TesterThread(tc) with Monitor.acquire(tc): tt.start() sleep(0.4) self.assertEqual(cq.qsize(), 1)
class CallbackHandler(object): """ handles callback for event for single url with pause support """ def __init__(self, url, subscription_id, callback, paused): self.url = url self.subscription_id = subscription_id self.callback = callback self.paused = paused self.event_q = Queue() def flush(self): # flush any pending events (no callback triggered on flush) while not self.event_q.empty(): self.event_q.get() def pause(self): # pause callbacks logger.debug("pausing url: %s", self.url) self.paused = True def resume(self): # trigger all callbacks before setting pause to false logger.debug("resume (queue-size %s) url %s", self.event_q.qsize(), self.url) while not self.event_q.empty(): event = self.event_q.get() try: self.callback(event) except Exception as e: logger.debug("Traceback:\n%s", traceback.format_exc()) logger.warn("failed to execute event callback: %s", e) self.paused = False def execute_callback(self, event): # execute callback or queue event if currently paused if self.paused: self.event_q.put(event) else: try: self.callback(event) except Exception as e: logger.debug("Traceback:\n%s", traceback.format_exc()) logger.warn("failed to execute event callback: %s", e)
class SelectiveRepeater(object): """ Selective repeater for SBP file I/O requests Fields ---------- _pending_map : list(PendingRequest) List (used as a map) of PendingRequest objects, used to track outstanding requests. _request_pool : Queue Queue of available requests (recorded by index number) _seqmap : dict(int,int) Dictionary mapping SBP request sequence IDs to their corresponding request index. _batch_msgs : list Collector for a batch of messages to be sent in one buffer via the link _last_check_time : Time The last time we checked if any packets had expired _expire_map : dict(Time, dict(PendingRequest, PendingRequest)) Dictionary which records the future time at which a request will expire. _msg_type : int The message type we're currently sending _link : Handler The link over which we're sending data _callback_thread : int ID of the thread that we expect callbacks from _link_thread : int ID of the thread that handles link writes """ def __init__(self, link, msg_type, cb=None): """ Args --- link : Handler Link over which messages will be sent. msg_type : The type of message being sent cb : Invoked when SBP message with type `msg_type` is received """ self._link = link self._msg_type = msg_type self._callback = cb self._seqmap = {} self._batch_msgs = [] self._last_check_time = Time.now() self._expire_map = defaultdict(dict) self._init_fileio_config(SBP_FILEIO_WINDOW_SIZE, SBP_FILEIO_BATCH_SIZE, PROGRESS_CB_REDUCTION_FACTOR) self._callback_thread = None self._link_thread = None self._total_sends = 1.0 self._total_retries = 0 self._config_retry_time = None def _init_fileio_config(self, window_size, batch_size, progress_cb_reduction_factor): self._pending_map = [PendingRequest(X) for X in range(window_size)] self._request_pool = Queue(window_size) for pending_req in self._pending_map: self._request_pool.put(pending_req) self._batch_size = batch_size self._progress_cb_reduction_factor = progress_cb_reduction_factor def __enter__(self): self._link.add_callback(self._request_cb, self._msg_type) self._link.add_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP) return self def __exit__(self, type, value, traceback): self._link.remove_callback(self._request_cb, self._msg_type) self._link.remove_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP) def _verify_cb_thread(self): """ Verify that only one thread is consuming requests. """ if self._callback_thread is None: self._callback_thread = threading.currentThread().ident assert self._callback_thread == threading.currentThread().ident def _verify_link_thread(self): """ Verify that only one thread is producing requests. """ if self._link_thread is None: self._link_thread = threading.currentThread().ident assert self._link_thread == threading.currentThread().ident def _return_pending_req(self, pending_req): """ Return a pending request to the write pool and clean any entries in the expiration map. """ self._verify_cb_thread() pending_req.completed = True try: msg = pending_req.message except AttributeError: # Got a completion for something that was never requested return self._try_remove_keys(self._seqmap, msg.sequence) if self._try_remove_keys(self._expire_map[pending_req.time_expire], pending_req): # Only put the request back if it was successfully removed self._request_pool.put(pending_req) def _record_pending_req(self, msg, time_now, expiration_time): """ Acquire a pending request object and record it's future expiration time in a map. """ self._verify_link_thread() # Queue.get will block if no requests are available pending_req = self._request_pool.get(True) assert self._pending_map[pending_req.index].index == pending_req.index self._seqmap[msg.sequence] = pending_req.index self._pending_map[pending_req.index].track(msg, time_now, expiration_time) self._expire_map[expiration_time][pending_req] = pending_req def _config_cb(self, msg, **metadata): self._config_msg = msg self._init_fileio_config(msg.window_size, msg.batch_size, PROGRESS_CB_REDUCTION_FACTOR * 2) def _request_cb(self, msg, **metadata): """ Process request completions. """ index = self._seqmap.get(msg.sequence) if index is None: return pending_req = self._pending_map[index] if self._callback: self._callback(pending_req.message, msg) self._return_pending_req(pending_req) def _has_pending(self): return self._request_pool.qsize() != len(self._pending_map) def _retry_send(self, check_time, pending_req, delete_keys): """ Retry a request by updating it's expire time on the object itself and in the expiration map. """ self._total_retries += 1 self._total_sends += 1 timeout_delta = Time(SBP_FILEIO_TIMEOUT) send_time = Time.now() new_expire = send_time + timeout_delta pending_req.record_retry(send_time, new_expire) self._expire_map[new_expire][pending_req] = pending_req self._link(pending_req.message) delete_keys.append(pending_req) def _try_remove_keys(self, d, *keys): success = True for key in keys: try: del d[key] except KeyError: success = False return success def _check_pending(self): """ Scans from the last check time to the current time looking for requests that are due to expire and retries them if necessary. """ time_now = Time.now() timeout_delta = Time(SBP_FILEIO_TIMEOUT) for check_time in Time.iter_since(self._last_check_time, time_now): pending_reqs = self._expire_map[check_time] retried_writes = [] for pending_req in list(pending_reqs): time_expire = pending_req.time + timeout_delta if time_now >= time_expire: if pending_req.tries >= MAXIMUM_RETRIES: raise Exception('Timed out') # If the completion map becomes inconsistent (because # things are completing at the same time they're # being re-tried) then the `completed` field should # prevent us from re-sending a write in this case. if not pending_req.completed: self._retry_send(check_time, pending_req, retried_writes) # Pending writes can be marked completed while this function # is running, so a key error means is was marked completed # after we sent a retry (therefore _try_remove_keys ignores # key errors). self._try_remove_keys(self._expire_map[check_time], *retried_writes) self._last_check_time = time_now def _window_available(self, batch_size): return self._request_pool.qsize() >= batch_size def _ensure_config_req_sent(self): if self._config_retry_time is not None: return now = Time.now() self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS) self._config_timeout = now + Time(0, CONFIG_REQ_TIMEOUT_MS) self._config_seq = random.randint(0, 0xffffffff) self._config_msg = None self._link(MsgFileioConfigReq(sequence=self._config_seq)) def _config_received(self): self._ensure_config_req_sent() if self._config_msg is not None: return True now = Time.now() if now >= self._config_retry_time: self._link(MsgFileioConfigReq(sequence=self._config_seq)) self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS) if now >= self._config_timeout: self._config_msg = MsgFileioConfigResp(sequence=0, window_size=100, batch_size=1, fileio_version=0) return self._config_msg is not None def _wait_config_received(self): while not self._config_received(): time.sleep(WAIT_SLEEP_S) def _wait_window_available(self, batch_size): self._wait_config_received() while not self._window_available(batch_size): self._check_pending() if not self._window_available(batch_size): time.sleep(WAIT_SLEEP_S) @property def total_retries(self): return self._total_retries @property def total_sends(self): return self._total_sends @property def progress_cb_reduction_factor(self): return self._progress_cb_reduction_factor def send(self, msg, batch_size=None): if batch_size is not None: self._send(msg, batch_size) else: self._send(msg, self._batch_size) def _send(self, msg, batch_size): """ Sends data via the current link, potentially batching it together. Parameters ---------- msg : MsgFileioReadReq, MsgFileioReadDirReq, MsgFileioWriteReq, MsgFileioRemove The message to be sent via the current link batch_size : int The number of message to batch together before actually sending """ if msg is not None: self._batch_msgs.append(msg) if len(self._batch_msgs) >= batch_size: self._wait_window_available(batch_size) time_now = Time.now() expiration_time = time_now + Time(SBP_FILEIO_TIMEOUT) for msg in self._batch_msgs: self._record_pending_req(msg, time_now, expiration_time) self._link(*self._batch_msgs) self._total_sends += len(self._batch_msgs) del self._batch_msgs[:] def flush(self): """ Flush any pending requests (batched or otherwise) and wait for all pending requests to complete. """ self.send(None, batch_size=0) while self._has_pending(): self._check_pending() time.sleep(WAIT_SLEEP_S)
class GraphiteReporter(threading.Thread): """A graphite reporter thread.""" def __init__(self, host, port, maxQueueSize=10000): """Connect to a Graphite server on host:port.""" threading.Thread.__init__(self) self.host, self.port = host, port self.sock = None self.queue = Queue() self.maxQueueSize = maxQueueSize self.daemon = True def run(self): """Run the thread.""" while True: try: try: name, value, valueType, stamp = self.queue.get() except TypeError: break self.log(name, value, valueType, stamp) finally: self.queue.task_done() def connect(self): """Connects to the Graphite server if not already connected.""" if self.sock is not None: return backoff = 0.01 while True: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(5) sock.connect((self.host, self.port)) self.sock = sock return except socket.error: time.sleep(random.uniform(0, 2.0 * backoff)) backoff = min(backoff * 2.0, 5.0) def disconnect(self): """Disconnect from the Graphite server if connected.""" if self.sock is not None: try: self.sock.close() except socket.error: pass finally: self.sock = None def _sendMsg(self, msg): """Send a line to graphite. Retry with exponential backoff.""" if not self.sock: self.connect() backoff = 0.001 while True: try: self.sock.sendall(msg) break except socket.error: logging.warning('Graphite connection error', exc_info=True) self.disconnect() time.sleep(random.uniform(0, 2.0 * backoff)) backoff = min(backoff * 2.0, 5.0) self.connect() def _sanitizeName(self, name): """Sanitize a metric name.""" return name.replace(' ', '-') def log(self, name, value, valueType=None, stamp=None): """Log a named numeric value. The value type may be 'value', 'count', or None.""" if type(value) == float: form = "%s%s %2.2f %d\n" else: form = "%s%s %s %d\n" if valueType is not None and len( valueType) > 0 and valueType[0] != '.': valueType = '.' + valueType if not stamp: stamp = time.time() self._sendMsg( form % (self._sanitizeName(name), valueType or '', value, stamp)) def enqueue(self, name, value, valueType=None, stamp=None): """Enqueue a call to log.""" # If queue is too large, refuse to log. if self.maxQueueSize and self.queue.qsize() > self.maxQueueSize: return # Stick arguments into the queue self.queue.put((name, value, valueType, stamp)) def flush(self): """Block until all stats have been sent to Graphite.""" self.queue.join() def shutdown(self): """Shut down the background thread.""" self.queue.put(None) self.flush()
class CachePipeline(object): def __init__(self, spider, cache): self.spider = spider self.cache = cache self.idle_event = Event() self.queue_size = 100 self.input_queue = Queue() self.result_queue = Queue() self.thread = Thread(target=self.thread_worker) self.thread.daemon = True self.thread.start() def has_free_resources(self): return (self.input_queue.qsize() < self.queue_size and self.result_queue.qsize() < self.queue_size) def is_idle(self): return self.idle_event.is_set() def thread_worker(self): self.idle_event.clear() while True: try: action, data = self.input_queue.get(block=False) except Empty: self.idle_event.set() time.sleep(0.1) self.idle_event.clear() if self.spider.shutdown_event.is_set(): #print('EXITING CACHE PIPELINE') return self.shutdown() #else: # print('no shutdown event') else: assert action in ('load', 'save') if action == 'load': task, grab = data result = None if self.is_cache_loading_allowed(task, grab): result = self.load_from_cache(task, grab) if result: #print('!! PUT RESULT INTO CACHE PIPE RESULT QUEUE (cache)') self.result_queue.put(('network_result', result)) else: self.result_queue.put(('task', task)) elif action == 'save': task, grab = data if self.is_cache_saving_allowed(task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.write'): self.cache.save_response(task.url, grab) def is_cache_loading_allowed(self, task, grab): # 1) cache data should be refreshed # 2) cache is disabled for that task # 3) request type is not cacheable return (not task.get('refresh_cache', False) and not task.get('disable_cache', False) and grab.detect_request_method() == 'GET') def is_cache_saving_allowed(self, task, grab): """ Check if network transport result could be saved to cache layer. res: {ok, grab, grab_config_backup, task, emsg} """ if grab.request_method == 'GET': if not task.get('disable_cache'): if self.spider.is_valid_network_response_code( grab.response.code, task): return True return False def load_from_cache(self, task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.read'): cache_item = self.cache.get_item(grab.config['url'], timeout=task.cache_timeout) if cache_item is None: return None else: with self.spider.timer.log_time( 'cache.read.prepare_request'): grab.prepare_request() with self.spider.timer.log_time( 'cache.read.load_response'): self.cache.load_response(grab, cache_item) grab.log_request('CACHED') self.spider.stat.inc('spider:request-cache') return { 'ok': True, 'task': task, 'grab': grab, 'grab_config_backup': grab.dump_config(), 'emsg': None } def shutdown(self): try: self.cache.close() except AttributeError: print('Cache %s does not support close method' % self.cache)
class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisioner): """ Mimics a job batcher, provisioner and scalable batch system """ def __init__(self, config, secondsPerJob): super(MockBatchSystemAndProvisioner, self).__init__('clusterName') # To mimic parallel preemptable and non-preemptable queues # for jobs we create two parallel instances of the following class self.config = config self.secondsPerJob = secondsPerJob self.provisioner = self self.batchSystem = self self.nodeTypes = config.nodeTypes self.nodeShapes = self.nodeTypes self.nodeShapes.sort() self.jobQueue = Queue() self.updatedJobsQueue = Queue() self.jobBatchSystemIDToIssuedJob = {} self.totalJobs = 0 # Count of total jobs processed self.totalWorkerTime = 0.0 # Total time spent in worker threads self.toilMetrics = None self.nodesToWorker = { } # Map from Node to instances of the Worker class self.workers = {nodeShape: [] for nodeShape in self.nodeShapes } # Instances of the Worker class self.maxWorkers = {nodeShape: 0 for nodeShape in self.nodeShapes } # Maximum number of workers self.running = False self.leaderThread = Thread(target=self._leaderFn) def start(self): self.running = True self.leaderThread.start() def shutDown(self): self.running = False self.leaderThread.join() # Stub out all AbstractBatchSystem methods since they are never called for name, value in iteritems(AbstractBatchSystem.__dict__): if getattr(value, '__isabstractmethod__', False): exec('def %s(): pass' % name) # Without this, the class would end up with .name and .value attributes del name, value # AbstractScalableBatchSystem methods def nodeInUse(self, nodeIP): return False def ignoreNode(self, nodeAddress): pass def unignoreNode(self, nodeAddress): pass @contextmanager def nodeFiltering(self, filter): nodes = self.getProvisionedWorkers( preemptable=True, nodeType=None) + self.getProvisionedWorkers( preemptable=False, nodeType=None) yield nodes # AbstractProvisioner methods def getProvisionedWorkers(self, nodeType=None, preemptable=None): """ Returns a list of Node objects, each representing a worker node in the cluster :param preemptable: If True only return preemptable nodes else return non-preemptable nodes :return: list of Node """ nodesToWorker = self.nodesToWorker if nodeType: return [ node for node in nodesToWorker if node.nodeType == nodeType ] else: return list(nodesToWorker.keys()) def terminateNodes(self, nodes): self._removeNodes(nodes) def remainingBillingInterval(self, node): pass def addJob(self, jobShape, preemptable=False): """ Add a job to the job queue """ self.totalJobs += 1 jobID = uuid.uuid4() self.jobBatchSystemIDToIssuedJob[jobID] = Job(memory=jobShape.memory, cores=jobShape.cores, disk=jobShape.disk, preemptable=preemptable) self.jobQueue.put(jobID) # JobBatcher functionality def getNumberOfJobsIssued(self, preemptable=None): if preemptable is not None: jobList = [ job for job in list(self.jobQueue.queue) if self.jobBatchSystemIDToIssuedJob[job].preemptable == preemptable ] return len(jobList) else: return self.jobQueue.qsize() def getJobs(self): return self.jobBatchSystemIDToIssuedJob.values() # AbstractScalableBatchSystem functionality def getNodes(self, preemptable=False, timeout=None): nodes = dict() for node in self.nodesToWorker: if node.preemptable == preemptable: worker = self.nodesToWorker[node] nodes[node.privateIP] = NodeInfo( coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if worker.busyEvent.is_set() else 0) return nodes # AbstractProvisioner functionality def addNodes(self, nodeType, numNodes, preemptable): self._addNodes(numNodes=numNodes, nodeType=nodeType, preemptable=preemptable) return self.getNumberOfNodes(nodeType=nodeType, preemptable=preemptable) def getNodeShape(self, nodeType, preemptable=False): # Assume node shapes and node types are the same thing for testing return nodeType def getWorkersInCluster(self, nodeShape): return self.workers[nodeShape] def launchCluster(self, leaderNodeType, keyName, userTags=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50, botoPath=None, **kwargs): pass def destroyCluster(self): pass def getLeader(self): pass def _leaderFn(self): while self.running: updatedJobID = None try: updatedJobID = self.updatedJobsQueue.get(timeout=1.0) except Empty: continue if updatedJobID: del self.jobBatchSystemIDToIssuedJob[updatedJobID] time.sleep(0.1) def _addNodes(self, numNodes, nodeType, preemptable=False): nodeShape = self.getNodeShape(nodeType=nodeType, preemptable=preemptable) class Worker(object): def __init__(self, jobQueue, updatedJobsQueue, secondsPerJob): self.busyEvent = Event() self.stopEvent = Event() def workerFn(): while True: if self.stopEvent.is_set(): return try: jobID = jobQueue.get(timeout=1.0) except Empty: continue updatedJobsQueue.put(jobID) self.busyEvent.set() time.sleep(secondsPerJob) self.busyEvent.clear() self.startTime = time.time() self.worker = Thread(target=workerFn) self.worker.start() def stop(self): self.stopEvent.set() self.worker.join() return time.time() - self.startTime for _ in range(numNodes): node = Node('127.0.0.1', uuid.uuid4(), 'testNode', datetime.datetime.now().isoformat() + 'Z', nodeType=nodeType, preemptable=preemptable) self.nodesToWorker[node] = Worker(self.jobQueue, self.updatedJobsQueue, self.secondsPerJob) self.workers[nodeShape].append(self.nodesToWorker[node]) self.maxWorkers[nodeShape] = max(self.maxWorkers[nodeShape], len(self.workers[nodeShape])) def _removeNodes(self, nodes): logger.debug("Removing nodes. %s workers and %s to terminate.", len(self.nodesToWorker), len(nodes)) for node in nodes: logger.debug("removed node") try: nodeShape = self.getNodeShape(node.nodeType, node.preemptable) worker = self.nodesToWorker.pop(node) self.workers[nodeShape].pop() self.totalWorkerTime += worker.stop() except KeyError: # Node isn't our responsibility pass def getNumberOfNodes(self, nodeType=None, preemptable=None): if nodeType: nodeShape = self.getNodeShape(nodeType=nodeType, preemptable=preemptable) return len(self.workers[nodeShape]) else: return len(self.nodesToWorker)
class ThreadedTaskDispatcher(object): """A Task Dispatcher that creates a thread for each task.""" stop_count = 0 # Number of threads that will stop soon. def __init__(self): self.threads = {} # { thread number -> 1 } self.queue = Queue() self.thread_mgmt_lock = threading.Lock() def handlerThread(self, thread_no): threads = self.threads try: while threads.get(thread_no): task = self.queue.get() if task is None: # Special value: kill this thread. break try: task.service() except: log.exception('Exception during task') except: log.exception('Exception in thread main loop') finally: mlock = self.thread_mgmt_lock with mlock: self.stop_count -= 1 try: del threads[thread_no] except KeyError: pass def setThreadCount(self, count): """See zope.server.interfaces.ITaskDispatcher""" mlock = self.thread_mgmt_lock with mlock: threads = self.threads thread_no = 0 running = len(threads) - self.stop_count while running < count: # Start threads. while thread_no in threads: thread_no = thread_no + 1 threads[thread_no] = 1 running += 1 t = threading.Thread(target=self.handlerThread, args=(thread_no, ), name='zope.server-%d' % thread_no) t.setDaemon(True) t.start() thread_no = thread_no + 1 if running > count: # Stop threads. to_stop = running - count self.stop_count += to_stop for _n in range(to_stop): self.queue.put(None) running -= 1 def addTask(self, task): """See zope.server.interfaces.ITaskDispatcher""" if task is None: raise ValueError("No task passed to addTask().") # assert ITask.providedBy(task) try: task.defer() self.queue.put(task) except: task.cancel() raise def shutdown(self, cancel_pending=True, timeout=5): """See zope.server.interfaces.ITaskDispatcher""" self.setThreadCount(0) # Ensure the threads shut down. threads = self.threads expiration = time() + timeout while threads: if time() >= expiration: log.error("%d thread(s) still running", len(threads)) break sleep(0.1) if cancel_pending: # Cancel remaining tasks. try: queue = self.queue while not queue.empty(): task = queue.get() if task is not None: task.cancel() except Empty: pass def getPendingTasksEstimate(self): """See zope.server.interfaces.ITaskDispatcher""" return self.queue.qsize()
class HyperDash: """HyperDash monitors a job and manages capturing IO / server comms. This class is designed to be run in its own thread and contains an instance of runner (which is running the job) and server_manager (for talking to the server.) """ def __init__( self, job_name, current_sdk_run_uuid, server_manager, io_bufs, std_streams, parent_logger, runner, ): """Initialize the HyperDash class. args: 1) job_name: Name of the current running job 2) current_sdk_run_uuid: UUID of current run 3) runner: Instance of CodeRunner or ExperimentRunner 4) server_manager: ServerManager instance 5) io_bufs: Tuple in the form of (StringIO(), StringIO(),) 6) std_streams: Tuple in the form of (StdOut, StdErr) """ self.job_name = job_name self.current_sdk_run_uuid = current_sdk_run_uuid self.runner = runner self.server_manager = server_manager self.out_buf, self.err_buf = io_bufs self.std_out, self.std_err = std_streams self.programmatic_exit = False self.shutdown_network_channel = Queue() self.shutdown_main_channel = Queue() # Used to keep track of the current position in the IO buffers for data # that has been sent to STDOUT/STDERR and the logfile self.out_buf_offset = 0 self.err_buf_offset = 0 # Used to keep track of the current position in the IO buffers for data # that has been sent to the ServerManager. We separate the local/server # offsets because in the case of the user's code frequently flushing, we # want terminal/logs to update extremely quickly, but a small delay in # sending data to the server is acceptable so that more data can be batched # together. I.E if the user's code flushes 1000 times per second, we want # to capture that in realtime locally, but only want to send one message to # the server with the cumulative output of those 1000 flushes for the one # second period. self.server_out_buf_offset = 0 self.server_err_buf_offset = 0 self.time_since_last_server_capture = time.time() # Create run_start message before doing any other setup work to make sure that the # run_started message always precedes any other messages self.server_manager.put_buf( create_run_started_message(self.current_sdk_run_uuid, self.job_name), ) def on_stdout_flush(): self.capture_io() self.std_out.flush() self.flush_log_file() def on_stderr_flush(): self.capture_io() self.std_err.flush() self.flush_log_file() self.out_buf.set_on_flush(on_stdout_flush) self.err_buf.set_on_flush(on_stderr_flush) self.logger = parent_logger.getChild(__name__) self.log_file, self.log_file_path = self.open_log_file() if not self.log_file: self.logger.error( "Could not create logs file. Logs will not be stored locally.") def open_log_file(self): log_folder = get_hyperdash_logs_home_path() # Make sure logs directory exists (/logs) if not os.path.exists(log_folder): try: os.makedirs(log_folder) except OSError as exc: if exc.errno != errno.EEXIST: pass return None, None job_log_folder = get_hyperdash_logs_home_path_for_job(self.job_name) # Make sure directory for job exists in log directory (/logs/<JOB_NAME>) if not os.path.exists(job_log_folder): try: os.makedirs(job_log_folder) except OSError as exc: if exc.errno != errno.EEXIST: pass return None, None # Create new log file for run try: iso = slugify(datetime.datetime.now().isoformat()) logfile_name = "{}_{}.log".format(slugify(self.job_name), iso) logfile_path = os.path.join(job_log_folder, logfile_name) return open(logfile_path, "a"), logfile_path except IOError: return None, None # Capture all IO for terminal/log file since we last checked def capture_io(self, force_server_capture=False): current_time = time.time() should_send_to_server_manager = (( (current_time - self.time_since_last_server_capture) > 1) or force_server_capture) if should_send_to_server_manager: self.time_since_last_server_capture = current_time self.out_buf.acquire() out = self.out_buf.getvalue() # Local len_out = len(out) - self.out_buf_offset if len_out != 0: self.print_out(out[self.out_buf_offset:]) self.out_buf_offset += len_out # Server len_out_server = len(out) - self.server_out_buf_offset if len_out_server != 0 and should_send_to_server_manager: self.send_print_to_server_manager(out[self.server_out_buf_offset:], INFO_LEVEL) self.server_out_buf_offset += len_out_server self.out_buf.release() self.err_buf.acquire() err = self.err_buf.getvalue() # Local len_err = len(err) - self.err_buf_offset if len_err != 0: self.print_err(err[self.err_buf_offset:]) self.err_buf_offset += len_err # Server len_err_server = len(err) - self.server_err_buf_offset if len_err_server != 0 and should_send_to_server_manager: self.send_print_to_server_manager(err[self.server_err_buf_offset:], ERROR_LEVEL) self.server_err_buf_offset += len_err_server self.err_buf.release() def print_out(self, s): self.std_out.write(s) self.write_to_log_file(s) def print_err(self, s): self.std_err.write(s) self.write_to_log_file(s) # In the case that the amount of data is large, we chunk it into # several smaller messages def send_print_to_server_manager(self, s, level): offset = 0 while offset < len(s): chunk_size = min(len(s) - offset, MAX_LOG_SIZE_BYTES) message = create_log_message(self.current_sdk_run_uuid, level, s[offset:offset + chunk_size]) self.server_manager.put_buf(message) offset += chunk_size def write_to_log_file(self, s): if self.log_file: if PY2: self.log_file.write(s.encode("utf-8")) else: self.log_file.write(s) def flush_log_file(self): if self.log_file: self.log_file.flush() def cleanup(self, exit_status): self.print_completion_message() self.capture_io(force_server_capture=True) self.server_manager.put_buf( create_run_ended_message(self.current_sdk_run_uuid, exit_status), ) self.flush_log_file() self.shutdown_network_channel.put(True) def sudden_cleanup(self): self.print_completion_message() # Send what we can to local log self.capture_io() self.flush_log_file() # Make a best-effort attempt to notify server that the run was # canceled by the user, but don't wait for all messages to # be flushed to server so we don't hang the user's terminal. self.server_manager.send_message( create_run_ended_message(self.current_sdk_run_uuid, "user_canceled"), raise_exceptions=False, timeout_seconds=1, ) # Prevent the network thread from continuing to run in the background # even if SystemExit is caught self.shutdown_network_channel.put(True) def print_completion_message(self): start_time, end_time = self.runner.get_start_and_end_time() # end_time is not set in the case that the user canceled the run if not end_time: end_time = datetime.datetime.now() if self.log_file_path: self.logger.info( "This run of {} ran for {} and logs are available locally at: {}" .format( self.job_name, human_readable_duration(start_time, end_time), self.log_file_path, )) def run(self): """ run_http works using three separate threads: 1) runner thread which runs the user's code (if using CodeRunner) 2) network_thread which does blocking I/O with the server 3) event_loop thread which runs the SDK's main event loop (this is just the main thread) We require the event loop and network loop to be in separate threads because otherwise slow responses from the server could inhibit the SDK's event loop causing weird behavior like delayed logs in the user's terminal. Once all threads are running, the event_loop thread will periodically check the I/O buffers to see if any new logs have appeared, and if so, it will send them to the server manager's outgoing buffer. The network_loop thread will periodically check its outgoing buffer, and if it finds any messages in there, it will send them all to the server. Cleanup is the responsibility of the event_loop. With every tick of the event_loop, we check to see if the user's code has completed running. If it has, the event_loop will capture any remaining I/O and store that in the ServerManager's outgoing buf, as well as store a message indicating that the run is complete and its final exit status. Finally, the event_loop thread will push a message into the shutdown_network_channel which will indicate to the network_loop that it should finish sending any pending messages and then exit. The event_loop thread will then block until it receives a message on the shutdown_main_channel. At the next tick of the network_loop, the shutdown_network_channel will no longer be empty, and the network loop will try and fire off any remaining messages in the ServerManager's buffer to the server, and then put a message in the shutdown_main_channel. The main event_loop which has been blocked until now on the shutdown_main_channel will now return, and the program will exit cleanly. """ def network_loop(): while True: if self.shutdown_network_channel.qsize() != 0: self.server_manager.cleanup(self.current_sdk_run_uuid) self.shutdown_main_channel.put(True) return else: self.server_manager.tick(self.current_sdk_run_uuid) time.sleep(1) network_thread = Thread(target=network_loop) # Daemonize so they don't impede shutdown if the user # keyboard interrupts network_thread.daemon = True network_thread.start() # Create thread for running code if using CLI or decorator if self.runner.should_run_as_thread(): code_thread = Thread(target=self.runner.run) code_thread.daemon = True code_thread.start() # Event loop while True: try: self.capture_io() exited_cleanly, is_done = self.runner.is_done() if is_done: self.programmatic_exit = True if exited_cleanly: self.cleanup("success") # Block until network loop says its done self.shutdown_main_channel.get(block=True, timeout=None) return self.runner.get_return_val() else: self.cleanup("failure") # Block until network loop says its done self.shutdown_main_channel.get(block=True, timeout=None) raise self.runner.get_exception() time.sleep(1) # Handle Ctrl+C except (KeyboardInterrupt, SystemExit): self.sudden_cleanup() # code_thread and network_thread are daemons so they won't impede this sys.exit(130)
class TilestorWriter: __instance = None def __init__(self): if TilestorWriter.__instance is None: self.queue = Queue(maxsize=QUEUE_MAX_SIZE) self.cstart = None self._worker = Thread(target=self._job) self._worker.daemon = True self._worker.start() @classmethod def getInstance(cls): if cls.__instance is None: cls.__instance = TilestorWriter() return cls.__instance def put(self, payload, timeout): try: if timeout is None: self.queue.put_nowait(payload) else: self.queue.put(payload, timeout=timeout) except Full: raise TileWriterQueueFullException( "Tile writer queue is full at maxsize {}.".format( self.queue.maxsize)) def _job(self): self._shutdown = False atexit.register(self.wait_for_shutdown) data = None while True: self.cstart = None if data is None: try: # When the thread is shutting down use minimal timeout # otherwise use a big timeout not to block shutdown # that may happen. get_timeout = 0.001 if self._shutdown else min( SHUTDOWN_TIMEOUT / 5, 2) data = self.queue.get(True, get_timeout) except Empty: if self._shutdown: _logger.debug( "Tile cache writer queue is empty now. Exiting!") break else: continue db_path = data['db_path'] self.cstart = ptime = time() tiles_written = 0 time_taken = 0.0 answers = [] # Tile cache writer may fall sometimes in case of database connecti # problem for example. So we just skip a tile with error and log an # exception. try: with transaction.manager: conn = DBSession.connection() tilestor = get_tile_db(db_path) while data is not None and data['db_path'] == db_path: z, x, y = data['tile'] tstamp = int((datetime.utcnow() - TIMESTAMP_EPOCH).total_seconds()) img = data['img'] if img is not None and img.mode != 'RGBA': img = img.convert('RGBA') colortuple = imgcolor(img) color = pack_color( colortuple) if colortuple is not None else None self._write_tile_meta( conn, data['uuid'], dict(z=z, x=x, y=y, color=color, tstamp=tstamp)) if color is None: buf = BytesIO() img.save(buf, format='PNG', compress_level=3) value = buf.getvalue() self._write_tile_data(tilestor, z, x, y, tstamp, value) if 'answer_queue' in data: answers.append(data['answer_queue']) tiles_written += 1 ctime = time() time_taken += ctime - ptime if tiles_written >= BATCH_MAX_TILES: # Break the batch data = None else: # Try to get next tile for the batch. Or break # the batch if there is no tiles left. if time_taken < BATCH_DEADLINE: try: data = self.queue.get( timeout=(BATCH_DEADLINE - time_taken)) except Empty: data = None else: data = None # Do not account queue block time ptime = time() # Force zope session management to commit changes mark_changed(DBSession()) tilestor.commit() time_taken += time() - ptime _logger.debug( "%d tiles were written in %0.3f seconds (%0.1f per " "second, qsize = %d)", tiles_written, time_taken, tiles_written / time_taken, self.queue.qsize()) # Report about sucess only after transaction commit for a in answers: a.put_nowait(None) except Exception as exc: _logger.exception("Uncaught exception in tile writer: %s", exc.message) data = None self.cstart = None tilestor.rollback() def _write_tile_meta(self, conn, table_uuid, row): result = conn.execute( db.sql.text('SELECT true FROM tile_cache."{}" ' 'WHERE z = :z AND x = :x AND y = :y ' 'LIMIT 1 FOR UPDATE'.format(table_uuid)), **row) if result.returns_rows: conn.execute( db.sql.text('DELETE FROM tile_cache."{0}" ' 'WHERE z = :z AND x = :x AND y = :y ' ''.format(table_uuid)), **row) conn.execute( db.sql.text( 'INSERT INTO tile_cache."{0}" (z, x, y, color, tstamp) ' 'VALUES (:z, :x, :y, :color, :tstamp)'.format(table_uuid)), **row) def _write_tile_data(self, tilestor, z, x, y, tstamp, value): tilestor.execute("DELETE FROM tile WHERE z = ? AND x = ? AND y = ?", (z, x, y)) try: tilestor.execute("INSERT INTO tile VALUES (?, ?, ?, ?, ?)", (z, x, y, tstamp, value)) except sqlite3.IntegrityError: # NOTE: Race condition with other proccess may occurs here. # TODO: ON CONFLICT DO ... in SQLite >= 3.24.0 (python 3) pass def wait_for_shutdown(self, timeout=SHUTDOWN_TIMEOUT): if not self._worker.is_alive(): return True _logger.debug( "Waiting for shutdown of tile cache writer for %d seconds (" + "qsize = %d)...", timeout, self.queue.qsize()) self._shutdown = True self._worker.join(timeout) if self._worker.is_alive(): _logger.warn( "Tile cache writer is still running. It'll be killed!") return False else: _logger.debug("Tile cache writer has successfully shut down.") return True
class LogProcessingWorker(Thread): """""" # ---------------------------------------------------------------------- def __init__(self, *args, **kwargs): self._host = kwargs.pop('host') self._port = kwargs.pop('port') self._transport = kwargs.pop('transport') self._ssl_enable = kwargs.pop('ssl_enable') self._ssl_verify = kwargs.pop('ssl_verify') self._keyfile = kwargs.pop('keyfile') self._certfile = kwargs.pop('certfile') self._ca_certs = kwargs.pop('ca_certs') self._database_path = kwargs.pop('database_path') super(LogProcessingWorker, self).__init__(*args, **kwargs) self.daemon = True self.name = self.__class__.__name__ self._shutdown_event = Event() self._queue = Queue() self._event = None self._database = None self._last_event_flush_date = None self._non_flushed_event_count = None self._logger = None # ---------------------------------------------------------------------- def enqueue_event(self, event): # called from other threads self._queue.put(event) # ---------------------------------------------------------------------- def shutdown(self): # called from other threads self._shutdown_event.set() # ---------------------------------------------------------------------- def run(self): self._reset_flush_counters() self._setup_logger() self._setup_database() try: self._fetch_events() except Exception as e: # we really should not get anything here, and if, the worker thread is dying # too early resulting in undefined application behaviour self._log_general_error(e) # check for empty queue and report if not self._warn_about_non_empty_queue_on_shutdown() # ---------------------------------------------------------------------- def _reset_flush_counters(self): self._last_event_flush_date = datetime.now() self._non_flushed_event_count = 0 # ---------------------------------------------------------------------- def _setup_logger(self): self._logger = get_logger(self.name) # ---------------------------------------------------------------------- def _setup_database(self): self._database = DatabaseCache(self._database_path) # ---------------------------------------------------------------------- def _fetch_events(self): while True: try: self._fetch_event() self._process_event() except Empty: # Flush queued (in database) events after internally queued events has been # processed, i.e. the queue is empty. if self._shutdown_requested(): self._flush_queued_events(force=True) return self._flush_queued_events() self._delay_processing() except (DatabaseLockedError, ProcessingError): if self._shutdown_requested(): return else: self._requeue_event() self._delay_processing() # ---------------------------------------------------------------------- def _fetch_event(self): self._event = self._queue.get(block=False) # ---------------------------------------------------------------------- def _process_event(self): try: self._write_event_to_database() except DatabaseLockedError: self._safe_log( u'debug', u'Database is locked, will try again later (queue length %d)', self._queue.qsize()) raise except Exception as e: self._log_processing_error(e) raise ProcessingError() else: self._event = None # ---------------------------------------------------------------------- def _log_processing_error(self, exception): self._safe_log(u'exception', u'Log processing error (queue size: %3s): %s', self._queue.qsize(), exception) # ---------------------------------------------------------------------- def _delay_processing(self): self._shutdown_event.wait(QUEUE_CHECK_INTERVAL) # ---------------------------------------------------------------------- def _shutdown_requested(self): return self._shutdown_event.is_set() # ---------------------------------------------------------------------- def _requeue_event(self): self._queue.put(self._event) # ---------------------------------------------------------------------- def _write_event_to_database(self): self._database.add_event(self._event) self._non_flushed_event_count += 1 # ---------------------------------------------------------------------- def _flush_queued_events(self, force=False): # check if necessary and abort if not if not force and not self._queued_event_interval_reached( ) and not self._queued_event_count_reached(): return try: queued_events = self._database.get_queued_events() except DatabaseLockedError: self._safe_log( u'debug', u'Database is locked, will try again later (queue length %d)', self._queue.qsize()) return # try again later except Exception as e: # just log the exception and hope we can recover from the error self._safe_log(u'exception', u'Error retrieving queued events: %s', e) return if queued_events: try: events = [event['event_text'] for event in queued_events] self._send_events(events) except Exception as e: self._safe_log(u'exception', u'An error occurred while sending events: %s', e) self._database.requeue_queued_events(queued_events) else: self._delete_queued_events_from_database() self._reset_flush_counters() # ---------------------------------------------------------------------- def _delete_queued_events_from_database(self): try: self._database.delete_queued_events() except DatabaseLockedError: pass # nothing to handle, if it fails, we delete those events in a later run # ---------------------------------------------------------------------- def _queued_event_interval_reached(self): delta = datetime.now() - self._last_event_flush_date return delta.total_seconds() > QUEUED_EVENTS_FLUSH_INTERVAL # ---------------------------------------------------------------------- def _queued_event_count_reached(self): return self._non_flushed_event_count > QUEUED_EVENTS_FLUSH_COUNT # ---------------------------------------------------------------------- def _send_events(self, events): self._transport.send(events) # ---------------------------------------------------------------------- def _log_general_error(self, exc): self._safe_log(u'exception', u'An unexpected error occurred: %s', exc) # ---------------------------------------------------------------------- def _safe_log(self, log_level, message, *args, **kwargs): # we cannot log via the logging subsystem any longer once it has been set to shutdown if self._shutdown_requested(): safe_log_via_print(log_level, message, *args, **kwargs) else: log_func = getattr(self._logger, log_level) return log_func(message, *args, **kwargs) # ---------------------------------------------------------------------- def _warn_about_non_empty_queue_on_shutdown(self): queue_size = self._queue.qsize() if queue_size: self._safe_log( 'warn', u'Non-empty queue while shutting down ({} events pending). ' u'This indicates a previous error.'.format(queue_size), extra=dict(queue_size=queue_size))
class BatchTaskQueue(TaskQueue): """ A class for managing batch async operations. """ def __init__(self, work_func, max_batch_size=DEFAULT_BATCH_SIZE, batch_cushion=DEFAULT_BATCH_CUSHION, interval=DEFAULT_INTERVAL, **kwargs): """ :param work_func: Work function input params of list(item), items are added with add_item :type work_func: function :param max_batch_size: The max number of elements in a batch call :type max_batch_size: int :param batch_cushion: The batch cushion between items uploaded and the specified max :type batch_cushion: int :param interval: The interval between checking and uploading added items :type interval: int """ super(BatchTaskQueue, self).__init__(**kwargs) self._max_batch_size = max_batch_size self._batch_cushion = batch_cushion self._batch_size = self._max_batch_size - self._batch_cushion if self._batch_size <= 0: self._logger.warning("Batch size - batch cushion is less than 1, defaulting to 1.") self._batch_size = MIN_BATCH_SIZE self._items = Queue() self._work_func = work_func self._daemon = Daemon(self._do_work, interval, _parent_logger=self._logger, _ident="{}Daemon".format(self.identity)) self._daemon.start() def add_item(self, item): """ :param func: Function to be executed asynchronously :type func: builtin.function :param task_priority: Priority for the task, higher items have higher priority :type task_priority: int or None """ self._items.put(item) def _handle_batch(self): batch = [] for item in range(self._batch_size): try: item = self._items.get_nowait() batch.append(item) except Empty: break self._logger.debug("Batch size {}.".format(len(batch))) if len(batch) > 0: self.add(self._work_func, batch) def _do_work(self): if not self._items.empty(): queue_size = self._items.qsize() num_batches = 1 + int(queue_size / self._batch_size) with TaskQueue(_ident="BatchTaskQueueAdd_{}_Batches".format(num_batches)) as task_queue: for _ in range(num_batches): task_queue.add(self._handle_batch) def __exit__(self, *args): super(BatchTaskQueue, self).__exit__(*args) self._daemon.stop() def flush(self, *args, **kwargs): self._do_work() super(BatchTaskQueue, self).flush(*args, **kwargs)
class Publisher(PublisherServerBase): def __init__(self, use_nanoconfig_service, publisher_endpoint, nanoconfig_service_endpoint, nanoconfig_update_endpoint, nanoconfig_profile, metrics_store, max_queue_size, max_worker, min_worker=5): """ :param use_nanoconfig_service: Indicates whether or not it should use a nanoconfig service :type use_nanoconfig_service: bool :param publisher_endpoint: Publisher server URI :type publisher_endpoint: str :param nanoconfig_service_endpoint: Nanoconfig service URI :type nanoconfig_service_endpoint: str :param nanoconfig_update_endpoint: Nanoconfig update service URI :type nanoconfig_update_endpoint: str :param nanoconfig_profile: Nanoconfig profile URI :type nanoconfig_profile: str :param max_queue_size: Max size for the message queue :type max_queue_size: int :param max_worker: Max number of worker to be spawned at a given time :type max_worker: int :param min_worker: Min number of worker to be spawned at a given time :type min_worker: int """ super(Publisher, self).__init__(use_nanoconfig_service, publisher_endpoint, nanoconfig_service_endpoint, nanoconfig_update_endpoint, nanoconfig_profile) self.max_queue_size = max_queue_size self.metrics_store = metrics_store self.min_worker = min_worker self.max_worker = max_worker self.msg_queue = Queue(self.max_queue_size) self.workers = [] @property def num_workers(self): return len(self.workers) def on_receive(self, msg): LOG.debug('[Publisher] Queue msg size = %s | workers = %s', self.msg_queue.qsize(), self.num_workers) try: self.check_workers_alive() self.adjust_pool_size() except OSError as exc: LOG.exception(exc) LOG.error("[Publisher] Error upon receiving a message") self.msg_queue.put(msg) def check_workers_alive(self): # Because we can create new workers in this loop, we create a copy # --> We could otherwise loop onto a new workers... worker_items = self.workers[:] for worker_thread in worker_items: if not worker_thread.is_alive(): self.workers.pop(self.workers.index(worker_thread)) self.start_worker() def adjust_pool_size(self): needed_size = self.msg_queue.qsize() + self.min_worker if abs(needed_size - self.num_workers) > self.min_worker * 2: LOG.debug(("[Publisher] Auto adjust pool size needed size is `%s` " "and the current size is `%s`"), needed_size, self.num_workers) while self.num_workers > min(self.min_worker, needed_size): self.stop_worker() # Create enough, but not too many while self.num_workers < min(self.max_worker, needed_size): self.start_worker() def start_worker(self): LOG.debug("[Publisher] starting worker") worker = Worker(self.msg_queue, self.metrics_store) worker.start() self.workers.append(worker) def stop_worker(self): if self.num_workers: LOG.debug("[Publisher] stopping worker") worker = self.workers.pop(-1) # Pops the last worker worker.stop() def stop(self): super(Publisher, self).stop() join_threads = [] for key in self.workers: t = Thread(target=self.workers.get(key).stop) t.start() join_threads.append(t) for join_thread in join_threads: join_thread.join()
class Streamer(threading.Thread): """ This class extends threading.Thread and provides a simple concurrent queue and an async service that pulls data from the queue and sends it to the server. """ def __init__(self, ws_connection, beat_duration, connection): threading.Thread.__init__(self) self.daemon = True self.messages = Queue() self.counter = 0 self.name = "Streamer(%r)" % (ws_connection) self.ws_connection = ws_connection self.closed = False self.beat_duration = beat_duration / 1000. # We get milliseconds self.last_beat = time.time() self.connection = connection def put_messge_in_q(self, message): ''' Puts a message in the queue :param message: Some kind of payload, type agnostic ''' if message is not None: self.messages.put(message) def close(self): ''' Puts a None in the queue which leads to closing it. ''' if self.closed is True: return # Send a message to close self.put_messge_in_q(CloseMessage()) self.closed = True def run(self): ''' Continuously pulls messages from the queue and sends them to the server. ''' self.ws_connection.wait_for_connection() while True: out = self._loop() # Exit the infinite loop if isinstance(out, CloseMessage): break # Close the websocket connection self.ws_connection.close() return def _loop(self): """ A single loop of running """ try: if self.ws_connection is not None and self.ws_connection.is_connected( ): messages = self.getn(1) if messages and isinstance(messages[0], CloseMessage): return messages[0] if messages is not None: self.ws_connection.send(messages) self._check_heartbeat() except Exception as e: LOGGER.error("Streaming error", exc_info=True) def _check_heartbeat(self): """ Check if we should send an heartbeat """ next_beat = self.last_beat + self.beat_duration now = time.time() if next_beat < now: new_beat_duration = self.connection.heartbeat() self.beat_duration = new_beat_duration / 1000. # We get milliseconds self.last_beat = time.time() def getn(self, n): """ Pops n messages from the queue. Args: n: Number of messages to pull from queue Returns: n messages """ try: msg = self.messages.get( timeout=self.beat_duration) # block until at least 1 except Empty: return None if isinstance(msg, CloseMessage): return [msg] self.counter += 1 msg.set_offset(self.counter) result = [msg] try: while len(result) < n: another_msg = self.messages.get( block=False) # dont block if no more messages self.counter += 1 another_msg.set_offset(self.counter) result.append(another_msg) except Exception: pass return result def wait_for_finish(self): """ Blocks the experiment from exiting until all data was sent to server OR 30 seconds passed.""" msg = "Uploading stats to Comet before program termination (may take several seconds)" LOGGER.info(msg) # Wait maximum 2 minutes self._wait_for_empty(30) if not self.messages.empty(): msg = "Still uploading stats to Comet before program termination (may take several seconds)" LOGGER.info(msg) self._wait_for_empty(30) if not self.messages.empty(): self._wait_for_empty(60, verbose=True) if not self.messages.empty(): remaining = self.messages.qsize() LOGGER.error( "Comet failed to send all the data back (%s messages)", remaining) if config.experiment is not None: if config.experiment.alive: LOGGER.info( "Experiment is live on comet.ml: %s", config.experiment._get_experiment_url(), ) else: LOGGER.warning("Failed to log run in comet.ml") else: LOGGER.warning("Failed to log run in comet.ml") def _wait_for_empty(self, timeout, verbose=False): """ Wait up to TIMEOUT seconds for the messages queue to be empty """ end_time = time.time() + timeout while not self.messages.empty() and time.time() < end_time: if verbose is True: LOGGER.info("%d messages remaining to upload", self.messages.qsize()) time.sleep(5) def _report_upload_not_finished(self): """ Called if the queue is not empty during exit """ remaining = self.messages.qsize() if remaining == 0: return err_msg = "Comet failed to send all the data back %d" # Display the message LOGGER.error(err_msg, remaining) self.connection.report(event_name="on_exit_didnt_finish_upload_sdk", err_msg=(err_msg % remaining))
class Delegate(object): """ Implements the methods of the Provisioner and JobBatcher class needed for the ClusterScaler class. """ def __init__(self, preemptable): super(MockBatchSystemAndProvisioner.Delegate, self).__init__() self.jobQueue = Queue() self.totalJobs = 0 # Count of total jobs processed self.totalWorkerTime = 0.0 # Total time spent in worker threads self.nodesToWorker = {} # Map from Node to instances of the Worker class self.maxWorkers = 0 # Maximum number of workers self.preemptable = preemptable def addJob(self): """ Add a job to the job queue """ self.totalJobs += 1 self.jobQueue.put(None) # JobBatcher functionality def getNumberOfJobsIssued(self): return self.jobQueue.qsize() # AbstractScalableBatchSystem functionality def getNodes(self): nodes = dict() for i, worker in enumerate(self.nodesToWorker.values()): nodes[(i, self.preemptable)] = NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if worker.busyEvent.is_set() else 0) return nodes def _addNodes(self, numNodes): class Worker(object): def __init__(self, jobQueue, secondsPerJob): self.busyEvent = Event() self.stopEvent = Event() def workerFn(): while True: if self.stopEvent.is_set(): return try: jobQueue.get(timeout=1.0) except Empty: continue self.busyEvent.set() time.sleep(secondsPerJob) self.busyEvent.clear() self.startTime = time.time() self.worker = Thread(target=workerFn) self.worker.start() def stop(self): self.stopEvent.set() self.worker.join() return time.time() - self.startTime for i in range(numNodes): node = Node('127.0.0.1', '127.0.0.1', 'testNode', time.time()) self.nodesToWorker[node] = Worker(self.jobQueue, self.outer.secondsPerJob) self.maxWorkers = max(self.maxWorkers, len(self.nodesToWorker)) def _removeNodes(self, nodes): logger.info("removing nodes. %s workers and %s to terminate", len(self.nodesToWorker), len(nodes)) for node in nodes: logger.info("removed node") try: worker = self.nodesToWorker.pop(node) self.totalWorkerTime += worker.stop() except KeyError: # Node isn't our responsibility pass def getNumberOfNodes(self): return len(self.nodesToWorker)
class TaskQueue(ChainedIdentity): """ A class for managing async tasks. """ def __init__(self, worker_pool=None, error_handler=None, flush_timeout_seconds=None, **kwargs): """ :param worker_pool: Thread pool for executing tasks :type worker_pool: concurrent.futures.ThreadPoolExecutor :param error_handler: Extension point for processing error queue items :type error_handler: function(error, logging.Logger) :param timeout_seconds: Task flush timeout in seconds :type timeout_seconds: timeout in seconds """ super(TaskQueue, self).__init__(**kwargs) self._tasks = Queue() self._results = [] # For right now, don't need queue for errors, but it's # probable that we'll want the error handler looping on queue thread self._errors = [] self._err_handler = error_handler self._worker_pool = worker_pool if worker_pool is not None else WorkerPool(_parent_logger=self._logger) self._task_number = 0 self._flush_timeout_seconds = DEFAULT_FLUSH_TIMEOUT_SECONDS if flush_timeout_seconds: self._flush_timeout_seconds = flush_timeout_seconds self._logger.debug('Overriding default timeout to {}'.format(flush_timeout_seconds)) def __enter__(self): self._logger.debug("[Start]") return self def __exit__(self, *args): self._logger.debug("[Stop] - waiting default timeout") self.flush(self.identity) # TODO: Adding functions with this method needs to be more configurable def add(self, func, *args, **kwargs): """ :param func: Function to be executed asynchronously :type func: builtin.function """ future = self.create_future(func, *args, **kwargs) ident = "{}_{}".format(self._tasks.qsize(), func.__name__) task = AsyncTask(future, _ident=ident, _parent_logger=self._logger) self.add_task(task) return task def add_task(self, async_task): """ :param async_task: asynchronous task to be added to the queue and possibly processed :type async_task: azureml._async.AsyncTask """ '''Blocking, no timeout add task to queue''' if not isinstance(async_task, AsyncTask): raise ValueError("Can only add AsyncTask, got {0}".format(type(async_task))) self._logger.debug("Adding task {0} to queue of approximate size: {1}".format(async_task.ident, self._tasks.qsize())) self._tasks.put(async_task) def create_future(self, func, *args, **kwargs): return self._worker_pool.submit(func, *args, **kwargs) def flush(self, source, timeout_seconds=None): with self._log_context("WaitFlushSource:{}".format(source)) as log_context: if timeout_seconds is None: log_context.debug("Overriding default flush timeout from None to {}". format(self._flush_timeout_seconds)) timeout_seconds = self._flush_timeout_seconds else: log_context.debug("flush timeout {} is different from task queue timeout {}, using flush timeout". format(timeout_seconds, self._flush_timeout_seconds)) start_time = time.time() # Take tasks off of the queue tasks_to_wait = [] while True: try: tasks_to_wait.append(self._tasks.get_nowait()) except Empty: break message = "" timeout_time = start_time + timeout_seconds log_context.debug("Waiting {} seconds on tasks: {}.".format(timeout_seconds, tasks_to_wait)) not_done = True while not_done and time.time() <= timeout_time: completed_tasks = [task for task in tasks_to_wait if task.done()] tasks_to_wait = [task for task in tasks_to_wait if not task.done()] not_done = len(tasks_to_wait) != 0 self._results.extend((task.wait(awaiter_name=self.identity) for task in completed_tasks)) if not_done: for task in tasks_to_wait: message += "Waiting on task: {}.\n".format(task.ident) message += "{} tasks left. Current duration of flush {} seconds.\n".format( len(tasks_to_wait), time.time() - start_time) time.sleep(.25) self._logger.debug(message) # Reach this case on timeout if not_done: azureml_error = AzureMLError.create( FlushTaskTimeout, timeout_seconds=timeout_seconds ) raise AzureMLException._with_error(azureml_error) @property def results(self): for result in self._results: yield result def errors(self): for error in self._errors: yield error
class SelectiveRepeater(object): """ Selective repeater for SBP file I/O requests Fields ---------- _pending_map : list(PendingRequest) List (used as a map) of PendingRequest objects, used to track outstanding requests. _request_pool : Queue Queue of available requests (recorded by index number) _seqmap : dict(int,int) Dictionary mapping SBP request sequence IDs to their corresponding request index. _batch_msgs : list Collector for a batch of messages to be sent in one buffer via the link _last_check_time : Time The last time we checked if any packets had expired _expire_map : dict(Time, dict(PendingRequest, PendingRequest)) Dictionary which records the future time at which a request will expire. _msg_type : int The message type we're currently sending _link : Handler The link over which we're sending data _callback_thread : int ID of the thread that we expect callbacks from _link_thread : int ID of the thread that handles link writes """ def __init__(self, link, msg_type, cb=None): """ Args --- link : Handler Link over which messages will be sent. msg_type : The type of message being sent cb : Invoked when SBP message with type `msg_type` is received """ self._link = link self._msg_type = msg_type self._callback = cb self._seqmap = {} self._batch_msgs = [] self._last_check_time = Time.now() self._expire_map = defaultdict(dict) self._init_fileio_config(SBP_FILEIO_WINDOW_SIZE, SBP_FILEIO_BATCH_SIZE, PROGRESS_CB_REDUCTION_FACTOR) self._callback_thread = None self._link_thread = None self._total_sends = 1.0 self._total_retries = 0 self._config_retry_time = None def _init_fileio_config(self, window_size, batch_size, progress_cb_reduction_factor): self._pending_map = [PendingRequest(X) for X in range(window_size)] self._request_pool = Queue(window_size) for pending_req in self._pending_map: self._request_pool.put(pending_req) self._batch_size = batch_size self._progress_cb_reduction_factor = progress_cb_reduction_factor def __enter__(self): self._link.add_callback(self._request_cb, self._msg_type) self._link.add_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP) return self def __exit__(self, type, value, traceback): self._link.remove_callback(self._request_cb, self._msg_type) self._link.remove_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP) def _verify_cb_thread(self): """ Verify that only one thread is consuming requests. """ if self._callback_thread is None: self._callback_thread = threading.currentThread().ident assert self._callback_thread == threading.currentThread().ident def _verify_link_thread(self): """ Verify that only one thread is producing requests. """ if self._link_thread is None: self._link_thread = threading.currentThread().ident assert self._link_thread == threading.currentThread().ident def _return_pending_req(self, pending_req): """ Return a pending request to the write pool and clean any entries in the expiration map. """ self._verify_cb_thread() pending_req.completed = True try: msg = pending_req.message except AttributeError: # Got a completion for something that was never requested return self._try_remove_keys(self._seqmap, msg.sequence) if self._try_remove_keys(self._expire_map[pending_req.time_expire], pending_req): # Only put the request back if it was successfully removed self._request_pool.put(pending_req) def _record_pending_req(self, msg, time_now, expiration_time): """ Acquire a pending request object and record it's future expiration time in a map. """ self._verify_link_thread() # Queue.get will block if no requests are available pending_req = self._request_pool.get(True) assert self._pending_map[pending_req.index].index == pending_req.index self._seqmap[msg.sequence] = pending_req.index self._pending_map[pending_req.index].track(msg, time_now, expiration_time) self._expire_map[expiration_time][pending_req] = pending_req def _config_cb(self, msg, **metadata): self._config_msg = msg self._init_fileio_config(msg.window_size, msg.batch_size, PROGRESS_CB_REDUCTION_FACTOR * 2) def _request_cb(self, msg, **metadata): """ Process request completions. """ index = self._seqmap.get(msg.sequence) if index is None: return pending_req = self._pending_map[index] if self._callback: self._callback(pending_req.message, msg) self._return_pending_req(pending_req) def _has_pending(self): return self._request_pool.qsize() != len(self._pending_map) def _retry_send(self, check_time, pending_req, delete_keys): """ Retry a request by updating it's expire time on the object itself and in the expiration map. """ self._total_retries += 1 self._total_sends += 1 timeout_delta = Time(SBP_FILEIO_TIMEOUT) send_time = Time.now() new_expire = send_time + timeout_delta pending_req.record_retry(send_time, new_expire) self._expire_map[new_expire][pending_req] = pending_req self._link(pending_req.message) delete_keys.append(pending_req) def _try_remove_keys(self, d, *keys): success = True for key in keys: try: del d[key] except KeyError: success = False return success def _check_pending(self): """ Scans from the last check time to the current time looking for requests that are due to expire and retries them if necessary. """ time_now = Time.now() timeout_delta = Time(SBP_FILEIO_TIMEOUT) for check_time in Time.iter_since(self._last_check_time, time_now): pending_reqs = self._expire_map[check_time] retried_writes = [] for pending_req in pending_reqs.keys(): time_expire = pending_req.time + timeout_delta if time_now >= time_expire: if pending_req.tries >= MAXIMUM_RETRIES: raise Exception('Timed out') # If the completion map becomes inconsistent (because # things are completing at the same time they're # being re-tried) then the `completed` field should # prevent us from re-sending a write in this case. if not pending_req.completed: self._retry_send(check_time, pending_req, retried_writes) # Pending writes can be marked completed while this function # is running, so a key error means is was marked completed # after we sent a retry (therefore _try_remove_keys ignores # key errors). self._try_remove_keys(self._expire_map[check_time], *retried_writes) self._last_check_time = time_now def _window_available(self, batch_size): return self._request_pool.qsize() >= batch_size def _ensure_config_req_sent(self): if self._config_retry_time is not None: return now = Time.now() self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS) self._config_timeout = now + Time(0, CONFIG_REQ_TIMEOUT_MS) self._config_seq = random.randint(0, 0xffffffff) self._config_msg = None self._link(MsgFileioConfigReq(sequence=self._config_seq)) def _config_received(self): self._ensure_config_req_sent() if self._config_msg is not None: return True now = Time.now() if now >= self._config_retry_time: self._link(MsgFileioConfigReq(sequence=self._config_seq)) self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS) if now >= self._config_timeout: self._config_msg = MsgFileioConfigResp(sequence=0, window_size=100, batch_size=1, fileio_version=0) return self._config_msg is not None def _wait_config_received(self): while not self._config_received(): time.sleep(WAIT_SLEEP_S) def _wait_window_available(self, batch_size): self._wait_config_received() while not self._window_available(batch_size): self._check_pending() if not self._window_available(batch_size): time.sleep(WAIT_SLEEP_S) @property def total_retries(self): return self._total_retries @property def total_sends(self): return self._total_sends @property def progress_cb_reduction_factor(self): return self._progress_cb_reduction_factor def send(self, msg, batch_size=None): if batch_size is not None: self._send(msg, batch_size) else: self._send(msg, self._batch_size) def _send(self, msg, batch_size): """ Sends data via the current link, potentially batching it together. Parameters ---------- msg : MsgFileioReadReq, MsgFileioReadDirReq, MsgFileioWriteReq, MsgFileioRemove The message to be sent via the current link batch_size : int The number of message to batch together before actually sending """ if msg is not None: self._batch_msgs.append(msg) if len(self._batch_msgs) >= batch_size: self._wait_window_available(batch_size) time_now = Time.now() expiration_time = time_now + Time(SBP_FILEIO_TIMEOUT) for msg in self._batch_msgs: self._record_pending_req(msg, time_now, expiration_time) self._link(*self._batch_msgs) self._total_sends += len(self._batch_msgs) del self._batch_msgs[:] def flush(self): """ Flush any pending requests (batched or otherwise) and wait for all pending requests to complete. """ self.send(None, batch_size=0) while self._has_pending(): self._check_pending() time.sleep(WAIT_SLEEP_S)
class Delegate(object): """ Implements the methods of the Provisioner and JobBatcher class needed for the ClusterScaler class. """ def __init__(self, preemptable): super(MockBatchSystemAndProvisioner.Delegate, self).__init__() self.jobQueue = Queue() self.totalJobs = 0 # Count of total jobs processed self.totalWorkerTime = 0.0 # Total time spent in worker threads self.nodesToWorker = { } # Map from Node to instances of the Worker class self.maxWorkers = 0 # Maximum number of workers self.preemptable = preemptable def addJob(self): """ Add a job to the job queue """ self.totalJobs += 1 self.jobQueue.put(None) # JobBatcher functionality def getNumberOfJobsIssued(self): return self.jobQueue.qsize() # AbstractScalableBatchSystem functionality def getNodes(self): nodes = dict() for i, worker in enumerate(self.nodesToWorker.values()): nodes[(i, self.preemptable)] = NodeInfo( coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if worker.busyEvent.is_set() else 0) return nodes def _addNodes(self, numNodes): class Worker(object): def __init__(self, jobQueue, secondsPerJob): self.busyEvent = Event() self.stopEvent = Event() def workerFn(): while True: if self.stopEvent.is_set(): return try: jobQueue.get(timeout=1.0) except Empty: continue self.busyEvent.set() time.sleep(secondsPerJob) self.busyEvent.clear() self.startTime = time.time() self.worker = Thread(target=workerFn) self.worker.start() def stop(self): self.stopEvent.set() self.worker.join() return time.time() - self.startTime for i in xrange(numNodes): node = Node('127.0.0.1', '127.0.0.1', 'testNode', time.time()) self.nodesToWorker[node] = Worker(self.jobQueue, self.outer.secondsPerJob) self.maxWorkers = max(self.maxWorkers, len(self.nodesToWorker)) def _removeNodes(self, nodes): logger.info("removing nodes. %s workers and %s to terminate", len(self.nodesToWorker), len(nodes)) for node in nodes: logger.info("removed node") try: worker = self.nodesToWorker.pop(node) self.totalWorkerTime += worker.stop() except KeyError: # Node isn't our responsibility pass def getNumberOfNodes(self): return len(self.nodesToWorker)
class GraphiteReporter(threading.Thread): """A graphite reporter thread.""" def __init__(self, host, port, maxQueueSize=10000): """Connect to a Graphite server on host:port.""" threading.Thread.__init__(self) self.host, self.port = host, port self.sock = None self.queue = Queue() self.maxQueueSize = maxQueueSize self.daemon = True def run(self): """Run the thread.""" while True: try: try: name, value, valueType, stamp = self.queue.get() except TypeError: break self.log(name, value, valueType, stamp) finally: self.queue.task_done() def connect(self): """Connects to the Graphite server if not already connected.""" if self.sock is not None: return backoff = 0.01 while True: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(5) sock.connect((self.host, self.port)) self.sock = sock return except socket.error: time.sleep(random.uniform(0, 2.0*backoff)) backoff = min(backoff*2.0, 5.0) def disconnect(self): """Disconnect from the Graphite server if connected.""" if self.sock is not None: try: self.sock.close() except socket.error: pass finally: self.sock = None def _sendMsg(self, msg): """Send a line to graphite. Retry with exponential backoff.""" if not self.sock: self.connect() if not isinstance(msg, binary_type): msg = msg.encode("UTF-8") backoff = 0.001 while True: try: self.sock.sendall(msg) break except socket.error: log.warning('Graphite connection error', exc_info = True) self.disconnect() time.sleep(random.uniform(0, 2.0*backoff)) backoff = min(backoff*2.0, 5.0) self.connect() def _sanitizeName(self, name): """Sanitize a metric name.""" return name.replace(' ', '-') def log(self, name, value, valueType=None, stamp=None): """Log a named numeric value. The value type may be 'value', 'count', or None.""" if type(value) == float: form = "%s%s %2.2f %d\n" else: form = "%s%s %s %d\n" if valueType is not None and len(valueType) > 0 and valueType[0] != '.': valueType = '.' + valueType if not stamp: stamp = time.time() self._sendMsg(form % (self._sanitizeName(name), valueType or '', value, stamp)) def enqueue(self, name, value, valueType=None, stamp=None): """Enqueue a call to log.""" # If queue is too large, refuse to log. if self.maxQueueSize and self.queue.qsize() > self.maxQueueSize: return # Stick arguments into the queue self.queue.put((name, value, valueType, stamp)) def flush(self): """Block until all stats have been sent to Graphite.""" self.queue.join() def shutdown(self): """Shut down the background thread.""" self.queue.put(None) self.flush()
class Delegate(object): """ Implements the methods of the Provisioner and JobBatcher class needed for the ClusterScaler class, but omits the preemptable argument. """ def __init__(self): super(MockBatchSystemAndProvisioner.Delegate, self).__init__() self.jobQueue = Queue() self.totalJobs = 0 # Count of total jobs processed self.totalWorkerTime = 0.0 # Total time spent in worker threads self.workers = [] # Instances of the Worker class self.maxWorkers = 0 # Maximum number of workers def addJob(self): """ Add a job to the job queue """ self.totalJobs += 1 self.jobQueue.put(None) # JobBatcher functionality def getNumberOfJobsIssued(self): return self.jobQueue.qsize() # AbstractScalableBatchSystem functionality def getNodes(self): return { address: NodeInfo(coresTotal=0, coresUsed=0, memoryTotal=0, memoryUsed=0, workers=1 if w.busyEvent.is_set() else 0) for address, w in enumerate(self.workers) } # AbstractProvisioner functionality def setNodeCount(self, numNodes): delta = numNodes - len(self.workers) if delta > 0: self._addNodes(numNodes=delta) elif delta < 0: self._removeNodes(numNodes=-delta) assert len(self.workers) == numNodes return numNodes def _addNodes(self, numNodes): class Worker(object): def __init__(self, jobQueue, secondsPerJob): self.busyEvent = Event() self.stopEvent = Event() def workerFn(): while True: if self.stopEvent.is_set(): return try: jobQueue.get(timeout=1.0) except Empty: continue self.busyEvent.set() time.sleep(secondsPerJob) self.busyEvent.clear() self.startTime = time.time() self.worker = Thread(target=workerFn) self.worker.start() def stop(self): self.stopEvent.set() self.worker.join() return time.time() - self.startTime for i in xrange(numNodes): self.workers.append( Worker(self.jobQueue, self.outer.secondsPerJob)) if len(self.workers) > self.maxWorkers: self.maxWorkers = len(self.workers) def _removeNodes(self, numNodes): while len(self.workers) > 0 and numNodes > 0: worker = self.workers.pop() self.totalWorkerTime += worker.stop() numNodes -= 1 def getNumberOfNodes(self): return len(self.workers)
class LogProcessingWorker(Thread): """""" # ---------------------------------------------------------------------- def __init__(self, *args, **kwargs): self._host = kwargs.pop('host') self._port = kwargs.pop('port') self._transport = kwargs.pop('transport') self._ssl_enable = kwargs.pop('ssl_enable') self._ssl_verify = kwargs.pop('ssl_verify') self._keyfile = kwargs.pop('keyfile') self._certfile = kwargs.pop('certfile') self._ca_certs = kwargs.pop('ca_certs') self._database_path = kwargs.pop('database_path') self._memory_cache = kwargs.pop('cache') self._event_ttl = kwargs.pop('event_ttl') super(LogProcessingWorker, self).__init__(*args, **kwargs) self.daemon = True self.name = self.__class__.__name__ self._shutdown_event = Event() self._flush_event = Event() self._queue = Queue() self._event = None self._database = None self._last_event_flush_date = None self._non_flushed_event_count = None self._logger = None self._rate_limit_storage = None self._rate_limit_strategy = None self._rate_limit_item = None # ---------------------------------------------------------------------- def enqueue_event(self, event): # called from other threads self._queue.put(event) # ---------------------------------------------------------------------- def shutdown(self): # called from other threads self._shutdown_event.set() # ---------------------------------------------------------------------- def run(self): self._reset_flush_counters() self._setup_logger() self._setup_database() try: self._fetch_events() except Exception as e: # we really should not get anything here, and if, the worker thread is dying # too early resulting in undefined application behaviour self._log_general_error(e) # check for empty queue and report if not self._warn_about_non_empty_queue_on_shutdown() # ---------------------------------------------------------------------- def force_flush_queued_events(self): self._flush_event.set() # ---------------------------------------------------------------------- def _reset_flush_counters(self): self._last_event_flush_date = datetime.now() self._non_flushed_event_count = 0 # ---------------------------------------------------------------------- def _clear_flush_event(self): self._flush_event.clear() # ---------------------------------------------------------------------- def _setup_logger(self): self._logger = get_logger(self.name) # rate limit our own messages to not spam around in case of temporary network errors, etc rate_limit_setting = constants.ERROR_LOG_RATE_LIMIT if rate_limit_setting: self._rate_limit_storage = MemoryStorage() self._rate_limit_strategy = FixedWindowRateLimiter(self._rate_limit_storage) self._rate_limit_item = parse_rate_limit(rate_limit_setting) # ---------------------------------------------------------------------- def _setup_database(self): if self._database_path: self._database = DatabaseCache(path=self._database_path, event_ttl=self._event_ttl) else: self._database = MemoryCache(cache=self._memory_cache, event_ttl=self._event_ttl) # ---------------------------------------------------------------------- def _fetch_events(self): while True: try: self._fetch_event() self._process_event() except Empty: # Flush queued (in database) events after internally queued events has been # processed, i.e. the queue is empty. if self._shutdown_requested(): self._flush_queued_events(force=True) return force_flush = self._flush_requested() self._flush_queued_events(force=force_flush) self._delay_processing() self._expire_events() except (DatabaseLockedError, ProcessingError): if self._shutdown_requested(): return else: self._requeue_event() self._delay_processing() # ---------------------------------------------------------------------- def _fetch_event(self): self._event = self._queue.get(block=False) # ---------------------------------------------------------------------- def _process_event(self): try: self._write_event_to_database() except DatabaseLockedError as e: self._safe_log( u'debug', u'Database is locked, will try again later (queue length %d)', self._queue.qsize(), exc=e) raise except Exception as e: self._log_processing_error(e) raise ProcessingError() else: self._event = None # ---------------------------------------------------------------------- def _expire_events(self): try: self._database.expire_events() except DatabaseLockedError: # Nothing to handle, if it fails, we will either successfully publish these messages next time # or we will delete them on the next pass. pass # ---------------------------------------------------------------------- def _log_processing_error(self, exception): self._safe_log( u'exception', u'Log processing error (queue size: %3s): %s', self._queue.qsize(), exception, exc=exception) # ---------------------------------------------------------------------- def _delay_processing(self): self._shutdown_event.wait(constants.QUEUE_CHECK_INTERVAL) # ---------------------------------------------------------------------- def _shutdown_requested(self): return self._shutdown_event.is_set() # ---------------------------------------------------------------------- def _flush_requested(self): return self._flush_event.is_set() # ---------------------------------------------------------------------- def _requeue_event(self): self._queue.put(self._event) # ---------------------------------------------------------------------- def _write_event_to_database(self): self._database.add_event(self._event) self._non_flushed_event_count += 1 # ---------------------------------------------------------------------- def _flush_queued_events(self, force=False): # check if necessary and abort if not if not force and not self._queued_event_interval_reached() and not self._queued_event_count_reached(): return self._clear_flush_event() try: queued_events = self._database.get_queued_events() except DatabaseLockedError as e: self._safe_log( u'debug', u'Database is locked, will try again later (queue length %d)', self._queue.qsize(), exc=e) return # try again later except Exception as e: # just log the exception and hope we can recover from the error self._safe_log(u'exception', u'Error retrieving queued events: %s', e, exc=e) return if queued_events: try: events = [event['event_text'] for event in queued_events] self._send_events(events) except Exception as e: self._safe_log( u'exception', u'An error occurred while sending events: %s', e, exc=e) self._database.requeue_queued_events(queued_events) else: self._delete_queued_events_from_database() self._reset_flush_counters() # ---------------------------------------------------------------------- def _delete_queued_events_from_database(self): try: self._database.delete_queued_events() except DatabaseLockedError: pass # nothing to handle, if it fails, we delete those events in a later run # ---------------------------------------------------------------------- def _queued_event_interval_reached(self): delta = datetime.now() - self._last_event_flush_date return delta.total_seconds() > constants.QUEUED_EVENTS_FLUSH_INTERVAL # ---------------------------------------------------------------------- def _queued_event_count_reached(self): return self._non_flushed_event_count > constants.QUEUED_EVENTS_FLUSH_COUNT # ---------------------------------------------------------------------- def _send_events(self, events): self._transport.send(events) # ---------------------------------------------------------------------- def _log_general_error(self, exc): self._safe_log(u'exception', u'An unexpected error occurred: %s', exc, exc=exc) # ---------------------------------------------------------------------- def _safe_log(self, log_level, message, *args, **kwargs): # we cannot log via the logging subsystem any longer once it has been set to shutdown if self._shutdown_requested(): safe_log_via_print(log_level, message, *args, **kwargs) else: rate_limit_allowed = self._rate_limit_check(kwargs) if rate_limit_allowed <= 0: return # skip further logging due to rate limiting elif rate_limit_allowed == 1: # extend the message to indicate future rate limiting message = \ u'{} (rate limiting effective, ' \ 'further equal messages will be limited)'.format(message) self._safe_log_impl(log_level, message, *args, **kwargs) # ---------------------------------------------------------------------- def _rate_limit_check(self, kwargs): exc = kwargs.pop('exc', None) if self._rate_limit_strategy is not None and exc is not None: key = self._factor_rate_limit_key(exc) # query curent counter for the caller _, remaining = self._rate_limit_strategy.get_window_stats(self._rate_limit_item, key) # increase the rate limit counter for the key self._rate_limit_strategy.hit(self._rate_limit_item, key) return remaining return 2 # any value greater than 1 means allowed # ---------------------------------------------------------------------- def _factor_rate_limit_key(self, exc): module_name = getattr(exc, '__module__', '__no_module__') class_name = exc.__class__.__name__ key_items = [module_name, class_name] if hasattr(exc, 'errno') and isinstance(exc.errno, integer_types): # in case of socket.error, include the errno as rate limiting key key_items.append(str(exc.errno)) return '.'.join(key_items) # ---------------------------------------------------------------------- def _safe_log_impl(self, log_level, message, *args, **kwargs): log_func = getattr(self._logger, log_level) log_func(message, *args, **kwargs) # ---------------------------------------------------------------------- def _warn_about_non_empty_queue_on_shutdown(self): queue_size = self._queue.qsize() if queue_size: self._safe_log( 'warn', u'Non-empty queue while shutting down ({} events pending). ' u'This indicates a previous error.'.format(queue_size), extra=dict(queue_size=queue_size))
class Publisher(PublisherServerBase): def __init__(self, use_nanoconfig_service, publisher_endpoint, nanoconfig_service_endpoint, nanoconfig_update_endpoint, nanoconfig_profile, metrics_store, max_queue_size, max_worker, min_worker=5): """ :param use_nanoconfig_service: Indicates whether or not it should use a nanoconfig service :type use_nanoconfig_service: bool :param publisher_endpoint: Publisher server URI :type publisher_endpoint: str :param nanoconfig_service_endpoint: Nanoconfig service URI :type nanoconfig_service_endpoint: str :param nanoconfig_update_endpoint: Nanoconfig update service URI :type nanoconfig_update_endpoint: str :param nanoconfig_profile: Nanoconfig profile URI :type nanoconfig_profile: str :param max_queue_size: Max size for the message queue :type max_queue_size: int :param max_worker: Max number of worker to be spawned at a given time :type max_worker: int :param min_worker: Min number of worker to be spawned at a given time :type min_worker: int """ super(Publisher, self).__init__( use_nanoconfig_service, publisher_endpoint, nanoconfig_service_endpoint, nanoconfig_update_endpoint, nanoconfig_profile ) self.max_queue_size = max_queue_size self.metrics_store = metrics_store self.min_worker = min_worker self.max_worker = max_worker self.msg_queue = Queue(self.max_queue_size) self.workers = [] @property def num_workers(self): return len(self.workers) def on_receive(self, msg): LOG.debug('[Publisher] Queue msg size = %s | workers = %s', self.msg_queue.qsize(), self.num_workers) try: self.check_workers_alive() self.adjust_pool_size() except OSError as exc: LOG.exception(exc) LOG.error("[Publisher] Error upon receiving a message") self.msg_queue.put(msg) def check_workers_alive(self): # Because we can create new workers in this loop, we create a copy # --> We could otherwise loop onto a new workers... worker_items = self.workers[:] for worker_thread in worker_items: if not worker_thread.is_alive(): self.workers.pop(self.workers.index(worker_thread)) self.start_worker() def adjust_pool_size(self): needed_size = self.msg_queue.qsize() + self.min_worker if abs(needed_size - self.num_workers) > self.min_worker * 2: LOG.debug(("[Publisher] Auto adjust pool size needed size is `%s` " "and the current size is `%s`"), needed_size, self.num_workers) while self.num_workers > min(self.min_worker, needed_size): self.stop_worker() # Create enough, but not too many while self.num_workers < min(self.max_worker, needed_size): self.start_worker() def start_worker(self): LOG.debug("[Publisher] starting worker") worker = Worker(self.msg_queue, self.metrics_store) worker.start() self.workers.append(worker) def stop_worker(self): if self.num_workers: LOG.debug("[Publisher] stopping worker") worker = self.workers.pop(-1) # Pops the last worker worker.stop() def stop(self): super(Publisher, self).stop() join_threads = [] for key in self.workers: t = Thread(target=self.workers.get(key).stop) t.start() join_threads.append(t) for join_thread in join_threads: join_thread.join()
class QueueWorker(BaseWorker): TIMEOUT_ATTEMPTS = 5 QUEUE_SIZE = -1 # inf END_EVENT = object() NAME = 'polyaxon.QueueWorker' def __init__(self, timeout=None, queue_size=None): super(QueueWorker, self).__init__() self._queue = Queue(queue_size or self.QUEUE_SIZE) self._timeout = timeout if timeout is not None else settings.TIMEOUT def atexit(self): with self._lock: if not self.is_alive(): return self._queue.put_nowait(self.END_EVENT) def timeout_join(timeout, queue): end = time() + timeout queue.all_tasks_done.acquire() try: while queue.unfinished_tasks: current_timeout = end - time() if current_timeout <= 0: # timed out return False queue.all_tasks_done.wait(timeout=current_timeout) return True finally: queue.all_tasks_done.release() # ensure wait timeout = min(settings.MIN_TIMEOUT, self._timeout / self.TIMEOUT_ATTEMPTS) if timeout_join(timeout=timeout, queue=self._queue): timeout = 0 else: # Queue still has message, try another time size = self._queue.qsize() if not settings.IN_CLUSTER: print( 'Polyaxon %s is attempting to send %i pending messages' % (self.NAME, size)) print('Waiting up to {} seconds'.format(self._timeout)) if os.name == 'nt': print('Press Ctrl-Break to quit') else: print('Press Ctrl-C to quit') sleep(settings.MIN_TIMEOUT) # Allow tasks to get executed while timeout > 0 and not timeout_join(timeout=timeout, queue=self._queue): timeout = min(timeout + self._timeout / self.TIMEOUT_ATTEMPTS, self._timeout - timeout) size = self._queue.qsize() if size > 0: print( 'Polyaxon %s timed out and did not manage to send %i messages' % (self.NAME, size)) self._thread = None def stop(self, timeout=None): with self._lock: if self._thread: self._queue.put_nowait(self.END_EVENT) self._thread.join(timeout=timeout) self._thread = None self._thread_for_pid = None def queue(self, callback, *args, **kwargs): self.is_running() self._queue.put_nowait((callback, args, kwargs)) def _target(self): while True: record = self._queue.get() try: if record is self.END_EVENT: break callback, args, kwargs = record try: callback(*args, **kwargs) except Exception: logger.error('Failed processing job', exc_info=True) finally: self._queue.task_done() sleep(0)
class CachePipeline(object): def __init__(self, spider, cache): self.spider = spider self.cache = cache self.idle_event = Event() self.queue_size = 100 self.input_queue = Queue() self.result_queue = Queue() self.thread = Thread(target=self.thread_worker) self.thread.daemon = True self.thread.start() def has_free_resources(self): return (self.input_queue.qsize() < self.queue_size and self.result_queue.qsize() < self.queue_size) def is_idle(self): return self.idle_event.is_set() def thread_worker(self): while True: self.idle_event.clear() try: action, data = self.input_queue.get(block=False) except Empty: self.idle_event.set() time.sleep(0.1) self.idle_event.clear() else: assert action in ('load', 'save') if action == 'load': task, grab = data result = None if self.is_cache_loading_allowed(task, grab): result = self.load_from_cache(task, grab) if result: self.result_queue.put(result) else: self.spider.submit_task_to_transport(task, grab) elif action == 'save': task, grab = data if self.is_cache_saving_allowed(task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.write'): self.cache.save_response(task.url, grab) def is_cache_loading_allowed(self, task, grab): # 1) cache data should be refreshed # 2) cache is disabled for that task # 3) request type is not cacheable return (not task.get('refresh_cache', False) and not task.get('disable_cache', False) and grab.detect_request_method() == 'GET') def is_cache_saving_allowed(self, task, grab): """ Check if network transport result could be saved to cache layer. res: {ok, grab, grab_config_backup, task, emsg} """ if grab.request_method == 'GET': if not task.get('disable_cache'): if self.spider.is_valid_network_response_code( grab.response.code, task): return True return False def load_from_cache(self, task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.read'): cache_item = self.cache.get_item( grab.config['url'], timeout=task.cache_timeout) if cache_item is None: return None else: with self.spider.timer.log_time('cache.read.prepare_request'): grab.prepare_request() with self.spider.timer.log_time('cache.read.load_response'): self.cache.load_response(grab, cache_item) grab.log_request('CACHED') self.spider.stat.inc('spider:request-cache') return {'ok': True, 'task': task, 'grab': grab, 'grab_config_backup': grab.dump_config(), 'emsg': None}
class PluginRunner(object): class State(object): RUNNING = 'RUNNING' STOPPED = 'STOPPED' def __init__(self, name, runtime_path, plugin_path, logger, command_timeout=5.0, state_callback=None): self.runtime_path = runtime_path self.plugin_path = plugin_path self.command_timeout = command_timeout self._logger = logger self._cid = 0 self._proc = None # type: Optional[subprocess.Popen[bytes]] self._running = False self._process_running = False self._command_lock = Lock() self._response_queue = Queue() # type: Queue[Dict[str,Any]] self._writer = None # type: Optional[PluginIPCWriter] self._reader = None # type: Optional[PluginIPCReader] self._state_callback = state_callback # type: Optional[Callable[[str, str], None]] self.name = name self.version = None self.interfaces = None self._decorators_in_use = {} self._exposes = [] self._metric_collectors = [] self._metric_receivers = [] self._async_command_thread = None self._async_command_queue = None # type: Optional[Queue[Optional[Dict[str, Any]]]] self._commands_executed = 0 self._commands_failed = 0 self.__collector_runs = {} # type: Dict[str,float] def start(self): # type: () -> None if self._running: raise Exception('PluginRunner is already running') self.logger('[Runner] Starting') python_executable = sys.executable if python_executable is None or len(python_executable) == 0: python_executable = '/usr/bin/python' self._proc = subprocess.Popen([ python_executable, 'runtime.py', 'start_plugin', self.plugin_path ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None, cwd=self.runtime_path, close_fds=True) assert self._proc.stdout, 'Plugin stdout not available' self._process_running = True self._commands_executed = 0 self._commands_failed = 0 assert self._proc.stdin, 'Plugin stdin not defined' self._writer = PluginIPCWriter(stream=self._proc.stdin) self._reader = PluginIPCReader(stream=self._proc.stdout, logger=lambda message, ex: self.logger( '{0}: {1}'.format(message, ex)), command_receiver=self._process_command, name=self.name) self._reader.start() start_out = self._do_command('start', timeout=180) self.name = start_out['name'] self.version = start_out['version'] self.interfaces = start_out['interfaces'] self._decorators_in_use = start_out['decorators'] self._exposes = start_out['exposes'] self._metric_collectors = start_out['metric_collectors'] self._metric_receivers = start_out['metric_receivers'] exception = start_out.get('exception') if exception is not None: raise RuntimeError(exception) self._async_command_queue = Queue(1000) self._async_command_thread = BaseThread( name='plugincmd{0}'.format(self.plugin_path), target=self._perform_async_commands) self._async_command_thread.daemon = True self._async_command_thread.start() self._running = True if self._state_callback is not None: self._state_callback(self.name, PluginRunner.State.RUNNING) self.logger('[Runner] Started') def logger(self, message): # type: (str) -> None self._logger(message) logger_.info('Plugin {0} - {1}'.format(self.name, message)) def get_webservice(self, webinterface): # type: (WebInterface) -> Service return Service(self, webinterface) def is_running(self): # type: () -> bool return self._running def stop(self): # type: () -> None if self._process_running: self._running = False self.logger('[Runner] Sending stop command') try: self._do_command('stop') except Exception as exception: self.logger( '[Runner] Exception during stopping plugin: {0}'.format( exception)) time.sleep(0.1) if self._reader: self._reader.stop() self._process_running = False if self._async_command_queue is not None: self._async_command_queue.put( None) # Triggers an abort on the read thread if self._proc and self._proc.poll() is None: self.logger('[Runner] Terminating process') try: self._proc.terminate() except Exception as exception: self.logger( '[Runner] Exception during terminating plugin: {0}'. format(exception)) time.sleep(0.5) if self._proc.poll() is None: self.logger('[Runner] Killing process') try: self._proc.kill() except Exception as exception: self.logger( '[Runner] Exception during killing plugin: {0}'. format(exception)) if self._state_callback is not None: self._state_callback(self.name, PluginRunner.State.STOPPED) self.logger('[Runner] Stopped') def process_input_status(self, data, action_version=1): if action_version in [1, 2]: if action_version == 1: payload = {'status': data} else: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='input_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger('Input status version {} not supported.'.format( action_version)) def process_output_status(self, data, action_version=1): if action_version in [1, 2]: if action_version == 1: payload = {'status': data} else: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='output_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger('Output status version {} not supported.'.format( action_version)) def process_shutter_status(self, data, action_version=1): if action_version in [1, 2, 3]: if action_version == 1: payload = {'status': data} elif action_version == 2: status, detail = data payload = {'status': {'status': status, 'detail': detail}} else: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='shutter_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger('Shutter status version {} not supported.'.format( action_version)) def process_ventilation_status(self, data, action_version=1): if action_version in [1]: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='ventilation_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger('Ventilation status version {} not supported.'.format( action_version)) def process_thermostat_status(self, data, action_version=1): if action_version in [1]: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='thermostat_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger('Thermostat status version {} not supported.'.format( action_version)) def process_thermostat_group_status(self, data, action_version=1): if action_version in [1]: event_json = data.serialize() payload = {'event': event_json} self._do_async(action='thermostat_group_status', payload=payload, should_filter=True, action_version=action_version) else: self.logger( 'Thermostat group status version {} not supported.'.format( action_version)) def process_event(self, code): self._do_async('receive_events', {'code': code}, should_filter=True) def collect_metrics(self): for mc in self._metric_collectors: try: now = time.time() (name, interval) = (mc['name'], mc['interval']) if self.__collector_runs.get(name, 0.0) < now - interval: self.__collector_runs[name] = now metrics = self._do_command('collect_metrics', {'name': name})['metrics'] for metric in metrics: if metric is None: continue metric['source'] = self.name yield metric except Exception as exception: self.logger( '[Runner] Exception while collecting metrics {0}: {1}'. format(exception, traceback.format_exc())) def get_metric_receivers(self): return self._metric_receivers def distribute_metrics(self, method, metrics): self._do_async('distribute_metrics', { 'name': method, 'metrics': metrics }) def get_metric_definitions(self): return self._do_command('get_metric_definitions')['metric_definitions'] def request(self, method, args=None, kwargs=None): if args is None: args = [] if kwargs is None: kwargs = {} ret = self._do_command('request', { 'method': method, 'args': args, 'kwargs': kwargs }) if ret['success']: return ret['response'] elif 'stacktrace' in ret: raise Exception('{0}: {1}'.format(ret['exception'], ret['stacktrace'])) raise Exception(ret['exception']) def remove_callback(self): # type: () -> None self._do_command('remove_callback') def _process_command(self, response): # type: (Dict[str,Any]) -> None if not self._process_running: return assert self._proc, 'Plugin process not defined' exit_code = self._proc.poll() if exit_code is not None: self.logger( '[Runner] Stopped with exit code {0}'.format(exit_code)) self._process_running = False return if response['cid'] == 0: self._handle_async_response(response) elif response['cid'] == self._cid: self._response_queue.put(response) else: self.logger( '[Runner] Received message with unknown cid: {0}'.format( response)) def _handle_async_response(self, response): # type: (Dict[str,Any]) -> None if response['action'] == 'logs': self.logger(response['logs']) else: self.logger('[Runner] Unkown async message: {0}'.format(response)) def _do_async(self, action, payload, should_filter=False, action_version=1): # type: (str, Dict[str,Any], bool, int) -> None has_receiver = False for decorator_name, decorator_versions in six.iteritems( self._decorators_in_use): # the action version is linked to a specific decorator version has_receiver |= (action == decorator_name and action_version in decorator_versions) if not self._process_running or (should_filter and not has_receiver): return try: assert self._async_command_queue, 'Command Queue not defined' self._async_command_queue.put( { 'action': action, 'payload': payload, 'action_version': action_version }, block=False) except Full: self.logger('Async action cannot be queued, queue is full') def _perform_async_commands(self): # type: () -> None while self._process_running: try: # Give it a timeout in order to check whether the plugin is not stopped. assert self._async_command_queue, 'Command Queue not defined' command = self._async_command_queue.get(block=True, timeout=10) if command is None: continue # Used to exit this thread self._do_command(command['action'], payload=command['payload'], action_version=command['action_version']) except Empty: self._do_async('ping', {}) except Exception as exception: self.logger( '[Runner] Failed to perform async command: {0}'.format( exception)) def _do_command(self, action, payload=None, timeout=None, action_version=1): # type: (str, Dict[str,Any], Optional[float], int) -> Dict[str,Any] if payload is None: payload = {} self._commands_executed += 1 if timeout is None: timeout = self.command_timeout if not self._process_running: raise Exception('Plugin was stopped') with self._command_lock: try: command = self._create_command(action, payload, action_version) assert self._writer, 'Plugin stdin not defined' self._writer.write(command) except Exception: self._commands_failed += 1 raise try: response = self._response_queue.get(block=True, timeout=timeout) while response['cid'] != self._cid: response = self._response_queue.get(block=False) exception = response.get('_exception') if exception is not None: raise RuntimeError(exception) return response except Empty: metadata = '' if action == 'request': metadata = ' {0}'.format(payload['method']) if self._running: self.logger( '[Runner] No response within {0}s ({1}{2})'.format( timeout, action, metadata)) self._commands_failed += 1 raise Exception('Plugin did not respond') def _create_command(self, action, payload=None, action_version=1): # type: (str, Dict[str,Any], int) -> Dict[str,Any] if payload is None: payload = {} self._cid += 1 command = { 'cid': self._cid, 'action': action, 'action_version': action_version } command.update(payload) return command def error_score(self): # type: () -> float if self._commands_executed == 0: return 0.0 else: score = float(self._commands_failed) / self._commands_executed self._commands_failed = 0 self._commands_executed = 0 return score def get_queue_length(self): # type: () -> int if self._async_command_queue is None: return 0 return self._async_command_queue.qsize()
class Hue(OMPluginBase): name = 'Hue' version = '1.1.2' interfaces = [('config', '1.0')] config_description = [{ 'name': 'api_url', 'type': 'str', 'description': 'The API URL of the Hue Bridge device. E.g. http://192.168.1.2/api' }, { 'name': 'username', 'type': 'str', 'description': 'Hue Bridge generated username.' }, { 'name': 'poll_frequency', 'type': 'int', 'description': 'The frequency used to pull the status of all outputs from the Hue bridge in seconds (0 means never)' }, { 'name': 'output_mapping', 'type': 'section', 'description': 'Mapping between OpenMotics Virtual Outputs/Dimmers and Hue Outputs', 'repeat': True, 'min': 0, 'content': [{ 'name': 'output_id', 'type': 'int' }, { 'name': 'hue_output_id', 'type': 'int' }] }] default_config = { 'api_url': 'http://hue/api', 'username': '', 'poll_frequency': 60 } def __init__(self, webinterface, gateway_logger): self.setup_logging(log_function=gateway_logger) super(Hue, self).__init__(webinterface, logger) logger.info('Starting Hue plugin %s ...', self.version) self.discover_hue_bridges() self._config = self.read_config(Hue.default_config) self._config_checker = PluginConfigChecker(Hue.config_description) self._read_config() self._io_lock = Lock() self._output_event_queue = Queue(maxsize=256) logger.info("Hue plugin started") @staticmethod def setup_logging(log_function): # type: (Callable) -> None logger.setLevel(logging.INFO) log_handler = PluginLogHandler(log_function=log_function) # some elements like time and name are added by the plugin runtime already # formatter = logging.Formatter('%(asctime)s - %(name)s - %(threadName)s - %(levelname)s - %(message)s') formatter = logging.Formatter( '%(threadName)s - %(levelname)s - %(message)s') log_handler.setFormatter(formatter) logger.addHandler(log_handler) def _read_config(self): self._api_url = self._config['api_url'] self._output_mapping = self._config.get('output_mapping', []) self._output = self._create_output_object() self._hue = self._create_hue_object() self._username = self._config['username'] self._poll_frequency = self._config['poll_frequency'] self._endpoint = '{0}/{1}/{{0}}'.format(self._api_url, self._username) self._enabled = self._api_url != '' and self._username != '' logger.info( 'Hue is {0}'.format('enabled' if self._enabled else 'disabled')) def _create_output_object(self): # create an object with the OM output IDs as the keys and hue light IDs as the values output_object = {} for entry in self._output_mapping: output_object[entry['output_id']] = entry['hue_output_id'] return output_object def _create_hue_object(self): # create an object with the hue light IDs as the keys and OM output IDs as the values hue_object = {} for entry in self._output_mapping: hue_object[entry['hue_output_id']] = entry['output_id'] return hue_object @output_status(version=2) def output_status(self, output_event): if self._enabled is True: try: output_id = output_event['id'] state = output_event['status'].get('on') dimmer_level = output_event['status'].get('value') hue_light_id = self._output.get(output_id) if hue_light_id is not None: logger.info( 'Switching output %s (hue id: %s) %s (dimmer: %s)', output_id, hue_light_id, 'ON' if state else 'OFF', dimmer_level) data = (hue_light_id, state, dimmer_level) self._output_event_queue.put(data) else: logger.debug( 'Ignoring output %s, because it is not Hue'.format( output_id)) except Exception as ex: logger.exception( 'Error processing output_status event: {0}'.format(ex)) def _send(self, hue_light_id, state, dimmer_level): try: state = {'on': state} if dimmer_level is not None: state.update( {'bri': self._dimmerLevelToBrightness(dimmer_level)}) self._setLightState(hue_light_id, state) except Exception as ex: logger.exception('Error sending command to output with hue id: %s', hue_light_id) def _getLightState(self, hue_light_id): try: start = time.time() response = requests.get( url=self._endpoint.format('lights/{0}').format(hue_light_id)) if response.status_code is 200: hue_light = response.json() logger.info('Getting output state for hue id: %s took %ss', hue_light_id, round(time.time() - start, 2)) return hue_light else: logger.warning('Failed to pull state for hue id: %s', hue_light_id) return False except Exception as ex: logger.exception('Error while getting output state for hue id: %s', hue_light_id) def _setLightState(self, hue_light_id, state): try: start = time.time() response = requests.put(url=self._endpoint.format( 'lights/{0}/state').format(hue_light_id), data=json.dumps(state)) if response.status_code is 200: result = response.json() if result[0].get('success') is None: logger.info( 'Setting output state for Hue output {0} returned unexpected result. Response: {1} ({2})' .format(hue_light_id, response.text, response.status_code)) return False logger.info('Setting output state for hue id: %s took %ss', hue_light_id, round(time.time() - start, 2)) return True else: logger.error( 'Setting output state for hue id: %s failed. Response: %s (%s)', hue_light_id, response.text, response.status_code) return False except Exception as ex: logger.exception( 'Error while setting output state for hue id: %s to %s', hue_light_id, json.dumps(state)) def import_remote_state(self): if not self._output_event_queue.empty(): logger.info( 'Ignoring syncing remote state because we still need to process %s output events', self._output_event_queue.qsize()) else: try: self._import_lights_state() except Exception as ex: logger.exception( 'Error while getting state for all Hue outputs') try: self._import_sensors_state() except Exception as ex: logger.exception( 'Error while getting state for all Hue sensors') def _import_lights_state(self): logger.debug( 'Syncing remote state for all outputs from the Hue bridge') hue_lights = self._getAllLightsState() for output in self._output_mapping: output_id = output['output_id'] hue_light_id = str(output['hue_output_id']) hue_light_state = hue_lights.get(hue_light_id) if hue_light_state is not None: if hue_light_state.get('on', False): result = json.loads( self.webinterface.set_output( None, str(output_id), 'true', str(hue_light_state['dimmer_level']))) else: result = json.loads( self.webinterface.set_output(None, str(output_id), 'false')) if result['success'] is False: logger.error( 'Error when updating output %s (hue id: %s): %s', output_id, hue_light_id, result['msg']) else: logger.warning( 'Output %s (hue id: %s) not found on Hue bridge', output_id, hue_light_id) def _import_sensors_state(self): logger.debug( 'Syncing remote state for all sensors from the Hue bridge') known_sensors = self._get_known_sensors() hue_sensors = self._getAllSensorsState() for hue_sensor_id, sensor in hue_sensors.items(): sensor_external_id = sensor['external_id'] if sensor_external_id not in known_sensors.keys(): name = 'Hue Sensor {}'.format(hue_sensor_id) om_sensor_id = self._register_sensor(name, sensor_external_id) else: om_sensor_id = known_sensors[sensor_external_id] value = float(sensor.get('value')) if om_sensor_id is not None: self._update_sensor(om_sensor_id, value) else: logger.error('Hue sensor %s (%s) not found', hue_sensor_id, sensor_external_id) def _get_known_sensors(self): response = self.webinterface.get_sensor_configurations() data = json.loads(response) return { x['external_id']: x['id'] for x in data['config'] if x.get('source', {}).get('name') == Hue.name and x['external_id'] not in [None, ''] } def _getAllSensorsState(self): hue_sensors = {} response = requests.get(url=self._endpoint.format('sensors')) if response.status_code is 200: for hue_sensor_id, data in response.json().items(): if data.get('type') == 'ZLLTemperature': hue_sensors[hue_sensor_id] = self._parseSensorObject( hue_sensor_id, data, sensor_type='temperature') else: logger.error('Failed to pull state for all sensors (HTTP %s)', response.status_code) return hue_sensors def _getAllLightsState(self): hue_lights = {} response = requests.get(url=self._endpoint.format('lights')) if response.status_code is 200: for hue_light_id, data in response.json().items(): hue_lights[hue_light_id] = self._parseLightObject( hue_light_id, data) else: logger.error('Failed to pull state for all outputs (HTTP %s)', response.status_code) return hue_lights def _parseLightObject(self, hue_light_id, hue_light_object): light = {'id': hue_light_id} try: light.update({ 'name': hue_light_object['name'], 'on': hue_light_object['state'].get('on', False), 'brightness': hue_light_object['state'].get('bri', 254) }) light['dimmer_level'] = self._brightnessToDimmerLevel( light['brightness']) except Exception as ex: logger.exception('Error while parsing Hue light %s', hue_light_object) return light def _parseSensorObject(self, hue_sensor_id, hue_sensor_object, sensor_type='temperature'): sensor = {'id': hue_sensor_id} try: value = hue_sensor_object['state'][sensor_type] if sensor_type == 'temperature': value /= 100.0 sensor.update({ 'external_id': hue_sensor_object['uniqueid'], 'name': hue_sensor_object['name'], 'type': hue_sensor_object['type'], 'value': value }) except Exception as ex: logger.exception('Error while parsing Hue sensor %s', hue_sensor_object) return sensor def _brightnessToDimmerLevel(self, brightness): return int(round(brightness / 2.54)) def _dimmerLevelToBrightness(self, dimmer_level): return int(round(dimmer_level * 2.54)) @background_task def run(self): if self._enabled: event_processor = Thread(target=self.output_event_processor, name='output_event_processor') event_processor.start() self.log_remote_asset_list() self.start_state_poller() def log_remote_asset_list(self): hue_lights = self._getAllLightsState() for hue_id, hue_light in hue_lights.items(): logger.info('Discovered hue output %s (hue id: %s)', hue_light.get('name'), hue_id) hue_sensors = self._getAllSensorsState() for hue_id, hue_sensor in hue_sensors.items(): logger.info('Discovered hue sensor %s (hue id: %s)', hue_sensor.get('name'), hue_id) def start_state_poller(self): while self._poll_frequency > 0: start = time.time() self.import_remote_state() # This loop will run approx. every 'poll_frequency' seconds sleep = self._poll_frequency - (time.time() - start) if sleep < 0: sleep = 1 self.sleep(sleep) @om_expose def get_config_description(self): return json.dumps(Hue.config_description) @om_expose def get_config(self): return json.dumps(self._config) @om_expose def set_config(self, config): config = json.loads(config) for key in config: if isinstance(config[key], six.string_types): config[key] = str(config[key]) self._config_checker.check_config(config) self._config = config self._read_config() self.write_config(config) return json.dumps({'success': True}) def output_event_processor(self): while self._enabled: try: _latest_value_buffer = {} while True: try: hue_light_id, status, dimmer = self._output_event_queue.get( block=True, timeout=1) _latest_value_buffer[hue_light_id] = ( status, dimmer ) # this will ensure only the latest value is taken except Empty: break for hue_light_id, (status, dimmer) in _latest_value_buffer.items(): self._send(hue_light_id, status, dimmer) self.sleep( 0.1 ) # "throttle" requests to the bridge to avoid overloading except Exception as ex: if 'maintenance_mode' in ex.message: logger.warning( 'System in maintenance mode. Processing paused for 1 minute.' ) self.sleep(60) else: logger.exception( 'Unexpected error processing output events') self.sleep(10) def sleep(self, timer): now = time.time() expired = time.time() - now > timer while self._enabled and not expired: time.sleep(min(timer, 0.5)) expired = time.time() - now > timer def _getLightState(self, hue_light_id): try: start = time.time() response = requests.get(url=self._endpoint.format( 'lights/{0}').format(hue_light_id)) if response.status_code is 200: hue_light = response.json() logger.info( 'Getting output state for hue id: %s took %ss', hue_light_id, round(time.time() - start, 2)) return hue_light else: logger.warning('Failed to pull state for hue id: %s', hue_light_id) return False except Exception as ex: logger.exception( 'Error while getting output state for hue id: %s', hue_light_id) def discover_hue_bridges(self): try: response = requests.get(url='https://discovery.meethue.com/') if response.status_code is 200: hue_bridge_data = response.json() for hue_bridge in hue_bridge_data: logger.info('Discovered hue bridge %s @ %s', hue_bridge.get('id'), hue_bridge.get('internalipaddress')) else: logger.warning('Failed to discover bridges on this network') return False except Exception as ex: logger.exception( 'Error while discovering hue bridges on this network') def _register_sensor(self, name, external_id): logger.debug('Registering sensor with name %s and external_id %s', name, external_id) try: config = { 'name': name, } response = self.webinterface.sensor.register( external_id=external_id, physical_quantity='temperature', unit='celcius', config=config) return response.id except Exception as e: logger.warning( 'Failed registering sensor with name %s and external_id %s with exception %s', name, external_id, str(e.message)) return None def _update_sensor(self, sensor_id, value): logger.debug('Updating sensor %s with status %s', sensor_id, value) response = self.webinterface.sensor.set_status(sensor_id=sensor_id, value=value) if response is None: logger.warning('Could not set the updated sensor value') return False return True
class eptWorkerFabric(object): """ tracks cache and settings for each fabric actively being monitored, also provides useful notification and push_event functions """ def __init__(self, fabric): self.fabric = fabric self.start_ts = time.time() self.settings = eptSettings.load(fabric=fabric, settings="default") self.cache = eptCache(fabric) self.dns_cache = DNSCache() self.db = get_db() self.watcher_paused = False self.session = None self.notify_queue = None self.notify_thread = None self.init() def init(self): """ initialize settings after fabric settings as been loaded """ # epm parser used with eptWorker for creating pseudo eevents self.ept_epm_parser = eptEpmEventParser(self.fabric, self.settings.overlay_vnid) # one time calculation for email address and syslog server (which requires valid port) self.email_address = self.settings.email_address self.syslog_server = self.settings.syslog_server self.syslog_port = self.settings.syslog_port if len(self.email_address) == 0: self.email_address = None if len(self.syslog_server) == 0: self.syslog_server = None self.syslog_port = None def close(self): """ stateful close when worker receives FABRIC_STOP for this fabric """ if self.db is not None: self.db.client.close() if self.session is not None: self.session.close() if self.notify_thread is not None: self.notify_thread.exit() # remove all objects from notify queue if self.notify_queue is not None: try: logger.debug("clearing notify queue (size: %d)", self.notify_queue.qsize()) while not self.notify_queue.empty(): self.notify_queue.get() except Exception as e: logger.debug("Traceback:\n%s", traceback.format_exc()) logger.error("failed to execute clear notify queue %s", e) def watcher_init(self): """ watcher process needs session object for mo sync and notify queue""" logger.debug("wf worker init for %s", self.fabric) self.notify_queue = Queue(maxsize=NOTIFY_QUEUE_MAX_SIZE) logger.debug("starting worker fabric apic session") self.session = get_apic_session(self.fabric) if self.session is None: logger.error("failed to get session object within worker fabric") # watcher will also send notifications within a background thread to ensure that # any delay in syslog or email does not backup other service events self.notify_thread = BackgroundThread(func=self.execute_notify, name="notify", count=0, interval=NOTIFY_INTERVAL) self.notify_thread.daemon = True self.notify_thread.start() def settings_reload(self): """ reload settings from db """ logger.debug("reloading settings for %s", self.fabric) self.settings.reload() self.init() def get_uptime_delta_offset(self, delta=None): """ return difference between provided delta and current uptime. If the uptime_delta is less than zero, return 0. If no delta is provided, then return the uptime. """ uptime = time.time() - self.start_ts if delta is None: return uptime uptime_delta = delta - uptime if uptime_delta > 0: return uptime_delta return 0 def push_event(self, table, key, event, per_node=True): # wrapper to push an event to eptHistory events list. set per_node to false to use # max_endpoint_event rotate length, else max_per_node_endpoint_events value is used if per_node: return push_event( self.db[table], key, event, rotate=self.settings.max_per_node_endpoint_events) else: return push_event(self.db[table], key, event, rotate=self.settings.max_endpoint_events) def get_learn_type(self, vnid, flags=[]): # based on provide vnid and flags return learn type for endpoint: # loopback - if loopback in flags # psvi - if psvi in flags # svi - represents an external SVI (should be set on all external svis) # overlay if vnid is overlay vnid # external if vnid is in eptVnid table with external set to true # else returns 'epg' (default learn type) if "loopback" in flags: return "loopback" elif "psvi" in flags: return "psvi" elif "svi" in flags: return "external" elif vnid == self.settings.overlay_vnid: return "overlay" ept_vnid = self.cache.get_vnid_name(vnid, return_object=True) if ept_vnid is not None and ept_vnid.external: return "external" return "epg" def notification_enabled(self, notify_type): # return dict with email address, syslog server, syslog port for notify type. If not enabled, # then return None for each field. Set notify_type to 'any_email' or 'any_syslog' to force # test of a particular notification type ret = { "enabled": False, "email_address": None, "syslog_server": None, "syslog_port": None } if notify_type == "move": attr = ("notify_move_email", "notify_move_syslog") elif notify_type == "stale": attr = ("notify_stale_email", "notify_stale_syslog") elif notify_type == "offsubnet": attr = ("notify_offsubnet_email", "notify_offsubnet_syslog") elif notify_type == "clear": attr = ("notify_clear_email", "notify_clear_syslog") elif notify_type == "rapid": attr = ("notify_rapid_email", "notify_rapid_syslog") elif notify_type == "any_email": # return all notification types enabled ret["enabled"] = True ret["email_address"] = self.email_address return ret elif notify_type == "any_syslog": # return all notification types enabled ret["enabled"] = True ret["syslog_server"] = self.syslog_server ret["syslog_port"] = self.syslog_port return ret else: logger.warn("invalid notification type '%s", notify_type) return ret if getattr(self.settings, attr[0]): ret["enabled"] = True ret["email_address"] = self.email_address if getattr(self.settings, attr[1]): ret["enabled"] = True ret["syslog_server"] = self.syslog_server ret["syslog_port"] = self.syslog_port return ret def send_notification(self, notify_type, subject=None, txt=None, bulk=None): # send proper notifications for this fabric. set notify_type to none to skip enable check # and force notification. user can set bulk to list of (subject/txt) tuples to send a list # of notifications at the same time. All notifications must be of the same notify_type. success = True errmsg = "" notify = self.notification_enabled(notify_type) if notify["enabled"]: if notify["email_address"] is not None: emails = [] if bulk is not None: for (bulk_subject, bulk_txt) in bulk: emails.append({ "sender": get_app_config().get("EMAIL_SENDER", None), "receiver": notify["email_address"], "subject": bulk_subject, "msg": bulk_txt, }) else: emails.append({ "sender": get_app_config().get("EMAIL_SENDER", None), "receiver": notify["email_address"], "subject": subject, "msg": txt, }) # send_email already supports a list of emails, so simply send all at once (success, errmsg) = send_emails(settings=self.settings, dns_cache=self.dns_cache, emails=emails) if not success: logger.warn("failed to send email: %s", errmsg) if notify["syslog_server"] is not None: if bulk is not None: for (bulk_subject, bulk_txt) in bulk: syslog( bulk_txt, dns_cache=self.dns_cache, server=notify["syslog_server"], server_port=notify["syslog_port"], ) else: syslog( txt, dns_cache=self.dns_cache, server=notify["syslog_server"], server_port=notify["syslog_port"], ) return (success, errmsg) else: logger.debug("skipping send notification as '%s' is not enabled", notify_type) return (False, "notification not enabled") def queue_notification(self, notify_type, subject, txt): # queue notification that will be sent at next iteration of NOTIFY_INTERVAL if self.notify_queue is None: logger.error("notify queue not initialized for worker fabric") return try: logger.debug("enqueuing %s notification (queue size %d)", notify_type, self.notify_queue.qsize()) self.notify_queue.put_nowait( (notify_type, subject, txt, time.time())) except Full as e: logger.error( "failed to enqueue notification, queue is full (size: %s)", self.notify_queue.qsize()) def execute_notify(self): # send any notifications sitting in queue, log number of notifications sent and max queue # time. We want to support bulk notifications (mainly for email to prevent multiple login # on smpt_relay_auth setups) so this function will sort based on notify type and execute # send notification with bulk flag. msgs = {} # indexed by notify type and contains tuple (subject,txt) count = 0 max_queue_time = 0 while not self.notify_queue.empty(): (notify_type, subject, txt, q_ts) = self.notify_queue.get() count += 1 q_time = time.time() - q_ts if q_time > max_queue_time: max_queue_time = q_time if notify_type not in msgs: msgs[notify_type] = [] msgs[notify_type].append((subject, txt)) for notify_type in msgs: self.send_notification(notify_type, bulk=msgs[notify_type]) if count > 0: logger.debug("sent %s notifications, max queue time %0.3f sec", count, max_queue_time)
if choice != '': job_queue.put({ 'q': entity_id, 'action': 'set_description', 'lang': 'nb', 'value': choice, 'summary': '#no_to_nb cleanup drive' }) if labels['no'].get('description') is not None: job_queue.put({ 'q': entity_id, 'action': 'remove_description', 'lang': 'no', 'summary': '#no_to_nb cleanup drive' }) completed += 1 t1 = time.time() - t0 if not bg_thread.isAlive(): print('Thread exited. Starting a new') bg_thread = start_thread() while job_queue.qsize() > 100: print('Job queue length: %d, sleeping a while' % job_queue.qsize()) time.sleep(10) print('Status: Fixed %s items. %s items left to go, time: %.2f sec/item, job queue length: %d' % (completed, len(rows), t1 / completed, job_queue.qsize())) print('*** Main thread waiting') job_queue.join() print('*** Done')
class HyperDash: """HyperDash monitors a job and manages capturing IO / server comms. This class is designed to be run in its own thread and contains an instance of code_runner (which is running the job) and server_manager (for talking to the server.) """ def __init__( self, job_name, code_runner, server_manager, io_bufs, std_streams, ): """Initialize the HyperDash class. args: 1) job_name: Name of the current running job 2) code_runner: Instance of CodeRunner 3) server_manager: ServerManager instance 4) io_bufs: Tuple in the form of (StringIO(), StringIO(),) 5) std_streams: Tuple in the form of (StdOut, StdErr) """ self.job_name = job_name self.code_runner = code_runner self.server_manager = server_manager self.out_buf, self.err_buf = io_bufs self.std_out, self.std_err = std_streams self.programmatic_exit = False self.shutdown_network_channel = Queue() self.shutdown_main_channel = Queue() # Used to keep track of the current position in the IO buffers self.out_buf_offset = 0 self.err_buf_offset = 0 # Create a UUID to uniquely identify this run from the SDK's point of view self.current_sdk_run_uuid = str(uuid.uuid4()) # Create run_start message before doing any other setup work to make sure that the # run_started message always precedes any other messages self.server_manager.put_buf( create_run_started_message(self.current_sdk_run_uuid, self.job_name), ) def on_stdout_flush(): self.capture_io() self.std_out.flush() self.flush_log_file() def on_stderr_flush(): self.capture_io() self.std_err.flush() self.flush_log_file() self.out_buf.set_on_flush(on_stdout_flush) self.err_buf.set_on_flush(on_stderr_flush) self.logger = logging.getLogger("hyperdash.{}".format(__name__)) self.log_file, self.log_file_path = self.open_log_file() if not self.log_file: self.logger.error( "Could not create logs file. Logs will not be stored locally.") def open_log_file(self): log_folder = get_hyperdash_logs_home_path() # Make sure logs directory exists (/logs) if not os.path.exists(log_folder): try: os.makedirs(log_folder) except OSError as exc: if exc.errno != errno.EEXIST: pass return None, None job_log_folder = get_hyperdash_logs_home_path_for_job(self.job_name) # Make sure directory for job exists in log directory (/logs/<JOB_NAME>) if not os.path.exists(job_log_folder): try: os.makedirs(job_log_folder) except OSError as exc: if exc.errno != errno.EEXIST: pass return None, None # Create new log file for run try: iso = slugify(datetime.datetime.now().isoformat()) logfile_name = "{}_{}.log".format(slugify(self.job_name), iso) logfile_path = os.path.join(job_log_folder, logfile_name) return open(logfile_path, "a"), logfile_path except IOError: return None, None def capture_io(self): self.out_buf.acquire() out = self.out_buf.getvalue() len_out = len(out) - self.out_buf_offset self.print_out(out[self.out_buf_offset:]) if len_out != 0 else None self.out_buf_offset += len_out self.out_buf.release() self.err_buf.acquire() err = self.err_buf.getvalue() len_err = len(err) - self.err_buf_offset self.print_err(err[self.err_buf_offset:]) if len_err != 0 else None self.err_buf_offset += len_err self.err_buf.release() def print_out(self, s): message = create_log_message(self.current_sdk_run_uuid, INFO_LEVEL, s) self.server_manager.put_buf(message) self.std_out.write(s) self.write_to_log_file(s) def print_err(self, s): message = create_log_message(self.current_sdk_run_uuid, ERROR_LEVEL, s) self.server_manager.put_buf(message) self.std_err.write(s) self.write_to_log_file(s) def write_to_log_file(self, s): if self.log_file: if PY2: self.log_file.write(s.encode("utf-8")) else: self.log_file.write(s) def flush_log_file(self): if self.log_file: self.log_file.flush() def cleanup(self, exit_status): self.print_log_file_location() self.capture_io() self.server_manager.put_buf( create_run_ended_message(self.current_sdk_run_uuid, exit_status), ) self.flush_log_file() self.shutdown_network_channel.put(True) def sudden_cleanup(self): self.print_log_file_location() # Send what we can to local log self.capture_io() self.flush_log_file() # Make a best-effort attempt to notify server that the run was # canceled by the user, but don't wait for all messages to # be flushed to server so we don't hang the user's terminal. self.server_manager.send_message( create_run_ended_message(self.current_sdk_run_uuid, "user_canceled"), raise_exceptions=False, timeout_seconds=1, ) def print_log_file_location(self): if self.log_file_path: self.logger.info( "Logs for this run of {} are available locally at: {}".format( self.job_name, self.log_file_path, )) def run(self): """ run_http works using three separate threads: 1) code_runner thread which runs the user's code 2) network_thread which does blocking I/O with the server 3) event_loop thread which runs the SDK's main event loop (this is just the main thread) We require the event loop and network loop to be in separate threads because otherwise slow responses from the server could inhibit the SDK's event loop causing weird behavior like delayed logs in the user's terminal. Once all threads are running, the event_loop thread will periodically check the I/O buffers to see if any new logs have appeared, and if so, it will send them to the server manager's outgoing buffer. The network_loop thread will periodically check its outgoing buffer, and if it finds any messages in there, it will send them all to the server. Cleanup is the responsibility of the event_loop. With every tick of the event_loop, we check to see if the user's code has completed running. If it has, the event_loop will capture any remaining I/O and store that in the ServerManager's outgoing buf, as well as store a message indicating that the run is complete and its final exit status. Finally, the event_loop thread will push a message into the shutdown_network_channel which will indicate to the network_loop that it should finish sending any pending messages and then exit. The event_loop thread will then block until it receives a message on the shutdown_main_channel. At the next tick of the network_loop, the shutdown_network_channel will no longer be empty, and the network loop will try and fire off any remaining messages in the ServerManager's buffer to the server, and then put a message in the shutdown_main_channel. The main event_loop which has been blocked until now on the shutdown_main_channel will now return, and the program will exit cleanly. """ def network_loop(): while True: if self.shutdown_network_channel.qsize() != 0: self.server_manager.cleanup(self.current_sdk_run_uuid) self.shutdown_main_channel.put(True) return else: self.server_manager.tick(self.current_sdk_run_uuid) time.sleep(1) code_thread = Thread(target=self.code_runner.run) network_thread = Thread(target=network_loop) # Daemonize them so they don't impede shutdown if the user # keyboard interrupts code_thread.daemon = True network_thread.daemon = True network_thread.start() code_thread.start() # Event loop while True: try: self.capture_io() exited_cleanly, is_done = self.code_runner.is_done() if is_done: self.programmatic_exit = True if exited_cleanly: self.cleanup("success") # Block until network loop says its done self.shutdown_main_channel.get(block=True, timeout=None) return self.code_runner.get_return_val() else: self.cleanup("failure") # Block until network loop says its done self.shutdown_main_channel.get(block=True, timeout=None) raise self.code_runner.get_exception() time.sleep(1) # Handle Ctrl+C except (KeyboardInterrupt, SystemExit): self.sudden_cleanup() # code_thread and network_thread are daemons so they won't impede this sys.exit(130)
class TaskHandler(BufferingHandler): __flush_max_history_seconds = 30. __wait_for_flush_timeout = 10. __max_event_size = 1024 * 1024 __once = False __offline_filename = 'log.jsonl' @property def task_id(self): return self._task_id @task_id.setter def task_id(self, value): self._task_id = value def __init__(self, task, capacity=buffer_capacity, connect_logger=True): super(TaskHandler, self).__init__(capacity) self.task_id = task.id self.session = task.session self.last_timestamp = 0 self.counter = 1 self._last_event = None self._exit_event = None self._queue = None self._thread = None self._pending = 0 self._offline_log_filename = None self._connect_logger = connect_logger if task.is_offline(): offline_folder = Path(task.get_offline_mode_folder()) offline_folder.mkdir(parents=True, exist_ok=True) self._offline_log_filename = offline_folder / self.__offline_filename def shouldFlush(self, record): """ Should the handler flush its buffer Returns true if the buffer is up to capacity. This method can be overridden to implement custom flushing strategies. """ if self._task_id is None: return False # if we need to add handlers to the base_logger, # it will not automatically create stream one when first used, so we must manually configure it. if self._connect_logger and not TaskHandler.__once: base_logger = getLogger() if len(base_logger.handlers) == 1 and isinstance( base_logger.handlers[0], TaskHandler): if record.name != 'console' and not record.name.startswith( 'clearml.'): base_logger.removeHandler(self) basicConfig() base_logger.addHandler(self) TaskHandler.__once = True else: TaskHandler.__once = True # if we passed the max buffer if len(self.buffer) >= self.capacity: return True # if the first entry in the log was too long ago. # noinspection PyBroadException try: if len(self.buffer) and (time.time() - self.buffer[0].created ) > self.__flush_max_history_seconds: return True except Exception: pass return False def _record_to_event(self, record): # type: (LogRecord) -> events.TaskLogEvent if self._task_id is None: return None timestamp = int(record.created * 1000) if timestamp == self.last_timestamp: timestamp += self.counter self.counter += 1 else: self.last_timestamp = timestamp self.counter = 1 # ignore backspaces (they are often used) full_msg = record.getMessage().replace('\x08', '') return_events = [] while full_msg: msg = full_msg[:self.__max_event_size] full_msg = full_msg[self.__max_event_size:] # unite all records in a single second if self._last_event and timestamp - self._last_event.timestamp < 1000 and \ len(self._last_event.msg) + len(msg) < self.__max_event_size and \ record.levelname.lower() == str(self._last_event.level): # ignore backspaces (they are often used) self._last_event.msg += '\n' + msg continue # if we have a previous event and it timed out, return it. new_event = events.TaskLogEvent(task=self.task_id, timestamp=timestamp, level=record.levelname.lower(), worker=self.session.worker, msg=msg) if self._last_event: return_events.append(self._last_event) self._last_event = new_event return return_events def flush(self): if self._task_id is None: return if not self.buffer: return buffer = None self.acquire() if self.buffer: buffer = self.buffer self.buffer = [] self.release() if not buffer: return # noinspection PyBroadException try: record_events = [ r for record in buffer for r in self._record_to_event(record) ] + [self._last_event] self._last_event = None batch_requests = events.AddBatchRequest( requests=[events.AddRequest(e) for e in record_events if e]) except Exception: self.__log_stderr( "WARNING: clearml.log - Failed logging task to backend ({:d} lines)" .format(len(buffer))) batch_requests = None if batch_requests and batch_requests.requests: self._pending += 1 self._add_to_queue(batch_requests) def _create_thread_queue(self): if self._queue: return self._queue = Queue() self._exit_event = Event() self._exit_event.clear() # multiple workers could be supported as well self._thread = Thread(target=self._daemon) self._thread.daemon = True self._thread.start() def _add_to_queue(self, request): self._create_thread_queue() self._queue.put(request) def close(self, wait=False): # self.__log_stderr('Closing {} wait={}'.format(os.getpid(), wait)) # flush pending logs if not self._task_id: return # avoid deadlocks just skip the lock, we are shutting down anyway self.lock = None self.flush() # shut down the TaskHandler, from this point onwards. No events will be logged _thread = self._thread self._thread = None if self._queue: self._exit_event.set() self._queue.put(None) self._task_id = None if wait and _thread: # noinspection PyBroadException try: timeout = 1. if self._queue.empty( ) else self.__wait_for_flush_timeout _thread.join(timeout=timeout) if not self._queue.empty(): self.__log_stderr( 'Flush timeout {}s exceeded, dropping last {} lines'. format(timeout, self._queue.qsize())) # self.__log_stderr('Closing {} wait done'.format(os.getpid())) except Exception: pass # call super and remove the handler super(TaskHandler, self).close() def _send_events(self, a_request): try: self._pending -= 1 if self._offline_log_filename: with open(self._offline_log_filename.as_posix(), 'at') as f: f.write( json.dumps([b.to_dict() for b in a_request.requests]) + '\n') return # if self._thread is None: # self.__log_stderr('Task.close() flushing remaining logs ({})'.format(self._pending)) res = self.session.send(a_request) if res and not res.ok(): self.__log_stderr( "failed logging task to backend ({:d} lines, {})".format( len(a_request.requests), str(res.meta)), level=WARNING) except MaxRequestSizeError: self.__log_stderr( "failed logging task to backend ({:d} lines) log size exceeded limit" .format(len(a_request.requests)), level=WARNING) except Exception as ex: self.__log_stderr( "Retrying, failed logging task to backend ({:d} lines): {}". format(len(a_request.requests), ex)) # we should push ourselves back into the thread pool if self._queue: self._pending += 1 self._queue.put(a_request) def _daemon(self): # multiple daemons are supported leave = self._exit_event.wait(0) request = True while not leave or request: # pull from queue request = None if self._queue: # noinspection PyBroadException try: request = self._queue.get(block=not leave) except Exception: pass if request: self._send_events(request) leave = self._exit_event.wait(0) # self.__log_stderr('leaving {}'.format(os.getpid())) @staticmethod def __log_stderr(msg, level=INFO): # output directly to stderr, make sure we do not catch it. write = sys.stderr._original_write if hasattr( sys.stderr, '_original_write') else sys.stderr.write write('{asctime} - {name} - {levelname} - {message}\n'.format( asctime=Formatter().formatTime(makeLogRecord({})), name='clearml.log', levelname=getLevelName(level), message=msg)) @classmethod def report_offline_session(cls, task, folder): filename = Path(folder) / cls.__offline_filename if not filename.is_file(): return False with open(filename.as_posix(), 'rt') as f: i = 0 while True: try: line = f.readline() if not line: break list_requests = json.loads(line) for r in list_requests: r.pop('task', None) i += 1 except StopIteration: break except Exception as ex: warning('Failed reporting log, line {} [{}]'.format(i, ex)) batch_requests = events.AddBatchRequest(requests=[ events.TaskLogEvent(task=task.id, **r) for r in list_requests ]) if batch_requests.requests: res = task.session.send(batch_requests) if res and not res.ok(): warning( "failed logging task to backend ({:d} lines, {})". format(len(batch_requests.requests), str(res.meta))) return True
class ThreadedTaskDispatcher(object): """A Task Dispatcher that creates a thread for each task.""" stop_count = 0 # Number of threads that will stop soon. def __init__(self): self.threads = {} # { thread number -> 1 } self.queue = Queue() self.thread_mgmt_lock = threading.Lock() def handlerThread(self, thread_no): threads = self.threads try: while threads.get(thread_no): task = self.queue.get() if task is None: # Special value: kill this thread. break try: task.service() except: log.exception('Exception during task') except: log.exception('Exception in thread main loop') finally: mlock = self.thread_mgmt_lock with mlock: self.stop_count -= 1 try: del threads[thread_no] except KeyError: pass def setThreadCount(self, count): """See zope.server.interfaces.ITaskDispatcher""" mlock = self.thread_mgmt_lock with mlock: threads = self.threads thread_no = 0 running = len(threads) - self.stop_count while running < count: # Start threads. while thread_no in threads: thread_no = thread_no + 1 threads[thread_no] = 1 running += 1 t = threading.Thread(target=self.handlerThread, args=(thread_no,), name='zope.server-%d' % thread_no) t.setDaemon(True) t.start() thread_no = thread_no + 1 if running > count: # Stop threads. to_stop = running - count self.stop_count += to_stop for _n in range(to_stop): self.queue.put(None) running -= 1 def addTask(self, task): """See zope.server.interfaces.ITaskDispatcher""" if task is None: raise ValueError("No task passed to addTask().") # assert ITask.providedBy(task) try: task.defer() self.queue.put(task) except: task.cancel() raise def shutdown(self, cancel_pending=True, timeout=5): """See zope.server.interfaces.ITaskDispatcher""" self.setThreadCount(0) # Ensure the threads shut down. threads = self.threads expiration = time() + timeout while threads: if time() >= expiration: log.error("%d thread(s) still running", len(threads)) break sleep(0.1) if cancel_pending: # Cancel remaining tasks. try: queue = self.queue while not queue.empty(): task = queue.get() if task is not None: task.cancel() except Empty: pass def getPendingTasksEstimate(self): """See zope.server.interfaces.ITaskDispatcher""" return self.queue.qsize()