Ejemplo n.º 1
0
    def test_monitoring(self):
        class TestedClass(Monitor):
            def __init__(self, cqueue):
                self.cqueue = cqueue
                Monitor.__init__(self)

            @Monitor.synchronized
            def execute(self):
                self.cqueue.put(1)
                sleep(1)
                self.cqueue.get()

        class TesterThread(Thread):
            def __init__(self, tc):
                self.tc = tc
                Thread.__init__(self)

            def run(self):
                self.tc.execute()

        q = Queue()
        tc = TestedClass(q)
        a, b = TesterThread(tc), TesterThread(tc)
        a.start(), b.start()

        while a.is_alive() or b.is_alive():
            sleep(0.1)
            self.assertNotEqual(q.qsize(), 2)
Ejemplo n.º 2
0
    def test_acquire_contextmanager(self):
        class TestedClass(Monitor):
            def __init__(self, cqueue):
                self.cqueue = cqueue
                Monitor.__init__(self)

            @Monitor.synchronized
            def execute(self):
                self.cqueue.put(1)
                sleep(1)
                self.cqueue.get()

        class TesterThread(Thread):
            def __init__(self, tc):
                self.tc = tc
                Thread.__init__(self)

            def run(self):
                self.tc.execute()

        cq = Queue()
        cq.put(1)
        tc = TestedClass(cq)
        tt = TesterThread(tc)

        with Monitor.acquire(tc):
            tt.start()
            sleep(0.4)
            self.assertEqual(cq.qsize(), 1)
class CallbackHandler(object):
    """ handles callback for event for single url with pause support """
    def __init__(self, url, subscription_id, callback, paused):
        self.url = url
        self.subscription_id = subscription_id
        self.callback = callback
        self.paused = paused
        self.event_q = Queue()

    def flush(self):
        # flush any pending events (no callback triggered on flush)
        while not self.event_q.empty():
            self.event_q.get()

    def pause(self):
        # pause callbacks
        logger.debug("pausing url: %s", self.url)
        self.paused = True

    def resume(self):
        # trigger all callbacks before setting pause to false
        logger.debug("resume (queue-size %s) url %s", self.event_q.qsize(),
                     self.url)
        while not self.event_q.empty():
            event = self.event_q.get()
            try:
                self.callback(event)
            except Exception as e:
                logger.debug("Traceback:\n%s", traceback.format_exc())
                logger.warn("failed to execute event callback: %s", e)
        self.paused = False

    def execute_callback(self, event):
        # execute callback or queue event if currently paused
        if self.paused:
            self.event_q.put(event)
        else:
            try:
                self.callback(event)
            except Exception as e:
                logger.debug("Traceback:\n%s", traceback.format_exc())
                logger.warn("failed to execute event callback: %s", e)
Ejemplo n.º 4
0
class SelectiveRepeater(object):
    """
    Selective repeater for SBP file I/O requests

    Fields
    ----------
    _pending_map : list(PendingRequest)
      List (used as a map) of PendingRequest objects, used to track
      outstanding requests.
    _request_pool : Queue
      Queue of available requests (recorded by index number)
    _seqmap : dict(int,int)
      Dictionary mapping SBP request sequence IDs to their corresponding
      request index.
    _batch_msgs : list
      Collector for a batch of messages to be sent in one
      buffer via the link
    _last_check_time : Time
      The last time we checked if any packets had expired
    _expire_map : dict(Time, dict(PendingRequest, PendingRequest))
      Dictionary which records the future time at which a request
      will expire.

    _msg_type : int
      The message type we're currently sending
    _link : Handler
      The link over which we're sending data

    _callback_thread : int
      ID of the thread that we expect callbacks from
    _link_thread : int
      ID of the thread that handles link writes
    """
    def __init__(self, link, msg_type, cb=None):
        """
        Args
        ---
        link : Handler
          Link over which messages will be sent.
        msg_type :
          The type of message being sent
        cb :
          Invoked when SBP message with type `msg_type` is received
        """

        self._link = link
        self._msg_type = msg_type
        self._callback = cb

        self._seqmap = {}
        self._batch_msgs = []
        self._last_check_time = Time.now()
        self._expire_map = defaultdict(dict)

        self._init_fileio_config(SBP_FILEIO_WINDOW_SIZE, SBP_FILEIO_BATCH_SIZE,
                                 PROGRESS_CB_REDUCTION_FACTOR)

        self._callback_thread = None
        self._link_thread = None

        self._total_sends = 1.0
        self._total_retries = 0

        self._config_retry_time = None

    def _init_fileio_config(self, window_size, batch_size,
                            progress_cb_reduction_factor):
        self._pending_map = [PendingRequest(X) for X in range(window_size)]
        self._request_pool = Queue(window_size)
        for pending_req in self._pending_map:
            self._request_pool.put(pending_req)
        self._batch_size = batch_size
        self._progress_cb_reduction_factor = progress_cb_reduction_factor

    def __enter__(self):
        self._link.add_callback(self._request_cb, self._msg_type)
        self._link.add_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP)
        return self

    def __exit__(self, type, value, traceback):
        self._link.remove_callback(self._request_cb, self._msg_type)
        self._link.remove_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP)

    def _verify_cb_thread(self):
        """
        Verify that only one thread is consuming requests.
        """
        if self._callback_thread is None:
            self._callback_thread = threading.currentThread().ident
        assert self._callback_thread == threading.currentThread().ident

    def _verify_link_thread(self):
        """
        Verify that only one thread is producing requests.
        """
        if self._link_thread is None:
            self._link_thread = threading.currentThread().ident
        assert self._link_thread == threading.currentThread().ident

    def _return_pending_req(self, pending_req):
        """
        Return a pending request to the write pool and clean any
        entries in the expiration map.
        """
        self._verify_cb_thread()
        pending_req.completed = True
        try:
            msg = pending_req.message
        except AttributeError:
            # Got a completion for something that was never requested
            return
        self._try_remove_keys(self._seqmap, msg.sequence)
        if self._try_remove_keys(self._expire_map[pending_req.time_expire],
                                 pending_req):
            # Only put the request back if it was successfully removed
            self._request_pool.put(pending_req)

    def _record_pending_req(self, msg, time_now, expiration_time):
        """
        Acquire a pending request object and record it's future
        expiration time in a map.
        """
        self._verify_link_thread()
        # Queue.get will block if no requests are available
        pending_req = self._request_pool.get(True)
        assert self._pending_map[pending_req.index].index == pending_req.index
        self._seqmap[msg.sequence] = pending_req.index
        self._pending_map[pending_req.index].track(msg, time_now,
                                                   expiration_time)
        self._expire_map[expiration_time][pending_req] = pending_req

    def _config_cb(self, msg, **metadata):
        self._config_msg = msg
        self._init_fileio_config(msg.window_size, msg.batch_size,
                                 PROGRESS_CB_REDUCTION_FACTOR * 2)

    def _request_cb(self, msg, **metadata):
        """
        Process request completions.
        """
        index = self._seqmap.get(msg.sequence)
        if index is None:
            return
        pending_req = self._pending_map[index]
        if self._callback:
            self._callback(pending_req.message, msg)
        self._return_pending_req(pending_req)

    def _has_pending(self):
        return self._request_pool.qsize() != len(self._pending_map)

    def _retry_send(self, check_time, pending_req, delete_keys):
        """
        Retry a request by updating it's expire time on the object
        itself and in the expiration map.
        """
        self._total_retries += 1
        self._total_sends += 1
        timeout_delta = Time(SBP_FILEIO_TIMEOUT)
        send_time = Time.now()
        new_expire = send_time + timeout_delta
        pending_req.record_retry(send_time, new_expire)
        self._expire_map[new_expire][pending_req] = pending_req
        self._link(pending_req.message)
        delete_keys.append(pending_req)

    def _try_remove_keys(self, d, *keys):
        success = True
        for key in keys:
            try:
                del d[key]
            except KeyError:
                success = False
        return success

    def _check_pending(self):
        """
        Scans from the last check time to the current time looking
        for requests that are due to expire and retries them if
        necessary.
        """
        time_now = Time.now()
        timeout_delta = Time(SBP_FILEIO_TIMEOUT)
        for check_time in Time.iter_since(self._last_check_time, time_now):
            pending_reqs = self._expire_map[check_time]
            retried_writes = []
            for pending_req in list(pending_reqs):
                time_expire = pending_req.time + timeout_delta
                if time_now >= time_expire:
                    if pending_req.tries >= MAXIMUM_RETRIES:
                        raise Exception('Timed out')
                    # If the completion map becomes inconsistent (because
                    #   things are completing at the same time they're
                    #   being re-tried) then the `completed` field should
                    #   prevent us from re-sending a write in this case.
                    if not pending_req.completed:
                        self._retry_send(check_time, pending_req,
                                         retried_writes)
            # Pending writes can be marked completed while this function
            #   is running, so a key error means is was marked completed
            #   after we sent a retry (therefore _try_remove_keys ignores
            #   key errors).
            self._try_remove_keys(self._expire_map[check_time],
                                  *retried_writes)
        self._last_check_time = time_now

    def _window_available(self, batch_size):
        return self._request_pool.qsize() >= batch_size

    def _ensure_config_req_sent(self):
        if self._config_retry_time is not None:
            return
        now = Time.now()
        self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS)
        self._config_timeout = now + Time(0, CONFIG_REQ_TIMEOUT_MS)
        self._config_seq = random.randint(0, 0xffffffff)
        self._config_msg = None
        self._link(MsgFileioConfigReq(sequence=self._config_seq))

    def _config_received(self):
        self._ensure_config_req_sent()
        if self._config_msg is not None:
            return True
        now = Time.now()
        if now >= self._config_retry_time:
            self._link(MsgFileioConfigReq(sequence=self._config_seq))
            self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS)
        if now >= self._config_timeout:
            self._config_msg = MsgFileioConfigResp(sequence=0,
                                                   window_size=100,
                                                   batch_size=1,
                                                   fileio_version=0)
        return self._config_msg is not None

    def _wait_config_received(self):
        while not self._config_received():
            time.sleep(WAIT_SLEEP_S)

    def _wait_window_available(self, batch_size):
        self._wait_config_received()
        while not self._window_available(batch_size):
            self._check_pending()
            if not self._window_available(batch_size):
                time.sleep(WAIT_SLEEP_S)

    @property
    def total_retries(self):
        return self._total_retries

    @property
    def total_sends(self):
        return self._total_sends

    @property
    def progress_cb_reduction_factor(self):
        return self._progress_cb_reduction_factor

    def send(self, msg, batch_size=None):
        if batch_size is not None:
            self._send(msg, batch_size)
        else:
            self._send(msg, self._batch_size)

    def _send(self, msg, batch_size):
        """
        Sends data via the current link, potentially batching it together.

        Parameters
        ----------
        msg : MsgFileioReadReq, MsgFileioReadDirReq, MsgFileioWriteReq, MsgFileioRemove
          The message to be sent via the current link
        batch_size : int
          The number of message to batch together before actually sending
        """
        if msg is not None:
            self._batch_msgs.append(msg)
        if len(self._batch_msgs) >= batch_size:
            self._wait_window_available(batch_size)
            time_now = Time.now()
            expiration_time = time_now + Time(SBP_FILEIO_TIMEOUT)
            for msg in self._batch_msgs:
                self._record_pending_req(msg, time_now, expiration_time)
            self._link(*self._batch_msgs)
            self._total_sends += len(self._batch_msgs)
            del self._batch_msgs[:]

    def flush(self):
        """
        Flush any pending requests (batched or otherwise) and wait for all
        pending requests to complete.
        """
        self.send(None, batch_size=0)
        while self._has_pending():
            self._check_pending()
            time.sleep(WAIT_SLEEP_S)
Ejemplo n.º 5
0
Archivo: util.py Proyecto: e42s/scales
class GraphiteReporter(threading.Thread):
    """A graphite reporter thread."""
    def __init__(self, host, port, maxQueueSize=10000):
        """Connect to a Graphite server on host:port."""
        threading.Thread.__init__(self)

        self.host, self.port = host, port
        self.sock = None
        self.queue = Queue()
        self.maxQueueSize = maxQueueSize
        self.daemon = True

    def run(self):
        """Run the thread."""
        while True:
            try:
                try:
                    name, value, valueType, stamp = self.queue.get()
                except TypeError:
                    break
                self.log(name, value, valueType, stamp)
            finally:
                self.queue.task_done()

    def connect(self):
        """Connects to the Graphite server if not already connected."""
        if self.sock is not None:
            return
        backoff = 0.01
        while True:
            try:
                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                sock.settimeout(5)
                sock.connect((self.host, self.port))
                self.sock = sock
                return
            except socket.error:
                time.sleep(random.uniform(0, 2.0 * backoff))
                backoff = min(backoff * 2.0, 5.0)

    def disconnect(self):
        """Disconnect from the Graphite server if connected."""
        if self.sock is not None:
            try:
                self.sock.close()
            except socket.error:
                pass
            finally:
                self.sock = None

    def _sendMsg(self, msg):
        """Send a line to graphite. Retry with exponential backoff."""
        if not self.sock:
            self.connect()
        backoff = 0.001
        while True:
            try:
                self.sock.sendall(msg)
                break
            except socket.error:
                logging.warning('Graphite connection error', exc_info=True)
                self.disconnect()
                time.sleep(random.uniform(0, 2.0 * backoff))
                backoff = min(backoff * 2.0, 5.0)
                self.connect()

    def _sanitizeName(self, name):
        """Sanitize a metric name."""
        return name.replace(' ', '-')

    def log(self, name, value, valueType=None, stamp=None):
        """Log a named numeric value. The value type may be 'value',
    'count', or None."""
        if type(value) == float:
            form = "%s%s %2.2f %d\n"
        else:
            form = "%s%s %s %d\n"

        if valueType is not None and len(
                valueType) > 0 and valueType[0] != '.':
            valueType = '.' + valueType

        if not stamp:
            stamp = time.time()

        self._sendMsg(
            form % (self._sanitizeName(name), valueType or '', value, stamp))

    def enqueue(self, name, value, valueType=None, stamp=None):
        """Enqueue a call to log."""
        # If queue is too large, refuse to log.
        if self.maxQueueSize and self.queue.qsize() > self.maxQueueSize:
            return
        # Stick arguments into the queue
        self.queue.put((name, value, valueType, stamp))

    def flush(self):
        """Block until all stats have been sent to Graphite."""
        self.queue.join()

    def shutdown(self):
        """Shut down the background thread."""
        self.queue.put(None)
        self.flush()
Ejemplo n.º 6
0
class CachePipeline(object):
    def __init__(self, spider, cache):
        self.spider = spider
        self.cache = cache
        self.idle_event = Event()
        self.queue_size = 100
        self.input_queue = Queue()
        self.result_queue = Queue()

        self.thread = Thread(target=self.thread_worker)
        self.thread.daemon = True
        self.thread.start()

    def has_free_resources(self):
        return (self.input_queue.qsize() < self.queue_size
                and self.result_queue.qsize() < self.queue_size)

    def is_idle(self):
        return self.idle_event.is_set()

    def thread_worker(self):
        self.idle_event.clear()
        while True:
            try:
                action, data = self.input_queue.get(block=False)
            except Empty:
                self.idle_event.set()
                time.sleep(0.1)
                self.idle_event.clear()
                if self.spider.shutdown_event.is_set():
                    #print('EXITING CACHE PIPELINE')
                    return self.shutdown()
                #else:
                #    print('no shutdown event')
            else:
                assert action in ('load', 'save')
                if action == 'load':
                    task, grab = data
                    result = None
                    if self.is_cache_loading_allowed(task, grab):
                        result = self.load_from_cache(task, grab)
                    if result:
                        #print('!! PUT RESULT INTO CACHE PIPE RESULT QUEUE (cache)')
                        self.result_queue.put(('network_result', result))
                    else:
                        self.result_queue.put(('task', task))
                elif action == 'save':
                    task, grab = data
                    if self.is_cache_saving_allowed(task, grab):
                        with self.spider.timer.log_time('cache'):
                            with self.spider.timer.log_time('cache.write'):
                                self.cache.save_response(task.url, grab)

    def is_cache_loading_allowed(self, task, grab):
        # 1) cache data should be refreshed
        # 2) cache is disabled for that task
        # 3) request type is not cacheable
        return (not task.get('refresh_cache', False)
                and not task.get('disable_cache', False)
                and grab.detect_request_method() == 'GET')

    def is_cache_saving_allowed(self, task, grab):
        """
        Check if network transport result could
        be saved to cache layer.

        res: {ok, grab, grab_config_backup, task, emsg}
        """

        if grab.request_method == 'GET':
            if not task.get('disable_cache'):
                if self.spider.is_valid_network_response_code(
                        grab.response.code, task):
                    return True
        return False

    def load_from_cache(self, task, grab):
        with self.spider.timer.log_time('cache'):
            with self.spider.timer.log_time('cache.read'):
                cache_item = self.cache.get_item(grab.config['url'],
                                                 timeout=task.cache_timeout)
                if cache_item is None:
                    return None
                else:
                    with self.spider.timer.log_time(
                            'cache.read.prepare_request'):
                        grab.prepare_request()
                    with self.spider.timer.log_time(
                            'cache.read.load_response'):
                        self.cache.load_response(grab, cache_item)

                    grab.log_request('CACHED')
                    self.spider.stat.inc('spider:request-cache')

                    return {
                        'ok': True,
                        'task': task,
                        'grab': grab,
                        'grab_config_backup': grab.dump_config(),
                        'emsg': None
                    }

    def shutdown(self):
        try:
            self.cache.close()
        except AttributeError:
            print('Cache %s does not support close method' % self.cache)
Ejemplo n.º 7
0
class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem,
                                    AbstractProvisioner):
    """
    Mimics a job batcher, provisioner and scalable batch system
    """
    def __init__(self, config, secondsPerJob):
        super(MockBatchSystemAndProvisioner, self).__init__('clusterName')
        # To mimic parallel preemptable and non-preemptable queues
        # for jobs we create two parallel instances of the following class
        self.config = config
        self.secondsPerJob = secondsPerJob
        self.provisioner = self
        self.batchSystem = self
        self.nodeTypes = config.nodeTypes
        self.nodeShapes = self.nodeTypes
        self.nodeShapes.sort()
        self.jobQueue = Queue()
        self.updatedJobsQueue = Queue()
        self.jobBatchSystemIDToIssuedJob = {}
        self.totalJobs = 0  # Count of total jobs processed
        self.totalWorkerTime = 0.0  # Total time spent in worker threads
        self.toilMetrics = None
        self.nodesToWorker = {
        }  # Map from Node to instances of the Worker class
        self.workers = {nodeShape: []
                        for nodeShape in self.nodeShapes
                        }  # Instances of the Worker class
        self.maxWorkers = {nodeShape: 0
                           for nodeShape in self.nodeShapes
                           }  # Maximum number of workers
        self.running = False
        self.leaderThread = Thread(target=self._leaderFn)

    def start(self):
        self.running = True
        self.leaderThread.start()

    def shutDown(self):
        self.running = False
        self.leaderThread.join()

    # Stub out all AbstractBatchSystem methods since they are never called
    for name, value in iteritems(AbstractBatchSystem.__dict__):
        if getattr(value, '__isabstractmethod__', False):
            exec('def %s(): pass' % name)
        # Without this, the class would end up with .name and .value attributes
        del name, value

    # AbstractScalableBatchSystem methods
    def nodeInUse(self, nodeIP):
        return False

    def ignoreNode(self, nodeAddress):
        pass

    def unignoreNode(self, nodeAddress):
        pass

    @contextmanager
    def nodeFiltering(self, filter):
        nodes = self.getProvisionedWorkers(
            preemptable=True, nodeType=None) + self.getProvisionedWorkers(
                preemptable=False, nodeType=None)
        yield nodes

    # AbstractProvisioner methods
    def getProvisionedWorkers(self, nodeType=None, preemptable=None):
        """
        Returns a list of Node objects, each representing a worker node in the cluster

        :param preemptable: If True only return preemptable nodes else return non-preemptable nodes
        :return: list of Node
        """
        nodesToWorker = self.nodesToWorker
        if nodeType:
            return [
                node for node in nodesToWorker if node.nodeType == nodeType
            ]
        else:
            return list(nodesToWorker.keys())

    def terminateNodes(self, nodes):
        self._removeNodes(nodes)

    def remainingBillingInterval(self, node):
        pass

    def addJob(self, jobShape, preemptable=False):
        """
        Add a job to the job queue
        """
        self.totalJobs += 1
        jobID = uuid.uuid4()
        self.jobBatchSystemIDToIssuedJob[jobID] = Job(memory=jobShape.memory,
                                                      cores=jobShape.cores,
                                                      disk=jobShape.disk,
                                                      preemptable=preemptable)
        self.jobQueue.put(jobID)

    # JobBatcher functionality
    def getNumberOfJobsIssued(self, preemptable=None):
        if preemptable is not None:
            jobList = [
                job for job in list(self.jobQueue.queue)
                if self.jobBatchSystemIDToIssuedJob[job].preemptable ==
                preemptable
            ]
            return len(jobList)
        else:
            return self.jobQueue.qsize()

    def getJobs(self):
        return self.jobBatchSystemIDToIssuedJob.values()

    # AbstractScalableBatchSystem functionality
    def getNodes(self, preemptable=False, timeout=None):
        nodes = dict()
        for node in self.nodesToWorker:
            if node.preemptable == preemptable:
                worker = self.nodesToWorker[node]
                nodes[node.privateIP] = NodeInfo(
                    coresTotal=0,
                    coresUsed=0,
                    requestedCores=1,
                    memoryTotal=0,
                    memoryUsed=0,
                    requestedMemory=1,
                    workers=1 if worker.busyEvent.is_set() else 0)
        return nodes

    # AbstractProvisioner functionality
    def addNodes(self, nodeType, numNodes, preemptable):
        self._addNodes(numNodes=numNodes,
                       nodeType=nodeType,
                       preemptable=preemptable)
        return self.getNumberOfNodes(nodeType=nodeType,
                                     preemptable=preemptable)

    def getNodeShape(self, nodeType, preemptable=False):
        # Assume node shapes and node types are the same thing for testing
        return nodeType

    def getWorkersInCluster(self, nodeShape):
        return self.workers[nodeShape]

    def launchCluster(self,
                      leaderNodeType,
                      keyName,
                      userTags=None,
                      vpcSubnet=None,
                      leaderStorage=50,
                      nodeStorage=50,
                      botoPath=None,
                      **kwargs):
        pass

    def destroyCluster(self):
        pass

    def getLeader(self):
        pass

    def _leaderFn(self):
        while self.running:
            updatedJobID = None
            try:
                updatedJobID = self.updatedJobsQueue.get(timeout=1.0)
            except Empty:
                continue
            if updatedJobID:
                del self.jobBatchSystemIDToIssuedJob[updatedJobID]
            time.sleep(0.1)

    def _addNodes(self, numNodes, nodeType, preemptable=False):
        nodeShape = self.getNodeShape(nodeType=nodeType,
                                      preemptable=preemptable)

        class Worker(object):
            def __init__(self, jobQueue, updatedJobsQueue, secondsPerJob):
                self.busyEvent = Event()
                self.stopEvent = Event()

                def workerFn():
                    while True:
                        if self.stopEvent.is_set():
                            return
                        try:
                            jobID = jobQueue.get(timeout=1.0)
                        except Empty:
                            continue
                        updatedJobsQueue.put(jobID)
                        self.busyEvent.set()
                        time.sleep(secondsPerJob)
                        self.busyEvent.clear()

                self.startTime = time.time()
                self.worker = Thread(target=workerFn)
                self.worker.start()

            def stop(self):
                self.stopEvent.set()
                self.worker.join()
                return time.time() - self.startTime

        for _ in range(numNodes):
            node = Node('127.0.0.1',
                        uuid.uuid4(),
                        'testNode',
                        datetime.datetime.now().isoformat() + 'Z',
                        nodeType=nodeType,
                        preemptable=preemptable)
            self.nodesToWorker[node] = Worker(self.jobQueue,
                                              self.updatedJobsQueue,
                                              self.secondsPerJob)
            self.workers[nodeShape].append(self.nodesToWorker[node])
        self.maxWorkers[nodeShape] = max(self.maxWorkers[nodeShape],
                                         len(self.workers[nodeShape]))

    def _removeNodes(self, nodes):
        logger.debug("Removing nodes. %s workers and %s to terminate.",
                     len(self.nodesToWorker), len(nodes))
        for node in nodes:
            logger.debug("removed node")
            try:
                nodeShape = self.getNodeShape(node.nodeType, node.preemptable)
                worker = self.nodesToWorker.pop(node)
                self.workers[nodeShape].pop()
                self.totalWorkerTime += worker.stop()
            except KeyError:
                # Node isn't our responsibility
                pass

    def getNumberOfNodes(self, nodeType=None, preemptable=None):
        if nodeType:
            nodeShape = self.getNodeShape(nodeType=nodeType,
                                          preemptable=preemptable)
            return len(self.workers[nodeShape])
        else:
            return len(self.nodesToWorker)
Ejemplo n.º 8
0
class ThreadedTaskDispatcher(object):
    """A Task Dispatcher that creates a thread for each task."""

    stop_count = 0  # Number of threads that will stop soon.

    def __init__(self):
        self.threads = {}  # { thread number -> 1 }
        self.queue = Queue()
        self.thread_mgmt_lock = threading.Lock()

    def handlerThread(self, thread_no):
        threads = self.threads
        try:
            while threads.get(thread_no):
                task = self.queue.get()
                if task is None:
                    # Special value: kill this thread.
                    break
                try:
                    task.service()
                except:
                    log.exception('Exception during task')
        except:
            log.exception('Exception in thread main loop')
        finally:
            mlock = self.thread_mgmt_lock
            with mlock:
                self.stop_count -= 1
                try:
                    del threads[thread_no]
                except KeyError:
                    pass

    def setThreadCount(self, count):
        """See zope.server.interfaces.ITaskDispatcher"""
        mlock = self.thread_mgmt_lock
        with mlock:
            threads = self.threads
            thread_no = 0
            running = len(threads) - self.stop_count
            while running < count:
                # Start threads.
                while thread_no in threads:
                    thread_no = thread_no + 1
                threads[thread_no] = 1
                running += 1
                t = threading.Thread(target=self.handlerThread,
                                     args=(thread_no, ),
                                     name='zope.server-%d' % thread_no)
                t.setDaemon(True)
                t.start()
                thread_no = thread_no + 1
            if running > count:
                # Stop threads.
                to_stop = running - count
                self.stop_count += to_stop
                for _n in range(to_stop):
                    self.queue.put(None)
                    running -= 1

    def addTask(self, task):
        """See zope.server.interfaces.ITaskDispatcher"""
        if task is None:
            raise ValueError("No task passed to addTask().")
        # assert ITask.providedBy(task)
        try:
            task.defer()
            self.queue.put(task)
        except:
            task.cancel()
            raise

    def shutdown(self, cancel_pending=True, timeout=5):
        """See zope.server.interfaces.ITaskDispatcher"""
        self.setThreadCount(0)
        # Ensure the threads shut down.
        threads = self.threads
        expiration = time() + timeout
        while threads:
            if time() >= expiration:
                log.error("%d thread(s) still running", len(threads))
                break
            sleep(0.1)
        if cancel_pending:
            # Cancel remaining tasks.
            try:
                queue = self.queue
                while not queue.empty():
                    task = queue.get()
                    if task is not None:
                        task.cancel()
            except Empty:
                pass

    def getPendingTasksEstimate(self):
        """See zope.server.interfaces.ITaskDispatcher"""
        return self.queue.qsize()
Ejemplo n.º 9
0
class HyperDash:
    """HyperDash monitors a job and manages capturing IO / server comms.

    This class is designed to be run in its own thread and contains an instance
    of runner (which is running the job) and server_manager (for talking
    to the server.)
    """
    def __init__(
        self,
        job_name,
        current_sdk_run_uuid,
        server_manager,
        io_bufs,
        std_streams,
        parent_logger,
        runner,
    ):
        """Initialize the HyperDash class.

        args:
            1) job_name: Name of the current running job
            2) current_sdk_run_uuid: UUID of current run
            3) runner: Instance of CodeRunner or ExperimentRunner
            4) server_manager: ServerManager instance
            5) io_bufs: Tuple in the form of (StringIO(), StringIO(),)
            6) std_streams: Tuple in the form of (StdOut, StdErr)
        """
        self.job_name = job_name
        self.current_sdk_run_uuid = current_sdk_run_uuid
        self.runner = runner
        self.server_manager = server_manager
        self.out_buf, self.err_buf = io_bufs
        self.std_out, self.std_err = std_streams
        self.programmatic_exit = False
        self.shutdown_network_channel = Queue()
        self.shutdown_main_channel = Queue()

        # Used to keep track of the current position in the IO buffers for data
        # that has been sent to STDOUT/STDERR and the logfile
        self.out_buf_offset = 0
        self.err_buf_offset = 0

        # Used to keep track of the current position in the IO buffers for data
        # that has been sent to the ServerManager. We separate the local/server
        # offsets because in the case of the user's code frequently flushing, we
        # want terminal/logs to update extremely quickly, but a small delay in
        # sending data to the server is acceptable so that more data can be batched
        # together. I.E if the user's code flushes 1000 times per second, we want
        # to capture that in realtime locally, but only want to send one message to
        # the server with the cumulative output of those 1000 flushes for the one
        # second period.
        self.server_out_buf_offset = 0
        self.server_err_buf_offset = 0

        self.time_since_last_server_capture = time.time()

        # Create run_start message before doing any other setup work to make sure that the
        # run_started message always precedes any other messages
        self.server_manager.put_buf(
            create_run_started_message(self.current_sdk_run_uuid,
                                       self.job_name), )

        def on_stdout_flush():
            self.capture_io()
            self.std_out.flush()
            self.flush_log_file()

        def on_stderr_flush():
            self.capture_io()
            self.std_err.flush()
            self.flush_log_file()

        self.out_buf.set_on_flush(on_stdout_flush)
        self.err_buf.set_on_flush(on_stderr_flush)

        self.logger = parent_logger.getChild(__name__)
        self.log_file, self.log_file_path = self.open_log_file()
        if not self.log_file:
            self.logger.error(
                "Could not create logs file. Logs will not be stored locally.")

    def open_log_file(self):
        log_folder = get_hyperdash_logs_home_path()

        # Make sure logs directory exists (/logs)
        if not os.path.exists(log_folder):
            try:
                os.makedirs(log_folder)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    pass
                return None, None

        job_log_folder = get_hyperdash_logs_home_path_for_job(self.job_name)
        # Make sure directory for job exists in log directory (/logs/<JOB_NAME>)
        if not os.path.exists(job_log_folder):
            try:
                os.makedirs(job_log_folder)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    pass
                return None, None

        # Create new log file for run
        try:
            iso = slugify(datetime.datetime.now().isoformat())
            logfile_name = "{}_{}.log".format(slugify(self.job_name), iso)
            logfile_path = os.path.join(job_log_folder, logfile_name)
            return open(logfile_path, "a"), logfile_path
        except IOError:
            return None, None

    # Capture all IO for terminal/log file since we last checked
    def capture_io(self, force_server_capture=False):
        current_time = time.time()
        should_send_to_server_manager = ((
            (current_time - self.time_since_last_server_capture) > 1)
                                         or force_server_capture)
        if should_send_to_server_manager:
            self.time_since_last_server_capture = current_time

        self.out_buf.acquire()
        out = self.out_buf.getvalue()
        # Local
        len_out = len(out) - self.out_buf_offset
        if len_out != 0:
            self.print_out(out[self.out_buf_offset:])
        self.out_buf_offset += len_out
        # Server
        len_out_server = len(out) - self.server_out_buf_offset
        if len_out_server != 0 and should_send_to_server_manager:
            self.send_print_to_server_manager(out[self.server_out_buf_offset:],
                                              INFO_LEVEL)
            self.server_out_buf_offset += len_out_server
        self.out_buf.release()

        self.err_buf.acquire()
        err = self.err_buf.getvalue()
        # Local
        len_err = len(err) - self.err_buf_offset
        if len_err != 0:
            self.print_err(err[self.err_buf_offset:])
        self.err_buf_offset += len_err
        # Server
        len_err_server = len(err) - self.server_err_buf_offset
        if len_err_server != 0 and should_send_to_server_manager:
            self.send_print_to_server_manager(err[self.server_err_buf_offset:],
                                              ERROR_LEVEL)
            self.server_err_buf_offset += len_err_server
        self.err_buf.release()

    def print_out(self, s):
        self.std_out.write(s)
        self.write_to_log_file(s)

    def print_err(self, s):
        self.std_err.write(s)
        self.write_to_log_file(s)

    # In the case that the amount of data is large, we chunk it into
    # several smaller messages
    def send_print_to_server_manager(self, s, level):
        offset = 0
        while offset < len(s):
            chunk_size = min(len(s) - offset, MAX_LOG_SIZE_BYTES)
            message = create_log_message(self.current_sdk_run_uuid, level,
                                         s[offset:offset + chunk_size])
            self.server_manager.put_buf(message)
            offset += chunk_size

    def write_to_log_file(self, s):
        if self.log_file:
            if PY2:
                self.log_file.write(s.encode("utf-8"))
            else:
                self.log_file.write(s)

    def flush_log_file(self):
        if self.log_file:
            self.log_file.flush()

    def cleanup(self, exit_status):
        self.print_completion_message()
        self.capture_io(force_server_capture=True)
        self.server_manager.put_buf(
            create_run_ended_message(self.current_sdk_run_uuid, exit_status), )
        self.flush_log_file()
        self.shutdown_network_channel.put(True)

    def sudden_cleanup(self):
        self.print_completion_message()
        # Send what we can to local log
        self.capture_io()
        self.flush_log_file()

        # Make a best-effort attempt to notify server that the run was
        # canceled by the user, but don't wait for all messages to
        # be flushed to server so we don't hang the user's terminal.
        self.server_manager.send_message(
            create_run_ended_message(self.current_sdk_run_uuid,
                                     "user_canceled"),
            raise_exceptions=False,
            timeout_seconds=1,
        )
        # Prevent the network thread from continuing to run in the background
        # even if SystemExit is caught
        self.shutdown_network_channel.put(True)

    def print_completion_message(self):
        start_time, end_time = self.runner.get_start_and_end_time()
        # end_time is not set in the case that the user canceled the run
        if not end_time:
            end_time = datetime.datetime.now()

        if self.log_file_path:
            self.logger.info(
                "This run of {} ran for {} and logs are available locally at: {}"
                .format(
                    self.job_name,
                    human_readable_duration(start_time, end_time),
                    self.log_file_path,
                ))

    def run(self):
        """
        run_http works using three separate threads:
            1) runner thread which runs the user's code (if using CodeRunner)
            2) network_thread which does blocking I/O with the server
            3) event_loop thread which runs the SDK's main event loop (this is
               just the main thread)

        We require the event loop and network loop to be in separate threads
        because otherwise slow responses from the server could inhibit the
        SDK's event loop causing weird behavior like delayed logs in the user's
        terminal.

        Once all threads are running, the event_loop thread will periodically
        check the I/O buffers to see if any new logs have appeared, and if so,
        it will send them to the server manager's outgoing buffer.

        The network_loop thread will periodically check its outgoing buffer, and
        if it finds any messages in there, it will send them all to the server.

        Cleanup is the responsibility of the event_loop. With every tick of the
        event_loop, we check to see if the user's code has completed running. If
        it has, the event_loop will capture any remaining I/O and store that in
        the ServerManager's outgoing buf, as well as store a message indicating
        that the run is complete and its final exit status. Finally, the
        event_loop thread will push a message into the shutdown_network_channel which
        will indicate to the network_loop that it should finish sending any
        pending messages and then exit. The event_loop thread will then block
        until it receives a message on the shutdown_main_channel.

        At the next tick of the network_loop, the shutdown_network_channel will no longer
        be empty, and the network loop will try and fire off any remaining messages
        in the ServerManager's buffer to the server, and then put a message in the
        shutdown_main_channel.

        The main event_loop which has been blocked until now on the shutdown_main_channel
        will now return, and the program will exit cleanly.
        """
        def network_loop():
            while True:
                if self.shutdown_network_channel.qsize() != 0:
                    self.server_manager.cleanup(self.current_sdk_run_uuid)
                    self.shutdown_main_channel.put(True)
                    return
                else:
                    self.server_manager.tick(self.current_sdk_run_uuid)
                    time.sleep(1)

        network_thread = Thread(target=network_loop)
        # Daemonize so they don't impede shutdown if the user
        # keyboard interrupts
        network_thread.daemon = True
        network_thread.start()

        # Create thread for running code if using CLI or decorator
        if self.runner.should_run_as_thread():
            code_thread = Thread(target=self.runner.run)
            code_thread.daemon = True
            code_thread.start()

        # Event loop
        while True:
            try:
                self.capture_io()
                exited_cleanly, is_done = self.runner.is_done()
                if is_done:
                    self.programmatic_exit = True
                    if exited_cleanly:
                        self.cleanup("success")
                        # Block until network loop says its done
                        self.shutdown_main_channel.get(block=True,
                                                       timeout=None)
                        return self.runner.get_return_val()
                    else:
                        self.cleanup("failure")
                        # Block until network loop says its done
                        self.shutdown_main_channel.get(block=True,
                                                       timeout=None)
                        raise self.runner.get_exception()

                time.sleep(1)
            # Handle Ctrl+C
            except (KeyboardInterrupt, SystemExit):
                self.sudden_cleanup()
                # code_thread and network_thread are daemons so they won't impede this
                sys.exit(130)
Ejemplo n.º 10
0
class TilestorWriter:
    __instance = None

    def __init__(self):
        if TilestorWriter.__instance is None:
            self.queue = Queue(maxsize=QUEUE_MAX_SIZE)
            self.cstart = None

            self._worker = Thread(target=self._job)
            self._worker.daemon = True
            self._worker.start()

    @classmethod
    def getInstance(cls):
        if cls.__instance is None:
            cls.__instance = TilestorWriter()
        return cls.__instance

    def put(self, payload, timeout):
        try:
            if timeout is None:
                self.queue.put_nowait(payload)
            else:
                self.queue.put(payload, timeout=timeout)
        except Full:
            raise TileWriterQueueFullException(
                "Tile writer queue is full at maxsize {}.".format(
                    self.queue.maxsize))

    def _job(self):
        self._shutdown = False
        atexit.register(self.wait_for_shutdown)

        data = None
        while True:
            self.cstart = None

            if data is None:
                try:
                    # When the thread is shutting down use minimal timeout
                    # otherwise use a big timeout not to block shutdown
                    # that may happen.
                    get_timeout = 0.001 if self._shutdown else min(
                        SHUTDOWN_TIMEOUT / 5, 2)
                    data = self.queue.get(True, get_timeout)
                except Empty:
                    if self._shutdown:
                        _logger.debug(
                            "Tile cache writer queue is empty now. Exiting!")
                        break
                    else:
                        continue

            db_path = data['db_path']

            self.cstart = ptime = time()

            tiles_written = 0
            time_taken = 0.0

            answers = []

            # Tile cache writer may fall sometimes in case of database connecti
            # problem for example. So we just skip a tile with error and log an
            # exception.
            try:

                with transaction.manager:
                    conn = DBSession.connection()
                    tilestor = get_tile_db(db_path)

                    while data is not None and data['db_path'] == db_path:
                        z, x, y = data['tile']
                        tstamp = int((datetime.utcnow() -
                                      TIMESTAMP_EPOCH).total_seconds())

                        img = data['img']
                        if img is not None and img.mode != 'RGBA':
                            img = img.convert('RGBA')

                        colortuple = imgcolor(img)
                        color = pack_color(
                            colortuple) if colortuple is not None else None

                        self._write_tile_meta(
                            conn, data['uuid'],
                            dict(z=z, x=x, y=y, color=color, tstamp=tstamp))

                        if color is None:
                            buf = BytesIO()
                            img.save(buf, format='PNG', compress_level=3)
                            value = buf.getvalue()

                            self._write_tile_data(tilestor, z, x, y, tstamp,
                                                  value)

                        if 'answer_queue' in data:
                            answers.append(data['answer_queue'])

                        tiles_written += 1

                        ctime = time()
                        time_taken += ctime - ptime

                        if tiles_written >= BATCH_MAX_TILES:
                            # Break the batch
                            data = None
                        else:
                            # Try to get next tile for the batch. Or break
                            # the batch if there is no tiles left.
                            if time_taken < BATCH_DEADLINE:
                                try:
                                    data = self.queue.get(
                                        timeout=(BATCH_DEADLINE - time_taken))
                                except Empty:
                                    data = None
                            else:
                                data = None

                        # Do not account queue block time
                        ptime = time()

                    # Force zope session management to commit changes
                    mark_changed(DBSession())
                    tilestor.commit()

                    time_taken += time() - ptime
                    _logger.debug(
                        "%d tiles were written in %0.3f seconds (%0.1f per "
                        "second, qsize = %d)", tiles_written, time_taken,
                        tiles_written / time_taken, self.queue.qsize())

                # Report about sucess only after transaction commit
                for a in answers:
                    a.put_nowait(None)

            except Exception as exc:
                _logger.exception("Uncaught exception in tile writer: %s",
                                  exc.message)

                data = None
                self.cstart = None
                tilestor.rollback()

    def _write_tile_meta(self, conn, table_uuid, row):
        result = conn.execute(
            db.sql.text('SELECT true FROM tile_cache."{}" '
                        'WHERE z = :z AND x = :x AND y = :y '
                        'LIMIT 1 FOR UPDATE'.format(table_uuid)), **row)

        if result.returns_rows:
            conn.execute(
                db.sql.text('DELETE FROM tile_cache."{0}" '
                            'WHERE z = :z AND x = :x AND y = :y '
                            ''.format(table_uuid)), **row)

        conn.execute(
            db.sql.text(
                'INSERT INTO tile_cache."{0}" (z, x, y, color, tstamp) '
                'VALUES (:z, :x, :y, :color, :tstamp)'.format(table_uuid)),
            **row)

    def _write_tile_data(self, tilestor, z, x, y, tstamp, value):
        tilestor.execute("DELETE FROM tile WHERE z = ? AND x = ? AND y = ?",
                         (z, x, y))

        try:
            tilestor.execute("INSERT INTO tile VALUES (?, ?, ?, ?, ?)",
                             (z, x, y, tstamp, value))
        except sqlite3.IntegrityError:
            # NOTE: Race condition with other proccess may occurs here.
            # TODO: ON CONFLICT DO ... in SQLite >= 3.24.0 (python 3)
            pass

    def wait_for_shutdown(self, timeout=SHUTDOWN_TIMEOUT):
        if not self._worker.is_alive():
            return True

        _logger.debug(
            "Waiting for shutdown of tile cache writer for %d seconds (" +
            "qsize = %d)...", timeout, self.queue.qsize())

        self._shutdown = True
        self._worker.join(timeout)

        if self._worker.is_alive():
            _logger.warn(
                "Tile cache writer is still running. It'll be killed!")
            return False
        else:
            _logger.debug("Tile cache writer has successfully shut down.")
            return True
Ejemplo n.º 11
0
class LogProcessingWorker(Thread):
    """"""

    # ----------------------------------------------------------------------
    def __init__(self, *args, **kwargs):
        self._host = kwargs.pop('host')
        self._port = kwargs.pop('port')
        self._transport = kwargs.pop('transport')
        self._ssl_enable = kwargs.pop('ssl_enable')
        self._ssl_verify = kwargs.pop('ssl_verify')
        self._keyfile = kwargs.pop('keyfile')
        self._certfile = kwargs.pop('certfile')
        self._ca_certs = kwargs.pop('ca_certs')
        self._database_path = kwargs.pop('database_path')

        super(LogProcessingWorker, self).__init__(*args, **kwargs)
        self.daemon = True
        self.name = self.__class__.__name__

        self._shutdown_event = Event()
        self._queue = Queue()

        self._event = None
        self._database = None
        self._last_event_flush_date = None
        self._non_flushed_event_count = None
        self._logger = None

    # ----------------------------------------------------------------------
    def enqueue_event(self, event):
        # called from other threads
        self._queue.put(event)

    # ----------------------------------------------------------------------
    def shutdown(self):
        # called from other threads
        self._shutdown_event.set()

    # ----------------------------------------------------------------------
    def run(self):
        self._reset_flush_counters()
        self._setup_logger()
        self._setup_database()
        try:
            self._fetch_events()
        except Exception as e:
            # we really should not get anything here, and if, the worker thread is dying
            # too early resulting in undefined application behaviour
            self._log_general_error(e)
        # check for empty queue and report if not
        self._warn_about_non_empty_queue_on_shutdown()

    # ----------------------------------------------------------------------
    def _reset_flush_counters(self):
        self._last_event_flush_date = datetime.now()
        self._non_flushed_event_count = 0

    # ----------------------------------------------------------------------
    def _setup_logger(self):
        self._logger = get_logger(self.name)

    # ----------------------------------------------------------------------
    def _setup_database(self):
        self._database = DatabaseCache(self._database_path)

    # ----------------------------------------------------------------------
    def _fetch_events(self):
        while True:
            try:
                self._fetch_event()
                self._process_event()
            except Empty:
                # Flush queued (in database) events after internally queued events has been
                # processed, i.e. the queue is empty.
                if self._shutdown_requested():
                    self._flush_queued_events(force=True)
                    return

                self._flush_queued_events()
                self._delay_processing()
            except (DatabaseLockedError, ProcessingError):
                if self._shutdown_requested():
                    return
                else:
                    self._requeue_event()
                    self._delay_processing()

    # ----------------------------------------------------------------------
    def _fetch_event(self):
        self._event = self._queue.get(block=False)

    # ----------------------------------------------------------------------
    def _process_event(self):
        try:
            self._write_event_to_database()
        except DatabaseLockedError:
            self._safe_log(
                u'debug',
                u'Database is locked, will try again later (queue length %d)',
                self._queue.qsize())
            raise
        except Exception as e:
            self._log_processing_error(e)
            raise ProcessingError()
        else:
            self._event = None

    # ----------------------------------------------------------------------
    def _log_processing_error(self, exception):
        self._safe_log(u'exception',
                       u'Log processing error (queue size: %3s): %s',
                       self._queue.qsize(), exception)

    # ----------------------------------------------------------------------
    def _delay_processing(self):
        self._shutdown_event.wait(QUEUE_CHECK_INTERVAL)

    # ----------------------------------------------------------------------
    def _shutdown_requested(self):
        return self._shutdown_event.is_set()

    # ----------------------------------------------------------------------
    def _requeue_event(self):
        self._queue.put(self._event)

    # ----------------------------------------------------------------------
    def _write_event_to_database(self):
        self._database.add_event(self._event)
        self._non_flushed_event_count += 1

    # ----------------------------------------------------------------------
    def _flush_queued_events(self, force=False):
        # check if necessary and abort if not
        if not force and not self._queued_event_interval_reached(
        ) and not self._queued_event_count_reached():
            return

        try:
            queued_events = self._database.get_queued_events()
        except DatabaseLockedError:
            self._safe_log(
                u'debug',
                u'Database is locked, will try again later (queue length %d)',
                self._queue.qsize())
            return  # try again later
        except Exception as e:
            # just log the exception and hope we can recover from the error
            self._safe_log(u'exception', u'Error retrieving queued events: %s',
                           e)
            return

        if queued_events:
            try:
                events = [event['event_text'] for event in queued_events]
                self._send_events(events)
            except Exception as e:
                self._safe_log(u'exception',
                               u'An error occurred while sending events: %s',
                               e)
                self._database.requeue_queued_events(queued_events)
            else:
                self._delete_queued_events_from_database()
                self._reset_flush_counters()

    # ----------------------------------------------------------------------
    def _delete_queued_events_from_database(self):
        try:
            self._database.delete_queued_events()
        except DatabaseLockedError:
            pass  # nothing to handle, if it fails, we delete those events in a later run

    # ----------------------------------------------------------------------
    def _queued_event_interval_reached(self):
        delta = datetime.now() - self._last_event_flush_date
        return delta.total_seconds() > QUEUED_EVENTS_FLUSH_INTERVAL

    # ----------------------------------------------------------------------
    def _queued_event_count_reached(self):
        return self._non_flushed_event_count > QUEUED_EVENTS_FLUSH_COUNT

    # ----------------------------------------------------------------------
    def _send_events(self, events):
        self._transport.send(events)

    # ----------------------------------------------------------------------
    def _log_general_error(self, exc):
        self._safe_log(u'exception', u'An unexpected error occurred: %s', exc)

    # ----------------------------------------------------------------------
    def _safe_log(self, log_level, message, *args, **kwargs):
        # we cannot log via the logging subsystem any longer once it has been set to shutdown
        if self._shutdown_requested():
            safe_log_via_print(log_level, message, *args, **kwargs)
        else:
            log_func = getattr(self._logger, log_level)
            return log_func(message, *args, **kwargs)

    # ----------------------------------------------------------------------
    def _warn_about_non_empty_queue_on_shutdown(self):
        queue_size = self._queue.qsize()
        if queue_size:
            self._safe_log(
                'warn',
                u'Non-empty queue while shutting down ({} events pending). '
                u'This indicates a previous error.'.format(queue_size),
                extra=dict(queue_size=queue_size))
Ejemplo n.º 12
0
class BatchTaskQueue(TaskQueue):
    """
    A class for managing batch async operations.
    """

    def __init__(self, work_func,
                 max_batch_size=DEFAULT_BATCH_SIZE, batch_cushion=DEFAULT_BATCH_CUSHION,
                 interval=DEFAULT_INTERVAL,
                 **kwargs):
        """
        :param work_func: Work function input params of list(item), items are added with add_item
        :type work_func: function
        :param max_batch_size: The max number of elements in a batch call
        :type max_batch_size: int
        :param batch_cushion: The batch cushion between items uploaded and the specified max
        :type batch_cushion: int
        :param interval: The interval between checking and uploading added items
        :type interval: int
        """

        super(BatchTaskQueue, self).__init__(**kwargs)
        self._max_batch_size = max_batch_size
        self._batch_cushion = batch_cushion

        self._batch_size = self._max_batch_size - self._batch_cushion
        if self._batch_size <= 0:
            self._logger.warning("Batch size - batch cushion is less than 1, defaulting to 1.")
            self._batch_size = MIN_BATCH_SIZE

        self._items = Queue()
        self._work_func = work_func

        self._daemon = Daemon(self._do_work,
                              interval,
                              _parent_logger=self._logger,
                              _ident="{}Daemon".format(self.identity))
        self._daemon.start()

    def add_item(self, item):
        """
        :param func: Function to be executed asynchronously
        :type func: builtin.function
        :param task_priority: Priority for the task, higher items have higher priority
        :type task_priority: int or None
        """
        self._items.put(item)

    def _handle_batch(self):
        batch = []
        for item in range(self._batch_size):
            try:
                item = self._items.get_nowait()
                batch.append(item)
            except Empty:
                break

        self._logger.debug("Batch size {}.".format(len(batch)))
        if len(batch) > 0:
            self.add(self._work_func, batch)

    def _do_work(self):
        if not self._items.empty():
            queue_size = self._items.qsize()
            num_batches = 1 + int(queue_size / self._batch_size)
            with TaskQueue(_ident="BatchTaskQueueAdd_{}_Batches".format(num_batches)) as task_queue:
                for _ in range(num_batches):
                    task_queue.add(self._handle_batch)

    def __exit__(self, *args):
        super(BatchTaskQueue, self).__exit__(*args)
        self._daemon.stop()

    def flush(self, *args, **kwargs):
        self._do_work()
        super(BatchTaskQueue, self).flush(*args, **kwargs)
Ejemplo n.º 13
0
class Publisher(PublisherServerBase):
    def __init__(self,
                 use_nanoconfig_service,
                 publisher_endpoint,
                 nanoconfig_service_endpoint,
                 nanoconfig_update_endpoint,
                 nanoconfig_profile,
                 metrics_store,
                 max_queue_size,
                 max_worker,
                 min_worker=5):
        """
        :param use_nanoconfig_service: Indicates whether or not it should use a
            nanoconfig service
        :type use_nanoconfig_service: bool
        :param publisher_endpoint: Publisher server URI
        :type publisher_endpoint: str
        :param nanoconfig_service_endpoint: Nanoconfig service URI
        :type nanoconfig_service_endpoint: str
        :param nanoconfig_update_endpoint: Nanoconfig update service URI
        :type nanoconfig_update_endpoint: str
        :param nanoconfig_profile: Nanoconfig profile URI
        :type nanoconfig_profile: str
        :param max_queue_size: Max size for the message queue
        :type max_queue_size: int
        :param max_worker: Max number of worker to be spawned at a given time
        :type max_worker: int
        :param min_worker: Min number of worker to be spawned at a given time
        :type min_worker: int
        """
        super(Publisher,
              self).__init__(use_nanoconfig_service, publisher_endpoint,
                             nanoconfig_service_endpoint,
                             nanoconfig_update_endpoint, nanoconfig_profile)
        self.max_queue_size = max_queue_size
        self.metrics_store = metrics_store
        self.min_worker = min_worker
        self.max_worker = max_worker

        self.msg_queue = Queue(self.max_queue_size)
        self.workers = []

    @property
    def num_workers(self):
        return len(self.workers)

    def on_receive(self, msg):
        LOG.debug('[Publisher] Queue msg size = %s | workers = %s',
                  self.msg_queue.qsize(), self.num_workers)
        try:
            self.check_workers_alive()
            self.adjust_pool_size()
        except OSError as exc:
            LOG.exception(exc)
            LOG.error("[Publisher] Error upon receiving a message")

        self.msg_queue.put(msg)

    def check_workers_alive(self):
        # Because we can create new workers in this loop, we create a copy
        # --> We could otherwise loop onto a new workers...
        worker_items = self.workers[:]
        for worker_thread in worker_items:
            if not worker_thread.is_alive():
                self.workers.pop(self.workers.index(worker_thread))
                self.start_worker()

    def adjust_pool_size(self):
        needed_size = self.msg_queue.qsize() + self.min_worker
        if abs(needed_size - self.num_workers) > self.min_worker * 2:
            LOG.debug(("[Publisher] Auto adjust pool size needed size is `%s` "
                       "and the current size is `%s`"), needed_size,
                      self.num_workers)
            while self.num_workers > min(self.min_worker, needed_size):
                self.stop_worker()
            # Create enough, but not too many
            while self.num_workers < min(self.max_worker, needed_size):
                self.start_worker()

    def start_worker(self):
        LOG.debug("[Publisher] starting worker")
        worker = Worker(self.msg_queue, self.metrics_store)
        worker.start()
        self.workers.append(worker)

    def stop_worker(self):
        if self.num_workers:
            LOG.debug("[Publisher] stopping worker")
            worker = self.workers.pop(-1)  # Pops the last worker
            worker.stop()

    def stop(self):
        super(Publisher, self).stop()
        join_threads = []
        for key in self.workers:
            t = Thread(target=self.workers.get(key).stop)
            t.start()
            join_threads.append(t)
        for join_thread in join_threads:
            join_thread.join()
Ejemplo n.º 14
0
class Streamer(threading.Thread):
    """
    This class extends threading.Thread and provides a simple concurrent queue
    and an async service that pulls data from the queue and sends it to the server.
    """
    def __init__(self, ws_connection, beat_duration, connection):
        threading.Thread.__init__(self)
        self.daemon = True
        self.messages = Queue()
        self.counter = 0
        self.name = "Streamer(%r)" % (ws_connection)
        self.ws_connection = ws_connection
        self.closed = False
        self.beat_duration = beat_duration / 1000.  # We get milliseconds
        self.last_beat = time.time()
        self.connection = connection

    def put_messge_in_q(self, message):
        '''
        Puts a message in the queue
        :param message: Some kind of payload, type agnostic
        '''
        if message is not None:
            self.messages.put(message)

    def close(self):
        '''
        Puts a None in the queue which leads to closing it.
        '''
        if self.closed is True:
            return

        # Send a message to close
        self.put_messge_in_q(CloseMessage())

        self.closed = True

    def run(self):
        '''
        Continuously pulls messages from the queue and sends them to the server.
        '''
        self.ws_connection.wait_for_connection()

        while True:
            out = self._loop()

            # Exit the infinite loop
            if isinstance(out, CloseMessage):
                break

        # Close the websocket connection
        self.ws_connection.close()

        return

    def _loop(self):
        """
        A single loop of running
        """
        try:
            if self.ws_connection is not None and self.ws_connection.is_connected(
            ):
                messages = self.getn(1)

                if messages and isinstance(messages[0], CloseMessage):
                    return messages[0]

                if messages is not None:
                    self.ws_connection.send(messages)

                self._check_heartbeat()

        except Exception as e:
            LOGGER.error("Streaming error", exc_info=True)

    def _check_heartbeat(self):
        """
        Check if we should send an heartbeat
        """
        next_beat = self.last_beat + self.beat_duration
        now = time.time()
        if next_beat < now:
            new_beat_duration = self.connection.heartbeat()
            self.beat_duration = new_beat_duration / 1000.  # We get milliseconds
            self.last_beat = time.time()

    def getn(self, n):
        """
        Pops n messages from the queue.
        Args:
            n: Number of messages to pull from queue

        Returns: n messages

        """
        try:
            msg = self.messages.get(
                timeout=self.beat_duration)  # block until at least 1
        except Empty:
            return None

        if isinstance(msg, CloseMessage):
            return [msg]

        self.counter += 1
        msg.set_offset(self.counter)
        result = [msg]
        try:
            while len(result) < n:
                another_msg = self.messages.get(
                    block=False)  # dont block if no more messages
                self.counter += 1
                another_msg.set_offset(self.counter)
                result.append(another_msg)
        except Exception:
            pass
        return result

    def wait_for_finish(self):
        """ Blocks the experiment from exiting until all data was sent to server OR 30 seconds passed."""

        msg = "Uploading stats to Comet before program termination (may take several seconds)"
        LOGGER.info(msg)

        # Wait maximum 2 minutes
        self._wait_for_empty(30)

        if not self.messages.empty():
            msg = "Still uploading stats to Comet before program termination (may take several seconds)"
            LOGGER.info(msg)
            self._wait_for_empty(30)

            if not self.messages.empty():
                self._wait_for_empty(60, verbose=True)

        if not self.messages.empty():
            remaining = self.messages.qsize()
            LOGGER.error(
                "Comet failed to send all the data back (%s messages)",
                remaining)

        if config.experiment is not None:
            if config.experiment.alive:
                LOGGER.info(
                    "Experiment is live on comet.ml: %s",
                    config.experiment._get_experiment_url(),
                )
            else:
                LOGGER.warning("Failed to log run in comet.ml")
        else:
            LOGGER.warning("Failed to log run in comet.ml")

    def _wait_for_empty(self, timeout, verbose=False):
        """ Wait up to TIMEOUT seconds for the messages queue to be empty
        """
        end_time = time.time() + timeout

        while not self.messages.empty() and time.time() < end_time:
            if verbose is True:
                LOGGER.info("%d messages remaining to upload",
                            self.messages.qsize())
            time.sleep(5)

    def _report_upload_not_finished(self):
        """ Called if the queue is not empty during exit
        """
        remaining = self.messages.qsize()

        if remaining == 0:
            return

        err_msg = "Comet failed to send all the data back %d"

        # Display the message
        LOGGER.error(err_msg, remaining)

        self.connection.report(event_name="on_exit_didnt_finish_upload_sdk",
                               err_msg=(err_msg % remaining))
Ejemplo n.º 15
0
    class Delegate(object):
        """
        Implements the methods of the Provisioner and JobBatcher class needed for the
        ClusterScaler class.
        """

        def __init__(self, preemptable):
            super(MockBatchSystemAndProvisioner.Delegate, self).__init__()
            self.jobQueue = Queue()
            self.totalJobs = 0  # Count of total jobs processed
            self.totalWorkerTime = 0.0  # Total time spent in worker threads
            self.nodesToWorker = {}  # Map from Node to instances of the Worker class
            self.maxWorkers = 0  # Maximum number of workers
            self.preemptable = preemptable

        def addJob(self):
            """
            Add a job to the job queue
            """
            self.totalJobs += 1
            self.jobQueue.put(None)

        # JobBatcher functionality

        def getNumberOfJobsIssued(self):
            return self.jobQueue.qsize()

        # AbstractScalableBatchSystem functionality

        def getNodes(self):
            nodes = dict()
            for i, worker in enumerate(self.nodesToWorker.values()):
                nodes[(i, self.preemptable)] = NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1,
                                                        memoryTotal=0, memoryUsed=0, requestedMemory=1,
                                                        workers=1 if worker.busyEvent.is_set() else 0)
            return nodes

        def _addNodes(self, numNodes):
            class Worker(object):
                def __init__(self, jobQueue, secondsPerJob):
                    self.busyEvent = Event()
                    self.stopEvent = Event()

                    def workerFn():
                        while True:
                            if self.stopEvent.is_set():
                                return
                            try:
                                jobQueue.get(timeout=1.0)
                            except Empty:
                                continue
                            self.busyEvent.set()
                            time.sleep(secondsPerJob)
                            self.busyEvent.clear()

                    self.startTime = time.time()
                    self.worker = Thread(target=workerFn)
                    self.worker.start()

                def stop(self):
                    self.stopEvent.set()
                    self.worker.join()
                    return time.time() - self.startTime

            for i in range(numNodes):
                node = Node('127.0.0.1', '127.0.0.1', 'testNode', time.time())
                self.nodesToWorker[node] = Worker(self.jobQueue, self.outer.secondsPerJob)
            self.maxWorkers = max(self.maxWorkers, len(self.nodesToWorker))

        def _removeNodes(self, nodes):
            logger.info("removing nodes. %s workers and %s to terminate", len(self.nodesToWorker), len(nodes))
            for node in nodes:
                logger.info("removed node")
                try:
                    worker = self.nodesToWorker.pop(node)
                    self.totalWorkerTime += worker.stop()
                except KeyError:
                    # Node isn't our responsibility
                    pass

        def getNumberOfNodes(self):
            return len(self.nodesToWorker)
Ejemplo n.º 16
0
class TaskQueue(ChainedIdentity):
    """
    A class for managing async tasks.
    """

    def __init__(self, worker_pool=None, error_handler=None, flush_timeout_seconds=None, **kwargs):
        """
        :param worker_pool: Thread pool for executing tasks
        :type worker_pool: concurrent.futures.ThreadPoolExecutor
        :param error_handler: Extension point for processing error queue items
        :type error_handler: function(error, logging.Logger)
        :param timeout_seconds: Task flush timeout in seconds
        :type timeout_seconds: timeout in seconds
        """
        super(TaskQueue, self).__init__(**kwargs)
        self._tasks = Queue()
        self._results = []
        # For right now, don't need queue for errors, but it's
        # probable that we'll want the error handler looping on queue thread
        self._errors = []
        self._err_handler = error_handler
        self._worker_pool = worker_pool if worker_pool is not None else WorkerPool(_parent_logger=self._logger)
        self._task_number = 0
        self._flush_timeout_seconds = DEFAULT_FLUSH_TIMEOUT_SECONDS
        if flush_timeout_seconds:
            self._flush_timeout_seconds = flush_timeout_seconds
            self._logger.debug('Overriding default timeout to {}'.format(flush_timeout_seconds))

    def __enter__(self):
        self._logger.debug("[Start]")
        return self

    def __exit__(self, *args):
        self._logger.debug("[Stop] - waiting default timeout")
        self.flush(self.identity)

    # TODO: Adding functions with this method needs to be more configurable
    def add(self, func, *args, **kwargs):
        """
        :param func: Function to be executed asynchronously
        :type func: builtin.function
        """
        future = self.create_future(func, *args, **kwargs)
        ident = "{}_{}".format(self._tasks.qsize(), func.__name__)
        task = AsyncTask(future, _ident=ident, _parent_logger=self._logger)
        self.add_task(task)
        return task

    def add_task(self, async_task):
        """
        :param async_task: asynchronous task to be added to the queue and possibly processed
        :type async_task: azureml._async.AsyncTask
        """
        '''Blocking, no timeout add task to queue'''
        if not isinstance(async_task, AsyncTask):
            raise ValueError("Can only add AsyncTask, got {0}".format(type(async_task)))

        self._logger.debug("Adding task {0} to queue of approximate size: {1}".format(async_task.ident,
                                                                                      self._tasks.qsize()))
        self._tasks.put(async_task)

    def create_future(self, func, *args, **kwargs):
        return self._worker_pool.submit(func, *args, **kwargs)

    def flush(self, source, timeout_seconds=None):
        with self._log_context("WaitFlushSource:{}".format(source)) as log_context:

            if timeout_seconds is None:
                log_context.debug("Overriding default flush timeout from None to {}".
                                  format(self._flush_timeout_seconds))
                timeout_seconds = self._flush_timeout_seconds
            else:
                log_context.debug("flush timeout {} is different from task queue timeout {}, using flush timeout".
                                  format(timeout_seconds, self._flush_timeout_seconds))

            start_time = time.time()

            #  Take tasks off of the queue
            tasks_to_wait = []
            while True:
                try:
                    tasks_to_wait.append(self._tasks.get_nowait())
                except Empty:
                    break

            message = ""
            timeout_time = start_time + timeout_seconds

            log_context.debug("Waiting {} seconds on tasks: {}.".format(timeout_seconds, tasks_to_wait))

            not_done = True

            while not_done and time.time() <= timeout_time:
                completed_tasks = [task for task in tasks_to_wait if task.done()]
                tasks_to_wait = [task for task in tasks_to_wait if not task.done()]
                not_done = len(tasks_to_wait) != 0

                self._results.extend((task.wait(awaiter_name=self.identity) for task in completed_tasks))

                if not_done:
                    for task in tasks_to_wait:
                        message += "Waiting on task: {}.\n".format(task.ident)
                    message += "{} tasks left. Current duration of flush {} seconds.\n".format(
                        len(tasks_to_wait), time.time() - start_time)

                    time.sleep(.25)

            self._logger.debug(message)

            # Reach this case on timeout
            if not_done:
                azureml_error = AzureMLError.create(
                    FlushTaskTimeout, timeout_seconds=timeout_seconds
                )
                raise AzureMLException._with_error(azureml_error)

    @property
    def results(self):
        for result in self._results:
            yield result

    def errors(self):
        for error in self._errors:
            yield error
Ejemplo n.º 17
0
class SelectiveRepeater(object):
    """
    Selective repeater for SBP file I/O requests

    Fields
    ----------
    _pending_map : list(PendingRequest)
      List (used as a map) of PendingRequest objects, used to track
      outstanding requests.
    _request_pool : Queue
      Queue of available requests (recorded by index number)
    _seqmap : dict(int,int)
      Dictionary mapping SBP request sequence IDs to their corresponding
      request index.
    _batch_msgs : list
      Collector for a batch of messages to be sent in one
      buffer via the link
    _last_check_time : Time
      The last time we checked if any packets had expired
    _expire_map : dict(Time, dict(PendingRequest, PendingRequest))
      Dictionary which records the future time at which a request
      will expire.

    _msg_type : int
      The message type we're currently sending
    _link : Handler
      The link over which we're sending data

    _callback_thread : int
      ID of the thread that we expect callbacks from
    _link_thread : int
      ID of the thread that handles link writes
    """

    def __init__(self, link, msg_type, cb=None):
        """
        Args
        ---
        link : Handler
          Link over which messages will be sent.
        msg_type :
          The type of message being sent
        cb :
          Invoked when SBP message with type `msg_type` is received
        """

        self._link = link
        self._msg_type = msg_type
        self._callback = cb

        self._seqmap = {}
        self._batch_msgs = []
        self._last_check_time = Time.now()
        self._expire_map = defaultdict(dict)

        self._init_fileio_config(SBP_FILEIO_WINDOW_SIZE, SBP_FILEIO_BATCH_SIZE, PROGRESS_CB_REDUCTION_FACTOR)

        self._callback_thread = None
        self._link_thread = None

        self._total_sends = 1.0
        self._total_retries = 0

        self._config_retry_time = None

    def _init_fileio_config(self, window_size, batch_size, progress_cb_reduction_factor):
        self._pending_map = [PendingRequest(X) for X in range(window_size)]
        self._request_pool = Queue(window_size)
        for pending_req in self._pending_map:
            self._request_pool.put(pending_req)
        self._batch_size = batch_size
        self._progress_cb_reduction_factor = progress_cb_reduction_factor

    def __enter__(self):
        self._link.add_callback(self._request_cb, self._msg_type)
        self._link.add_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP)
        return self

    def __exit__(self, type, value, traceback):
        self._link.remove_callback(self._request_cb, self._msg_type)
        self._link.remove_callback(self._config_cb, SBP_MSG_FILEIO_CONFIG_RESP)

    def _verify_cb_thread(self):
        """
        Verify that only one thread is consuming requests.
        """
        if self._callback_thread is None:
            self._callback_thread = threading.currentThread().ident
        assert self._callback_thread == threading.currentThread().ident

    def _verify_link_thread(self):
        """
        Verify that only one thread is producing requests.
        """
        if self._link_thread is None:
            self._link_thread = threading.currentThread().ident
        assert self._link_thread == threading.currentThread().ident

    def _return_pending_req(self, pending_req):
        """
        Return a pending request to the write pool and clean any
        entries in the expiration map.
        """
        self._verify_cb_thread()
        pending_req.completed = True
        try:
            msg = pending_req.message
        except AttributeError:
            # Got a completion for something that was never requested
            return
        self._try_remove_keys(self._seqmap, msg.sequence)
        if self._try_remove_keys(self._expire_map[pending_req.time_expire], pending_req):
            # Only put the request back if it was successfully removed
            self._request_pool.put(pending_req)

    def _record_pending_req(self, msg, time_now, expiration_time):
        """
        Acquire a pending request object and record it's future
        expiration time in a map.
        """
        self._verify_link_thread()
        # Queue.get will block if no requests are available
        pending_req = self._request_pool.get(True)
        assert self._pending_map[pending_req.index].index == pending_req.index
        self._seqmap[msg.sequence] = pending_req.index
        self._pending_map[pending_req.index].track(msg, time_now, expiration_time)
        self._expire_map[expiration_time][pending_req] = pending_req

    def _config_cb(self, msg, **metadata):
        self._config_msg = msg
        self._init_fileio_config(msg.window_size, msg.batch_size, PROGRESS_CB_REDUCTION_FACTOR * 2)

    def _request_cb(self, msg, **metadata):
        """
        Process request completions.
        """
        index = self._seqmap.get(msg.sequence)
        if index is None:
            return
        pending_req = self._pending_map[index]
        if self._callback:
            self._callback(pending_req.message, msg)
        self._return_pending_req(pending_req)

    def _has_pending(self):
        return self._request_pool.qsize() != len(self._pending_map)

    def _retry_send(self, check_time, pending_req, delete_keys):
        """
        Retry a request by updating it's expire time on the object
        itself and in the expiration map.
        """
        self._total_retries += 1
        self._total_sends += 1
        timeout_delta = Time(SBP_FILEIO_TIMEOUT)
        send_time = Time.now()
        new_expire = send_time + timeout_delta
        pending_req.record_retry(send_time, new_expire)
        self._expire_map[new_expire][pending_req] = pending_req
        self._link(pending_req.message)
        delete_keys.append(pending_req)

    def _try_remove_keys(self, d, *keys):
        success = True
        for key in keys:
            try:
                del d[key]
            except KeyError:
                success = False
        return success

    def _check_pending(self):
        """
        Scans from the last check time to the current time looking
        for requests that are due to expire and retries them if
        necessary.
        """
        time_now = Time.now()
        timeout_delta = Time(SBP_FILEIO_TIMEOUT)
        for check_time in Time.iter_since(self._last_check_time, time_now):
            pending_reqs = self._expire_map[check_time]
            retried_writes = []
            for pending_req in pending_reqs.keys():
                time_expire = pending_req.time + timeout_delta
                if time_now >= time_expire:
                    if pending_req.tries >= MAXIMUM_RETRIES:
                        raise Exception('Timed out')
                    # If the completion map becomes inconsistent (because
                    #   things are completing at the same time they're
                    #   being re-tried) then the `completed` field should
                    #   prevent us from re-sending a write in this case.
                    if not pending_req.completed:
                        self._retry_send(check_time, pending_req, retried_writes)
            # Pending writes can be marked completed while this function
            #   is running, so a key error means is was marked completed
            #   after we sent a retry (therefore _try_remove_keys ignores
            #   key errors).
            self._try_remove_keys(self._expire_map[check_time], *retried_writes)
        self._last_check_time = time_now

    def _window_available(self, batch_size):
        return self._request_pool.qsize() >= batch_size

    def _ensure_config_req_sent(self):
        if self._config_retry_time is not None:
            return
        now = Time.now()
        self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS)
        self._config_timeout = now + Time(0, CONFIG_REQ_TIMEOUT_MS)
        self._config_seq = random.randint(0, 0xffffffff)
        self._config_msg = None
        self._link(MsgFileioConfigReq(sequence=self._config_seq))

    def _config_received(self):
        self._ensure_config_req_sent()
        if self._config_msg is not None:
            return True
        now = Time.now()
        if now >= self._config_retry_time:
            self._link(MsgFileioConfigReq(sequence=self._config_seq))
            self._config_retry_time = now + Time(0, CONFIG_REQ_RETRY_MS)
        if now >= self._config_timeout:
            self._config_msg = MsgFileioConfigResp(sequence=0,
                                                   window_size=100,
                                                   batch_size=1,
                                                   fileio_version=0)
        return self._config_msg is not None

    def _wait_config_received(self):
        while not self._config_received():
            time.sleep(WAIT_SLEEP_S)

    def _wait_window_available(self, batch_size):
        self._wait_config_received()
        while not self._window_available(batch_size):
            self._check_pending()
            if not self._window_available(batch_size):
                time.sleep(WAIT_SLEEP_S)

    @property
    def total_retries(self):
        return self._total_retries

    @property
    def total_sends(self):
        return self._total_sends

    @property
    def progress_cb_reduction_factor(self):
        return self._progress_cb_reduction_factor

    def send(self, msg, batch_size=None):
        if batch_size is not None:
            self._send(msg, batch_size)
        else:
            self._send(msg, self._batch_size)

    def _send(self, msg, batch_size):
        """
        Sends data via the current link, potentially batching it together.

        Parameters
        ----------
        msg : MsgFileioReadReq, MsgFileioReadDirReq, MsgFileioWriteReq, MsgFileioRemove
          The message to be sent via the current link
        batch_size : int
          The number of message to batch together before actually sending
        """
        if msg is not None:
            self._batch_msgs.append(msg)
        if len(self._batch_msgs) >= batch_size:
            self._wait_window_available(batch_size)
            time_now = Time.now()
            expiration_time = time_now + Time(SBP_FILEIO_TIMEOUT)
            for msg in self._batch_msgs:
                self._record_pending_req(msg, time_now, expiration_time)
            self._link(*self._batch_msgs)
            self._total_sends += len(self._batch_msgs)
            del self._batch_msgs[:]

    def flush(self):
        """
        Flush any pending requests (batched or otherwise) and wait for all
        pending requests to complete.
        """
        self.send(None, batch_size=0)
        while self._has_pending():
            self._check_pending()
            time.sleep(WAIT_SLEEP_S)
Ejemplo n.º 18
0
    class Delegate(object):
        """
        Implements the methods of the Provisioner and JobBatcher class needed for the
        ClusterScaler class.
        """
        def __init__(self, preemptable):
            super(MockBatchSystemAndProvisioner.Delegate, self).__init__()
            self.jobQueue = Queue()
            self.totalJobs = 0  # Count of total jobs processed
            self.totalWorkerTime = 0.0  # Total time spent in worker threads
            self.nodesToWorker = {
            }  # Map from Node to instances of the Worker class
            self.maxWorkers = 0  # Maximum number of workers
            self.preemptable = preemptable

        def addJob(self):
            """
            Add a job to the job queue
            """
            self.totalJobs += 1
            self.jobQueue.put(None)

        # JobBatcher functionality

        def getNumberOfJobsIssued(self):
            return self.jobQueue.qsize()

        # AbstractScalableBatchSystem functionality

        def getNodes(self):
            nodes = dict()
            for i, worker in enumerate(self.nodesToWorker.values()):
                nodes[(i, self.preemptable)] = NodeInfo(
                    coresTotal=0,
                    coresUsed=0,
                    requestedCores=1,
                    memoryTotal=0,
                    memoryUsed=0,
                    requestedMemory=1,
                    workers=1 if worker.busyEvent.is_set() else 0)
            return nodes

        def _addNodes(self, numNodes):
            class Worker(object):
                def __init__(self, jobQueue, secondsPerJob):
                    self.busyEvent = Event()
                    self.stopEvent = Event()

                    def workerFn():
                        while True:
                            if self.stopEvent.is_set():
                                return
                            try:
                                jobQueue.get(timeout=1.0)
                            except Empty:
                                continue
                            self.busyEvent.set()
                            time.sleep(secondsPerJob)
                            self.busyEvent.clear()

                    self.startTime = time.time()
                    self.worker = Thread(target=workerFn)
                    self.worker.start()

                def stop(self):
                    self.stopEvent.set()
                    self.worker.join()
                    return time.time() - self.startTime

            for i in xrange(numNodes):
                node = Node('127.0.0.1', '127.0.0.1', 'testNode', time.time())
                self.nodesToWorker[node] = Worker(self.jobQueue,
                                                  self.outer.secondsPerJob)
            self.maxWorkers = max(self.maxWorkers, len(self.nodesToWorker))

        def _removeNodes(self, nodes):
            logger.info("removing nodes. %s workers and %s to terminate",
                        len(self.nodesToWorker), len(nodes))
            for node in nodes:
                logger.info("removed node")
                try:
                    worker = self.nodesToWorker.pop(node)
                    self.totalWorkerTime += worker.stop()
                except KeyError:
                    # Node isn't our responsibility
                    pass

        def getNumberOfNodes(self):
            return len(self.nodesToWorker)
Ejemplo n.º 19
0
Archivo: util.py Proyecto: Cue/scales
class GraphiteReporter(threading.Thread):
  """A graphite reporter thread."""

  def __init__(self, host, port, maxQueueSize=10000):
    """Connect to a Graphite server on host:port."""
    threading.Thread.__init__(self)

    self.host, self.port = host, port
    self.sock = None
    self.queue = Queue()
    self.maxQueueSize = maxQueueSize
    self.daemon = True


  def run(self):
    """Run the thread."""
    while True:
      try:
        try:
          name, value, valueType, stamp = self.queue.get()
        except TypeError:
          break
        self.log(name, value, valueType, stamp)
      finally:
        self.queue.task_done()


  def connect(self):
    """Connects to the Graphite server if not already connected."""
    if self.sock is not None:
      return
    backoff = 0.01
    while True:
      try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(5)
        sock.connect((self.host, self.port))
        self.sock = sock
        return
      except socket.error:
        time.sleep(random.uniform(0, 2.0*backoff))
        backoff = min(backoff*2.0, 5.0)


  def disconnect(self):
    """Disconnect from the Graphite server if connected."""
    if self.sock is not None:
      try:
        self.sock.close()
      except socket.error:
        pass
      finally:
        self.sock = None


  def _sendMsg(self, msg):
    """Send a line to graphite. Retry with exponential backoff."""
    if not self.sock:
      self.connect()
    if not isinstance(msg, binary_type):
      msg = msg.encode("UTF-8")

    backoff = 0.001
    while True:
      try:
        self.sock.sendall(msg)
        break
      except socket.error:
        log.warning('Graphite connection error', exc_info = True)
        self.disconnect()
        time.sleep(random.uniform(0, 2.0*backoff))
        backoff = min(backoff*2.0, 5.0)
        self.connect()


  def _sanitizeName(self, name):
    """Sanitize a metric name."""
    return name.replace(' ', '-')


  def log(self, name, value, valueType=None, stamp=None):
    """Log a named numeric value. The value type may be 'value',
    'count', or None."""
    if type(value) == float:
      form = "%s%s %2.2f %d\n"
    else:
      form = "%s%s %s %d\n"

    if valueType is not None and len(valueType) > 0 and valueType[0] != '.':
      valueType = '.' + valueType

    if not stamp:
      stamp = time.time()

    self._sendMsg(form % (self._sanitizeName(name), valueType or '', value, stamp))


  def enqueue(self, name, value, valueType=None, stamp=None):
    """Enqueue a call to log."""
    # If queue is too large, refuse to log.
    if self.maxQueueSize and self.queue.qsize() > self.maxQueueSize:
      return
    # Stick arguments into the queue
    self.queue.put((name, value, valueType, stamp))


  def flush(self):
    """Block until all stats have been sent to Graphite."""
    self.queue.join()


  def shutdown(self):
    """Shut down the background thread."""
    self.queue.put(None)
    self.flush()
Ejemplo n.º 20
0
    class Delegate(object):
        """
        Implements the methods of the Provisioner and JobBatcher class needed for the
        ClusterScaler class, but omits the preemptable argument.
        """
        def __init__(self):
            super(MockBatchSystemAndProvisioner.Delegate, self).__init__()
            self.jobQueue = Queue()
            self.totalJobs = 0  # Count of total jobs processed
            self.totalWorkerTime = 0.0  # Total time spent in worker threads
            self.workers = []  # Instances of the Worker class
            self.maxWorkers = 0  # Maximum number of workers

        def addJob(self):
            """
            Add a job to the job queue
            """
            self.totalJobs += 1
            self.jobQueue.put(None)

        # JobBatcher functionality

        def getNumberOfJobsIssued(self):
            return self.jobQueue.qsize()

        # AbstractScalableBatchSystem functionality

        def getNodes(self):
            return {
                address: NodeInfo(coresTotal=0,
                                  coresUsed=0,
                                  memoryTotal=0,
                                  memoryUsed=0,
                                  workers=1 if w.busyEvent.is_set() else 0)
                for address, w in enumerate(self.workers)
            }

        # AbstractProvisioner functionality

        def setNodeCount(self, numNodes):
            delta = numNodes - len(self.workers)
            if delta > 0:
                self._addNodes(numNodes=delta)
            elif delta < 0:
                self._removeNodes(numNodes=-delta)
            assert len(self.workers) == numNodes
            return numNodes

        def _addNodes(self, numNodes):
            class Worker(object):
                def __init__(self, jobQueue, secondsPerJob):
                    self.busyEvent = Event()
                    self.stopEvent = Event()

                    def workerFn():
                        while True:
                            if self.stopEvent.is_set():
                                return
                            try:
                                jobQueue.get(timeout=1.0)
                            except Empty:
                                continue
                            self.busyEvent.set()
                            time.sleep(secondsPerJob)
                            self.busyEvent.clear()

                    self.startTime = time.time()
                    self.worker = Thread(target=workerFn)
                    self.worker.start()

                def stop(self):
                    self.stopEvent.set()
                    self.worker.join()
                    return time.time() - self.startTime

            for i in xrange(numNodes):
                self.workers.append(
                    Worker(self.jobQueue, self.outer.secondsPerJob))
            if len(self.workers) > self.maxWorkers:
                self.maxWorkers = len(self.workers)

        def _removeNodes(self, numNodes):
            while len(self.workers) > 0 and numNodes > 0:
                worker = self.workers.pop()
                self.totalWorkerTime += worker.stop()
                numNodes -= 1

        def getNumberOfNodes(self):
            return len(self.workers)
Ejemplo n.º 21
0
class LogProcessingWorker(Thread):
    """"""

    # ----------------------------------------------------------------------
    def __init__(self, *args, **kwargs):
        self._host = kwargs.pop('host')
        self._port = kwargs.pop('port')
        self._transport = kwargs.pop('transport')
        self._ssl_enable = kwargs.pop('ssl_enable')
        self._ssl_verify = kwargs.pop('ssl_verify')
        self._keyfile = kwargs.pop('keyfile')
        self._certfile = kwargs.pop('certfile')
        self._ca_certs = kwargs.pop('ca_certs')
        self._database_path = kwargs.pop('database_path')
        self._memory_cache = kwargs.pop('cache')
        self._event_ttl = kwargs.pop('event_ttl')

        super(LogProcessingWorker, self).__init__(*args, **kwargs)
        self.daemon = True
        self.name = self.__class__.__name__

        self._shutdown_event = Event()
        self._flush_event = Event()
        self._queue = Queue()

        self._event = None
        self._database = None
        self._last_event_flush_date = None
        self._non_flushed_event_count = None
        self._logger = None
        self._rate_limit_storage = None
        self._rate_limit_strategy = None
        self._rate_limit_item = None

    # ----------------------------------------------------------------------
    def enqueue_event(self, event):
        # called from other threads
        self._queue.put(event)

    # ----------------------------------------------------------------------
    def shutdown(self):
        # called from other threads
        self._shutdown_event.set()

    # ----------------------------------------------------------------------
    def run(self):
        self._reset_flush_counters()
        self._setup_logger()
        self._setup_database()
        try:
            self._fetch_events()
        except Exception as e:
            # we really should not get anything here, and if, the worker thread is dying
            # too early resulting in undefined application behaviour
            self._log_general_error(e)
        # check for empty queue and report if not
        self._warn_about_non_empty_queue_on_shutdown()

    # ----------------------------------------------------------------------
    def force_flush_queued_events(self):
        self._flush_event.set()

    # ----------------------------------------------------------------------
    def _reset_flush_counters(self):
        self._last_event_flush_date = datetime.now()
        self._non_flushed_event_count = 0

    # ----------------------------------------------------------------------
    def _clear_flush_event(self):
        self._flush_event.clear()

    # ----------------------------------------------------------------------
    def _setup_logger(self):
        self._logger = get_logger(self.name)
        # rate limit our own messages to not spam around in case of temporary network errors, etc
        rate_limit_setting = constants.ERROR_LOG_RATE_LIMIT
        if rate_limit_setting:
            self._rate_limit_storage = MemoryStorage()
            self._rate_limit_strategy = FixedWindowRateLimiter(self._rate_limit_storage)
            self._rate_limit_item = parse_rate_limit(rate_limit_setting)

    # ----------------------------------------------------------------------
    def _setup_database(self):
        if self._database_path:
            self._database = DatabaseCache(path=self._database_path, event_ttl=self._event_ttl)
        else:
            self._database = MemoryCache(cache=self._memory_cache, event_ttl=self._event_ttl)

    # ----------------------------------------------------------------------
    def _fetch_events(self):
        while True:
            try:
                self._fetch_event()
                self._process_event()
            except Empty:
                # Flush queued (in database) events after internally queued events has been
                # processed, i.e. the queue is empty.
                if self._shutdown_requested():
                    self._flush_queued_events(force=True)
                    return

                force_flush = self._flush_requested()
                self._flush_queued_events(force=force_flush)
                self._delay_processing()
                self._expire_events()
            except (DatabaseLockedError, ProcessingError):
                if self._shutdown_requested():
                    return
                else:
                    self._requeue_event()
                    self._delay_processing()

    # ----------------------------------------------------------------------
    def _fetch_event(self):
        self._event = self._queue.get(block=False)

    # ----------------------------------------------------------------------
    def _process_event(self):
        try:
            self._write_event_to_database()
        except DatabaseLockedError as e:
            self._safe_log(
                u'debug',
                u'Database is locked, will try again later (queue length %d)',
                self._queue.qsize(),
                exc=e)
            raise
        except Exception as e:
            self._log_processing_error(e)
            raise ProcessingError()
        else:
            self._event = None

    # ----------------------------------------------------------------------
    def _expire_events(self):
        try:
            self._database.expire_events()
        except DatabaseLockedError:
            # Nothing to handle, if it fails, we will either successfully publish these messages next time
            # or we will delete them on the next pass.
            pass

    # ----------------------------------------------------------------------
    def _log_processing_error(self, exception):
        self._safe_log(
            u'exception',
            u'Log processing error (queue size: %3s): %s',
            self._queue.qsize(),
            exception,
            exc=exception)

    # ----------------------------------------------------------------------
    def _delay_processing(self):
        self._shutdown_event.wait(constants.QUEUE_CHECK_INTERVAL)

    # ----------------------------------------------------------------------
    def _shutdown_requested(self):
        return self._shutdown_event.is_set()

    # ----------------------------------------------------------------------
    def _flush_requested(self):
        return self._flush_event.is_set()

    # ----------------------------------------------------------------------
    def _requeue_event(self):
        self._queue.put(self._event)

    # ----------------------------------------------------------------------
    def _write_event_to_database(self):
        self._database.add_event(self._event)
        self._non_flushed_event_count += 1

    # ----------------------------------------------------------------------
    def _flush_queued_events(self, force=False):
        # check if necessary and abort if not
        if not force and not self._queued_event_interval_reached() and not self._queued_event_count_reached():
            return

        self._clear_flush_event()

        try:
            queued_events = self._database.get_queued_events()
        except DatabaseLockedError as e:
            self._safe_log(
                u'debug',
                u'Database is locked, will try again later (queue length %d)',
                self._queue.qsize(),
                exc=e)
            return  # try again later
        except Exception as e:
            # just log the exception and hope we can recover from the error
            self._safe_log(u'exception', u'Error retrieving queued events: %s', e, exc=e)
            return

        if queued_events:
            try:
                events = [event['event_text'] for event in queued_events]
                self._send_events(events)
            except Exception as e:
                self._safe_log(
                    u'exception',
                    u'An error occurred while sending events: %s',
                    e,
                    exc=e)
                self._database.requeue_queued_events(queued_events)
            else:
                self._delete_queued_events_from_database()
                self._reset_flush_counters()

    # ----------------------------------------------------------------------
    def _delete_queued_events_from_database(self):
        try:
            self._database.delete_queued_events()
        except DatabaseLockedError:
            pass  # nothing to handle, if it fails, we delete those events in a later run

    # ----------------------------------------------------------------------
    def _queued_event_interval_reached(self):
        delta = datetime.now() - self._last_event_flush_date
        return delta.total_seconds() > constants.QUEUED_EVENTS_FLUSH_INTERVAL

    # ----------------------------------------------------------------------
    def _queued_event_count_reached(self):
        return self._non_flushed_event_count > constants.QUEUED_EVENTS_FLUSH_COUNT

    # ----------------------------------------------------------------------
    def _send_events(self, events):
        self._transport.send(events)

    # ----------------------------------------------------------------------
    def _log_general_error(self, exc):
        self._safe_log(u'exception', u'An unexpected error occurred: %s', exc, exc=exc)

    # ----------------------------------------------------------------------
    def _safe_log(self, log_level, message, *args, **kwargs):
        # we cannot log via the logging subsystem any longer once it has been set to shutdown
        if self._shutdown_requested():
            safe_log_via_print(log_level, message, *args, **kwargs)
        else:
            rate_limit_allowed = self._rate_limit_check(kwargs)
            if rate_limit_allowed <= 0:
                return  # skip further logging due to rate limiting
            elif rate_limit_allowed == 1:
                # extend the message to indicate future rate limiting
                message = \
                    u'{} (rate limiting effective, ' \
                    'further equal messages will be limited)'.format(message)

            self._safe_log_impl(log_level, message, *args, **kwargs)

    # ----------------------------------------------------------------------
    def _rate_limit_check(self, kwargs):
        exc = kwargs.pop('exc', None)
        if self._rate_limit_strategy is not None and exc is not None:
            key = self._factor_rate_limit_key(exc)
            # query curent counter for the caller
            _, remaining = self._rate_limit_strategy.get_window_stats(self._rate_limit_item, key)
            # increase the rate limit counter for the key
            self._rate_limit_strategy.hit(self._rate_limit_item, key)
            return remaining

        return 2  # any value greater than 1 means allowed

    # ----------------------------------------------------------------------
    def _factor_rate_limit_key(self, exc):
        module_name = getattr(exc, '__module__', '__no_module__')
        class_name = exc.__class__.__name__
        key_items = [module_name, class_name]
        if hasattr(exc, 'errno') and isinstance(exc.errno, integer_types):
            # in case of socket.error, include the errno as rate limiting key
            key_items.append(str(exc.errno))
        return '.'.join(key_items)

    # ----------------------------------------------------------------------
    def _safe_log_impl(self, log_level, message, *args, **kwargs):
        log_func = getattr(self._logger, log_level)
        log_func(message, *args, **kwargs)

    # ----------------------------------------------------------------------
    def _warn_about_non_empty_queue_on_shutdown(self):
        queue_size = self._queue.qsize()
        if queue_size:
            self._safe_log(
                'warn',
                u'Non-empty queue while shutting down ({} events pending). '
                u'This indicates a previous error.'.format(queue_size),
                extra=dict(queue_size=queue_size))
Ejemplo n.º 22
0
class Publisher(PublisherServerBase):

    def __init__(self, use_nanoconfig_service, publisher_endpoint,
                 nanoconfig_service_endpoint, nanoconfig_update_endpoint,
                 nanoconfig_profile, metrics_store, max_queue_size,
                 max_worker, min_worker=5):
        """
        :param use_nanoconfig_service: Indicates whether or not it should use a
            nanoconfig service
        :type use_nanoconfig_service: bool
        :param publisher_endpoint: Publisher server URI
        :type publisher_endpoint: str
        :param nanoconfig_service_endpoint: Nanoconfig service URI
        :type nanoconfig_service_endpoint: str
        :param nanoconfig_update_endpoint: Nanoconfig update service URI
        :type nanoconfig_update_endpoint: str
        :param nanoconfig_profile: Nanoconfig profile URI
        :type nanoconfig_profile: str
        :param max_queue_size: Max size for the message queue
        :type max_queue_size: int
        :param max_worker: Max number of worker to be spawned at a given time
        :type max_worker: int
        :param min_worker: Min number of worker to be spawned at a given time
        :type min_worker: int
        """
        super(Publisher, self).__init__(
            use_nanoconfig_service, publisher_endpoint,
            nanoconfig_service_endpoint, nanoconfig_update_endpoint,
            nanoconfig_profile
        )
        self.max_queue_size = max_queue_size
        self.metrics_store = metrics_store
        self.min_worker = min_worker
        self.max_worker = max_worker

        self.msg_queue = Queue(self.max_queue_size)
        self.workers = []

    @property
    def num_workers(self):
        return len(self.workers)

    def on_receive(self, msg):
        LOG.debug('[Publisher] Queue msg size = %s | workers = %s',
                  self.msg_queue.qsize(), self.num_workers)
        try:
            self.check_workers_alive()
            self.adjust_pool_size()
        except OSError as exc:
            LOG.exception(exc)
            LOG.error("[Publisher] Error upon receiving a message")

        self.msg_queue.put(msg)

    def check_workers_alive(self):
        # Because we can create new workers in this loop, we create a copy
        # --> We could otherwise loop onto a new workers...
        worker_items = self.workers[:]
        for worker_thread in worker_items:
            if not worker_thread.is_alive():
                self.workers.pop(self.workers.index(worker_thread))
                self.start_worker()

    def adjust_pool_size(self):
        needed_size = self.msg_queue.qsize() + self.min_worker
        if abs(needed_size - self.num_workers) > self.min_worker * 2:
            LOG.debug(("[Publisher] Auto adjust pool size needed size is `%s` "
                       "and the current size is `%s`"),
                      needed_size, self.num_workers)
            while self.num_workers > min(self.min_worker, needed_size):
                self.stop_worker()
            # Create enough, but not too many
            while self.num_workers < min(self.max_worker, needed_size):
                self.start_worker()

    def start_worker(self):
        LOG.debug("[Publisher] starting worker")
        worker = Worker(self.msg_queue, self.metrics_store)
        worker.start()
        self.workers.append(worker)

    def stop_worker(self):
        if self.num_workers:
            LOG.debug("[Publisher] stopping worker")
            worker = self.workers.pop(-1)  # Pops the last worker
            worker.stop()

    def stop(self):
        super(Publisher, self).stop()
        join_threads = []
        for key in self.workers:
            t = Thread(target=self.workers.get(key).stop)
            t.start()
            join_threads.append(t)
        for join_thread in join_threads:
            join_thread.join()
Ejemplo n.º 23
0
class QueueWorker(BaseWorker):
    TIMEOUT_ATTEMPTS = 5
    QUEUE_SIZE = -1  # inf
    END_EVENT = object()
    NAME = 'polyaxon.QueueWorker'

    def __init__(self, timeout=None, queue_size=None):
        super(QueueWorker, self).__init__()
        self._queue = Queue(queue_size or self.QUEUE_SIZE)
        self._timeout = timeout if timeout is not None else settings.TIMEOUT

    def atexit(self):
        with self._lock:
            if not self.is_alive():
                return

            self._queue.put_nowait(self.END_EVENT)

            def timeout_join(timeout, queue):
                end = time() + timeout
                queue.all_tasks_done.acquire()
                try:
                    while queue.unfinished_tasks:
                        current_timeout = end - time()
                        if current_timeout <= 0:
                            # timed out
                            return False

                        queue.all_tasks_done.wait(timeout=current_timeout)

                    return True

                finally:
                    queue.all_tasks_done.release()

            # ensure wait
            timeout = min(settings.MIN_TIMEOUT,
                          self._timeout / self.TIMEOUT_ATTEMPTS)
            if timeout_join(timeout=timeout, queue=self._queue):
                timeout = 0
            else:
                # Queue still has message, try another time
                size = self._queue.qsize()

                if not settings.IN_CLUSTER:
                    print(
                        'Polyaxon %s is attempting to send %i pending messages'
                        % (self.NAME, size))
                    print('Waiting up to {} seconds'.format(self._timeout))
                    if os.name == 'nt':
                        print('Press Ctrl-Break to quit')
                    else:
                        print('Press Ctrl-C to quit')

            sleep(settings.MIN_TIMEOUT)  # Allow tasks to get executed
            while timeout > 0 and not timeout_join(timeout=timeout,
                                                   queue=self._queue):
                timeout = min(timeout + self._timeout / self.TIMEOUT_ATTEMPTS,
                              self._timeout - timeout)

            size = self._queue.qsize()
            if size > 0:
                print(
                    'Polyaxon %s timed out and did not manage to send %i messages'
                    % (self.NAME, size))

            self._thread = None

    def stop(self, timeout=None):
        with self._lock:
            if self._thread:
                self._queue.put_nowait(self.END_EVENT)
                self._thread.join(timeout=timeout)
                self._thread = None
                self._thread_for_pid = None

    def queue(self, callback, *args, **kwargs):
        self.is_running()
        self._queue.put_nowait((callback, args, kwargs))

    def _target(self):
        while True:
            record = self._queue.get()
            try:
                if record is self.END_EVENT:
                    break
                callback, args, kwargs = record
                try:
                    callback(*args, **kwargs)
                except Exception:
                    logger.error('Failed processing job', exc_info=True)
            finally:
                self._queue.task_done()

            sleep(0)
Ejemplo n.º 24
0
class CachePipeline(object):
    def __init__(self, spider, cache):
        self.spider = spider
        self.cache = cache
        self.idle_event = Event()
        self.queue_size = 100
        self.input_queue = Queue()
        self.result_queue = Queue()

        self.thread = Thread(target=self.thread_worker)
        self.thread.daemon = True
        self.thread.start()

    def has_free_resources(self):
        return (self.input_queue.qsize() < self.queue_size
                and self.result_queue.qsize() < self.queue_size)

    def is_idle(self):
        return self.idle_event.is_set()

    def thread_worker(self):
        while True:
            self.idle_event.clear()
            try:
                action, data = self.input_queue.get(block=False)
            except Empty:
                self.idle_event.set()
                time.sleep(0.1)
                self.idle_event.clear()
            else:
                assert action in ('load', 'save')
                if action == 'load':
                    task, grab = data
                    result = None
                    if self.is_cache_loading_allowed(task, grab):
                        result = self.load_from_cache(task, grab)
                    if result:
                        self.result_queue.put(result)
                    else:
                        self.spider.submit_task_to_transport(task, grab)
                elif action == 'save':
                    task, grab = data
                    if self.is_cache_saving_allowed(task, grab):
                        with self.spider.timer.log_time('cache'):
                            with self.spider.timer.log_time('cache.write'):
                                self.cache.save_response(task.url, grab)

    def is_cache_loading_allowed(self, task, grab):
        # 1) cache data should be refreshed
        # 2) cache is disabled for that task
        # 3) request type is not cacheable
        return (not task.get('refresh_cache', False)
                and not task.get('disable_cache', False)
                and grab.detect_request_method() == 'GET')

    def is_cache_saving_allowed(self, task, grab):
        """
        Check if network transport result could
        be saved to cache layer.

        res: {ok, grab, grab_config_backup, task, emsg}
        """

        if grab.request_method == 'GET':
            if not task.get('disable_cache'):
                if self.spider.is_valid_network_response_code(
                        grab.response.code, task):
                    return True
        return False


    def load_from_cache(self, task, grab):
        with self.spider.timer.log_time('cache'):
            with self.spider.timer.log_time('cache.read'):
                cache_item = self.cache.get_item(
                    grab.config['url'], timeout=task.cache_timeout)
                if cache_item is None:
                    return None
                else:
                    with self.spider.timer.log_time('cache.read.prepare_request'):
                        grab.prepare_request()
                    with self.spider.timer.log_time('cache.read.load_response'):
                        self.cache.load_response(grab, cache_item)

                    grab.log_request('CACHED')
                    self.spider.stat.inc('spider:request-cache')

                    return {'ok': True, 'task': task, 'grab': grab,
                            'grab_config_backup': grab.dump_config(),
                            'emsg': None}
Ejemplo n.º 25
0
class PluginRunner(object):
    class State(object):
        RUNNING = 'RUNNING'
        STOPPED = 'STOPPED'

    def __init__(self,
                 name,
                 runtime_path,
                 plugin_path,
                 logger,
                 command_timeout=5.0,
                 state_callback=None):
        self.runtime_path = runtime_path
        self.plugin_path = plugin_path
        self.command_timeout = command_timeout

        self._logger = logger
        self._cid = 0
        self._proc = None  # type: Optional[subprocess.Popen[bytes]]
        self._running = False
        self._process_running = False
        self._command_lock = Lock()
        self._response_queue = Queue()  # type: Queue[Dict[str,Any]]
        self._writer = None  # type: Optional[PluginIPCWriter]
        self._reader = None  # type: Optional[PluginIPCReader]
        self._state_callback = state_callback  # type: Optional[Callable[[str, str], None]]

        self.name = name
        self.version = None
        self.interfaces = None

        self._decorators_in_use = {}
        self._exposes = []
        self._metric_collectors = []
        self._metric_receivers = []

        self._async_command_thread = None
        self._async_command_queue = None  # type: Optional[Queue[Optional[Dict[str, Any]]]]

        self._commands_executed = 0
        self._commands_failed = 0

        self.__collector_runs = {}  # type: Dict[str,float]

    def start(self):
        # type: () -> None
        if self._running:
            raise Exception('PluginRunner is already running')

        self.logger('[Runner] Starting')

        python_executable = sys.executable
        if python_executable is None or len(python_executable) == 0:
            python_executable = '/usr/bin/python'

        self._proc = subprocess.Popen([
            python_executable, 'runtime.py', 'start_plugin', self.plugin_path
        ],
                                      stdin=subprocess.PIPE,
                                      stdout=subprocess.PIPE,
                                      stderr=None,
                                      cwd=self.runtime_path,
                                      close_fds=True)
        assert self._proc.stdout, 'Plugin stdout not available'
        self._process_running = True

        self._commands_executed = 0
        self._commands_failed = 0

        assert self._proc.stdin, 'Plugin stdin not defined'
        self._writer = PluginIPCWriter(stream=self._proc.stdin)
        self._reader = PluginIPCReader(stream=self._proc.stdout,
                                       logger=lambda message, ex: self.logger(
                                           '{0}: {1}'.format(message, ex)),
                                       command_receiver=self._process_command,
                                       name=self.name)
        self._reader.start()

        start_out = self._do_command('start', timeout=180)
        self.name = start_out['name']
        self.version = start_out['version']
        self.interfaces = start_out['interfaces']

        self._decorators_in_use = start_out['decorators']
        self._exposes = start_out['exposes']
        self._metric_collectors = start_out['metric_collectors']
        self._metric_receivers = start_out['metric_receivers']

        exception = start_out.get('exception')
        if exception is not None:
            raise RuntimeError(exception)

        self._async_command_queue = Queue(1000)
        self._async_command_thread = BaseThread(
            name='plugincmd{0}'.format(self.plugin_path),
            target=self._perform_async_commands)
        self._async_command_thread.daemon = True
        self._async_command_thread.start()

        self._running = True
        if self._state_callback is not None:
            self._state_callback(self.name, PluginRunner.State.RUNNING)
        self.logger('[Runner] Started')

    def logger(self, message):
        # type: (str) -> None
        self._logger(message)
        logger_.info('Plugin {0} - {1}'.format(self.name, message))

    def get_webservice(self, webinterface):
        # type: (WebInterface) -> Service
        return Service(self, webinterface)

    def is_running(self):
        # type: () -> bool
        return self._running

    def stop(self):
        # type: () -> None
        if self._process_running:
            self._running = False

            self.logger('[Runner] Sending stop command')
            try:
                self._do_command('stop')
            except Exception as exception:
                self.logger(
                    '[Runner] Exception during stopping plugin: {0}'.format(
                        exception))
            time.sleep(0.1)

            if self._reader:
                self._reader.stop()
            self._process_running = False
            if self._async_command_queue is not None:
                self._async_command_queue.put(
                    None)  # Triggers an abort on the read thread

            if self._proc and self._proc.poll() is None:
                self.logger('[Runner] Terminating process')
                try:
                    self._proc.terminate()
                except Exception as exception:
                    self.logger(
                        '[Runner] Exception during terminating plugin: {0}'.
                        format(exception))
                time.sleep(0.5)

                if self._proc.poll() is None:
                    self.logger('[Runner] Killing process')
                    try:
                        self._proc.kill()
                    except Exception as exception:
                        self.logger(
                            '[Runner] Exception during killing plugin: {0}'.
                            format(exception))

            if self._state_callback is not None:
                self._state_callback(self.name, PluginRunner.State.STOPPED)
            self.logger('[Runner] Stopped')

    def process_input_status(self, data, action_version=1):
        if action_version in [1, 2]:
            if action_version == 1:
                payload = {'status': data}
            else:
                event_json = data.serialize()
                payload = {'event': event_json}
            self._do_async(action='input_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger('Input status version {} not supported.'.format(
                action_version))

    def process_output_status(self, data, action_version=1):
        if action_version in [1, 2]:
            if action_version == 1:
                payload = {'status': data}
            else:
                event_json = data.serialize()
                payload = {'event': event_json}
            self._do_async(action='output_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger('Output status version {} not supported.'.format(
                action_version))

    def process_shutter_status(self, data, action_version=1):
        if action_version in [1, 2, 3]:
            if action_version == 1:
                payload = {'status': data}
            elif action_version == 2:
                status, detail = data
                payload = {'status': {'status': status, 'detail': detail}}
            else:
                event_json = data.serialize()
                payload = {'event': event_json}
            self._do_async(action='shutter_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger('Shutter status version {} not supported.'.format(
                action_version))

    def process_ventilation_status(self, data, action_version=1):
        if action_version in [1]:
            event_json = data.serialize()
            payload = {'event': event_json}
            self._do_async(action='ventilation_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger('Ventilation status version {} not supported.'.format(
                action_version))

    def process_thermostat_status(self, data, action_version=1):
        if action_version in [1]:
            event_json = data.serialize()
            payload = {'event': event_json}
            self._do_async(action='thermostat_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger('Thermostat status version {} not supported.'.format(
                action_version))

    def process_thermostat_group_status(self, data, action_version=1):
        if action_version in [1]:
            event_json = data.serialize()
            payload = {'event': event_json}
            self._do_async(action='thermostat_group_status',
                           payload=payload,
                           should_filter=True,
                           action_version=action_version)
        else:
            self.logger(
                'Thermostat group status version {} not supported.'.format(
                    action_version))

    def process_event(self, code):
        self._do_async('receive_events', {'code': code}, should_filter=True)

    def collect_metrics(self):
        for mc in self._metric_collectors:
            try:
                now = time.time()
                (name, interval) = (mc['name'], mc['interval'])

                if self.__collector_runs.get(name, 0.0) < now - interval:
                    self.__collector_runs[name] = now
                    metrics = self._do_command('collect_metrics',
                                               {'name': name})['metrics']
                    for metric in metrics:
                        if metric is None:
                            continue
                        metric['source'] = self.name
                        yield metric
            except Exception as exception:
                self.logger(
                    '[Runner] Exception while collecting metrics {0}: {1}'.
                    format(exception, traceback.format_exc()))

    def get_metric_receivers(self):
        return self._metric_receivers

    def distribute_metrics(self, method, metrics):
        self._do_async('distribute_metrics', {
            'name': method,
            'metrics': metrics
        })

    def get_metric_definitions(self):
        return self._do_command('get_metric_definitions')['metric_definitions']

    def request(self, method, args=None, kwargs=None):
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        ret = self._do_command('request', {
            'method': method,
            'args': args,
            'kwargs': kwargs
        })
        if ret['success']:
            return ret['response']
        elif 'stacktrace' in ret:
            raise Exception('{0}: {1}'.format(ret['exception'],
                                              ret['stacktrace']))
        raise Exception(ret['exception'])

    def remove_callback(self):
        # type: () -> None
        self._do_command('remove_callback')

    def _process_command(self, response):
        # type: (Dict[str,Any]) -> None
        if not self._process_running:
            return
        assert self._proc, 'Plugin process not defined'
        exit_code = self._proc.poll()
        if exit_code is not None:
            self.logger(
                '[Runner] Stopped with exit code {0}'.format(exit_code))
            self._process_running = False
            return

        if response['cid'] == 0:
            self._handle_async_response(response)
        elif response['cid'] == self._cid:
            self._response_queue.put(response)
        else:
            self.logger(
                '[Runner] Received message with unknown cid: {0}'.format(
                    response))

    def _handle_async_response(self, response):
        # type: (Dict[str,Any]) -> None
        if response['action'] == 'logs':
            self.logger(response['logs'])
        else:
            self.logger('[Runner] Unkown async message: {0}'.format(response))

    def _do_async(self,
                  action,
                  payload,
                  should_filter=False,
                  action_version=1):
        # type: (str, Dict[str,Any], bool, int) -> None
        has_receiver = False
        for decorator_name, decorator_versions in six.iteritems(
                self._decorators_in_use):
            # the action version is linked to a specific decorator version
            has_receiver |= (action == decorator_name
                             and action_version in decorator_versions)
        if not self._process_running or (should_filter and not has_receiver):
            return
        try:
            assert self._async_command_queue, 'Command Queue not defined'
            self._async_command_queue.put(
                {
                    'action': action,
                    'payload': payload,
                    'action_version': action_version
                },
                block=False)
        except Full:
            self.logger('Async action cannot be queued, queue is full')

    def _perform_async_commands(self):
        # type: () -> None
        while self._process_running:
            try:
                # Give it a timeout in order to check whether the plugin is not stopped.
                assert self._async_command_queue, 'Command Queue not defined'
                command = self._async_command_queue.get(block=True, timeout=10)
                if command is None:
                    continue  # Used to exit this thread
                self._do_command(command['action'],
                                 payload=command['payload'],
                                 action_version=command['action_version'])
            except Empty:
                self._do_async('ping', {})
            except Exception as exception:
                self.logger(
                    '[Runner] Failed to perform async command: {0}'.format(
                        exception))

    def _do_command(self,
                    action,
                    payload=None,
                    timeout=None,
                    action_version=1):
        # type: (str, Dict[str,Any], Optional[float], int) -> Dict[str,Any]
        if payload is None:
            payload = {}
        self._commands_executed += 1
        if timeout is None:
            timeout = self.command_timeout

        if not self._process_running:
            raise Exception('Plugin was stopped')

        with self._command_lock:
            try:
                command = self._create_command(action, payload, action_version)
                assert self._writer, 'Plugin stdin not defined'
                self._writer.write(command)
            except Exception:
                self._commands_failed += 1
                raise

            try:
                response = self._response_queue.get(block=True,
                                                    timeout=timeout)
                while response['cid'] != self._cid:
                    response = self._response_queue.get(block=False)
                exception = response.get('_exception')
                if exception is not None:
                    raise RuntimeError(exception)
                return response
            except Empty:
                metadata = ''
                if action == 'request':
                    metadata = ' {0}'.format(payload['method'])
                if self._running:
                    self.logger(
                        '[Runner] No response within {0}s ({1}{2})'.format(
                            timeout, action, metadata))
                self._commands_failed += 1
                raise Exception('Plugin did not respond')

    def _create_command(self, action, payload=None, action_version=1):
        # type: (str, Dict[str,Any], int) -> Dict[str,Any]
        if payload is None:
            payload = {}
        self._cid += 1
        command = {
            'cid': self._cid,
            'action': action,
            'action_version': action_version
        }
        command.update(payload)
        return command

    def error_score(self):
        # type: () -> float
        if self._commands_executed == 0:
            return 0.0
        else:
            score = float(self._commands_failed) / self._commands_executed
            self._commands_failed = 0
            self._commands_executed = 0
            return score

    def get_queue_length(self):
        # type: () -> int
        if self._async_command_queue is None:
            return 0
        return self._async_command_queue.qsize()
Ejemplo n.º 26
0
class Hue(OMPluginBase):

    name = 'Hue'
    version = '1.1.2'
    interfaces = [('config', '1.0')]

    config_description = [{
        'name':
        'api_url',
        'type':
        'str',
        'description':
        'The API URL of the Hue Bridge device. E.g. http://192.168.1.2/api'
    }, {
        'name': 'username',
        'type': 'str',
        'description': 'Hue Bridge generated username.'
    }, {
        'name':
        'poll_frequency',
        'type':
        'int',
        'description':
        'The frequency used to pull the status of all outputs from the Hue bridge in seconds (0 means never)'
    }, {
        'name':
        'output_mapping',
        'type':
        'section',
        'description':
        'Mapping between OpenMotics Virtual Outputs/Dimmers and Hue Outputs',
        'repeat':
        True,
        'min':
        0,
        'content': [{
            'name': 'output_id',
            'type': 'int'
        }, {
            'name': 'hue_output_id',
            'type': 'int'
        }]
    }]

    default_config = {
        'api_url': 'http://hue/api',
        'username': '',
        'poll_frequency': 60
    }

    def __init__(self, webinterface, gateway_logger):
        self.setup_logging(log_function=gateway_logger)
        super(Hue, self).__init__(webinterface, logger)
        logger.info('Starting Hue plugin %s ...', self.version)

        self.discover_hue_bridges()

        self._config = self.read_config(Hue.default_config)
        self._config_checker = PluginConfigChecker(Hue.config_description)

        self._read_config()

        self._io_lock = Lock()
        self._output_event_queue = Queue(maxsize=256)

        logger.info("Hue plugin started")

    @staticmethod
    def setup_logging(log_function):  # type: (Callable) -> None
        logger.setLevel(logging.INFO)
        log_handler = PluginLogHandler(log_function=log_function)
        # some elements like time and name are added by the plugin runtime already
        # formatter = logging.Formatter('%(asctime)s - %(name)s - %(threadName)s - %(levelname)s - %(message)s')
        formatter = logging.Formatter(
            '%(threadName)s - %(levelname)s - %(message)s')
        log_handler.setFormatter(formatter)
        logger.addHandler(log_handler)

    def _read_config(self):
        self._api_url = self._config['api_url']
        self._output_mapping = self._config.get('output_mapping', [])
        self._output = self._create_output_object()
        self._hue = self._create_hue_object()
        self._username = self._config['username']
        self._poll_frequency = self._config['poll_frequency']

        self._endpoint = '{0}/{1}/{{0}}'.format(self._api_url, self._username)

        self._enabled = self._api_url != '' and self._username != ''
        logger.info(
            'Hue is {0}'.format('enabled' if self._enabled else 'disabled'))

    def _create_output_object(self):
        # create an object with the OM output IDs as the keys and hue light IDs as the values
        output_object = {}
        for entry in self._output_mapping:
            output_object[entry['output_id']] = entry['hue_output_id']
        return output_object

    def _create_hue_object(self):
        # create an object with the hue light IDs as the keys and OM output IDs as the values
        hue_object = {}
        for entry in self._output_mapping:
            hue_object[entry['hue_output_id']] = entry['output_id']
        return hue_object

    @output_status(version=2)
    def output_status(self, output_event):
        if self._enabled is True:
            try:
                output_id = output_event['id']
                state = output_event['status'].get('on')
                dimmer_level = output_event['status'].get('value')
                hue_light_id = self._output.get(output_id)
                if hue_light_id is not None:
                    logger.info(
                        'Switching output %s (hue id: %s) %s (dimmer: %s)',
                        output_id, hue_light_id, 'ON' if state else 'OFF',
                        dimmer_level)
                    data = (hue_light_id, state, dimmer_level)
                    self._output_event_queue.put(data)
                else:
                    logger.debug(
                        'Ignoring output %s, because it is not Hue'.format(
                            output_id))
            except Exception as ex:
                logger.exception(
                    'Error processing output_status event: {0}'.format(ex))

    def _send(self, hue_light_id, state, dimmer_level):
        try:
            state = {'on': state}
            if dimmer_level is not None:
                state.update(
                    {'bri': self._dimmerLevelToBrightness(dimmer_level)})
            self._setLightState(hue_light_id, state)
        except Exception as ex:
            logger.exception('Error sending command to output with hue id: %s',
                             hue_light_id)

    def _getLightState(self, hue_light_id):
        try:
            start = time.time()
            response = requests.get(
                url=self._endpoint.format('lights/{0}').format(hue_light_id))
            if response.status_code is 200:
                hue_light = response.json()
                logger.info('Getting output state for hue id: %s took %ss',
                            hue_light_id, round(time.time() - start, 2))
                return hue_light
            else:
                logger.warning('Failed to pull state for hue id: %s',
                               hue_light_id)
                return False
        except Exception as ex:
            logger.exception('Error while getting output state for hue id: %s',
                             hue_light_id)

    def _setLightState(self, hue_light_id, state):
        try:
            start = time.time()
            response = requests.put(url=self._endpoint.format(
                'lights/{0}/state').format(hue_light_id),
                                    data=json.dumps(state))
            if response.status_code is 200:
                result = response.json()
                if result[0].get('success') is None:
                    logger.info(
                        'Setting output state for Hue output {0} returned unexpected result. Response: {1} ({2})'
                        .format(hue_light_id, response.text,
                                response.status_code))
                    return False
                logger.info('Setting output state for hue id: %s took %ss',
                            hue_light_id, round(time.time() - start, 2))
                return True
            else:
                logger.error(
                    'Setting output state for hue id: %s failed. Response: %s (%s)',
                    hue_light_id, response.text, response.status_code)
                return False
        except Exception as ex:
            logger.exception(
                'Error while setting output state for hue id: %s to %s',
                hue_light_id, json.dumps(state))

    def import_remote_state(self):
        if not self._output_event_queue.empty():
            logger.info(
                'Ignoring syncing remote state because we still need to process %s output events',
                self._output_event_queue.qsize())
        else:
            try:
                self._import_lights_state()
            except Exception as ex:
                logger.exception(
                    'Error while getting state for all Hue outputs')
            try:
                self._import_sensors_state()
            except Exception as ex:
                logger.exception(
                    'Error while getting state for all Hue sensors')

    def _import_lights_state(self):
        logger.debug(
            'Syncing remote state for all outputs from the Hue bridge')
        hue_lights = self._getAllLightsState()
        for output in self._output_mapping:
            output_id = output['output_id']
            hue_light_id = str(output['hue_output_id'])
            hue_light_state = hue_lights.get(hue_light_id)
            if hue_light_state is not None:
                if hue_light_state.get('on', False):
                    result = json.loads(
                        self.webinterface.set_output(
                            None, str(output_id), 'true',
                            str(hue_light_state['dimmer_level'])))
                else:
                    result = json.loads(
                        self.webinterface.set_output(None, str(output_id),
                                                     'false'))
                if result['success'] is False:
                    logger.error(
                        'Error when updating output %s (hue id: %s): %s',
                        output_id, hue_light_id, result['msg'])
            else:
                logger.warning(
                    'Output %s (hue id:  %s) not found on Hue bridge',
                    output_id, hue_light_id)

    def _import_sensors_state(self):
        logger.debug(
            'Syncing remote state for all sensors from the Hue bridge')
        known_sensors = self._get_known_sensors()
        hue_sensors = self._getAllSensorsState()

        for hue_sensor_id, sensor in hue_sensors.items():
            sensor_external_id = sensor['external_id']
            if sensor_external_id not in known_sensors.keys():
                name = 'Hue Sensor {}'.format(hue_sensor_id)
                om_sensor_id = self._register_sensor(name, sensor_external_id)
            else:
                om_sensor_id = known_sensors[sensor_external_id]
            value = float(sensor.get('value'))
            if om_sensor_id is not None:
                self._update_sensor(om_sensor_id, value)
            else:
                logger.error('Hue sensor %s (%s) not found', hue_sensor_id,
                             sensor_external_id)

    def _get_known_sensors(self):
        response = self.webinterface.get_sensor_configurations()
        data = json.loads(response)
        return {
            x['external_id']: x['id']
            for x in data['config']
            if x.get('source', {}).get('name') == Hue.name
            and x['external_id'] not in [None, '']
        }

    def _getAllSensorsState(self):
        hue_sensors = {}
        response = requests.get(url=self._endpoint.format('sensors'))
        if response.status_code is 200:
            for hue_sensor_id, data in response.json().items():
                if data.get('type') == 'ZLLTemperature':
                    hue_sensors[hue_sensor_id] = self._parseSensorObject(
                        hue_sensor_id, data, sensor_type='temperature')
        else:
            logger.error('Failed to pull state for all sensors (HTTP %s)',
                         response.status_code)
        return hue_sensors

    def _getAllLightsState(self):
        hue_lights = {}
        response = requests.get(url=self._endpoint.format('lights'))
        if response.status_code is 200:
            for hue_light_id, data in response.json().items():
                hue_lights[hue_light_id] = self._parseLightObject(
                    hue_light_id, data)
        else:
            logger.error('Failed to pull state for all outputs (HTTP %s)',
                         response.status_code)
        return hue_lights

    def _parseLightObject(self, hue_light_id, hue_light_object):
        light = {'id': hue_light_id}
        try:
            light.update({
                'name':
                hue_light_object['name'],
                'on':
                hue_light_object['state'].get('on', False),
                'brightness':
                hue_light_object['state'].get('bri', 254)
            })
            light['dimmer_level'] = self._brightnessToDimmerLevel(
                light['brightness'])
        except Exception as ex:
            logger.exception('Error while parsing Hue light %s',
                             hue_light_object)
        return light

    def _parseSensorObject(self,
                           hue_sensor_id,
                           hue_sensor_object,
                           sensor_type='temperature'):
        sensor = {'id': hue_sensor_id}
        try:
            value = hue_sensor_object['state'][sensor_type]
            if sensor_type == 'temperature':
                value /= 100.0
            sensor.update({
                'external_id': hue_sensor_object['uniqueid'],
                'name': hue_sensor_object['name'],
                'type': hue_sensor_object['type'],
                'value': value
            })

        except Exception as ex:
            logger.exception('Error while parsing Hue sensor %s',
                             hue_sensor_object)
        return sensor

    def _brightnessToDimmerLevel(self, brightness):
        return int(round(brightness / 2.54))

    def _dimmerLevelToBrightness(self, dimmer_level):
        return int(round(dimmer_level * 2.54))

    @background_task
    def run(self):
        if self._enabled:
            event_processor = Thread(target=self.output_event_processor,
                                     name='output_event_processor')
            event_processor.start()
            self.log_remote_asset_list()
            self.start_state_poller()

    def log_remote_asset_list(self):
        hue_lights = self._getAllLightsState()
        for hue_id, hue_light in hue_lights.items():
            logger.info('Discovered hue output %s (hue id: %s)',
                        hue_light.get('name'), hue_id)
        hue_sensors = self._getAllSensorsState()
        for hue_id, hue_sensor in hue_sensors.items():
            logger.info('Discovered hue sensor %s (hue id: %s)',
                        hue_sensor.get('name'), hue_id)

    def start_state_poller(self):
        while self._poll_frequency > 0:
            start = time.time()
            self.import_remote_state()
            # This loop will run approx. every 'poll_frequency' seconds
            sleep = self._poll_frequency - (time.time() - start)
            if sleep < 0:
                sleep = 1
            self.sleep(sleep)

    @om_expose
    def get_config_description(self):
        return json.dumps(Hue.config_description)

    @om_expose
    def get_config(self):
        return json.dumps(self._config)

    @om_expose
    def set_config(self, config):
        config = json.loads(config)
        for key in config:
            if isinstance(config[key], six.string_types):
                config[key] = str(config[key])
        self._config_checker.check_config(config)
        self._config = config
        self._read_config()
        self.write_config(config)
        return json.dumps({'success': True})

    def output_event_processor(self):
        while self._enabled:
            try:
                _latest_value_buffer = {}
                while True:
                    try:
                        hue_light_id, status, dimmer = self._output_event_queue.get(
                            block=True, timeout=1)
                        _latest_value_buffer[hue_light_id] = (
                            status, dimmer
                        )  # this will ensure only the latest value is taken
                    except Empty:
                        break
                for hue_light_id, (status,
                                   dimmer) in _latest_value_buffer.items():
                    self._send(hue_light_id, status, dimmer)
                    self.sleep(
                        0.1
                    )  # "throttle" requests to the bridge to avoid overloading
            except Exception as ex:
                if 'maintenance_mode' in ex.message:
                    logger.warning(
                        'System in maintenance mode. Processing paused for 1 minute.'
                    )
                    self.sleep(60)
                else:
                    logger.exception(
                        'Unexpected error processing output events')
                    self.sleep(10)

    def sleep(self, timer):
        now = time.time()
        expired = time.time() - now > timer
        while self._enabled and not expired:
            time.sleep(min(timer, 0.5))
            expired = time.time() - now > timer

            def _getLightState(self, hue_light_id):
                try:
                    start = time.time()
                    response = requests.get(url=self._endpoint.format(
                        'lights/{0}').format(hue_light_id))
                    if response.status_code is 200:
                        hue_light = response.json()
                        logger.info(
                            'Getting output state for hue id: %s took %ss',
                            hue_light_id, round(time.time() - start, 2))
                        return hue_light
                    else:
                        logger.warning('Failed to pull state for hue id: %s',
                                       hue_light_id)
                        return False
                except Exception as ex:
                    logger.exception(
                        'Error while getting output state for hue id: %s',
                        hue_light_id)

    def discover_hue_bridges(self):
        try:
            response = requests.get(url='https://discovery.meethue.com/')
            if response.status_code is 200:
                hue_bridge_data = response.json()
                for hue_bridge in hue_bridge_data:
                    logger.info('Discovered hue bridge %s @ %s',
                                hue_bridge.get('id'),
                                hue_bridge.get('internalipaddress'))
            else:
                logger.warning('Failed to discover bridges on this network')
                return False
        except Exception as ex:
            logger.exception(
                'Error while discovering hue bridges on this network')

    def _register_sensor(self, name, external_id):
        logger.debug('Registering sensor with name %s and external_id %s',
                     name, external_id)
        try:
            config = {
                'name': name,
            }
            response = self.webinterface.sensor.register(
                external_id=external_id,
                physical_quantity='temperature',
                unit='celcius',
                config=config)
            return response.id
        except Exception as e:
            logger.warning(
                'Failed registering sensor with name %s and external_id %s with exception %s',
                name, external_id, str(e.message))
            return None

    def _update_sensor(self, sensor_id, value):
        logger.debug('Updating sensor %s with status %s', sensor_id, value)
        response = self.webinterface.sensor.set_status(sensor_id=sensor_id,
                                                       value=value)
        if response is None:
            logger.warning('Could not set the updated sensor value')
            return False
        return True
Ejemplo n.º 27
0
class eptWorkerFabric(object):
    """ tracks cache and settings for each fabric actively being monitored, also provides useful
        notification and push_event functions
    """
    def __init__(self, fabric):
        self.fabric = fabric
        self.start_ts = time.time()
        self.settings = eptSettings.load(fabric=fabric, settings="default")
        self.cache = eptCache(fabric)
        self.dns_cache = DNSCache()
        self.db = get_db()
        self.watcher_paused = False
        self.session = None
        self.notify_queue = None
        self.notify_thread = None
        self.init()

    def init(self):
        """ initialize settings after fabric settings as been loaded """
        # epm parser used with eptWorker for creating pseudo eevents
        self.ept_epm_parser = eptEpmEventParser(self.fabric,
                                                self.settings.overlay_vnid)
        # one time calculation for email address and syslog server (which requires valid port)
        self.email_address = self.settings.email_address
        self.syslog_server = self.settings.syslog_server
        self.syslog_port = self.settings.syslog_port
        if len(self.email_address) == 0:
            self.email_address = None
        if len(self.syslog_server) == 0:
            self.syslog_server = None
            self.syslog_port = None

    def close(self):
        """ stateful close when worker receives FABRIC_STOP for this fabric """
        if self.db is not None:
            self.db.client.close()
        if self.session is not None:
            self.session.close()
        if self.notify_thread is not None:
            self.notify_thread.exit()
        # remove all objects from notify queue
        if self.notify_queue is not None:
            try:
                logger.debug("clearing notify queue (size: %d)",
                             self.notify_queue.qsize())
                while not self.notify_queue.empty():
                    self.notify_queue.get()
            except Exception as e:
                logger.debug("Traceback:\n%s", traceback.format_exc())
                logger.error("failed to execute clear notify queue %s", e)

    def watcher_init(self):
        """ watcher process needs session object for mo sync and notify queue"""
        logger.debug("wf worker init for %s", self.fabric)
        self.notify_queue = Queue(maxsize=NOTIFY_QUEUE_MAX_SIZE)
        logger.debug("starting worker fabric apic session")
        self.session = get_apic_session(self.fabric)
        if self.session is None:
            logger.error("failed to get session object within worker fabric")

        # watcher will also send notifications within a background thread to ensure that
        # any delay in syslog or email does not backup other service events
        self.notify_thread = BackgroundThread(func=self.execute_notify,
                                              name="notify",
                                              count=0,
                                              interval=NOTIFY_INTERVAL)
        self.notify_thread.daemon = True
        self.notify_thread.start()

    def settings_reload(self):
        """ reload settings from db """
        logger.debug("reloading settings for %s", self.fabric)
        self.settings.reload()
        self.init()

    def get_uptime_delta_offset(self, delta=None):
        """ return difference between provided delta and current uptime. If the uptime_delta is 
            less than zero, return 0.  If no delta is provided, then return the uptime.
        """
        uptime = time.time() - self.start_ts
        if delta is None: return uptime
        uptime_delta = delta - uptime
        if uptime_delta > 0: return uptime_delta
        return 0

    def push_event(self, table, key, event, per_node=True):
        # wrapper to push an event to eptHistory events list.  set per_node to false to use
        # max_endpoint_event rotate length, else max_per_node_endpoint_events value is used
        if per_node:
            return push_event(
                self.db[table],
                key,
                event,
                rotate=self.settings.max_per_node_endpoint_events)
        else:
            return push_event(self.db[table],
                              key,
                              event,
                              rotate=self.settings.max_endpoint_events)

    def get_learn_type(self, vnid, flags=[]):
        # based on provide vnid and flags return learn type for endpoint:
        #   loopback - if loopback in flags
        #   psvi - if psvi in flags
        #   svi - represents an external SVI (should be set on all external svis)
        #   overlay if vnid is overlay vnid
        #   external if vnid is in eptVnid table with external set to true
        #   else returns 'epg' (default learn type)
        if "loopback" in flags:
            return "loopback"
        elif "psvi" in flags:
            return "psvi"
        elif "svi" in flags:
            return "external"
        elif vnid == self.settings.overlay_vnid:
            return "overlay"
        ept_vnid = self.cache.get_vnid_name(vnid, return_object=True)
        if ept_vnid is not None and ept_vnid.external:
            return "external"
        return "epg"

    def notification_enabled(self, notify_type):
        # return dict with email address, syslog server, syslog port for notify type. If not enabled,
        # then return None for each field. Set notify_type to 'any_email' or 'any_syslog' to force
        # test of a particular notification type
        ret = {
            "enabled": False,
            "email_address": None,
            "syslog_server": None,
            "syslog_port": None
        }
        if notify_type == "move":
            attr = ("notify_move_email", "notify_move_syslog")
        elif notify_type == "stale":
            attr = ("notify_stale_email", "notify_stale_syslog")
        elif notify_type == "offsubnet":
            attr = ("notify_offsubnet_email", "notify_offsubnet_syslog")
        elif notify_type == "clear":
            attr = ("notify_clear_email", "notify_clear_syslog")
        elif notify_type == "rapid":
            attr = ("notify_rapid_email", "notify_rapid_syslog")
        elif notify_type == "any_email":
            # return all notification types enabled
            ret["enabled"] = True
            ret["email_address"] = self.email_address
            return ret
        elif notify_type == "any_syslog":
            # return all notification types enabled
            ret["enabled"] = True
            ret["syslog_server"] = self.syslog_server
            ret["syslog_port"] = self.syslog_port
            return ret
        else:
            logger.warn("invalid notification type '%s", notify_type)
            return ret
        if getattr(self.settings, attr[0]):
            ret["enabled"] = True
            ret["email_address"] = self.email_address
        if getattr(self.settings, attr[1]):
            ret["enabled"] = True
            ret["syslog_server"] = self.syslog_server
            ret["syslog_port"] = self.syslog_port
        return ret

    def send_notification(self,
                          notify_type,
                          subject=None,
                          txt=None,
                          bulk=None):
        # send proper notifications for this fabric.  set notify_type to none to skip enable check
        # and force notification. user can set bulk to list of (subject/txt) tuples to send a list
        # of notifications at the same time.  All notifications must be of the same notify_type.
        success = True
        errmsg = ""
        notify = self.notification_enabled(notify_type)
        if notify["enabled"]:
            if notify["email_address"] is not None:
                emails = []
                if bulk is not None:
                    for (bulk_subject, bulk_txt) in bulk:
                        emails.append({
                            "sender":
                            get_app_config().get("EMAIL_SENDER", None),
                            "receiver":
                            notify["email_address"],
                            "subject":
                            bulk_subject,
                            "msg":
                            bulk_txt,
                        })
                else:
                    emails.append({
                        "sender":
                        get_app_config().get("EMAIL_SENDER", None),
                        "receiver":
                        notify["email_address"],
                        "subject":
                        subject,
                        "msg":
                        txt,
                    })
                # send_email already supports a list of emails, so simply send all at once
                (success, errmsg) = send_emails(settings=self.settings,
                                                dns_cache=self.dns_cache,
                                                emails=emails)
                if not success:
                    logger.warn("failed to send email: %s", errmsg)
            if notify["syslog_server"] is not None:
                if bulk is not None:
                    for (bulk_subject, bulk_txt) in bulk:
                        syslog(
                            bulk_txt,
                            dns_cache=self.dns_cache,
                            server=notify["syslog_server"],
                            server_port=notify["syslog_port"],
                        )
                else:
                    syslog(
                        txt,
                        dns_cache=self.dns_cache,
                        server=notify["syslog_server"],
                        server_port=notify["syslog_port"],
                    )
            return (success, errmsg)
        else:
            logger.debug("skipping send notification as '%s' is not enabled",
                         notify_type)
            return (False, "notification not enabled")

    def queue_notification(self, notify_type, subject, txt):
        # queue notification that will be sent at next iteration of NOTIFY_INTERVAL
        if self.notify_queue is None:
            logger.error("notify queue not initialized for worker fabric")
            return
        try:
            logger.debug("enqueuing %s notification (queue size %d)",
                         notify_type, self.notify_queue.qsize())
            self.notify_queue.put_nowait(
                (notify_type, subject, txt, time.time()))
        except Full as e:
            logger.error(
                "failed to enqueue notification, queue is full (size: %s)",
                self.notify_queue.qsize())

    def execute_notify(self):
        # send any notifications sitting in queue, log number of notifications sent and max queue
        # time. We want to support bulk notifications (mainly for email to prevent multiple login
        # on smpt_relay_auth setups) so this function will sort based on notify type and execute
        # send notification with bulk flag.
        msgs = {}  # indexed by notify type and contains tuple (subject,txt)
        count = 0
        max_queue_time = 0
        while not self.notify_queue.empty():
            (notify_type, subject, txt, q_ts) = self.notify_queue.get()
            count += 1
            q_time = time.time() - q_ts
            if q_time > max_queue_time:
                max_queue_time = q_time
            if notify_type not in msgs:
                msgs[notify_type] = []
            msgs[notify_type].append((subject, txt))
        for notify_type in msgs:
            self.send_notification(notify_type, bulk=msgs[notify_type])
        if count > 0:
            logger.debug("sent %s notifications, max queue time %0.3f sec",
                         count, max_queue_time)
        if choice != '':
            job_queue.put({
                'q': entity_id,
                'action': 'set_description',
                'lang': 'nb',
                'value': choice,
                'summary': '#no_to_nb cleanup drive'
            })

            if labels['no'].get('description') is not None:
                job_queue.put({
                    'q': entity_id,
                    'action': 'remove_description',
                    'lang': 'no',
                    'summary': '#no_to_nb cleanup drive'
                })

    completed += 1
    t1 = time.time() - t0
    if not bg_thread.isAlive():
        print('Thread exited. Starting a new')
        bg_thread = start_thread()

    while job_queue.qsize() > 100:
        print('Job queue length: %d, sleeping a while' % job_queue.qsize())
        time.sleep(10)
    print('Status: Fixed %s items. %s items left to go, time: %.2f sec/item, job queue length: %d' % (completed, len(rows), t1 / completed, job_queue.qsize()))

print('*** Main thread waiting')
job_queue.join()
print('*** Done')
Ejemplo n.º 29
0
class HyperDash:
    """HyperDash monitors a job and manages capturing IO / server comms.

    This class is designed to be run in its own thread and contains an instance
    of code_runner (which is running the job) and server_manager (for talking
    to the server.)
    """
    def __init__(
        self,
        job_name,
        code_runner,
        server_manager,
        io_bufs,
        std_streams,
    ):
        """Initialize the HyperDash class.

        args:
            1) job_name: Name of the current running job
            2) code_runner: Instance of CodeRunner
            3) server_manager: ServerManager instance
            4) io_bufs: Tuple in the form of (StringIO(), StringIO(),)
            5) std_streams: Tuple in the form of (StdOut, StdErr)
        """
        self.job_name = job_name
        self.code_runner = code_runner
        self.server_manager = server_manager
        self.out_buf, self.err_buf = io_bufs
        self.std_out, self.std_err = std_streams
        self.programmatic_exit = False
        self.shutdown_network_channel = Queue()
        self.shutdown_main_channel = Queue()

        # Used to keep track of the current position in the IO buffers
        self.out_buf_offset = 0
        self.err_buf_offset = 0

        # Create a UUID to uniquely identify this run from the SDK's point of view
        self.current_sdk_run_uuid = str(uuid.uuid4())

        # Create run_start message before doing any other setup work to make sure that the
        # run_started message always precedes any other messages
        self.server_manager.put_buf(
            create_run_started_message(self.current_sdk_run_uuid,
                                       self.job_name), )

        def on_stdout_flush():
            self.capture_io()
            self.std_out.flush()
            self.flush_log_file()

        def on_stderr_flush():
            self.capture_io()
            self.std_err.flush()
            self.flush_log_file()

        self.out_buf.set_on_flush(on_stdout_flush)
        self.err_buf.set_on_flush(on_stderr_flush)

        self.logger = logging.getLogger("hyperdash.{}".format(__name__))
        self.log_file, self.log_file_path = self.open_log_file()
        if not self.log_file:
            self.logger.error(
                "Could not create logs file. Logs will not be stored locally.")

    def open_log_file(self):
        log_folder = get_hyperdash_logs_home_path()

        # Make sure logs directory exists (/logs)
        if not os.path.exists(log_folder):
            try:
                os.makedirs(log_folder)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    pass
                return None, None

        job_log_folder = get_hyperdash_logs_home_path_for_job(self.job_name)
        # Make sure directory for job exists in log directory (/logs/<JOB_NAME>)
        if not os.path.exists(job_log_folder):
            try:
                os.makedirs(job_log_folder)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    pass
                return None, None

        # Create new log file for run
        try:
            iso = slugify(datetime.datetime.now().isoformat())
            logfile_name = "{}_{}.log".format(slugify(self.job_name), iso)
            logfile_path = os.path.join(job_log_folder, logfile_name)
            return open(logfile_path, "a"), logfile_path
        except IOError:
            return None, None

    def capture_io(self):
        self.out_buf.acquire()
        out = self.out_buf.getvalue()
        len_out = len(out) - self.out_buf_offset
        self.print_out(out[self.out_buf_offset:]) if len_out != 0 else None
        self.out_buf_offset += len_out
        self.out_buf.release()

        self.err_buf.acquire()
        err = self.err_buf.getvalue()
        len_err = len(err) - self.err_buf_offset
        self.print_err(err[self.err_buf_offset:]) if len_err != 0 else None
        self.err_buf_offset += len_err
        self.err_buf.release()

    def print_out(self, s):
        message = create_log_message(self.current_sdk_run_uuid, INFO_LEVEL, s)
        self.server_manager.put_buf(message)
        self.std_out.write(s)
        self.write_to_log_file(s)

    def print_err(self, s):
        message = create_log_message(self.current_sdk_run_uuid, ERROR_LEVEL, s)
        self.server_manager.put_buf(message)
        self.std_err.write(s)
        self.write_to_log_file(s)

    def write_to_log_file(self, s):
        if self.log_file:
            if PY2:
                self.log_file.write(s.encode("utf-8"))
            else:
                self.log_file.write(s)

    def flush_log_file(self):
        if self.log_file:
            self.log_file.flush()

    def cleanup(self, exit_status):
        self.print_log_file_location()
        self.capture_io()
        self.server_manager.put_buf(
            create_run_ended_message(self.current_sdk_run_uuid, exit_status), )
        self.flush_log_file()
        self.shutdown_network_channel.put(True)

    def sudden_cleanup(self):
        self.print_log_file_location()
        # Send what we can to local log
        self.capture_io()
        self.flush_log_file()

        # Make a best-effort attempt to notify server that the run was
        # canceled by the user, but don't wait for all messages to
        # be flushed to server so we don't hang the user's terminal.
        self.server_manager.send_message(
            create_run_ended_message(self.current_sdk_run_uuid,
                                     "user_canceled"),
            raise_exceptions=False,
            timeout_seconds=1,
        )

    def print_log_file_location(self):
        if self.log_file_path:
            self.logger.info(
                "Logs for this run of {} are available locally at: {}".format(
                    self.job_name,
                    self.log_file_path,
                ))

    def run(self):
        """
        run_http works using three separate threads:
            1) code_runner thread which runs the user's code
            2) network_thread which does blocking I/O with the server
            3) event_loop thread which runs the SDK's main event loop (this is
               just the main thread)

        We require the event loop and network loop to be in separate threads
        because otherwise slow responses from the server could inhibit the
        SDK's event loop causing weird behavior like delayed logs in the user's
        terminal.

        Once all threads are running, the event_loop thread will periodically
        check the I/O buffers to see if any new logs have appeared, and if so,
        it will send them to the server manager's outgoing buffer.

        The network_loop thread will periodically check its outgoing buffer, and
        if it finds any messages in there, it will send them all to the server.

        Cleanup is the responsibility of the event_loop. With every tick of the
        event_loop, we check to see if the user's code has completed running. If
        it has, the event_loop will capture any remaining I/O and store that in
        the ServerManager's outgoing buf, as well as store a message indicating
        that the run is complete and its final exit status. Finally, the
        event_loop thread will push a message into the shutdown_network_channel which
        will indicate to the network_loop that it should finish sending any
        pending messages and then exit. The event_loop thread will then block
        until it receives a message on the shutdown_main_channel.

        At the next tick of the network_loop, the shutdown_network_channel will no longer
        be empty, and the network loop will try and fire off any remaining messages
        in the ServerManager's buffer to the server, and then put a message in the
        shutdown_main_channel.

        The main event_loop which has been blocked until now on the shutdown_main_channel
        will now return, and the program will exit cleanly.
        """
        def network_loop():
            while True:
                if self.shutdown_network_channel.qsize() != 0:
                    self.server_manager.cleanup(self.current_sdk_run_uuid)
                    self.shutdown_main_channel.put(True)
                    return
                else:
                    self.server_manager.tick(self.current_sdk_run_uuid)
                    time.sleep(1)

        code_thread = Thread(target=self.code_runner.run)
        network_thread = Thread(target=network_loop)

        # Daemonize them so they don't impede shutdown if the user
        # keyboard interrupts
        code_thread.daemon = True
        network_thread.daemon = True

        network_thread.start()
        code_thread.start()

        # Event loop
        while True:
            try:
                self.capture_io()
                exited_cleanly, is_done = self.code_runner.is_done()
                if is_done:
                    self.programmatic_exit = True
                    if exited_cleanly:
                        self.cleanup("success")
                        # Block until network loop says its done
                        self.shutdown_main_channel.get(block=True,
                                                       timeout=None)
                        return self.code_runner.get_return_val()
                    else:
                        self.cleanup("failure")
                        # Block until network loop says its done
                        self.shutdown_main_channel.get(block=True,
                                                       timeout=None)
                        raise self.code_runner.get_exception()

                time.sleep(1)
            # Handle Ctrl+C
            except (KeyboardInterrupt, SystemExit):
                self.sudden_cleanup()
                # code_thread and network_thread are daemons so they won't impede this
                sys.exit(130)
Ejemplo n.º 30
0
class TaskHandler(BufferingHandler):
    __flush_max_history_seconds = 30.
    __wait_for_flush_timeout = 10.
    __max_event_size = 1024 * 1024
    __once = False
    __offline_filename = 'log.jsonl'

    @property
    def task_id(self):
        return self._task_id

    @task_id.setter
    def task_id(self, value):
        self._task_id = value

    def __init__(self, task, capacity=buffer_capacity, connect_logger=True):
        super(TaskHandler, self).__init__(capacity)
        self.task_id = task.id
        self.session = task.session
        self.last_timestamp = 0
        self.counter = 1
        self._last_event = None
        self._exit_event = None
        self._queue = None
        self._thread = None
        self._pending = 0
        self._offline_log_filename = None
        self._connect_logger = connect_logger
        if task.is_offline():
            offline_folder = Path(task.get_offline_mode_folder())
            offline_folder.mkdir(parents=True, exist_ok=True)
            self._offline_log_filename = offline_folder / self.__offline_filename

    def shouldFlush(self, record):
        """
        Should the handler flush its buffer

        Returns true if the buffer is up to capacity. This method can be
        overridden to implement custom flushing strategies.
        """
        if self._task_id is None:
            return False

        # if we need to add handlers to the base_logger,
        # it will not automatically create stream one when first used, so we must manually configure it.
        if self._connect_logger and not TaskHandler.__once:
            base_logger = getLogger()
            if len(base_logger.handlers) == 1 and isinstance(
                    base_logger.handlers[0], TaskHandler):
                if record.name != 'console' and not record.name.startswith(
                        'clearml.'):
                    base_logger.removeHandler(self)
                    basicConfig()
                    base_logger.addHandler(self)
                    TaskHandler.__once = True
            else:
                TaskHandler.__once = True

        # if we passed the max buffer
        if len(self.buffer) >= self.capacity:
            return True

        # if the first entry in the log was too long ago.
        # noinspection PyBroadException
        try:
            if len(self.buffer) and (time.time() - self.buffer[0].created
                                     ) > self.__flush_max_history_seconds:
                return True
        except Exception:
            pass

        return False

    def _record_to_event(self, record):
        # type: (LogRecord) -> events.TaskLogEvent
        if self._task_id is None:
            return None
        timestamp = int(record.created * 1000)
        if timestamp == self.last_timestamp:
            timestamp += self.counter
            self.counter += 1
        else:
            self.last_timestamp = timestamp
            self.counter = 1

        # ignore backspaces (they are often used)
        full_msg = record.getMessage().replace('\x08', '')

        return_events = []
        while full_msg:
            msg = full_msg[:self.__max_event_size]
            full_msg = full_msg[self.__max_event_size:]
            # unite all records in a single second
            if self._last_event and timestamp - self._last_event.timestamp < 1000 and \
                    len(self._last_event.msg) + len(msg) < self.__max_event_size and \
                    record.levelname.lower() == str(self._last_event.level):
                # ignore backspaces (they are often used)
                self._last_event.msg += '\n' + msg
                continue

            # if we have a previous event and it timed out, return it.
            new_event = events.TaskLogEvent(task=self.task_id,
                                            timestamp=timestamp,
                                            level=record.levelname.lower(),
                                            worker=self.session.worker,
                                            msg=msg)
            if self._last_event:
                return_events.append(self._last_event)

            self._last_event = new_event

        return return_events

    def flush(self):
        if self._task_id is None:
            return

        if not self.buffer:
            return

        buffer = None
        self.acquire()
        if self.buffer:
            buffer = self.buffer
            self.buffer = []
        self.release()

        if not buffer:
            return

        # noinspection PyBroadException
        try:
            record_events = [
                r for record in buffer for r in self._record_to_event(record)
            ] + [self._last_event]
            self._last_event = None
            batch_requests = events.AddBatchRequest(
                requests=[events.AddRequest(e) for e in record_events if e])
        except Exception:
            self.__log_stderr(
                "WARNING: clearml.log - Failed logging task to backend ({:d} lines)"
                .format(len(buffer)))
            batch_requests = None

        if batch_requests and batch_requests.requests:
            self._pending += 1
            self._add_to_queue(batch_requests)

    def _create_thread_queue(self):
        if self._queue:
            return

        self._queue = Queue()
        self._exit_event = Event()
        self._exit_event.clear()
        # multiple workers could be supported as well
        self._thread = Thread(target=self._daemon)
        self._thread.daemon = True
        self._thread.start()

    def _add_to_queue(self, request):
        self._create_thread_queue()
        self._queue.put(request)

    def close(self, wait=False):
        # self.__log_stderr('Closing {} wait={}'.format(os.getpid(), wait))
        # flush pending logs
        if not self._task_id:
            return
        # avoid deadlocks just skip the lock, we are shutting down anyway
        self.lock = None

        self.flush()
        # shut down the TaskHandler, from this point onwards. No events will be logged
        _thread = self._thread
        self._thread = None
        if self._queue:
            self._exit_event.set()
            self._queue.put(None)
        self._task_id = None

        if wait and _thread:
            # noinspection PyBroadException
            try:
                timeout = 1. if self._queue.empty(
                ) else self.__wait_for_flush_timeout
                _thread.join(timeout=timeout)
                if not self._queue.empty():
                    self.__log_stderr(
                        'Flush timeout {}s exceeded, dropping last {} lines'.
                        format(timeout, self._queue.qsize()))
                # self.__log_stderr('Closing {} wait done'.format(os.getpid()))
            except Exception:
                pass
        # call super and remove the handler
        super(TaskHandler, self).close()

    def _send_events(self, a_request):
        try:
            self._pending -= 1

            if self._offline_log_filename:
                with open(self._offline_log_filename.as_posix(), 'at') as f:
                    f.write(
                        json.dumps([b.to_dict()
                                    for b in a_request.requests]) + '\n')
                return

            # if self._thread is None:
            #     self.__log_stderr('Task.close() flushing remaining logs ({})'.format(self._pending))
            res = self.session.send(a_request)
            if res and not res.ok():
                self.__log_stderr(
                    "failed logging task to backend ({:d} lines, {})".format(
                        len(a_request.requests), str(res.meta)),
                    level=WARNING)
        except MaxRequestSizeError:
            self.__log_stderr(
                "failed logging task to backend ({:d} lines) log size exceeded limit"
                .format(len(a_request.requests)),
                level=WARNING)
        except Exception as ex:
            self.__log_stderr(
                "Retrying, failed logging task to backend ({:d} lines): {}".
                format(len(a_request.requests), ex))
            # we should push ourselves back into the thread pool
            if self._queue:
                self._pending += 1
                self._queue.put(a_request)

    def _daemon(self):
        # multiple daemons are supported
        leave = self._exit_event.wait(0)
        request = True
        while not leave or request:
            # pull from queue
            request = None
            if self._queue:
                # noinspection PyBroadException
                try:
                    request = self._queue.get(block=not leave)
                except Exception:
                    pass
            if request:
                self._send_events(request)
            leave = self._exit_event.wait(0)
        # self.__log_stderr('leaving {}'.format(os.getpid()))

    @staticmethod
    def __log_stderr(msg, level=INFO):
        # output directly to stderr, make sure we do not catch it.
        write = sys.stderr._original_write if hasattr(
            sys.stderr, '_original_write') else sys.stderr.write
        write('{asctime} - {name} - {levelname} - {message}\n'.format(
            asctime=Formatter().formatTime(makeLogRecord({})),
            name='clearml.log',
            levelname=getLevelName(level),
            message=msg))

    @classmethod
    def report_offline_session(cls, task, folder):
        filename = Path(folder) / cls.__offline_filename
        if not filename.is_file():
            return False
        with open(filename.as_posix(), 'rt') as f:
            i = 0
            while True:
                try:
                    line = f.readline()
                    if not line:
                        break
                    list_requests = json.loads(line)
                    for r in list_requests:
                        r.pop('task', None)
                    i += 1
                except StopIteration:
                    break
                except Exception as ex:
                    warning('Failed reporting log, line {} [{}]'.format(i, ex))
                batch_requests = events.AddBatchRequest(requests=[
                    events.TaskLogEvent(task=task.id, **r)
                    for r in list_requests
                ])
                if batch_requests.requests:
                    res = task.session.send(batch_requests)
                    if res and not res.ok():
                        warning(
                            "failed logging task to backend ({:d} lines, {})".
                            format(len(batch_requests.requests),
                                   str(res.meta)))
        return True
Ejemplo n.º 31
0
class ThreadedTaskDispatcher(object):
    """A Task Dispatcher that creates a thread for each task."""

    stop_count = 0  # Number of threads that will stop soon.

    def __init__(self):
        self.threads = {}  # { thread number -> 1 }
        self.queue = Queue()
        self.thread_mgmt_lock = threading.Lock()

    def handlerThread(self, thread_no):
        threads = self.threads
        try:
            while threads.get(thread_no):
                task = self.queue.get()
                if task is None:
                    # Special value: kill this thread.
                    break
                try:
                    task.service()
                except:
                    log.exception('Exception during task')
        except:
            log.exception('Exception in thread main loop')
        finally:
            mlock = self.thread_mgmt_lock
            with mlock:
                self.stop_count -= 1
                try:
                    del threads[thread_no]
                except KeyError:
                    pass

    def setThreadCount(self, count):
        """See zope.server.interfaces.ITaskDispatcher"""
        mlock = self.thread_mgmt_lock
        with mlock:
            threads = self.threads
            thread_no = 0
            running = len(threads) - self.stop_count
            while running < count:
                # Start threads.
                while thread_no in threads:
                    thread_no = thread_no + 1
                threads[thread_no] = 1
                running += 1
                t = threading.Thread(target=self.handlerThread,
                                     args=(thread_no,),
                                     name='zope.server-%d' % thread_no)
                t.setDaemon(True)
                t.start()
                thread_no = thread_no + 1
            if running > count:
                # Stop threads.
                to_stop = running - count
                self.stop_count += to_stop
                for _n in range(to_stop):
                    self.queue.put(None)
                    running -= 1

    def addTask(self, task):
        """See zope.server.interfaces.ITaskDispatcher"""
        if task is None:
            raise ValueError("No task passed to addTask().")
        # assert ITask.providedBy(task)
        try:
            task.defer()
            self.queue.put(task)
        except:
            task.cancel()
            raise

    def shutdown(self, cancel_pending=True, timeout=5):
        """See zope.server.interfaces.ITaskDispatcher"""
        self.setThreadCount(0)
        # Ensure the threads shut down.
        threads = self.threads
        expiration = time() + timeout
        while threads:
            if time() >= expiration:
                log.error("%d thread(s) still running", len(threads))
                break
            sleep(0.1)
        if cancel_pending:
            # Cancel remaining tasks.
            try:
                queue = self.queue
                while not queue.empty():
                    task = queue.get()
                    if task is not None:
                        task.cancel()
            except Empty:
                pass

    def getPendingTasksEstimate(self):
        """See zope.server.interfaces.ITaskDispatcher"""
        return self.queue.qsize()