def _run(self): self.open_fifo() self.event_relay.start() copytool_log.info("Copytool monitor starting for %s" % self.copytool) while not self.stopping.is_set(): readable, _, _ = select.select([self.reader_fd], [], [], READER_SELECT_TIMEOUT) if not readable: continue data = os.read(self.reader_fd, BUFSIZ) if not data: copytool_log.warning("Got EOF on FIFO, restarting reader.") os.close(self.reader_fd) self.open_fifo() continue self.read_buffer += data if '\n' in self.read_buffer: tmp = self.read_buffer.split('\n') events, self.read_buffer = tmp[:-1], tmp[-1] for event in events: self.event_relay.put(event) copytool_log.debug("Put event in relay queue: %s" % event) self.event_relay.stop() self.event_relay.join()
def backoff(self): if self.poll_interval == RELAY_POLL_INTERVAL: self.poll_interval = MIN_SESSION_BACKOFF.seconds else: self.poll_interval *= 2 if self.poll_interval > MAX_SESSION_BACKOFF.seconds: self.poll_interval = MAX_SESSION_BACKOFF.seconds copytool_log.info("Retry interval increased to %d seconds" % self.poll_interval)
def main(): parser = ArgumentParser( description="IntelĀ® Manager for Lustre* software Copytool Monitor") parser.add_argument("copytool_id", action=GetCopytoolAction) args = parser.parse_args() copytool_log_setup() try: manager_url = config.get('settings', 'server')['url'] + "copytool_event/" except KeyError: copytool_log.error( "No configuration found (must be configured before starting a copytool monitor)" ) sys.exit(1) client = CryptoClient(manager_url, Crypto(config.path)) monitor = CopytoolMonitor(client, args.copytool) def teardown_callback(*args, **kwargs): monitor.stop() signal.signal(signal.SIGTERM, teardown_callback) signal.signal(signal.SIGINT, teardown_callback) signal.signal(signal.SIGUSR1, decrease_loglevel) signal.signal(signal.SIGUSR2, increase_loglevel) try: monitor.start() while not monitor.stopping.is_set(): monitor.stopping.wait(timeout=10) monitor.join() except Exception as e: copytool_log.exception() sys.stderr.write("Unhandled exception: %s\n" % e) sys.exit(1) copytool_log.info("Terminating")
def open_fifo(self): try: os.mkfifo(self.copytool.event_fifo) except OSError as e: if e.errno != errno.EEXIST: raise e pids = lsof(file=self.copytool.event_fifo) readers = set() writers = set() for pid, files in pids.items(): for file, info in files.items(): if 'r' in info['mode']: readers.add(pid) if 'w' in info['mode']: writers.add(pid) if readers: raise FifoReaderConflict(readers) self.reader_fd = os.open(self.copytool.event_fifo, os.O_RDONLY | os.O_NONBLOCK) copytool_log.info("Opened %s for reading" % self.copytool.event_fifo)
def send(self): events = [] envelope = dict(fqdn=self.client.fqdn, copytool=self.copytool.id, events=events) envelope_size = len(json.dumps(envelope)) while True: try: event = self.retry_queue.get_nowait() copytool_log.debug("Got event from retry queue: %s" % event) except Queue.Empty: try: raw_event = self.send_queue.get_nowait() event = json.loads(raw_event) copytool_log.debug("Got event from send queue: %s" % event) except Queue.Empty: break except ValueError: copytool_log.error("Invalid JSON: %s" % raw_event) break try: date = IMLDateTime.parse(event['event_time']) event['event_time'] = date.astimezone( tz=FixedOffset(0)).strftime("%Y-%m-%d %H:%M:%S+00:00") except ValueError as e: copytool_log.error("Invalid event date in event '%s': %s" % (event, e)) break # During restore operations, we don't know the data_fid until # after the operation has started (i.e. RUNNING). The tricky part # is that when the restore completes, the source_fid is set to # data_fid, so unless we do this swap we'll lose track of the # operation. if 'RUNNING' in event['event_type']: if event['source_fid'] in self.active_operations: self.active_operations[ event['data_fid']] = self.active_operations.pop( event['source_fid']) if self.active_operations.get(event.get('data_fid', None), None): event['active_operation'] = self.active_operations[ event['data_fid']] if 'FINISH' in event['event_type']: try: del self.active_operations[event['data_fid']] except KeyError: pass copytool_log.debug("event: %s" % json.dumps(event)) event_size = len(json.dumps(event)) if event_size > MAX_BYTES_PER_POST: copytool_log.error("Oversized event dropped: %s" % event) break if events and event_size > MAX_BYTES_PER_POST - envelope_size: copytool_log.info("Requeueing oversized message " "(%d + %d > %d, %d messages)" % (event_size, envelope_size, MAX_BYTES_PER_POST, len(events))) self.retry_queue.put(event) break events.append(event) envelope_size += event_size if events: copytool_log.debug("EventRelay sending %d events" % len(events)) try: data = self.client.post(envelope) copytool_log.debug("Got data back from POST: %s" % data) try: self.active_operations.update(data['active_operations']) except (KeyError, TypeError): pass # Reset any backoff delay that might have been added self.reset_backoff() except HttpError: copytool_log.error("Failed to relay events, requeueing") for event in envelope['events']: self.retry_queue.put(event) self.backoff()