class JobHandler(object): """Handles a single polling job against a single netbox. An instance of this class performs a polling job, as described by a job specification in the config file, for a single netbox. It will handle the dispatch of polling plugins, and contain state information for the job. """ _logger = ContextLogger() _queue_logger = ContextLogger(suffix='queue') _timing_logger = ContextLogger(suffix='timings') _start_time = datetime.datetime.min def __init__(self, name, netbox, plugins=None): self.name = name self.netbox = netbox self.cancelled = threading.Event() self.plugins = plugins or [] self._logger.debug("Job %r initialized with plugins: %r", self.name, self.plugins) self.containers = storage.ContainerRepository() self.storage_queue = [] # Initialize netbox in container nb = self.container_factory(shadows.Netbox, key=None, id=netbox.id, sysname=netbox.sysname) self.agent = None def _create_agentproxy(self): if self.agent: self._destroy_agentproxy() if not self.netbox.read_only: self.agent = None return port = ports.next() self.agent = AgentProxy(self.netbox.ip, 161, community=self.netbox.read_only, snmpVersion='v%s' % self.netbox.snmp_version, protocol=port.protocol, snmp_parameters=snmp_parameter_factory( self.netbox)) try: self.agent.open() except SnmpError, error: self.agent.close() session_count = self.agent.count_open_sessions() job_count = self.get_instance_count() self._logger.error( "%s (%d currently open SNMP sessions, %d job handlers)", error, session_count, job_count) raise AbortedJobError("Cannot open SNMP session", cause=error) else:
class ProcessAMP(amp.AMP): """Modify AMP protocol to allow running over process pipes""" _logger = ContextLogger() def __init__(self, is_worker, **kwargs): super(ProcessAMP, self).__init__(**kwargs) self.is_worker = is_worker self.lost_handler = None def makeConnection(self, transport): if not hasattr(transport, 'getPeer'): setattr(transport, 'getPeer', lambda: "peer") if not hasattr(transport, 'getHost'): setattr(transport, 'getHost', lambda: "host") super(ProcessAMP, self).makeConnection(transport) def connectionLost(self, reason): super(ProcessAMP, self).connectionLost(reason) if self.is_worker: if reactor.running: reactor.stop() else: if self.lost_handler: self.lost_handler(self, reason)
class Worker(object): """This class holds information about one worker process as seen from the worker pool""" _logger = ContextLogger() def __init__(self, pool, threadpoolsize, max_jobs): self.active_jobs = 0 self.total_jobs = 0 self.max_concurrent_jobs = 0 self.pool = pool self.threadpoolsize = threadpoolsize self.max_jobs = max_jobs @inlineCallbacks def start(self): args = [control.get_process_command(), '--worker', '-f', '-s', '-P'] if self.threadpoolsize: args.append('--threadpoolsize=%d' % self.threadpoolsize) endpoint = ProcessEndpoint(reactor, control.get_process_command(), args, os.environ) factory = protocol.Factory() factory.protocol = lambda: ProcessAMP(is_worker=False, locator=JobHandler()) self.process = yield endpoint.connect(factory) self.process.lost_handler = self._worker_died returnValue(self) def done(self): return self.max_jobs and (self.total_jobs >= self.max_jobs) def _worker_died(self, worker, reason): if not self.done(): self._logger.warning("Lost worker {worker} with {jobs} " "active jobs".format( worker=worker, jobs=self.active_jobs)) elif self.active_jobs: self._logger.warning("Worker {worker} exited with {jobs} " "active jobs".format( worker=worker, jobs=self.active_jobs)) else: self._logger.debug("Worker {worker} exited normally" .format(worker=worker)) self.pool.worker_died(self) def execute(self, serial, command, **kwargs): self.active_jobs += 1 self.total_jobs += 1 self.max_concurrent_jobs = max(self.active_jobs, self.max_concurrent_jobs) deferred = self.process.callRemote(command, serial=serial, **kwargs) if self.done(): self.process.callRemote(Shutdown) return deferred def cancel(self, serial): return self.process.callRemote(Cancel, serial=serial)
class JobHandler(amp.CommandLocator): """Resolve actions for jobs received over AMP""" _logger = ContextLogger() def __init__(self): super(JobHandler, self).__init__() self.jobs = dict() self.done = False def job_done(self, result, serial): if serial in self.jobs: del self.jobs[serial] if self.done and not self.jobs: reactor.callLater(3, reactor.stop) return result @Job.responder def execute_job(self, netbox, job, plugins, interval, serial): self._logger.debug( "Process {pid} received job {job} for" " netbox {netbox}" " with plugins {plugins}".format(pid=os.getpid(), job=job, netbox=netbox, plugins=",".join(plugins)), ) job = jobs.JobHandler(job, netbox, plugins, interval) self.jobs[serial] = job deferred = job.run() deferred.addBoth(self.job_done, serial) deferred.addCallback(lambda x: {'result': x, 'reschedule': 0}) def handle_reschedule(failure): failure.trap(jobs.SuggestedReschedule) return {'reschedule': failure.value.delay, "result": False} deferred.addErrback(handle_reschedule) return deferred @Cancel.responder def cancel(self, serial): if serial in self.jobs: self.jobs[serial].cancel() return {} @Shutdown.responder def shutdown(self): self.done = True return {} def log_jobs(self): self._logger.info("Got {jobs} active jobs".format(jobs=len(self.jobs))) for job in self.jobs.values(): self._logger.info("{job} {netbox} {plugins}".format( job=job.name, netbox=job.netbox, plugins=", ".join(job.plugins)))
class JobHandler(object): """Handles a single polling job against a single netbox. An instance of this class performs a polling job, as described by a job specification in the config file, for a single netbox. It will handle the dispatch of polling plugins, and contain state information for the job. """ _logger = ContextLogger() _queue_logger = ContextLogger(suffix='queue') _timing_logger = ContextLogger(suffix='timings') _start_time = datetime.datetime.min def __init__(self, name, netbox, plugins=None, interval=None): self.name = name self.netbox_id = netbox self.netbox = None self.cancelled = threading.Event() self.interval = interval self.plugins = plugins or [] self._log_context = {} self.containers = storage.ContainerRepository() self.storage_queue = [] self.agent = None def _create_agentproxy(self): if self.agent: self._destroy_agentproxy() if not self.netbox.read_only: self.agent = None return port = next(ports) self.agent = AgentProxy(self.netbox.ip, 161, community=self.netbox.read_only, snmpVersion='v%s' % self.netbox.snmp_version, protocol=port.protocol, snmp_parameters=snmp_parameter_factory( self.netbox)) try: self.agent.open() except SnmpError as error: self.agent.close() session_count = self.agent.count_open_sessions() job_count = self.get_instance_count() self._logger.error( "%s (%d currently open SNMP sessions, %d job handlers)", error, session_count, job_count) raise AbortedJobError("Cannot open SNMP session", cause=error) else: self._logger.debug("AgentProxy created for %s: %s", self.netbox.sysname, self.agent) def _destroy_agentproxy(self): if self.agent: self.agent.close() self.agent = None @defer.inlineCallbacks def _find_plugins(self): """Populate the internal plugin list with plugin class instances.""" from nav.ipdevpoll.config import ipdevpoll_conf plugin_classes = [ plugin_registry[name] for name in self._get_valid_plugins() ] willing_plugins = yield self._get_willing_plugins(plugin_classes) plugins = [ cls(self.netbox, agent=self.agent, containers=self.containers, config=ipdevpoll_conf) for cls in willing_plugins ] if not plugins: defer.returnValue(None) defer.returnValue(plugins) def _get_valid_plugins(self): valid_plugins, invalid_plugins = splitby( lambda name: name in plugin_registry, self.plugins) if list(invalid_plugins): self._logger.error( "Non-existent plugins were configured for job " "%r (ignoring them): %r", self.name, list(invalid_plugins)) return valid_plugins @defer.inlineCallbacks def _get_willing_plugins(self, plugin_classes): willing_plugins = [] unwilling_plugins = [] for cls in plugin_classes: try: can_handle = yield defer.maybeDeferred(cls.can_handle, self.netbox) except db.ResetDBConnectionError: raise except Exception: self._logger.exception( "Unhandled exception from can_handle(): %r", cls) can_handle = False if can_handle: willing_plugins.append(cls) else: unwilling_plugins.append(cls) for (willingness, plugins) in [('unwilling', unwilling_plugins), ('willing', willing_plugins)]: if plugins: self._logger.debug("%s plugins: %r", willingness, [cls.__name__ for cls in plugins]) else: self._logger.debug("no %s plugins", willingness) defer.returnValue(willing_plugins) def _iterate_plugins(self, plugins): """Iterates plugins.""" plugins = iter(plugins) def log_plugin_failure(failure, plugin_instance): if failure.check(TimeoutError, defer.TimeoutError): self._logger.debug("Plugin %s reported a timeout", plugin_instance.alias, exc_info=True) raise AbortedJobError("Plugin %s reported a timeout" % plugin_instance.alias) elif failure.check(SuggestedReschedule): self._logger.debug( "Plugin %s suggested a reschedule in " "%d seconds", plugin_instance, failure.value.delay) elif failure.check(db.ResetDBConnectionError): pass else: log_unhandled_failure( self._logger, failure, "Plugin %s reported an unhandled failure", plugin_instance) return failure def next_plugin(result=None): self._raise_if_cancelled() try: plugin_instance = next(plugins) except StopIteration: return result self._logger.debug("Now calling plugin: %s", plugin_instance) self._start_plugin_timer(plugin_instance) df = defer.maybeDeferred(plugin_instance.handle) df.addErrback(self._stop_plugin_timer) df.addErrback(log_plugin_failure, plugin_instance) df.addCallback(self._stop_plugin_timer) df.addCallback(next_plugin) return df return next_plugin() @defer.inlineCallbacks def run(self): """Starts a polling job for netbox. :returns: A Deferred, whose result will be True when the job did something, or False when the job did nothing (i.e. no plugins ran). """ self.netbox = yield db.run_in_thread(dataloader.load_netbox, self.netbox_id) self._log_context.update( dict(job=self.name, sysname=self.netbox.sysname)) self._logger.debug("Job %r started with plugins: %r", self.name, self.plugins) # Initialize netbox in container nb = self._container_factory(shadows.Netbox, key=None, id=self.netbox.id, sysname=self.netbox.sysname) self._create_agentproxy() plugins = yield self._find_plugins() self._reset_timers() if not plugins: self._destroy_agentproxy() defer.returnValue(False) self._logger.debug("Starting job %r for %s", self.name, self.netbox.sysname) def wrap_up_job(_result): self._logger.debug("Job %s for %s done.", self.name, self.netbox.sysname) self._log_timings() return True def plugin_failure(failure): self._log_timings() if not failure.check(AbortedJobError): raise AbortedJobError("Job aborted due to plugin failure", cause=failure.value) return failure def save_failure(failure): if not failure.check(db.ResetDBConnectionError): log_unhandled_failure( self._logger, failure, "Save stage failed with unhandled error") self._log_timings() raise AbortedJobError("Job aborted due to save failure", cause=failure.value) def log_abort(failure): if failure.check(SuggestedReschedule): return failure if failure.check(AbortedJobError): self._logger.error("Job %r for %s aborted: %s", self.name, self.netbox.sysname, failure.value) return failure def save(result): if self.cancelled.isSet(): return wrap_up_job(result) df = self._save_container() df.addErrback(save_failure) df.addCallback(wrap_up_job) return df # pylint is unable to find reactor members: # pylint: disable=E1101 shutdown_trigger_id = reactor.addSystemEventTrigger( "before", "shutdown", self.cancel) def cleanup(result): self._destroy_agentproxy() reactor.removeSystemEventTrigger(shutdown_trigger_id) return result def log_externally_success(result): self._log_job_externally(True if result else None) return result def log_externally_failure(result): self._log_job_externally(False) return result # The action begins here df = self._iterate_plugins(plugins) df.addErrback(plugin_failure) df.addCallback(save) df.addErrback(log_abort) df.addBoth(cleanup) df.addCallbacks(log_externally_success, log_externally_failure) yield df defer.returnValue(True) def cancel(self): """Cancels a running job. Job stops at the earliest convenience. """ self.cancelled.set() self._logger.info("Cancelling running job") def _reset_timers(self): self._start_time = datetime.datetime.now() self._plugin_times = [] def _start_plugin_timer(self, plugin): now = datetime.datetime.now() timings = [plugin.__class__.__name__, now, now] self._plugin_times.append(timings) def _stop_plugin_timer(self, result=None): timings = self._plugin_times[-1] timings[-1] = datetime.datetime.now() return result def _log_timings(self): stop_time = datetime.datetime.now() job_total = stop_time - self._start_time times = [(plugin, stop - start) for (plugin, start, stop) in self._plugin_times] plugin_total = sum((i[1] for i in times), datetime.timedelta(0)) times.append(("Plugin total", plugin_total)) times.append(("Job total", job_total)) times.append(("Job overhead", job_total - plugin_total)) log_text = [] longest_label = max(len(i[0]) for i in times) format = "%%-%ds: %%s" % longest_label for plugin, delta in times: log_text.append(format % (plugin, delta)) dashes = "-" * max(len(i) for i in log_text) log_text.insert(-3, dashes) log_text.insert(-2, dashes) log_text.insert( 0, "Job %r timings for %s:" % (self.name, self.netbox.sysname)) self._timing_logger.debug("\n".join(log_text)) def get_current_runtime(self): """Returns time elapsed since the start of the job as a timedelta.""" return datetime.datetime.now() - self._start_time def _save_container(self): """ Parses the container and finds a sane storage order. We do this so we get ForeignKeys stored before the objects that are using them are stored. """ @db.cleanup_django_debug_after def complete_save_cycle(): # Traverse all the classes in the container repository and # generate the storage queue self._populate_storage_queue() # Prepare all shadow objects for storage. self._prepare_containers_for_save() # Actually save to the database result = self._perform_save() self._log_timed_result(result, "Storing to database complete") # Do cleanup for the known container classes. self._cleanup_containers_after_save() df = db.run_in_thread(complete_save_cycle) return df def _prepare_containers_for_save(self): """Runs every queued manager's prepare routine""" for manager in self.storage_queue: self._raise_if_cancelled() manager.prepare() def _cleanup_containers_after_save(self): """Runs every queued manager's cleanup routine""" self._logger.debug("Running cleanup routines for %d managers: %r", len(self.storage_queue), self.storage_queue) try: for manager in self.storage_queue: self._raise_if_cancelled() manager.cleanup() except AbortedJobError: raise except Exception: self._logger.exception( "Caught exception during cleanup. " "Last manager = %r", manager) import django.db if django.db.connection.queries: self._logger.error("The last query was: %s", django.db.connection.queries[-1]) raise def _log_timed_result(self, res, msg): self._logger.debug(msg + " (%0.3f ms)" % res) def _perform_save(self): start_time = time.time() manager = None try: self._log_containers("containers before save") for manager in self.storage_queue: self._raise_if_cancelled() manager.save() end_time = time.time() total_time = (end_time - start_time) * 1000.0 self._log_containers("containers after save") return total_time except AbortedJobError: raise except Exception: self._logger.exception( "Caught exception during save. " "Last manager = %s. Last model = %s", manager, getattr(manager, 'cls', None)) import django.db if django.db.connection.queries: self._logger.error("The last query was: %s", django.db.connection.queries[-1]) raise def _log_containers(self, prefix=None): log = self._queue_logger if not log.isEnabledFor(logging.DEBUG): return log.debug("%s%s", prefix and "%s: " % prefix, pprint.pformat(dict(self.containers))) def _populate_storage_queue(self): """Naive population of the storage queue. Assuming there are no inter-dependencies between instances of a single shadow class, the only relevant ordering is the one between the container types themselves. This method will only order instances according to the dependency (topological) order of their classes. """ for shadow_class in self.containers.sortedkeys(): manager = shadow_class.manager(shadow_class, self.containers) self.storage_queue.append(manager) def _container_factory(self, container_class, key, *args, **kwargs): """Container factory function""" return self.containers.factory(key, container_class, *args, **kwargs) def _raise_if_cancelled(self): """Raises an AbortedJobError if the current job is cancelled""" if self.cancelled.isSet(): raise AbortedJobError("Job was already cancelled") @classmethod def get_instance_count(cls): """Returns the number of JobHandler instances as seen by the garbage collector. """ return len([o for o in gc.get_objects() if isinstance(o, cls)]) # pylint: disable=W0703 @defer.inlineCallbacks def _log_job_externally(self, success=True): """Logs a job to the database""" duration = self.get_current_runtime() duration_in_seconds = (duration.days * 86400 + duration.seconds + duration.microseconds / 1e6) timestamp = time.time() def _create_record(timestamp): netbox = manage.Netbox.objects.get(id=self.netbox.id) if netbox.deleted_at: _logger.info( "Not logging job to db; delete of this IP device" " was requested at %s", netbox.deleted_at) return log = manage.IpdevpollJobLog( netbox_id=self.netbox.id, job_name=self.name, end_time=datetime.datetime.fromtimestamp(timestamp), duration=duration_in_seconds, success=success, interval=self.interval) log.save() def _log_to_graphite(): prefix = metric_prefix_for_ipdevpoll_job(self.netbox.sysname, self.name) runtime_path = prefix + ".runtime" runtime = (runtime_path, (timestamp, duration_in_seconds)) send_metrics([runtime]) _log_to_graphite() try: yield db.run_in_thread(_create_record, timestamp) except db.ResetDBConnectionError: pass # this is being logged all over the place at the moment except Exception as error: _logger.warning("failed to log job to database: %s", error)
class MibRetriever(object): """Base class for functioning MIB retriever classes.""" mib = None __metaclass__ = MibRetrieverMaker _logger = ContextLogger() def __init__(self, agent_proxy): """Create a new instance tied to an AgentProxy instance.""" super(MibRetriever, self).__init__() self.agent_proxy = agent_proxy # touch _logger to initialize logging context right away # pylint: disable=W0104 self._logger @defer.inlineCallbacks def get_next(self, object_name): oid = self.nodes[object_name].oid result = yield self.agent_proxy.walk(str(oid)) if hasattr(result, 'items'): result = result.items() for key, value in result: if oid.is_a_prefix_of(key): defer.returnValue(value) def retrieve_column(self, column_name): """Retrieve the contents of a single MIB table column. Returns a deferred whose result is a dictionary: { row_index: column_value } """ node = self.nodes[column_name] if node.raw_mib_data['nodetype'] != 'column': self._logger.debug("%s is not a table column", column_name) def resultFormatter(result): formatted_result = {} # result keys may be OID objects/tuples or strings, depending on # snmp library used if node.oid not in result and str(node.oid) not in result: self._logger.debug( "%s (%s) seems to be unsupported, result " "keys were: %r", column_name, node.oid, result.keys()) return {} varlist = result.get(node.oid, result.get(str(node.oid), None)) for oid, value in varlist.items(): # Extract index information from oid row_index = OID(oid).strip_prefix(node.oid) formatted_result[row_index] = value return formatted_result deferred = self.agent_proxy.getTable([str(node.oid)]) deferred.addCallback(resultFormatter) return deferred def retrieve_columns(self, column_names): """Retrieve a set of table columns. The table columns may come from different tables, as long as the table rows are indexed the same way. Returns a deferred whose result is a dictionary: { row_index: MibTableResultRow instance } """ def sortkey(col): return self.nodes[col].oid columns = iter(sorted(column_names, key=sortkey)) final_result = {} my_deferred = defer.Deferred() def result_aggregate(result, column): for row_index, value in result.items(): if row_index not in final_result: final_result[row_index] = \ MibTableResultRow(row_index, column_names) final_result[row_index][column] = value return True # schedule the next iteration (i.e. collect next column) def schedule_next(result=None): try: column = columns.next() except StopIteration: my_deferred.callback(final_result) return deferred = self.retrieve_column(column) deferred.addCallback(result_aggregate, column) deferred.addCallback(schedule_next) deferred.addErrback(my_deferred.errback) reactor.callLater(0, schedule_next) return my_deferred def retrieve_table(self, table_name): """Table retriever and formatter. Retrieves an entire MIB table. Returns a deferred whose result is a dictionary: { row_index: MibTableResultRow instance } Each dictionary key is a row index (an oid suffix tuple). Each dictionary value is a MibTableResultRow instance, which can be accessed as both a dictionary and a list. """ table = self.tables[table_name] def resultFormatter(result): formatted_result = {} for varlist in result.values(): # Build a table structure for oid in sorted(varlist.keys()): if not table.table.oid.is_a_prefix_of(oid): raise MibRetrieverError( "Received wrong response from client," "%s is not in %s" % (oid, table.table.oid)) # Extract table position of value oid_suffix = OID(oid).strip_prefix(table.row.oid) column_no = oid_suffix[0] row_index = oid_suffix[1:] if column_no not in table.reverse_column_index: self._logger.warning( "device response has bad table index %s in %s::%s, " "ignoring", oid_suffix, self.mib['moduleName'], table_name) continue column_name = table.reverse_column_index[column_no] if row_index not in formatted_result: formatted_result[row_index] = \ MibTableResultRow(row_index, table.columns.keys()) formatted_result[row_index][column_name] = varlist[oid] return formatted_result deferred = self.agent_proxy.getTable([str(table.table.oid)]) deferred.addCallback(resultFormatter) return deferred @classmethod def translate_result(cls, result): """Translate result values to pythonic values according to object syntax. Given a table result from one of this object's retrievers, every column object will have it's to_python translation rules applied. This is useful to insert into a callback chain for result formatting. """ for row in result.values(): for column in row.keys(): if column in cls.nodes: row[column] = cls.nodes[column].to_python(row[column]) return result
class WorkerPool(object): """This class represent a pool of worker processes to which jobs can be scheduled""" _logger = ContextLogger() def __init__(self, workers, max_jobs, threadpoolsize=None): twisted.internet.endpoints.log = HackLog self.workers = set() self.target_count = workers self.max_jobs = max_jobs self.threadpoolsize = threadpoolsize for i in range(self.target_count): self._spawn_worker() self.serial = 0 self.jobs = dict() def worker_died(self, worker): self.workers.remove(worker) if not worker.done(): self._spawn_worker() @inlineCallbacks def _spawn_worker(self): worker = yield Worker(self, self.threadpoolsize, self.max_jobs).start() self.workers.add(worker) def _cleanup(self, result, deferred): serial, worker = self.jobs[deferred] del self.jobs[deferred] worker.active_jobs -= 1 return result def _execute(self, command, **kwargs): ready_workers = [w for w in self.workers if not w.done()] if not ready_workers: raise RuntimeError("No ready workers") worker = min(ready_workers, key=lambda x: x.active_jobs) self.serial += 1 deferred = worker.execute(self.serial, command, **kwargs) if worker.done(): self._spawn_worker() self.jobs[deferred] = (self.serial, worker) deferred.addBoth(self._cleanup, deferred) return deferred def cancel(self, deferred): if deferred not in self.jobs: self._logger.debug("Cancelling job that isn't known") return serial, worker = self.jobs[deferred] return worker.cancel(serial) def execute_job(self, job, netbox, plugins=None, interval=None): deferred = self._execute(Job, job=job, netbox=netbox, plugins=plugins, interval=interval) def handle_reschedule(result): reschedule = result.get('reschedule', 0) if reschedule: raise jobs.SuggestedReschedule(delay=reschedule) return result deferred.addCallback(handle_reschedule) deferred.addCallback(lambda x: x['result']) return deferred def log_summary(self): self._logger.info("{active} out of {target} workers running".format( active=len(self.workers), target=self.target_count)) for worker in self.workers: self._logger.info(" - ready {ready} active {active}" " max {max} total {total}".format( ready=not worker.done(), active=worker.active_jobs, max=worker.max_concurrent_jobs, total=worker.total_jobs))