def main(): time_start = time.time() ### ------------------------------------------------------------------------------ ### Create and Parse Arguments ### ----------------------------------------------------------------------------- # if getattr(sys, 'frozen', False): # # frozen # BASE_DIR = os.path.dirname(sys.executable) # else: # # unfrozen # BASE_DIR = os.path.dirname(os.path.realpath(__file__)) BASE_DIR = os.getcwd() full_parser = argparse.ArgumentParser() full_parser.add_argument( "--tag", nargs='+', help="Collect data from hosts that matches the tag") full_parser.add_argument( "--cmd-tag", nargs='+', help="Collect data from command that matches the tag") full_parser.add_argument("-s", "--start", action='store_true', help="Start collecting (default 'no')") full_parser.add_argument("--loglvl", default=20, help="Logs verbosity, 10-debug, 50 Critical") full_parser.add_argument("--logdir", default="", help="Directory where to store logs") full_parser.add_argument( "--sharding", help= "Define if the script is part of a shard need to include the place in the shard and the size of the shard [0/3]" ) full_parser.add_argument( "--sharding-offset", default=True, help="Define an offset needs to be applied to the shard_id") full_parser.add_argument("--parserdir", default="parsers", help="Directory where to find parsers") full_parser.add_argument( "--collector-timeout", default=15, help="Timeout for collector device rpc/rest calls") full_parser.add_argument("--retry", default=5, help="Max retry") full_parser.add_argument("--host", default=None, help="Host DNS or IP") full_parser.add_argument("--hosts", default="hosts.yaml", help="Hosts file in yaml") full_parser.add_argument("--commands", default="commands.yaml", help="Commands file in Yaml") full_parser.add_argument("--credentials", default="credentials.yaml", help="Credentials file in Yaml") full_parser.add_argument( "--no-facts", action='store_false', help= "Disable facts collection on device (remove version and product name in results)" ) full_parser.add_argument("--output-format", default="influxdb", help="Format of the output") full_parser.add_argument("--output-type", default="stdout", choices=['stdout', 'http'], help="Type of output") full_parser.add_argument("--output-addr", default="http://localhost:8186/write", help="Addr information for output action") full_parser.add_argument( "--no-collector-threads", action='store_true', help= "Dont Spawn multiple threads to collect the information on the devices" ) full_parser.add_argument( "--nbr-collector-threads", type=int, default=10, help="Maximum number of collector thread to spawn (default 10)") full_parser.add_argument( "--max-worker-threads", type=int, default=1, help="Maximum number of worker threads per interval for scheduler") full_parser.add_argument("--use-scheduler", action='store_true', help="Use scheduler") full_parser.add_argument( "--hosts-refresh-interval", type=int, default=3 * 60 * 60, help="Interval to periodically refresh dynamic host inventory") full_parser.add_argument("--allow-zero-hosts", action='store_true', help="Allow scheduler to run even with 0 hosts") dynamic_args = vars(full_parser.parse_args()) # Print help if no parameters are provided if len(sys.argv) == 1: full_parser.print_help() sys.exit(1) ### ------------------------------------------------------------------------------ # Loading YAML Default Variables ### ------------------------------------------------------------------------------ max_connection_retries = dynamic_args['retry'] logging_level = int(dynamic_args['loglvl']) ### ------------------------------------------------------------------------------ ### Validate Arguments ### ------------------------------------------------------------------------------ pp = pprint.PrettyPrinter(indent=4) tag_list = [] ### Known and fixed arguments if dynamic_args['tag']: tag_list = dynamic_args['tag'] else: tag_list = [".*"] if not (dynamic_args['start']): print('Missing <start> option, so nothing to do') sys.exit(0) ### ------------------------------------------------------------------------------ ### Logging ### ------------------------------------------------------------------------------ formatter = logging.Formatter( '%(asctime)s %(name)s: %(levelname)s: %(message)s') sh = logging.StreamHandler() sh.setFormatter(formatter) handlers = [sh] if dynamic_args['logdir']: log_dir = BASE_DIR + "/" + dynamic_args['logdir'] ## Check that logs directory exist, create it if needed if not os.path.exists(log_dir): os.makedirs(log_dir) filename = log_dir + "/" + 'metric_collector.log', fh = logging.handlers.RotatingFileHandler(filename, maxSize=10 * 1024 * 1024, backupCount=5) fh.setFormatter(formatter) handlers.append(fh) logging.basicConfig(level=logging_level, handlers=handlers) ### ------------------------------------------------------------------------------ ### LOAD all credentials in a dict ### ------------------------------------------------------------------------------ credentials = {} credentials_yaml_file = '' if os.path.isfile(dynamic_args['credentials']): credentials_yaml_file = dynamic_args['credentials'] else: credentials_yaml_file = BASE_DIR + "/" + dynamic_args['credentials'] logger.info('Importing credentials file: %s ', credentials_yaml_file) try: with open(credentials_yaml_file) as f: credentials = yaml.full_load(f) except Exception as e: logger.error('Error importing credentials file: %s: %s', credentials_yaml_file, str(e)) sys.exit(0) ### ------------------------------------------------------------------------------ ### LOAD all commands with their tags in a dict ### ------------------------------------------------------------------------------ commands_yaml_file = '' commands = [] if os.path.isfile(dynamic_args['commands']): commands_yaml_file = dynamic_args['commands'] else: commands_yaml_file = BASE_DIR + "/" + dynamic_args['commands'] logger.info('Importing commands file: %s ', commands_yaml_file) with open(commands_yaml_file) as f: try: for document in yaml.load_all(f, yaml.FullLoader): commands.append(document) except Exception as e: logger.error('Error importing commands file: %s, %s', commands_yaml_file, str(e)) sys.exit(0) general_commands = commands[0] use_threads = not (dynamic_args['no_collector_threads']) if dynamic_args['cmd_tag']: command_tags = dynamic_args['cmd_tag'] else: command_tags = ['.*'] sharding = dynamic_args.get('sharding') sharding_offset = dynamic_args.get('sharding_offset') max_worker_threads = dynamic_args.get('max_worker_threads', 1) max_collector_threads = dynamic_args.get('nbr_collector_threads') if dynamic_args.get('use_scheduler', False): device_scheduler = scheduler.Scheduler( credentials, general_commands, dynamic_args['parserdir'], dynamic_args['output_type'], dynamic_args['output_addr'], max_worker_threads=max_worker_threads, use_threads=use_threads, num_threads_per_worker=max_collector_threads, collector_timeout=dynamic_args['collector_timeout']) hri = dynamic_args.get('hosts_refresh_interval', 6 * 60 * 60) select_hosts( dynamic_args['hosts'], tag_list, sharding, sharding_offset, scheduler=device_scheduler, refresh_interval=float(hri), allow_zero_hosts=dynamic_args.get('allow_zero_hosts', False), ) device_scheduler.start() # blocking call return ### ------------------------------------------------------------------------------ ### LOAD all parsers ### ------------------------------------------------------------------------------ parsers_manager = parser_manager.ParserManager( parser_dirs=dynamic_args['parserdir']) hosts_conf = select_hosts(dynamic_args['hosts'], tag_list, sharding, sharding_offset) hosts_manager = host_manager.HostManager(credentials=credentials, commands=general_commands) hosts_manager.update_hosts(hosts_conf) coll = collector.Collector(hosts_manager=hosts_manager, parser_manager=parsers_manager, output_type=dynamic_args['output_type'], output_addr=dynamic_args['output_addr'], collect_facts=dynamic_args.get( 'no_facts', True), timeout=dynamic_args['collector_timeout']) target_hosts = hosts_manager.get_target_hosts(tags=tag_list) if use_threads: target_hosts_lists = [ target_hosts[x:x + int(len(target_hosts) / max_collector_threads + 1)] for x in range(0, len(target_hosts), int(len(target_hosts) / max_collector_threads + 1)) ] jobs = [] for (i, target_hosts_list) in enumerate(target_hosts_lists, 1): logger.info( 'Collector Thread-%s scheduled with following hosts: %s', i, target_hosts_list) thread = threading.Thread(target=coll.collect, args=('global', ), kwargs={ "hosts": target_hosts_list, "cmd_tags": command_tags }) jobs.append(thread) i = i + 1 # Start the threads for j in jobs: j.start() # Ensure all of the threads have finished for j in jobs: j.join() else: # Execute everythings in the main thread coll.collect('global', hosts=target_hosts, cmd_tags=command_tags) ### ----------------------------------------------------- ### Collect Global Statistics ### ----------------------------------------------------- time_end = time.time() time_execution = time_end - time_start global_datapoint = [{ 'measurement': global_measurement_prefix + '_stats_agent', 'tags': {}, 'fields': { 'execution_time_sec': "%.4f" % time_execution, 'nbr_devices': len(target_hosts) }, 'timestamp': time.time_ns(), }] if 'sharding' in dynamic_args and dynamic_args['sharding'] != None: global_datapoint[0]['tags']['sharding'] = dynamic_args['sharding'] if use_threads: global_datapoint[0]['fields']['nbr_threads'] = dynamic_args[ 'nbr_collector_threads'] ### Send results to the right output try: if dynamic_args['output_type'] == 'stdout': utils.print_format_influxdb(global_datapoint) elif dynamic_args['output_type'] == 'http': utils.post_format_influxdb( global_datapoint, dynamic_args['output_addr'], ) else: logger.warn('Output format unknown: %s', dynamic_args['output_type']) except Exception as ex: logger.warn("Hit error trying to post to influx: ", str(ex))
def run(self): ''' Main run loop ''' while True: if not self._run: return self._lock.acquire() logger.info('{}: Starting collection for {} hosts'.format( self.name, len(self.hostcmds))) hosts = list(self.hostcmds.keys()) time_start = time.time() if self.use_threads: target_hosts_lists = [ hosts[x:x + int(len(hosts) / self.num_collector_threads + 1)] for x in range( 0, len(hosts), int(len(hosts) / self.num_collector_threads + 1)) ] jobs = [] for i, target_hosts_list in enumerate(target_hosts_lists, 1): logger.info( '{}: Collector Thread-{} scheduled with following hosts: {}' .format(self.name, i, target_hosts_list)) hostcmds = {} for host in target_hosts_list: hostcmds[host] = self.hostcmds[host] job = threading.Thread(target=self.collector.collect, args=(self.name, ), kwargs={"host_cmds": hostcmds}) job.start() jobs.append(job) # Ensure all of the threads have finished for j in jobs: j.join() else: # Execute everythings in the main thread self.collector.collect(self.name, host_cmds=self.hostcmds) time_end = time.time() time_execution = time_end - time_start worker_datapoint = [{ 'measurement': collector.global_measurement_prefix + '_worker_stats', 'tags': { 'worker_name': self.name }, 'fields': { 'execution_time_sec': "%.4f" % time_execution, 'nbr_devices': len(self.hostcmds), 'nbr_threads': self.num_collector_threads }, 'timestamp': time.time_ns(), }] if os.environ.get('NOMAD_JOB_NAME'): worker_datapoint[0]['tags']['nomad_job_name'] = os.environ[ 'NOMAD_JOB_NAME'] if os.environ.get('NOMAD_ALLOC_INDEX'): worker_datapoint[0]['tags']['nomad_alloc_index'] = os.environ[ 'NOMAD_ALLOC_INDEX'] ### Send results to the right output try: if self.output_type == 'stdout': utils.print_format_influxdb(worker_datapoint) elif self.output_type == 'http': utils.post_format_influxdb(worker_datapoint, self.output_addr) else: logger.warn('{}: Output format unknown: {}'.format( self.name, self.output_type)) except Exception as ex: logger.exception("Hit exception trying to post to influx") logger.info('Worker {} took {} seconds to run'.format( self.name, time_execution)) self._lock.release() # sleep until next interval time.sleep(self.interval)
def collect(self, worker_name, hosts=None, host_cmds=None, cmd_tags=None): if not hosts and not host_cmds: logger.error('Collector: Nothing to collect') return if hosts: host_cmds = {} tags = cmd_tags or ['.*'] for host in hosts: cmds = self.hosts_manager.get_target_commands(host, tags=tags) target_cmds = [] for c in cmds: target_cmds += c['commands'] host_cmds[host] = target_cmds for host, target_commands in host_cmds.items(): values = [] credential = self.hosts_manager.get_credentials(host) host_reachable = False logger.info('Collector starting for: %s', host) host_address = self.hosts_manager.get_address(host) host_context = self.hosts_manager.get_context(host) device_type = self.hosts_manager.get_device_type(host) if device_type == 'juniper': dev = netconf_collector.NetconfCollector( host=host, address=host_address, credential=credential, parsers=self.parser_manager, context=host_context) elif device_type == 'f5': dev = f5_rest_collector.F5Collector( host=host, address=host_address, credential=credential, parsers=self.parser_manager, context=host_context) dev.connect() if dev.is_connected(): dev.collect_facts() host_reachable = True else: logger.error('Unable to connect to %s, skipping', host) host_reachable = False time_execution = 0 cmd_successful = 0 cmd_error = 0 if host_reachable: time_start = time.time() ### Execute commands on the device for command in target_commands: try: logger.info('[%s] Collecting > %s' % (host, command)) data = dev.collect(command) # returns a generator if data: values.append(data) cmd_successful += 1 except Exception as err: cmd_error += 1 logger.error( 'An issue happened while collecting %s on %s > %s ' % (host, command, err)) logger.error(traceback.format_exc()) ### Save collector statistics time_end = time.time() time_execution = time_end - time_start host_time_datapoint = [{ 'measurement': global_measurement_prefix + '_host_collector_stats', 'tags': { 'device': dev.hostname, 'worker_name': worker_name }, 'fields': { 'execution_time_sec': "%.4f" % time_execution, 'nbr_commands': cmd_successful + cmd_error, 'nbr_successful_commands': cmd_successful, 'nbr_error_commands': cmd_error, 'reacheable': int(host_reachable), 'unreacheable': int(not host_reachable) }, 'timestamp': time.time_ns(), }] host_time_datapoint[0]['tags'].update(dev.context) if os.environ.get('NOMAD_JOB_NAME'): host_time_datapoint[0]['tags']['nomad_job_name'] = os.environ[ 'NOMAD_JOB_NAME'] if os.environ.get('NOMAD_ALLOC_INDEX'): host_time_datapoint[0]['tags'][ 'nomad_alloc_index'] = os.environ['NOMAD_ALLOC_INDEX'] values.append((n for n in host_time_datapoint)) values = itertools.chain(*values) ### Send results to the right output try: if self.output_type == 'stdout': utils.print_format_influxdb(values) elif self.output_type == 'http': utils.post_format_influxdb(values, self.output_addr) else: logger.warn('Collector: Output format unknown: {}'.format( self.output_type)) except Exception as ex: logger.exception("Hit exception trying to post to influx") if host_reachable: dev.close()