def main(): logging.basicConfig(filename='log.main', level=logging.DEBUG, filemode='w', format='%(asctime)s %(levelname)s %(message)s') logging.info('Loading local configration...') config = Configuration() node_details = config.load_node_main() logging.info('Loading task_manager...') task_manager = TaskManager(node_details) logging.info('Setting up task storage db...') task_storage = TaskStorage() task_manager.start() try: task_manager.add(PingProbe('149.210.184.36', recurrence_time=1, run_on_nodes=['trucks'])) task_manager.add(TraceProbe('8.8.8.8', recurrence_time=3)) task_manager.add(PingProbe('10.0.0.1', run_on_nodes=['miles'])) while True: # Here we can send probes to the task_manager # e.g. task_manager.add(IcmpProbe('127.0.0.1')) db_tasks = task_storage.get_tasks() for task in db_tasks: if node_details['name'] in task['run_on_nodes']: if task['type'] == 'PingProbe': task_manager.add(PingProbe(task['dest_addr'], recurrence_time=task['recurrence_time'], recurrence_count=task['recurrence_count'], run_on_nodes=task['run_on_nodes'])) if task['type'] == 'TraceProbe': task_manager.add(TraceProbe(task['dest_addr'], recurrence_time=task['recurrence_time'], recurrence_count=task['recurrence_count'], run_on_nodes=task['run_on_nodes'])) time.sleep(5) except KeyboardInterrupt: task_manager.stop() print("\nThanks for joining!\n")
class Master: def msg_callback(self, ch, method, properties, body): callback_set = {'SUCCESS': self.success, 'FAIL': self.fail, 'AWAKE': self.update_slave_response_time, 'STOP': self.stop, 'ADD_SLAVE': self.add_slave, 'KILL_SLAVE': self.kill_slave, 'RESTART_SLAVE': self.restart_slave, 'STAT': self.stat, 'START': self.start, 'RECONFIGURE': self.configure, 'REFRESH': self.refresh } try: command = body[:body.find(' ')] info = body[body.find(' ')+1:] if command in callback_set: callback_set[command](ujson.loads(info)) else: logging.debug(" [x] Unknown command %r" % (str(body),)) except KeyError as e: if str(e) == "'Queue.DeclareOk'": logging.debug("Queue.DelcareOk at %r" % (str(body),)) else: logging.error("Unknown KeyError at %r:" % (str(body),)) except RuntimeError as e: if 'recursion' in str(e): logging.error('MAXIMUM RECURSION ERROR') def __init__(self, conf_file): self.config = ConfigParser.ConfigParser(allow_no_value=True) self.clean_time_gap = None self.wait_time_for_slave = None self.master_queue_name = None self.task_queue_name = None self.task_queue_size_limit = None self.task_file_name = None self.task_counter_file = None self.ssh_key = None self.s3_bucket = None self.s3_folder = None self.slave_num_every_packup = None self.slave_max_sec_each_task = None self.slave_python_version = None self.master_ip = None self.slaves_ip = None self.slave_awake_frequency = None self.configure(conf_file) self.last_wake_time = None self.repeated_timer = None self.is_started = False self.pop_forever_handler = None logging.info('Starting task manager...') self.task_manager = TaskManager(self.task_file_name, self.task_counter_file) logging.info('Starting slave manager...') self.slave_manager = SlaveManager(master_ip=self.master_ip, slaves_ip=self.slaves_ip, ssh_key=self.ssh_key, s3_bucket=self.s3_bucket, s3_folder=self.s3_folder, slave_num_every_packup=self.slave_num_every_packup, slave_max_sec_each_task=self.slave_max_sec_each_task, slave_python_version=self.slave_python_version, slave_awake_frequency=self.slave_awake_frequency, slave_buffer_size=1) logging.info('Starting connection manager...') self.message_connection = ConnectionManager(queue_name=self.master_queue_name, durable=False, callback=self.msg_callback, no_ack=True) def run(self): logging.info(' [*] Waiting for messages. To exit press CTRL+C') try: self.message_connection.start_accepting_message() except KeyboardInterrupt: logging.info('Stopping master...') master.stop(None) except EOFError: logging.info('Download finishes. Shutting down master.') master.stop(None) # except Exception as e: # logging.info(str(e)) # logging.info('Stopping master...') # TODO: write all configuration in one file def configure(self, conf_file): self.config.read(conf_file) self.clean_time_gap = self.config.getint('main', 'clean_time_gap') self.wait_time_for_slave = self.config.getint('main', 'wait_time_for_slave') self.slave_awake_frequency = self.config.get('main', 'slave_awake_frequency') self.master_ip = self.config.get('main', 'master_private_ip') self.slaves_ip = self.config.get('main', 'slaves_private_ip') self.master_queue_name = self.config.get('main', 'master_queue_name') self.task_queue_name = self.config.get('main', 'task_queue_name') self.task_file_name = self.config.get('main', 'task_file') self.task_queue_size_limit = int(self.config.get('main', 'task_queue_size_limit')) self.task_counter_file = self.config.get('main', 'task_counter_file') self.ssh_key = self.config.get('main', 'ssh_key') self.s3_bucket = self.config.get('main', 's3_bucket') self.s3_folder = self.config.get('main', 's3_folder') self.slave_num_every_packup = self.config.get('main', 'slave_num_every_packup') self.slave_max_sec_each_task = self.config.get('main', 'slave_max_sec_each_task') self.slave_python_version = self.config.get('main', 'slave_python_version') def add_slave(self, slave_info): if self.slave_manager.exist_slave(slave_info): logging.info('Slave ' + slave_info['host'] + ' already exists.') return logging.info('master: add slave' + str(slave_info)) new_slave_info = self.slave_manager.add_slave(slave_info) self.slave_manager.run_slave(new_slave_info) # TODO: def kill_slave(self, slave_info): if not self.slave_manager.exist_slave(slave_info): return logging.info('kill slave ' + str(slave_info)) self.slave_manager.kill_slave(slave_info) def restart_slave(self, slave_info): logging.info(slave_info['host']) logging.info('restart_slave' + str(slave_info)) self.kill_slave(slave_info) self.add_slave(slave_info) def start(self, info): logging.info('Master Starts') self.last_wake_time = datetime.datetime.utcnow() self.is_started = True self.pop_forever_handler = threading.Thread(target=self.start_popping_tasks) self.pop_forever_handler.start() self.repeated_timer = RepeatedTimer(self.clean_time_gap, self.notice_refresh, None) def pop_forever(self): self.start_popping_tasks() def get_task_queue_size(self): pass # TODO: There is a bottle neck here def start_popping_tasks(self): task_connection = ConnectionManager(queue_name=self.task_queue_name, durable=True, no_ack=False) eof_reached = False while self.is_started and not eof_reached: current_task_queue_size = task_connection.get_task_queue_size() while self.is_started and current_task_queue_size < self.task_queue_size_limit: task = self.task_manager.pop_task() if task is None: # TODO: Don't use Error. Just break and handle the case later in this function logging.info('EOF Reached') eof_reached = True break message = 'WORK ' + ujson.dumps(task) task_connection.publish(message) current_task_queue_size += 1 task_connection.stop() def fail(self, slave_task_info): self.task_manager.add_task(slave_task_info['task']) self.slave_manager.update_last_response(slave_task_info) def success(self, slave_task_info): slave_info = self.slave_manager.update_last_response(slave_task_info) def update_slave_response_time(self, slave_task_info): slave_info = self.slave_manager.update_last_response(slave_task_info) def stop(self, info): self.is_started = False self.notice_slaves_stop() if self.pop_forever_handler is not None: self.pop_forever_handler.join() if self.repeated_timer is not None: self.repeated_timer.stop() self.slave_manager.stop() self.task_manager.stop() self.message_connection.stop() def notice_slaves_stop(self): task_connection = ConnectionManager(queue_name=self.task_queue_name, durable=True, no_ack=False) screen_list = [key for key in self.slave_manager.slave_dict.keys()] for screen in screen_list: task_connection.publish('STOP {}') # task_connection.broadcast_task('STOP {}') task_connection.stop() def refresh(self, info): cur_progress, total_task = self.task_manager.get_progress() logging.info('downloading {}/{} files'.format(cur_progress, total_task)) if not self.is_started: return # if time interval met, check failed slave if self.last_wake_time is None: self.last_wake_time = datetime.datetime.utcnow() if self.last_wake_time + datetime.timedelta( seconds=self.clean_time_gap) > datetime.datetime.utcnow(): return failed_slaves = self.slave_manager.get_failed_slaves(self.wait_time_for_slave) if len(failed_slaves) != 0: logging.info('Finding failed slaves... ' + str(failed_slaves)) for slave in failed_slaves: self.restart_slave(slave) self.last_wake_time = datetime.datetime.utcnow() def notice_refresh(self, info): try: self.message_connection.publish('REFRESH {}') except IndexError: logging.critical('INDEX_ERROR') def stat(self, info): logging.info('=====================================') logging.info('Num of slave: ', self.slave_manager.get_num_slaves()) logging.info('=====================================') if len(info) > 0: for slave in self.slave_manager.slave_list: if slave['last_response'] is None: delta = 'new slave' else: delta = datetime.datetime.utcnow() - slave['last_response'] logging.info(slave['host'], '|', slave['queue'], '|', delta) logging.info('====================================')
class ProviderManager(object): log = logging.getLogger("nodepool.ProviderManager") @staticmethod def reconfigure(old_config, new_config, use_taskmanager=True): stop_managers = [] for p in new_config.providers.values(): oldmanager = None if old_config: oldmanager = old_config.provider_managers.get(p.name) if oldmanager and p != oldmanager.provider: stop_managers.append(oldmanager) oldmanager = None if oldmanager: new_config.provider_managers[p.name] = oldmanager else: ProviderManager.log.debug("Creating new ProviderManager object" " for %s" % p.name) new_config.provider_managers[p.name] = \ get_provider_manager(p, use_taskmanager) new_config.provider_managers[p.name].start() for stop_manager in stop_managers: stop_manager.stop() @staticmethod def stopProviders(config): for m in config.provider_managers.values(): m.stop() m.join() def __init__(self, provider, use_taskmanager): self.provider = provider self._images = {} self._networks = {} self.__flavors = {} self._use_taskmanager = use_taskmanager self._taskmanager = None def start(self): if self._use_taskmanager: self._taskmanager = TaskManager(None, self.provider.name, self.provider.rate) self._taskmanager.start() self.resetClient() def stop(self): if self._taskmanager: self._taskmanager.stop() def join(self): if self._taskmanager: self._taskmanager.join() @property def _flavors(self): if not self.__flavors: self.__flavors = self._getFlavors() return self.__flavors def _getClient(self): if self._use_taskmanager: manager = self._taskmanager else: manager = None return shade.OpenStackCloud(cloud_config=self.provider.cloud_config, manager=manager, **self.provider.cloud_config.config) def resetClient(self): self._client = self._getClient() if self._use_taskmanager: self._taskmanager.setClient(self._client) def _getFlavors(self): flavors = self.listFlavors() flavors.sort(lambda a, b: cmp(a['ram'], b['ram'])) return flavors def findFlavor(self, min_ram, name_filter=None): # Note: this will throw an error if the provider is offline # but all the callers are in threads (they call in via CreateServer) so # the mainloop won't be affected. for f in self._flavors: if (f['ram'] >= min_ram and (not name_filter or name_filter in f['name'])): return f raise Exception("Unable to find flavor with min ram: %s" % min_ram) def findImage(self, name): if name in self._images: return self._images[name] with shade_inner_exceptions(): image = self._client.get_image(name) self._images[name] = image return image def findNetwork(self, name): if name in self._networks: return self._networks[name] with shade_inner_exceptions(): network = self._client.get_network(name) self._networks[name] = network return network def deleteImage(self, name): if name in self._images: del self._images[name] with shade_inner_exceptions(): return self._client.delete_image(name) def createServer(self, name, min_ram, image_id=None, image_name=None, az=None, key_name=None, name_filter=None, config_drive=True, nodepool_node_id=None, nodepool_image_name=None, nodepool_snapshot_image_id=None): if image_name: image = self.findImage(image_name) else: image = {'id': image_id} flavor = self.findFlavor(min_ram, name_filter) create_args = dict(name=name, image=image, flavor=flavor, config_drive=config_drive) if key_name: create_args['key_name'] = key_name if az: create_args['availability_zone'] = az nics = [] for network in self.provider.networks: if network.id: nics.append({'net-id': network.id}) elif network.name: net_id = self.findNetwork(network.name)['id'] nics.append({'net-id': net_id}) else: raise Exception("Invalid 'networks' configuration.") if nics: create_args['nics'] = nics # Put provider.name and image_name in as groups so that ansible # inventory can auto-create groups for us based on each of those # qualities # Also list each of those values directly so that non-ansible # consumption programs don't need to play a game of knowing that # groups[0] is the image name or anything silly like that. nodepool_meta = dict(provider_name=self.provider.name) groups_meta = [self.provider.name] if self.provider.nodepool_id: nodepool_meta['nodepool_id'] = self.provider.nodepool_id if nodepool_node_id: nodepool_meta['node_id'] = nodepool_node_id if nodepool_snapshot_image_id: nodepool_meta['snapshot_image_id'] = nodepool_snapshot_image_id if nodepool_image_name: nodepool_meta['image_name'] = nodepool_image_name groups_meta.append(nodepool_image_name) create_args['meta'] = dict(groups=json.dumps(groups_meta), nodepool=json.dumps(nodepool_meta)) with shade_inner_exceptions(): return self._client.create_server(wait=False, **create_args) def getServer(self, server_id): with shade_inner_exceptions(): return self._client.get_server(server_id) def waitForServer(self, server, timeout=3600): with shade_inner_exceptions(): return self._client.wait_for_server(server=server, auto_ip=True, reuse=False, timeout=timeout) def waitForServerDeletion(self, server_id, timeout=600): for count in iterate_timeout(timeout, exceptions.ServerDeleteException, "server %s deletion" % server_id): if not self.getServer(server_id): return def waitForImage(self, image_id, timeout=3600): last_status = None for count in iterate_timeout(timeout, exceptions.ImageCreateException, "image creation"): try: image = self.getImage(image_id) except NotFound: continue except ManagerStoppedException: raise except Exception: self.log.exception('Unable to list images while waiting for ' '%s will retry' % (image_id)) continue # shade returns None when not found if not image: continue status = image['status'] if (last_status != status): self.log.debug( 'Status of image in {provider} {id}: {status}'.format( provider=self.provider.name, id=image_id, status=status)) if status == 'ERROR' and 'fault' in image: self.log.debug( 'ERROR in {provider} on {id}: {resason}'.format( provider=self.provider.name, id=image_id, resason=image['fault']['message'])) last_status = status # Glance client returns lower case statuses - but let's be sure if status.lower() in ['active', 'error']: return image def createImage(self, server, image_name, meta): with shade_inner_exceptions(): return self._client.create_image_snapshot(image_name, server, **meta) def getImage(self, image_id): with shade_inner_exceptions(): return self._client.get_image(image_id) def uploadImage(self, image_name, filename, image_type=None, meta=None, md5=None, sha256=None): # configure glance and upload image. Note the meta flags # are provided as custom glance properties # NOTE: we have wait=True set here. This is not how we normally # do things in nodepool, preferring to poll ourselves thankyouverymuch. # However - two things to note: # - PUT has no aysnc mechanism, so we have to handle it anyway # - v2 w/task waiting is very strange and complex - but we have to # block for our v1 clouds anyway, so we might as well # have the interface be the same and treat faking-out # a shade-level fake-async interface later if not meta: meta = {} if image_type: meta['disk_format'] = image_type with shade_inner_exceptions(): image = self._client.create_image(name=image_name, filename=filename, is_public=False, wait=True, md5=md5, sha256=sha256, **meta) return image.id def listImages(self): with shade_inner_exceptions(): return self._client.list_images() def listFlavors(self): with shade_inner_exceptions(): return self._client.list_flavors(get_extra=False) def listServers(self): # shade list_servers carries the nodepool server list caching logic with shade_inner_exceptions(): return self._client.list_servers() def deleteServer(self, server_id): with shade_inner_exceptions(): return self._client.delete_server(server_id, delete_ips=True) def cleanupServer(self, server_id): server = self.getServer(server_id) if not server: raise NotFound() self.log.debug('Deleting server %s' % server_id) self.deleteServer(server_id) def cleanupLeakedFloaters(self): with shade_inner_exceptions(): self._client.delete_unattached_floating_ips()
class ProviderManager(object): log = logging.getLogger("nodepool.ProviderManager") @staticmethod def reconfigure(old_config, new_config, use_taskmanager=True): stop_managers = [] for p in new_config.providers.values(): oldmanager = None if old_config: oldmanager = old_config.provider_managers.get(p.name) if oldmanager and p != oldmanager.provider: stop_managers.append(oldmanager) oldmanager = None if oldmanager: new_config.provider_managers[p.name] = oldmanager else: ProviderManager.log.debug("Creating new ProviderManager object" " for %s" % p.name) new_config.provider_managers[p.name] = \ get_provider_manager(p, use_taskmanager) new_config.provider_managers[p.name].start() for stop_manager in stop_managers: stop_manager.stop() @staticmethod def stopProviders(config): for m in config.provider_managers.values(): m.stop() m.join() def __init__(self, provider, use_taskmanager): self.provider = provider self._images = {} self._networks = {} self.__flavors = {} self._use_taskmanager = use_taskmanager self._taskmanager = None def start(self): if self._use_taskmanager: self._taskmanager = TaskManager(None, self.provider.name, self.provider.rate) self._taskmanager.start() self.resetClient() def stop(self): if self._taskmanager: self._taskmanager.stop() def join(self): if self._taskmanager: self._taskmanager.join() @property def _flavors(self): if not self.__flavors: self.__flavors = self._getFlavors() return self.__flavors def _getClient(self): if self._use_taskmanager: manager = self._taskmanager else: manager = None return shade.OpenStackCloud( cloud_config=self.provider.cloud_config, manager=manager, **self.provider.cloud_config.config) def resetClient(self): self._client = self._getClient() if self._use_taskmanager: self._taskmanager.setClient(self._client) def _getFlavors(self): flavors = self.listFlavors() flavors.sort(lambda a, b: cmp(a['ram'], b['ram'])) return flavors def findFlavor(self, min_ram, name_filter=None): # Note: this will throw an error if the provider is offline # but all the callers are in threads (they call in via CreateServer) so # the mainloop won't be affected. for f in self._flavors: if (f['ram'] >= min_ram and (not name_filter or name_filter in f['name'])): return f raise Exception("Unable to find flavor with min ram: %s" % min_ram) def findImage(self, name): if name in self._images: return self._images[name] with shade_inner_exceptions(): image = self._client.get_image(name) self._images[name] = image return image def findNetwork(self, name): if name in self._networks: return self._networks[name] with shade_inner_exceptions(): network = self._client.get_network(name) self._networks[name] = network return network def deleteImage(self, name): if name in self._images: del self._images[name] with shade_inner_exceptions(): return self._client.delete_image(name) def addKeypair(self, name): key = paramiko.RSAKey.generate(2048) public_key = key.get_name() + ' ' + key.get_base64() with shade_inner_exceptions(): self._client.create_keypair(name=name, public_key=public_key) return key def listKeypairs(self): with shade_inner_exceptions(): keypairs = self._client.list_keypairs() return keypairs def deleteKeypair(self, name): with shade_inner_exceptions(): return self._client.delete_keypair(name=name) def createServer(self, name, min_ram, image_id=None, image_name=None, az=None, key_name=None, name_filter=None, config_drive=None, nodepool_node_id=None, nodepool_image_name=None, nodepool_snapshot_image_id=None): if image_name: image = self.findImage(image_name) else: image = {'id': image_id} flavor = self.findFlavor(min_ram, name_filter) create_args = dict(name=name, image=image, flavor=flavor, config_drive=config_drive) if key_name: create_args['key_name'] = key_name if az: create_args['availability_zone'] = az nics = [] for network in self.provider.networks: if network.id: nics.append({'net-id': network.id}) elif network.name: net_id = self.findNetwork(network.name)['id'] nics.append({'net-id': net_id}) else: raise Exception("Invalid 'networks' configuration.") if nics: create_args['nics'] = nics # Put provider.name and image_name in as groups so that ansible # inventory can auto-create groups for us based on each of those # qualities # Also list each of those values directly so that non-ansible # consumption programs don't need to play a game of knowing that # groups[0] is the image name or anything silly like that. nodepool_meta = dict(provider_name=self.provider.name) groups_meta = [self.provider.name] if nodepool_node_id: nodepool_meta['node_id'] = nodepool_node_id if nodepool_snapshot_image_id: nodepool_meta['snapshot_image_id'] = nodepool_snapshot_image_id if nodepool_image_name: nodepool_meta['image_name'] = nodepool_image_name groups_meta.append(nodepool_image_name) create_args['meta'] = dict( groups=json.dumps(groups_meta), nodepool=json.dumps(nodepool_meta) ) with shade_inner_exceptions(): return self._client.create_server(wait=False, **create_args) def getServer(self, server_id): with shade_inner_exceptions(): return self._client.get_server(server_id) def waitForServer(self, server, timeout=3600): with shade_inner_exceptions(): return self._client.wait_for_server( server=server, auto_ip=False, reuse=False, timeout=timeout) def waitForServerDeletion(self, server_id, timeout=600): for count in iterate_timeout( timeout, exceptions.ServerDeleteException, "server %s deletion" % server_id): if not self.getServer(server_id): return def waitForImage(self, image_id, timeout=3600): last_status = None for count in iterate_timeout( timeout, exceptions.ImageCreateException, "image creation"): try: image = self.getImage(image_id) except NotFound: continue except ManagerStoppedException: raise except Exception: self.log.exception('Unable to list images while waiting for ' '%s will retry' % (image_id)) continue # shade returns None when not found if not image: continue status = image['status'] if (last_status != status): self.log.debug( 'Status of image in {provider} {id}: {status}'.format( provider=self.provider.name, id=image_id, status=status)) if status == 'ERROR' and 'fault' in image: self.log.debug( 'ERROR in {provider} on {id}: {resason}'.format( provider=self.provider.name, id=image_id, resason=image['fault']['message'])) last_status = status # Glance client returns lower case statuses - but let's be sure if status.lower() in ['active', 'error']: return image def createImage(self, server, image_name, meta): with shade_inner_exceptions(): return self._client.create_image_snapshot( image_name, server, **meta) def getImage(self, image_id): with shade_inner_exceptions(): return self._client.get_image(image_id) def uploadImage(self, image_name, filename, image_type=None, meta=None, md5=None, sha256=None): # configure glance and upload image. Note the meta flags # are provided as custom glance properties # NOTE: we have wait=True set here. This is not how we normally # do things in nodepool, preferring to poll ourselves thankyouverymuch. # However - two things to note: # - PUT has no aysnc mechanism, so we have to handle it anyway # - v2 w/task waiting is very strange and complex - but we have to # block for our v1 clouds anyway, so we might as well # have the interface be the same and treat faking-out # a shade-level fake-async interface later if not meta: meta = {} if image_type: meta['disk_format'] = image_type with shade_inner_exceptions(): image = self._client.create_image( name=image_name, filename=filename, is_public=False, wait=True, md5=md5, sha256=sha256, **meta) return image.id def listImages(self): with shade_inner_exceptions(): return self._client.list_images() def listFlavors(self): with shade_inner_exceptions(): return self._client.list_flavors(get_extra=False) def listServers(self): # shade list_servers carries the nodepool server list caching logic with shade_inner_exceptions(): return self._client.list_servers() def deleteServer(self, server_id): with shade_inner_exceptions(): return self._client.delete_server(server_id, delete_ips=True) def cleanupServer(self, server_id): server = self.getServer(server_id) if not server: raise NotFound() key_name = server.get('key_name') if key_name and key_name != self.provider.keypair: with shade_inner_exceptions(): self._client.delete_keypair(name=server['key_name']) self.log.debug('Deleting server %s' % server_id) self.deleteServer(server_id) def cleanupLeakedFloaters(self): with shade_inner_exceptions(): self._client.delete_unattached_floating_ips()