def __init__(self, rootDir, db, ctx): self.local_storage = RrdWrapper(rootDir) self.database = db self.context = ctx self.virtualMachines = {} self.estimatedMem = {} self.now = None self.lastUpdateTime = datetime.datetime.now() - datetime.timedelta(hours=24) self.hosts = {}
def collect_data(self, hostname, vm_name, resource): """ Collect historical data about resource utilization for given node (hostname/virtual machine). CUrrently it's implemented to retrieve data from RRD's files. :return: """ # node_topic = '%s.%s' % (HealthMonitorNodeAPI.HEALTH_MONITOR_NODE_TOPIC, hostname) if self.local_storage is None: self.local_storage = RrdWrapper(self.RRD_ROOT_DIR) node = "%s.%s" % (hostname, vm_name) endTime = datetime.datetime.now() startTime = endTime - datetime.timedelta(hours=1) # TODO: Move to configuration file customizable timedelta self.local_storage.query(startTime, endTime, resource, node) return None
class HealthMonitorManager(manager.Manager): BASE_RPC_API_VERSION = "1.0" RPC_API_VERSION = "1.0" RRD_ROOT_DIR = "/home/stack/ganglia" # def __init__(self, topic=None): # print "HelloMgr" ## self.topic = topic timestamp = None stabilizationTimeDelta = datetime.timedelta(minutes=20) lock = threading.RLock() lock2 = threading.RLock() # RPC API Implementation ------------------------------------------------------------------------------------------- def raise_alert(self, ctx=None, alert=None): LOG.info(alert) with self.lock: if self.STARTED: # Drop alert, algorithm is running. # TODO: Maybe alerts should be added to cyclic buffer? return else: # Do not check alerts because it's too early if ( self.timestamp is not None and (self.timestamp + MigrationParams.STABILIZATION_TIME_DELTA) > datetime.datetime.now() ): LOG.info("It's too early to run algorithm. Waiting for stabilization.") return self.STARTED = self.dataProvider.preProcessAlert(alert) try: if self.dataProvider.preProcessAlert(alert): if not self._is_migrating(): self.prepare_resource_allocation_algorithm_input(alert) pass except Exception as err: print "exception %s" % err LOG.error(err) with self.lock: self.STARTED = False # ------------------------------------------------------------------------------------------------------------------- def _get_scheduler_rpc_api(self): if not self.scheduler_rpc_api: self._init_scheduler() return self.scheduler_rpc_api def _is_migrating(self): ctx = context.get_admin_context() instances = self.db.instance_get_all(ctx) for instance in instances: if instance.vm_state == "migrating": LOG.error("Migration in process. Abort algorithm execution") return True return False # scheduler = self._get_scheduler_rpc_api() # Manager inherited ------------------------------------------------------------------------------------------------ def init_host(self): self.topic = HealthMonitorAPI.HEALTH_MONITOR_TOPIC self.ctx = context.get_admin_context() self.ctx.read_deleted = "no" self.dataProvider = DataProvider(self.RRD_ROOT_DIR, self.db, self.ctx) self.instances = self.db.instance_get_all_by_host(self.ctx, self.host) self.migration_algorithm = AntColonyAlgorithm() self._init_monitors_connections() self.STARTED = False self.scheduler_rpc_api = None # self._test_rpc_call() def periodic_tasks(self, context, raise_on_error=False): pass # ------------------------------------------------------------------------------------------------------------------- class MigrationSettings(object): block_migration = (False,) disk_over_commit = False def __init__(self, **kwargs): self.block_migration = False self.disk_over_commit = False for key in kwargs: setattr(self, key, kwargs[key]) migration_settings = MigrationSettings() # migration_s = namedtuple("", "block_migration disk_over_commit") # http://stackoverflow.com/questions/11708799/any-way-to-initialize-attributes-properties-during-class-creation-in-python def _init_scheduler(self): self.scheduler_rpc_api = SchedulerAPI() if self.scheduler_rpc_api is None: LOG.error("Scheduler == None") raise Exception("Error during execution scheduler") def _init_monitors_connections(self): self.conn = rpc.create_connection(new=True) LOG.debug(_("Creating Consumer connection for Service %s") % self.topic) rpc_dispatcher = self.create_rpc_dispatcher() # According to documentation fanout=True => broadcast to all services. self.conn.create_consumer(self.topic, self, fanout=True) # Consume from all consumers in a thread self.conn.consume_in_thread() def prepare_resource_allocation_algorithm_input(self, alert): """ Hostname is virtual machine's hostname (name) :return: """ isValid = self.dataProvider.getData() if not isValid: LOG.error("skipping this round") return hosts = self.dataProvider.hosts.values() virtualMachines = [] now = datetime.datetime.now() for host in hosts: LOG.error("stat [%s] host %s\t %s", int(time.mktime(now.timetuple())), host.Hostname, host.getMetrics()) virtualMachines.extend(host._vms) for vm in host._vms: LOG.error( "stat [%s]vm %s\t %s", int(time.mktime(now.timetuple())), vm.InstanceName, vm.getMetrics(host) ) InputData = namedtuple("InputData", "Hosts VirtualMachines Alert") input_data_set = InputData(Hosts=hosts, VirtualMachines=virtualMachines, Alert=alert) # Count used hosts and how many boundaries are violated usedHostsBeforeMigration = sum([host.getIsOn() for host in hosts]) # Dictionary <host, tuple(upperBoundsViolations, lowerBoundsViolations)> violationsDictionaryBeforeMigration = HealthMonitorManager.count_boundaries_violations(hosts) # todo if alert mem self.dataProvider.updateWeights() LOG.error("Start Algorithm") try: migrationPlans = self.migration_algorithm.execute_algorithm(input_data_set) except Exception as exc: LOG.error("OOOOOPS %s" % exc) LOG.error("Stop Algorithm") assert migrationPlans is not None, "Migration plans is none" plan, migrations_counter = self.choose_migration_plan(migrationPlans, virtualMachines) # Count used hosts and how many boundaries are violated usedHostsAfterMigration = sum([host.getIsOn() for host in hosts]) # Dictionary <host, tuple(upperBoundsViolations, lowerBoundsViolations)> violationsDictionaryAfterMigration = HealthMonitorManager.count_boundaries_violations(hosts) # Zysk na naruszonych granicach SLA. profitUpper, profitLower = HealthMonitorManager.boundaries_profit_gained( violationsDictionaryBeforeMigration, violationsDictionaryAfterMigration ) LOG.error("stat [%s] Migration count %s", int(time.mktime(now.timetuple())), migrations_counter) LOG.error( "stat [%s] Hosts used before %s, after %s", int(time.mktime(now.timetuple())), usedHostsBeforeMigration, usedHostsAfterMigration, ) if alert["severity"] == 2 and usedHostsAfterMigration >= usedHostsBeforeMigration: # todo make alert['severity'] more human readable LOG.error("There is no profit from migration - skip") return self.dataProvider.saveWeights() for mi in plan: LOG.error("stat [%s] migration %s@%s", int(time.mktime(now.timetuple())), mi.instance_id, mi.hostname) if migrations_counter != 0: self.execute_plan(plan) # Timestamp self.timestamp = datetime.datetime.now() pass @staticmethod def count_boundaries_violations(hosts): def count_true(dictionary): assert isinstance(dictionary, dict) def raise_exception_missing_key(key): if not dictionary.has_key("C"): LOG.error("Missing C key") raise Exception("Missing C key") raise_exception_missing_key("C") raise_exception_missing_key("N") raise_exception_missing_key("M") true_counter = 0 if dictionary["C"]: true_counter += 1 if dictionary["N"]: true_counter += 1 if dictionary["M"]: true_counter += 1 return true_counter violations = {} for host in hosts: assert isinstance(host, Host) upperBoundsWithRaise = count_true(host.getUpperBounds()) upperBoundsViolations = sum(int(violation) for violation in host.getUpperBounds().values()) assert upperBoundsWithRaise == upperBoundsViolations, "Upperbounds violations count error" lowerBoundsWithRaise = count_true(host.getLowerBounds()) lowerBoundsViolations = sum(int(violation) for violation in host.getLowerBounds().values()) assert lowerBoundsWithRaise == lowerBoundsViolations, "Lowerbounds violations count error" violations[host] = (upperBoundsViolations, lowerBoundsViolations) return violations @staticmethod def boundaries_profit_gained(violationsBefore, violationsAfter): assert isinstance(violationsBefore, dict) assert isinstance(violationsAfter, dict) assert len(violationsBefore.keys()) == len(violationsAfter.keys()) def sum_list_of_tuples(tuples): sumX, sumY = 0, 0 for x, y in tuples: sumX += x sumY += y return sumX, sumY def profitFunctionSumWholeViolations(): """ Prosta funkcaj zliczająca ilość naruszeń na górnych granicach i dolnych granicach w sumie w całym środowisku Jeśli suma naruszeń górnych granic jest większa niż """ upperViolatedBefore, lowerViolatedBefore = sum_list_of_tuples(violationsBefore.values()) upperViolatedAfter, lowerViolatedAfter = sum_list_of_tuples(violationsAfter.values()) # ProfitUpper - int # profitUpper==0 : no difference # profitUpper <0 : Not good # profitUpper >0 : Great we have less violations profitUpper = upperViolatedBefore - upperViolatedAfter # ProfitLower - int # profitLower==0 : no difference # profitLower <0 : Not good # profitLower >0 : Great we have less violations profitLower = lowerViolatedBefore - lowerViolatedAfter return profitUpper, profitLower profitUpper, profitLower = profitFunctionSumWholeViolations() return profitUpper, profitLower def choose_migration_plan(self, migrationPlans, virtualMachines): minValue = len(virtualMachines) plan = None if migrationPlans: for current in migrationPlans: migrationCount = 0 for vm in virtualMachines: migrationItem = find(lambda migration_item: migration_item.instance_id == vm.InstanceName, current) if vm.Hostname != migrationItem.hostname: migrationCount += 1 LOG.error("mg count %s", migrationCount) if current is not None and migrationCount < minValue: plan = current minValue = migrationCount else: LOG.info("There are no migration plans") return (None, None) selfMigrations = [] migrationCount = 0 # print "vms" # for vm in virtualMachines: # print vm.InstanceName # # print "Migration Items" # for item in plan: # print "%s@%s" % (item.instance_id, item.hostname) for vm in virtualMachines: assert plan is not None, "Plan is none" assert vm is not None, "VM is None" migrationItem = find(lambda migration_item: migration_item.instance_id == vm.InstanceName, plan) assert migrationItem is not None, "Migration item is None" if vm.Hostname != migrationItem.hostname: migrationCount += 1 self.updateHostVmConn(vm, migrationItem) else: selfMigrations.append(migrationItem) for mi in selfMigrations: plan.remove(mi) return plan, migrationCount def updateHostVmConn(self, vm, migrationItem): assert self.dataProvider.hosts.has_key( migrationItem.hostname ), "data provider has no host specified in migration item" assert self.dataProvider.hosts.has_key(vm.Hostname), "data provider has no host specified in vm" hostFrom = self.dataProvider.hosts[vm.Hostname] hosTo = self.dataProvider.hosts[migrationItem.hostname] hostFrom._vms.remove(vm) hosTo._vms.append(vm) def execute_plan(self, plan): """ Executes migration plan. Migrate VMs to given nodes. :param migrationPlans: list :return: """ try: if not self.scheduler_rpc_api: self._init_scheduler() # assert isinstance(migrationPlans, list) # if migrationPlans: # plan = migrationPlans[0] # else: # LOG.info("There is no migration plans") # return ctx = context.get_admin_context() instances = self.db.instance_get_all(self.ctx) for migrationItem in plan: assert isinstance(migrationItem, MigrationItem) # if 0:self.db=db_api # Stupid hack for code completion in ide instance = self._get_instance(migrationItem.instance_id, instances) assert instance is not None if instance["host"] == migrationItem.hostname: continue migration_status = self.scheduler_rpc_api.live_migration( ctxt=ctx, block_migration=self.migration_settings.block_migration, disk_over_commit=self.migration_settings.disk_over_commit, instance=instance, dest=migrationItem.hostname, ) except: raise def _get_instance(self, name, instances): for instance in instances: if instance.name == name: return instance def collect_data(self, hostname, vm_name, resource): """ Collect historical data about resource utilization for given node (hostname/virtual machine). CUrrently it's implemented to retrieve data from RRD's files. :return: """ # node_topic = '%s.%s' % (HealthMonitorNodeAPI.HEALTH_MONITOR_NODE_TOPIC, hostname) if self.local_storage is None: self.local_storage = RrdWrapper(self.RRD_ROOT_DIR) node = "%s.%s" % (hostname, vm_name) endTime = datetime.datetime.now() startTime = endTime - datetime.timedelta(hours=1) # TODO: Move to configuration file customizable timedelta self.local_storage.query(startTime, endTime, resource, node) return None def collect_data_remote(self, hostname, vm_name, resource): """ Collect data from network (AMQP). Not Implemented :param hostname: :param vm_name: :param resource: :return: """ raise NotImplemented health_rpc_api = HealthMonitorNodeAPI(hostname) if health_rpc_api is None: raise Exception("Unable to get health_monitor_node RPC API object") message = {"resource": resource, "vm_name": vm_name} return health_rpc_api.collect_recent_stats(self.ctx, message) def _test_rpc_call(self): health_monitor_node_rpc_api = HealthMonitorNodeAPI(self.host) message = {"resource": "RAM", "vm_name": "SEMY"} result = health_monitor_node_rpc_api.collect_recent_stats(self.ctx, message) LOG.info("Received: %s" % result) def test_migration(self): """ Executes migration plan. Migrate VMs to given nodes. :param migrationPlans: list :return: """ instance_uuid = "3974a5b5-39d4-4bcf-a12d-a1a17bdf2341" hostname = "lab-os-1" if not self.scheduler_rpc_api: self._init_scheduler() ctx = context.get_admin_context() if 0: self.db = db_api # Stupid hack for code completion in ide # self.db.instance_get_by_uuid(self.ctx, instance_uuid) instances = self.db.instance_get_all(ctx) selected = None assert isinstance(instance, nova.db.sqlalchemy.models.Instance) # migration_status = self.scheduler_rpc_api.live_migration(ctxt=ctx, # block_migration=self.migration_settings.block_migration, # disk_over_commit=self.migration_settings.disk_over_commit, # instance=instance, # dest=hostname) LOG.error("Migration status %s" % migration_status)
class DataProvider(object): def __init__(self, rootDir, db, ctx): self.local_storage = RrdWrapper(rootDir) self.database = db self.context = ctx self.virtualMachines = {} self.estimatedMem = {} self.now = None self.lastUpdateTime = datetime.datetime.now() - datetime.timedelta(hours=24) self.hosts = {} def getData(self): self.now = endTime = datetime.datetime.now() hostNames = self.local_storage.get_hosts_names() self.hosts = {} for hostName in hostNames: db_instances = self.database.instance_get_all_by_host(self.context, hostName) # From DB db_instnaces_names = [instance.name for instance in db_instances] try: cpu_idle = self.getWeightedAverageData(endTime, "cpu_idle", hostName) cpu_system = self.getWeightedAverageData(endTime, "cpu_system", hostName) cpu_num = self.getSingleValue(endTime, "cpu_num", hostName) cpu_speed = self.getSingleValue(endTime, "cpu_speed", hostName) mem = self.getSingleValue(endTime, "mem_total", hostName) mem_free = self.getWeightedAverageData(endTime, "mem_free", hostName) except Exception as err: LOG.error("error during retrieving host: %s data from rrd files: %s", hostName, err) continue host = Host( hostName, cpu_idle, cpu_system, cpu_num, cpu_speed, mem, mem_free) vms = [] for instanceName in db_instnaces_names: vm = self.createVm(hostName, instanceName, cpu_speed, endTime) if vm is not None: vms.append(vm) else: return False host._vms = vms host.setVmMem() self.hosts[hostName] = host return True def createVm(self, hostName, instanceName, hostCpuSpeed, endTime): try: cpu_util = self.getWeightedAverageData(endTime, "vcpu_util", hostName, instanceName) cpu_num = self.getSingleValue(endTime, "vcpu_num", hostName, instanceName) pkts_in = self.getWeightedAverageData(endTime, "vpkts_in", hostName, instanceName) pkts_out = self.getWeightedAverageData(endTime, "vpkts_out", hostName, instanceName) mem_declared = self.getSingleValue(endTime, "vmem_total", hostName, instanceName) vm = Vm( hostName, instanceName, cpu_util, cpu_num, pkts_in, pkts_out, mem_declared, hostCpuSpeed) if self.virtualMachines.has_key(instanceName): vm.setWeights(self.virtualMachines[instanceName]) else: vm.setWeights(None) return vm except Exception as err: LOG.error("error during retrieving vm: %s data on host %s from rrd files: %s", instanceName, hostName, err) return None def saveWeights(self): self.lastUpdateTime = datetime.datetime.now() self.estimatedMem = {} for host in self.hosts.values(): for vm in host._vms: self.estimatedMem[vm.InstanceName] = vm._mem LOG.error("stat [%s] instance: %s mem: %s", int(time.mktime(self.lastUpdateTime.timetuple())), vm.InstanceName, vm._mem) def updateWeights(self): if self.lastUpdateTime + datetime.timedelta(minutes=20) >= self.now: self.virtualMachines = {} for host in self.hosts.values(): for vm in host._vms: if self.estimatedMem.has_key(vm.InstanceName): estimatedMem = self.estimatedMem[vm.InstanceName] assert estimatedMem != 0, "estimated mem is 0" LOG.error("estimated mem %s %s", estimatedMem, vm._mem) dif = vm._mem / estimatedMem #todo think what you're doing vm.modifyM(dif) self.virtualMachines[vm.InstanceName] = vm.getWeights() LOG.error("stat [%s] instance: %s weights: %s", int(time.mktime(self.lastUpdateTime.timetuple())), vm.InstanceName, self.virtualMachines[vm.InstanceName]) else: LOG.error('Last update to long time ago - do not update weights') def preProcessAlert(self, alert): try: counter = alert["value"] metricName = counter[1] hostName = counter[9]["host"] util = None now = datetime.datetime.now() startTime = now - datetime.timedelta(minutes=5) db_instances = self.database.instance_get_all_by_host(self.context, hostName) # From DB if len(db_instances) == 0: #check if there are any VMs running on this host return if metricName == 'mem_util': memFree = self.local_storage.query(startTime, now, "mem_free", hostname = hostName).Average memTotal = self.getSingleValue(now, "mem_total", hostName) util = (1 - memFree / memTotal) * 100 elif metricName == 'cpu_util': cpu_idle = self.local_storage.query(startTime, now, "cpu_idle", hostname = hostName).Average util = 100 - float(cpu_idle) elif metricName == 'pkts': pkts_out = self.local_storage.query(startTime, now, "pkts_out", hostname = hostName).Average pkts_in = self.local_storage.query(startTime, now, "pkts_in", hostname = hostName).Average util = (pkts_out + pkts_in) * 500.0 / 10485760 * 100 LOG.error("stat [%s] dataProvider host: %s %s util is %s", int(time.mktime(now.timetuple())), hostName, metricName, util) if util is not None and (util > 85 or util < 40): LOG.error("Trigger migration algorithm") return True else: return False except Exception as err: LOG.error("dataProvider preProcessAlert Exception %s", err) def getWeightedAverageData(self, endTime, metric, host, instance=None): startTime = endTime - datetime.timedelta(minutes=5) _5minuteData = self.local_storage.query(startTime, endTime, metric, instance, host) startTime = endTime - datetime.timedelta(minutes=10) _10minuteData = self.local_storage.query(startTime, endTime, metric, instance, host) startTime = endTime - datetime.timedelta(minutes=15) _15minuteData = self.local_storage.query(startTime, endTime, metric, instance, host) startTime = endTime - datetime.timedelta(minutes=30) _30minuteData = self.local_storage.query(startTime, endTime, metric, instance, host) return 0.4 * _5minuteData.Average +\ 0.3 * _10minuteData.Average +\ 0.2 * _15minuteData.Average +\ 0.1 * _30minuteData.Average def getSingleValue(self, endTime, metric, host, instance=None): startTime = endTime - datetime.timedelta(minutes=1) result = self.local_storage.query(startTime, endTime, metric, instance, host) return result.getLastSingleValue()