Example #1
0
 def __init__(self, rootDir, db, ctx):
     self.local_storage = RrdWrapper(rootDir)
     self.database = db
     self.context = ctx
     self.virtualMachines = {}
     self.estimatedMem = {}
     self.now = None
     self.lastUpdateTime = datetime.datetime.now() - datetime.timedelta(hours=24)
     self.hosts = {}
Example #2
0
    def collect_data(self, hostname, vm_name, resource):
        """
            Collect historical data about resource utilization for given node (hostname/virtual machine).

            CUrrently it's implemented to retrieve data from RRD's files.
        :return:
        """

        # node_topic = '%s.%s' % (HealthMonitorNodeAPI.HEALTH_MONITOR_NODE_TOPIC, hostname)

        if self.local_storage is None:
            self.local_storage = RrdWrapper(self.RRD_ROOT_DIR)

        node = "%s.%s" % (hostname, vm_name)

        endTime = datetime.datetime.now()
        startTime = endTime - datetime.timedelta(hours=1)  # TODO: Move to configuration file customizable timedelta

        self.local_storage.query(startTime, endTime, resource, node)

        return None
Example #3
0
class HealthMonitorManager(manager.Manager):
    BASE_RPC_API_VERSION = "1.0"
    RPC_API_VERSION = "1.0"

    RRD_ROOT_DIR = "/home/stack/ganglia"

    #    def __init__(self, topic=None):
    #        print "HelloMgr"
    ##        self.topic = topic

    timestamp = None
    stabilizationTimeDelta = datetime.timedelta(minutes=20)
    lock = threading.RLock()
    lock2 = threading.RLock()

    # RPC API Implementation -------------------------------------------------------------------------------------------
    def raise_alert(self, ctx=None, alert=None):
        LOG.info(alert)

        with self.lock:
            if self.STARTED:
                # Drop alert, algorithm is running.
                # TODO: Maybe alerts should be added to cyclic buffer?
                return
            else:

                # Do not check alerts because it's too early
                if (
                    self.timestamp is not None
                    and (self.timestamp + MigrationParams.STABILIZATION_TIME_DELTA) > datetime.datetime.now()
                ):
                    LOG.info("It's too early to run algorithm. Waiting for stabilization.")
                    return

                self.STARTED = self.dataProvider.preProcessAlert(alert)
        try:

            if self.dataProvider.preProcessAlert(alert):
                if not self._is_migrating():
                    self.prepare_resource_allocation_algorithm_input(alert)
            pass
        except Exception as err:
            print "exception %s" % err
            LOG.error(err)

        with self.lock:
            self.STARTED = False

    # -------------------------------------------------------------------------------------------------------------------

    def _get_scheduler_rpc_api(self):
        if not self.scheduler_rpc_api:
            self._init_scheduler()

        return self.scheduler_rpc_api

    def _is_migrating(self):
        ctx = context.get_admin_context()

        instances = self.db.instance_get_all(ctx)

        for instance in instances:
            if instance.vm_state == "migrating":
                LOG.error("Migration in process. Abort algorithm execution")
                return True

        return False
        # scheduler = self._get_scheduler_rpc_api()

    # Manager inherited ------------------------------------------------------------------------------------------------
    def init_host(self):

        self.topic = HealthMonitorAPI.HEALTH_MONITOR_TOPIC
        self.ctx = context.get_admin_context()
        self.ctx.read_deleted = "no"
        self.dataProvider = DataProvider(self.RRD_ROOT_DIR, self.db, self.ctx)
        self.instances = self.db.instance_get_all_by_host(self.ctx, self.host)
        self.migration_algorithm = AntColonyAlgorithm()

        self._init_monitors_connections()
        self.STARTED = False

        self.scheduler_rpc_api = None

    #        self._test_rpc_call()

    def periodic_tasks(self, context, raise_on_error=False):
        pass

    # -------------------------------------------------------------------------------------------------------------------

    class MigrationSettings(object):
        block_migration = (False,)
        disk_over_commit = False

        def __init__(self, **kwargs):
            self.block_migration = False
            self.disk_over_commit = False
            for key in kwargs:
                setattr(self, key, kwargs[key])

    migration_settings = MigrationSettings()

    #    migration_s = namedtuple("", "block_migration disk_over_commit")
    #   http://stackoverflow.com/questions/11708799/any-way-to-initialize-attributes-properties-during-class-creation-in-python

    def _init_scheduler(self):

        self.scheduler_rpc_api = SchedulerAPI()

        if self.scheduler_rpc_api is None:
            LOG.error("Scheduler == None")
            raise Exception("Error during execution scheduler")

    def _init_monitors_connections(self):

        self.conn = rpc.create_connection(new=True)

        LOG.debug(_("Creating Consumer connection for Service %s") % self.topic)

        rpc_dispatcher = self.create_rpc_dispatcher()

        # According to documentation fanout=True => broadcast to all services.
        self.conn.create_consumer(self.topic, self, fanout=True)

        # Consume from all consumers in a thread
        self.conn.consume_in_thread()

    def prepare_resource_allocation_algorithm_input(self, alert):
        """
            Hostname is virtual machine's hostname (name)
        :return:
        """

        isValid = self.dataProvider.getData()

        if not isValid:
            LOG.error("skipping this round")
            return

        hosts = self.dataProvider.hosts.values()
        virtualMachines = []
        now = datetime.datetime.now()

        for host in hosts:
            LOG.error("stat [%s] host %s\t %s", int(time.mktime(now.timetuple())), host.Hostname, host.getMetrics())
            virtualMachines.extend(host._vms)
            for vm in host._vms:
                LOG.error(
                    "stat [%s]vm %s\t %s", int(time.mktime(now.timetuple())), vm.InstanceName, vm.getMetrics(host)
                )

        InputData = namedtuple("InputData", "Hosts VirtualMachines Alert")
        input_data_set = InputData(Hosts=hosts, VirtualMachines=virtualMachines, Alert=alert)

        # Count used hosts and how many boundaries are violated
        usedHostsBeforeMigration = sum([host.getIsOn() for host in hosts])
        # Dictionary <host, tuple(upperBoundsViolations, lowerBoundsViolations)>
        violationsDictionaryBeforeMigration = HealthMonitorManager.count_boundaries_violations(hosts)

        # todo if alert mem
        self.dataProvider.updateWeights()

        LOG.error("Start Algorithm")
        try:
            migrationPlans = self.migration_algorithm.execute_algorithm(input_data_set)
        except Exception as exc:
            LOG.error("OOOOOPS %s" % exc)
        LOG.error("Stop Algorithm")

        assert migrationPlans is not None, "Migration plans is none"
        plan, migrations_counter = self.choose_migration_plan(migrationPlans, virtualMachines)

        # Count used hosts and how many boundaries are violated
        usedHostsAfterMigration = sum([host.getIsOn() for host in hosts])
        # Dictionary <host, tuple(upperBoundsViolations, lowerBoundsViolations)>
        violationsDictionaryAfterMigration = HealthMonitorManager.count_boundaries_violations(hosts)

        # Zysk na naruszonych granicach SLA.
        profitUpper, profitLower = HealthMonitorManager.boundaries_profit_gained(
            violationsDictionaryBeforeMigration, violationsDictionaryAfterMigration
        )

        LOG.error("stat [%s] Migration count %s", int(time.mktime(now.timetuple())), migrations_counter)
        LOG.error(
            "stat [%s] Hosts used before %s, after %s",
            int(time.mktime(now.timetuple())),
            usedHostsBeforeMigration,
            usedHostsAfterMigration,
        )

        if alert["severity"] == 2 and usedHostsAfterMigration >= usedHostsBeforeMigration:
            # todo make alert['severity'] more human readable

            LOG.error("There is no profit from migration - skip")
            return

        self.dataProvider.saveWeights()

        for mi in plan:
            LOG.error("stat [%s] migration %s@%s", int(time.mktime(now.timetuple())), mi.instance_id, mi.hostname)

        if migrations_counter != 0:
            self.execute_plan(plan)

            # Timestamp
            self.timestamp = datetime.datetime.now()

        pass

    @staticmethod
    def count_boundaries_violations(hosts):
        def count_true(dictionary):
            assert isinstance(dictionary, dict)

            def raise_exception_missing_key(key):
                if not dictionary.has_key("C"):
                    LOG.error("Missing C key")
                    raise Exception("Missing C key")

            raise_exception_missing_key("C")
            raise_exception_missing_key("N")
            raise_exception_missing_key("M")

            true_counter = 0

            if dictionary["C"]:
                true_counter += 1

            if dictionary["N"]:
                true_counter += 1

            if dictionary["M"]:
                true_counter += 1

            return true_counter

        violations = {}

        for host in hosts:
            assert isinstance(host, Host)
            upperBoundsWithRaise = count_true(host.getUpperBounds())
            upperBoundsViolations = sum(int(violation) for violation in host.getUpperBounds().values())

            assert upperBoundsWithRaise == upperBoundsViolations, "Upperbounds violations count error"

            lowerBoundsWithRaise = count_true(host.getLowerBounds())
            lowerBoundsViolations = sum(int(violation) for violation in host.getLowerBounds().values())

            assert lowerBoundsWithRaise == lowerBoundsViolations, "Lowerbounds violations count error"

            violations[host] = (upperBoundsViolations, lowerBoundsViolations)

        return violations

    @staticmethod
    def boundaries_profit_gained(violationsBefore, violationsAfter):

        assert isinstance(violationsBefore, dict)
        assert isinstance(violationsAfter, dict)
        assert len(violationsBefore.keys()) == len(violationsAfter.keys())

        def sum_list_of_tuples(tuples):

            sumX, sumY = 0, 0
            for x, y in tuples:
                sumX += x
                sumY += y

            return sumX, sumY

        def profitFunctionSumWholeViolations():
            """
                Prosta funkcaj zliczająca ilość naruszeń na górnych granicach i dolnych granicach w sumie w całym środowisku

                Jeśli suma naruszeń górnych granic jest większa niż
            """

            upperViolatedBefore, lowerViolatedBefore = sum_list_of_tuples(violationsBefore.values())
            upperViolatedAfter, lowerViolatedAfter = sum_list_of_tuples(violationsAfter.values())

            # ProfitUpper - int
            # profitUpper==0 : no difference
            # profitUpper <0 : Not good
            # profitUpper >0 : Great we have less violations
            profitUpper = upperViolatedBefore - upperViolatedAfter

            # ProfitLower - int
            # profitLower==0 : no difference
            # profitLower <0 : Not good
            # profitLower >0 : Great we have less violations
            profitLower = lowerViolatedBefore - lowerViolatedAfter

            return profitUpper, profitLower

        profitUpper, profitLower = profitFunctionSumWholeViolations()

        return profitUpper, profitLower

    def choose_migration_plan(self, migrationPlans, virtualMachines):

        minValue = len(virtualMachines)
        plan = None

        if migrationPlans:
            for current in migrationPlans:

                migrationCount = 0

                for vm in virtualMachines:
                    migrationItem = find(lambda migration_item: migration_item.instance_id == vm.InstanceName, current)

                    if vm.Hostname != migrationItem.hostname:
                        migrationCount += 1

                    LOG.error("mg count %s", migrationCount)

                if current is not None and migrationCount < minValue:
                    plan = current
                    minValue = migrationCount

        else:
            LOG.info("There are no migration plans")
            return (None, None)

        selfMigrations = []
        migrationCount = 0

        #        print "vms"
        #        for vm in virtualMachines:
        #            print vm.InstanceName
        #
        #        print "Migration Items"
        #        for item in plan:
        #            print "%s@%s" % (item.instance_id, item.hostname)
        for vm in virtualMachines:

            assert plan is not None, "Plan is none"
            assert vm is not None, "VM is None"
            migrationItem = find(lambda migration_item: migration_item.instance_id == vm.InstanceName, plan)
            assert migrationItem is not None, "Migration item is None"

            if vm.Hostname != migrationItem.hostname:
                migrationCount += 1
                self.updateHostVmConn(vm, migrationItem)
            else:
                selfMigrations.append(migrationItem)

        for mi in selfMigrations:
            plan.remove(mi)

        return plan, migrationCount

    def updateHostVmConn(self, vm, migrationItem):

        assert self.dataProvider.hosts.has_key(
            migrationItem.hostname
        ), "data provider has no host specified in migration item"
        assert self.dataProvider.hosts.has_key(vm.Hostname), "data provider has no host specified in vm"

        hostFrom = self.dataProvider.hosts[vm.Hostname]
        hosTo = self.dataProvider.hosts[migrationItem.hostname]

        hostFrom._vms.remove(vm)
        hosTo._vms.append(vm)

    def execute_plan(self, plan):
        """
        Executes migration plan. Migrate VMs to given nodes.
        :param migrationPlans: list
        :return:
        """

        try:
            if not self.scheduler_rpc_api:
                self._init_scheduler()

            #            assert isinstance(migrationPlans, list)
            #            if migrationPlans:
            #                plan = migrationPlans[0]
            #            else:
            #                LOG.info("There is no migration plans")
            #                return

            ctx = context.get_admin_context()
            instances = self.db.instance_get_all(self.ctx)

            for migrationItem in plan:
                assert isinstance(migrationItem, MigrationItem)
                # if 0:self.db=db_api # Stupid hack for code completion in ide

                instance = self._get_instance(migrationItem.instance_id, instances)
                assert instance is not None

                if instance["host"] == migrationItem.hostname:
                    continue

                migration_status = self.scheduler_rpc_api.live_migration(
                    ctxt=ctx,
                    block_migration=self.migration_settings.block_migration,
                    disk_over_commit=self.migration_settings.disk_over_commit,
                    instance=instance,
                    dest=migrationItem.hostname,
                )

        except:
            raise

    def _get_instance(self, name, instances):
        for instance in instances:
            if instance.name == name:
                return instance

    def collect_data(self, hostname, vm_name, resource):
        """
            Collect historical data about resource utilization for given node (hostname/virtual machine).

            CUrrently it's implemented to retrieve data from RRD's files.
        :return:
        """

        # node_topic = '%s.%s' % (HealthMonitorNodeAPI.HEALTH_MONITOR_NODE_TOPIC, hostname)

        if self.local_storage is None:
            self.local_storage = RrdWrapper(self.RRD_ROOT_DIR)

        node = "%s.%s" % (hostname, vm_name)

        endTime = datetime.datetime.now()
        startTime = endTime - datetime.timedelta(hours=1)  # TODO: Move to configuration file customizable timedelta

        self.local_storage.query(startTime, endTime, resource, node)

        return None

    def collect_data_remote(self, hostname, vm_name, resource):
        """
            Collect data from network (AMQP). Not Implemented
        :param hostname:
        :param vm_name:
        :param resource:
        :return:
        """
        raise NotImplemented

        health_rpc_api = HealthMonitorNodeAPI(hostname)

        if health_rpc_api is None:
            raise Exception("Unable to get health_monitor_node RPC API object")

        message = {"resource": resource, "vm_name": vm_name}

        return health_rpc_api.collect_recent_stats(self.ctx, message)

    def _test_rpc_call(self):

        health_monitor_node_rpc_api = HealthMonitorNodeAPI(self.host)
        message = {"resource": "RAM", "vm_name": "SEMY"}

        result = health_monitor_node_rpc_api.collect_recent_stats(self.ctx, message)
        LOG.info("Received: %s" % result)

    def test_migration(self):
        """
        Executes migration plan. Migrate VMs to given nodes.
        :param migrationPlans: list
        :return:
        """

        instance_uuid = "3974a5b5-39d4-4bcf-a12d-a1a17bdf2341"
        hostname = "lab-os-1"

        if not self.scheduler_rpc_api:
            self._init_scheduler()

        ctx = context.get_admin_context()

        if 0:
            self.db = db_api  # Stupid hack for code completion in ide

        #        self.db.instance_get_by_uuid(self.ctx, instance_uuid)
        instances = self.db.instance_get_all(ctx)

        selected = None

        assert isinstance(instance, nova.db.sqlalchemy.models.Instance)

        #        migration_status = self.scheduler_rpc_api.live_migration(ctxt=ctx,
        #                                                                 block_migration=self.migration_settings.block_migration,
        #                                                                 disk_over_commit=self.migration_settings.disk_over_commit,
        #                                                                 instance=instance,
        #                                                                 dest=hostname)

        LOG.error("Migration status %s" % migration_status)
Example #4
0
class DataProvider(object):

    def __init__(self, rootDir, db, ctx):
        self.local_storage = RrdWrapper(rootDir)
        self.database = db
        self.context = ctx
        self.virtualMachines = {}
        self.estimatedMem = {}
        self.now = None
        self.lastUpdateTime = datetime.datetime.now() - datetime.timedelta(hours=24)
        self.hosts = {}



    def getData(self):
        self.now = endTime = datetime.datetime.now()

        hostNames = self.local_storage.get_hosts_names()

        self.hosts = {}

        for hostName in hostNames:

            db_instances = self.database.instance_get_all_by_host(self.context, hostName) # From DB
            db_instnaces_names = [instance.name for instance in db_instances]

            try:
                cpu_idle = self.getWeightedAverageData(endTime, "cpu_idle", hostName)
                cpu_system = self.getWeightedAverageData(endTime, "cpu_system", hostName)
                cpu_num = self.getSingleValue(endTime, "cpu_num", hostName)
                cpu_speed = self.getSingleValue(endTime, "cpu_speed", hostName)
                mem = self.getSingleValue(endTime, "mem_total", hostName)
                mem_free = self.getWeightedAverageData(endTime, "mem_free", hostName)

            except Exception as err:
                LOG.error("error during retrieving host: %s data from rrd files: %s", hostName, err)
                continue

            host = Host(
                hostName,
                cpu_idle,
                cpu_system,
                cpu_num,
                cpu_speed,
                mem,
                mem_free)

            vms = []

            for instanceName in db_instnaces_names:

                vm = self.createVm(hostName, instanceName, cpu_speed, endTime)

                if vm is not None:
                    vms.append(vm)
                else:
                    return False

            host._vms = vms

            host.setVmMem()
            self.hosts[hostName] = host

        return True


    def createVm(self, hostName, instanceName, hostCpuSpeed, endTime):

        try:
            cpu_util = self.getWeightedAverageData(endTime, "vcpu_util", hostName, instanceName)
            cpu_num = self.getSingleValue(endTime, "vcpu_num", hostName, instanceName)
            pkts_in = self.getWeightedAverageData(endTime, "vpkts_in", hostName, instanceName)
            pkts_out = self.getWeightedAverageData(endTime, "vpkts_out", hostName, instanceName)
            mem_declared = self.getSingleValue(endTime, "vmem_total", hostName, instanceName)

            vm = Vm(
                hostName,
                instanceName,
                cpu_util,
                cpu_num,
                pkts_in,
                pkts_out,
                mem_declared,
                hostCpuSpeed)


            if self.virtualMachines.has_key(instanceName):
                vm.setWeights(self.virtualMachines[instanceName])
            else:
                vm.setWeights(None)

            return vm

        except Exception as err:
            LOG.error("error during retrieving vm: %s data on host %s from rrd files: %s",
                instanceName,
                hostName,
                err)

            return None


    def saveWeights(self):
	
        self.lastUpdateTime = datetime.datetime.now()

        self.estimatedMem = {}

        for host in self.hosts.values():

            for vm in host._vms:
                self.estimatedMem[vm.InstanceName] = vm._mem
                LOG.error("stat [%s] instance: %s mem: %s", int(time.mktime(self.lastUpdateTime.timetuple())), vm.InstanceName, vm._mem)

    def updateWeights(self):
	
        if self.lastUpdateTime + datetime.timedelta(minutes=20) >= self.now:
            self.virtualMachines = {}

            for host in self.hosts.values():
	        
                for vm in host._vms:

                    if self.estimatedMem.has_key(vm.InstanceName):
				
                        estimatedMem = self.estimatedMem[vm.InstanceName]
                        assert estimatedMem != 0, "estimated mem is 0"

                        LOG.error("estimated mem %s %s", estimatedMem, vm._mem)

                        dif = vm._mem / estimatedMem

                        #todo think what you're doing

                        vm.modifyM(dif)
                        self.virtualMachines[vm.InstanceName] = vm.getWeights()
                        LOG.error("stat [%s] instance: %s weights: %s", int(time.mktime(self.lastUpdateTime.timetuple())), vm.InstanceName, self.virtualMachines[vm.InstanceName])

        else:
            LOG.error('Last update to long time ago - do not update weights')


    def preProcessAlert(self, alert):

        try:
            counter = alert["value"]
            metricName = counter[1]
            hostName = counter[9]["host"]
            util = None

            now = datetime.datetime.now()

            startTime = now - datetime.timedelta(minutes=5)

            db_instances = self.database.instance_get_all_by_host(self.context, hostName) # From DB

            if len(db_instances) == 0:
                #check if there are any VMs running on this host
                return

            if metricName == 'mem_util':

                memFree = self.local_storage.query(startTime, now, "mem_free", hostname = hostName).Average
                memTotal = self.getSingleValue(now, "mem_total", hostName)

                util = (1 - memFree / memTotal) * 100

            elif metricName == 'cpu_util':

                cpu_idle = self.local_storage.query(startTime, now, "cpu_idle", hostname = hostName).Average

                util = 100 - float(cpu_idle)

            elif metricName == 'pkts':

                pkts_out = self.local_storage.query(startTime, now, "pkts_out", hostname = hostName).Average
                pkts_in = self.local_storage.query(startTime, now, "pkts_in", hostname = hostName).Average

                util = (pkts_out + pkts_in) * 500.0 / 10485760 * 100

            LOG.error("stat [%s] dataProvider host: %s %s util is %s", int(time.mktime(now.timetuple())),  hostName, metricName, util)

            if util is not None and (util > 85 or util < 40):
                LOG.error("Trigger migration algorithm")
                return True
            else:
                return False

        except Exception as err:
            LOG.error("dataProvider preProcessAlert Exception %s", err)

    def getWeightedAverageData(self, endTime, metric, host, instance=None):

        startTime = endTime - datetime.timedelta(minutes=5)

        _5minuteData = self.local_storage.query(startTime, endTime, metric, instance, host)

        startTime = endTime - datetime.timedelta(minutes=10)

        _10minuteData = self.local_storage.query(startTime, endTime, metric, instance, host)

        startTime = endTime - datetime.timedelta(minutes=15)

        _15minuteData = self.local_storage.query(startTime, endTime, metric, instance, host)

        startTime = endTime - datetime.timedelta(minutes=30)

        _30minuteData = self.local_storage.query(startTime, endTime, metric, instance, host)

        return 0.4 * _5minuteData.Average +\
               0.3 * _10minuteData.Average +\
               0.2 * _15minuteData.Average +\
               0.1 * _30minuteData.Average

    def getSingleValue(self, endTime, metric, host, instance=None):

        startTime = endTime - datetime.timedelta(minutes=1)

        result = self.local_storage.query(startTime, endTime, metric, instance, host)

        return result.getLastSingleValue()