class SparkProgressUPV(Plugin):
    def __init__(self, app_id, info_plugin, retries=60):
        Plugin.__init__(self,
                        app_id,
                        info_plugin,
                        collect_period=5,
                        retries=retries)

        self.monasca = MonascaConnector()

        self.submission_url = info_plugin['spark_submisson_url']
        self.expected_time = info_plugin['expected_time']

        self.remaining_time = int(self.expected_time)
        self.job_expected_time = int(self.expected_time)

        self.number_of_jobs = int(info_plugin['number_of_jobs'])
        self.current_job_id = 0

        self.dimensions = {
            'application_id': self.app_id,
            'service': 'spark-sahara'
        }

        self.conn = paramiko.SSHClient()
        self.conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.conn.connect(hostname=api.mesos_cluster_addr,
                          username=api.mesos_username,
                          password=api.mesos_password)

        self.spark_id = self._discover_id_from_spark()

    def _publish_measurement(self, jobs):

        application_progress_error = {}

        # Init
        jobs.reverse()

        if not len(jobs) == 0:
            current_job = jobs[self.current_job_id]

            if current_job['status'] == 'FAILED':
                self.current_job_id = len(jobs) - 1

            elif current_job['status'] == 'SUCCEEDED':
                elapsed_time = float(
                    self._get_elapsed_time(current_job['submissionTime']))

                self.remaining_time = self.remaining_time - elapsed_time

                self.current_job_id = len(jobs) - 1

                # Job Time
                self.job_expected_time = (
                    self.remaining_time /
                    (float(self.number_of_jobs) - float(self.current_job_id)))

            elif current_job['status'] == 'RUNNING':
                # Job Progress
                job_progress = (current_job['numCompletedTasks'] /
                                float(current_job['numTasks']))

                # Elapsed Time
                elapsed_time = float(
                    self._get_elapsed_time(current_job['submissionTime']))

                # Reference Value
                ref_value = (elapsed_time / self.job_expected_time)

                # Error
                error = job_progress - ref_value

                application_progress_error['name'] = ('application-progress'
                                                      '.error')

                application_progress_error['value'] = error
                application_progress_error['timestamp'] = time.time() * 1000
                application_progress_error['dimensions'] = self.dimensions

                print application_progress_error['value']

                self.monasca.send_metrics([application_progress_error])

            time.sleep(MONITORING_INTERVAL)

    def _get_elapsed_time(self, gmt_timestamp):
        try:
            local_tz = tzlocal.get_localzone()

        except Exception as e:
            local_tz = "America/Recife"
            local_tz = pytz.timezone(local_tz)

        date_time = datetime.strptime(gmt_timestamp, '%Y-%m-%dT%H:%M:%S.%fGMT')
        date_time = date_time.replace(tzinfo=pytz.utc).astimezone(local_tz)
        date_time = date_time.replace(tzinfo=None)
        datetime_now = datetime.now()
        elapsed_time = datetime_now - date_time

        return elapsed_time.seconds

    def _discover_id_from_spark(self):
        for i in range(30):
            i, o, e = self.conn.exec_command('curl %s/api/v1/applications' %
                                             self.submission_url)
            applications_running = json.loads(o.read())

            for app in applications_running:
                if app['name'] == self.app_id:
                    return app['id']

            time.sleep(1)

        return None

    def _get_progress(self, spark_id):
        i, o, e = self.conn.exec_command(
            'curl %s/api/v1/applications/%s/jobs' %
            (self.submission_url, spark_id))

        return json.loads(o.read())

    def monitoring_application(self):
        try:
            job_request = self._get_progress(self.spark_id)

            self._publish_measurement(job_request)

        except Exception as ex:
            print("Error: No application found for %s. %s remaining attempts" %
                  (self.app_id, self.attempts))

            print ex.message
            raise
Exemple #2
0
class OSGeneric(Plugin):
    def __init__(self, app_id, info_plugin, keypair, retries=60):
        Plugin.__init__(self, app_id, info_plugin, collect_period=5,
                        retries=retries)

        self.app_id = app_id
        self.host_ip = info_plugin['host_ip']
        self.expected_time = info_plugin['expected_time']
        self.log_path = info_plugin['log_path']
        self.keypair_path = keypair
        self.host_username = '******'
        self.dimensions = {"application_id": self.app_id,
                           "host": self.host_ip}

        self.last_checked = ''
        self.start_time = time.time()
        self.monasca = MonascaConnector()

    """ This method extracts the value information from a log line
        that contains the measurement for the interest metric """
    def _get_metric_value(self, log):
        value = None
        for i in range(len(log) - 1, 0, -1):
            if log[i] == '#':
                value = float(log[i + 1:-1])

        return value

    def _get_elapsed_time(self):
        delay = time.time() - self.start_time
        return delay

    """ This method returns a remote connection with the host where
        the log will be captured. It is possible to execute a command
        in the host using the function c.exec_command("write_command_here")
        with the object returned here """
    def _get_ssh_connection(self):
        keypair = paramiko.RSAKey.from_private_key_file(self.keypair_path)
        conn = paramiko.SSHClient()
        conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        conn.connect(hostname=self.host_ip, username=self.host_username,
                     pkey=keypair)

        return conn

    """ This is an auxiliary function to prepare and publish the metric.
        The point is to keep monitoring_application as simple as possible. """
    def _publish_metrics(self, last_log):
        metric = {}
        print last_log

        # Check if this log line contains a new metric measurement.
        if '[Progress]' in last_log and self.last_checked != last_log:
            self.last_checked = last_log

            """ Add to metric_info values for this measurement:
                value and timestamp """
            ref_value = self._get_elapsed_time() / self.expected_time
            measurement_value = self._get_metric_value(last_log)
            error = measurement_value - ref_value

            """ The Monasca metric must have the 3 following fields to be
                created: name, value and timestamp, but also is possible to
                increment the metrics identities - using dimensions -  and
                informations - using value_meta, a dictionary that contains
                aditional information about the measurement. """

            metric['name'] = 'application-progress.error'
            metric['value'] = error
            metric['timestamp'] = time.time() * 1000
            metric['dimensions'] = self.dimensions

            time_progress_metric = {} 
            time_progress_metric['name'] = 'application-progress.time_progress'
            time_progress_metric['value'] = ref_value
            time_progress_metric['timestamp'] = time.time() * 1000
            time_progress_metric['dimensions'] = self.dimensions

            app_progress_metric = {}
            app_progress_metric['name'] = 'application-progress.app_progress'
            app_progress_metric['value'] = measurement_value
            app_progress_metric['timestamp'] = time.time() * 1000
            app_progress_metric['dimensions'] = self.dimensions

            # Sending the metric to Monasca
            self.monasca.send_metrics([metric, app_progress_metric,
                                       time_progress_metric])

            print "Application progress error: %.4f" % error

        # Flag that checks if the log capture is ended
        elif '[END]' in last_log:
            self.running = False

    def monitoring_application(self):
        try:
            # First of all, a connection with the host is created.
            conn = self._get_ssh_connection()

            """ The second step consists in execute the command to capture
                the last log line from the log file using the connection
                create below and saving the outputs. """
            stdin , stdout, stderr = conn.exec_command(
                                         "sudo tail -1 %s" % self.log_path)

            # The last step is to actually publish using the captured log line
            self._publish_metrics(stdout.read())

        except Exception as ex:
            print "Monitoring %s is not possible. \n\
                   Error: %s. %s remaining attempts" % (self.app_id,
                                                        ex.message,
                                                        self.attempts)

            raise ex
class SparkProgress(Plugin):

    def __init__(self, app_id, info_plugin, collect_period, retries=60):
        Plugin.__init__(self, app_id, info_plugin,
                        collect_period, retries=retries)

        self.monasca = MonascaConnector()

        self.submission_url = info_plugin['spark_submisson_url']
        self.expected_time = info_plugin['expected_time']


        self.number_of_jobs = int(info_plugin['number_of_jobs'])
        self.job_expected_time = (float(self.expected_time) 
                                  / float(self.number_of_jobs))

        self.remaining_time = float(self.expected_time)
        self.current_job_id = 0

        self.app_id = app_id
        self.dimensions = {'application_id': self.app_id,
                           'service': 'spark-sahara'}

        self.job_ratio = 1.0 / self.number_of_jobs
        self.first_submission_time = None


    def _publish_measurement(self, job_request):

        time_progress_metric = {}
        job_progress_metric = {}
        total_time_progress_metric = {}
        total_app_progress_metric = {}
        progress_error_metric = {}

        # Init
        jobs = job_request.json()
        jobs.reverse()

        if not len(jobs) == 0:
            if self.current_job_id is 0:
                self.first_submission_time = jobs[self.current_job_id]\
                                                 ['submissionTime']

            current_job = jobs[self.current_job_id]

            if current_job['status'] == 'FAILED':
                self.current_job_id = len(jobs) - 1

            elif current_job['status'] == 'SUCCEEDED':
                elapsed_time = float(self._get_elapsed_time(
                               current_job['submissionTime']))

                self.remaining_time = self.remaining_time - elapsed_time

                self.current_job_id = len(jobs) - 1

                # Job Time
                if self.remaining_time <= 0.0:
                    self.job_expected_time = -1
                else: 
                    self.job_expected_time = (self.remaining_time
                                     / (float(self.number_of_jobs)
                                     - float(self.current_job_id)))

            elif current_job['status'] == 'RUNNING':
                # Job Progress
                job_progress = (current_job['numCompletedTasks']
                                / float(current_job['numTasks']))


                # Total Elapsed Time
                total_elapsed_time = float(self._get_elapsed_time(
                                         self.first_submission_time))

                # Total Time Progress
                total_time_progress = float(
                    total_elapsed_time / self.expected_time)

                # Total Application Progress
                total_app_progress = self.job_ratio * (self.current_job_id
                                                 + job_progress)

                # New Progress Error
                new_progress_error = total_app_progress - total_time_progress

                # Elapsed Time
                elapsed_time = float(self._get_elapsed_time(
                               current_job['submissionTime']))

                plugin_log.log("%s | %s: Elapsed time: %.2f -\
                                         Expected time: %.2f" % 
                    (time.strftime("%H:%M:%S"),
                     self.app_id,
                     elapsed_time,
                     self.job_expected_time))

                # Error
                if self.job_expected_time == -1:
                    time_progress = 1
                    error = -1.0
                else:
                    time_progress = (elapsed_time / self.job_expected_time)

                    if time_progress > 1: time_progress = 1

                    error = job_progress - time_progress
               
                progress_error_metric = self._format_metric(
                    'application-progress.error',
                    new_progress_error, 
                    time.time() * 1000,
                    self.dimensions
                )

                total_app_progress_metric = self._format_metric(
                    'application-progress.total_app_progress',
                    total_app_progress * 100,
                    time.time() * 1000,
                    self.dimensions
                )

                total_time_progress_metric = self._format_metric(
                    'application-progress.total_time_progress',
                    total_time_progress * 100,
                    time.time() * 1000,
                    self.dimensions
                )

                log_string = ("%s | %s: Ref value: %.2f - Job progress: %.2f" %
                    (time.strftime("%H:%M:%S"),
                     self.app_id,
                     time_progress,
                     job_progress))

                plugin_log.log(log_string)

                log_string = ("%s | %s: Job: %d - Progress error: %.2f" % 
                    (time.strftime("%H:%M:%S"), 
                     self.app_id,
                     self.current_job_id,
                     float(progress_error_metric['value'])))

                plugin_log.log(log_string)

                self.monasca.send_metrics([progress_error_metric,
                                           total_time_progress_metric,
                                           total_app_progress_metric])

            time.sleep(MONITORING_INTERVAL)


    def _format_metric(self, name, value, timestamp, dimensions):
        metric = {}
        metric['name'] = name
        metric['value'] = value 
        metric['timestamp'] = timestamp
        metric['dimensions'] = dimensions
        
        return metric


    def _get_elapsed_time(self, gmt_timestamp):
        local_tz = tzlocal.get_localzone()

        submission_date = datetime.strptime(gmt_timestamp,
                                            '%Y-%m-%dT%H:%M:%S.%fGMT')

        submission_date = submission_date.replace(tzinfo=pytz.utc).\
                              astimezone(local_tz)

        submission_date = submission_date.replace(tzinfo=None)

        submission_timestamp = time.mktime(submission_date.timetuple())
        this_timestamp = time.time()

        plugin_log.log("%s | %s: Submission timestamp: %.2f - \
                                 This timestamp: %.2f" % 
                      (time.strftime("%H:%M:%S"), 
                       self.app_id, 
                       submission_timestamp,
                       this_timestamp))
        
        elapsed_time = this_timestamp - submission_timestamp

        return elapsed_time


    def monitoring_application(self):
        try:
            job_request = requests.get(self.submission_url
                          + ':4040/api/v1/applications/'
                          + self.app_id + '/jobs')

            self._publish_measurement(job_request)

        except Exception as ex:
            print ("Error: No application found for %s. %s remaining attempts"
                   % (self.app_id, self.attempts))

            print ex.message
            raise
class KubeJobProgress(Plugin):
    def __init__(self, app_id, info_plugin, collect_period=2, retries=10):
        Plugin.__init__(self,
                        app_id,
                        info_plugin,
                        collect_period,
                        retries=retries)
        kubernetes.config.load_kube_config()

        self.enable_monasca = info_plugin['graphic_metrics']
        if self.enable_monasca:
            self.monasca = MonascaConnector()
        self.submission_url = info_plugin['count_jobs_url']
        self.expected_time = int(info_plugin['expected_time'])
        self.number_of_jobs = int(info_plugin['number_of_jobs'])
        self.submission_time = datetime.strptime(
            info_plugin['submission_time'], '%Y-%m-%dT%H:%M:%S.%fGMT')
        self.dimensions = {
            'application_id': self.app_id,
            'service': 'kubejobs'
        }
        self.rds = redis.StrictRedis(host=info_plugin['redis_ip'],
                                     port=info_plugin['redis_port'])
        self.metric_queue = "%s:metrics" % self.app_id
        self.current_job_id = 0
        self.b_v1 = kubernetes.client.BatchV1Api()

    def _publish_measurement(self, jobs_completed):

        application_progress_error = {}
        job_progress_error = {}
        time_progress_error = {}
        cluster_size = {}
        parallelism = {}

        # Init
        print "Jobs Completed: %i" % jobs_completed

        # Job Progress

        job_progress = min(1.0, (float(jobs_completed) / self.number_of_jobs))
        # Elapsed Time
        elapsed_time = float(self._get_elapsed_time())

        # Reference Value
        ref_value = (elapsed_time / self.expected_time)
        replicas = self._get_num_replicas()
        # Error
        print "Job progress: %s\Time Progress: %s\nReplicas: %s" \
                        "\n========================" \
                        % (job_progress, ref_value, replicas)

        error = job_progress - ref_value

        application_progress_error['name'] = ('application-progress' '.error')

        job_progress_error['name'] = 'job-progress'

        time_progress_error['name'] = 'time-progress'

        application_progress_error['value'] = error
        application_progress_error['timestamp'] = time.time() * 1000
        application_progress_error['dimensions'] = self.dimensions

        job_progress_error['value'] = job_progress
        job_progress_error['timestamp'] = time.time() * 1000
        job_progress_error['dimensions'] = self.dimensions

        time_progress_error['value'] = ref_value
        time_progress_error['timestamp'] = time.time() * 1000
        time_progress_error['dimensions'] = self.dimensions

        parallelism['name'] = "job-parallelism"
        parallelism['value'] = replicas
        parallelism['timestamp'] = time.time() * 1000
        parallelism['dimensions'] = self.dimensions

        print "Error: %s " % application_progress_error['value']

        self.rds.rpush(self.metric_queue, str(application_progress_error))

        if self.enable_monasca:
            self.monasca.send_metrics([application_progress_error])
            self.monasca.send_metrics([job_progress_error])
            self.monasca.send_metrics([time_progress_error])
            self.monasca.send_metrics([parallelism])

        time.sleep(MONITORING_INTERVAL)

    def _get_num_replicas(self):

        job = self.b_v1.read_namespaced_job(name=self.app_id,
                                            namespace="default")
        return job.status.active

    def _get_elapsed_time(self):
        datetime_now = datetime.now()
        elapsed_time = datetime_now - self.submission_time
        print "Elapsed Time: %.2f" % elapsed_time.seconds

        return elapsed_time.seconds

    def monitoring_application(self):
        try:
            job_request = requests.get('http://%s/redis-%s/job/count' %
                                       (self.submission_url, self.app_id))
            job_processing = requests.get(
                'http://%s/redis-%s/job:processing/count' %
                (self.submission_url, self.app_id))

            job_progress = self.number_of_jobs - (int(job_request.json()) +
                                                  int(job_processing.json()))
            self._publish_measurement(jobs_completed=job_progress)
            return job_progress

        except Exception as ex:
            print("Error: No application found for %s. %s remaining attempts" %
                  (self.app_id, self.attempts))

            print ex.message
            raise
Exemple #5
0
class WebAppMonitor(Plugin):
    def __init__(self, app_id, info_plugin, keypair, retries=60):
        Plugin.__init__(self,
                        app_id,
                        info_plugin,
                        collect_period=5,
                        retries=retries)
        self.app_id = app_id
        self.host_ip = info_plugin['host_ip']
        self.keypair_path = keypair
        self.host_username = info_plugin['host_username']
        self.log_path = info_plugin['log_path']
        self.dimensions = {'app_id': self.app_id, 'host': self.host_ip}
        self.last_checked = ''
        self.monasca = MonascaConnector()

    def _get_metric_value(self, log):
        value = None
        for i in range(len(log) - 1, 0, -1):
            if log[i] == '#':
                value = float(log[i + 1:-1])
        return value

    def _get_ssh_connection(self):
        keypair = paramiko.RSAKey.from_private_key_file(self.keypair_path)
        conn = paramiko.SSHClient()
        conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        conn.connect(hostname=self.host_ip,
                     username=self.host_username,
                     pkey=keypair)
        return conn

    def _publish_metrics(self, last_log):
        metric = {}
        print last_log
        # Check if this log line contains a new metric measurement
        if '[Random]' in last_log and self.last_checked != last_log:
            self.last_checked = last_log
            # Add to metric_info values for this measurement:
            # value and timestamp
            value = self._get_metric_value(last_log)
            metric['name'] = 'web_app.random'
            metric['value'] = value
            metric['timestamp'] = time.time() * 1000
            metric['dimensions'] = {
                "app_id": self.app_id,
                "host": self.host_ip
            }
            # Sending the metric to Monasca
            print value
            self.monasca.send_metrics([metric])
            print "WebApp metric published: %i" % (value)

        # Flag that checks if the log capture is ended
        elif '[END]' in last_log:
            self.running = False

    def monitoring_application(self):
        try:

            conn = self._get_ssh_connection()
            stdin, stdout, stderr = conn.exec_command("sudo tail -1 %s" %
                                                      self.log_path)
            self._publish_metrics(stdout.read())

        except Exception as ex:
            print "Monitoring %s is not possible. \nError: %s. %s remaining attempts" % (
                self.app_id, ex.message, self.attempts)
            raise ex