def __init__(self, app_id, info_plugin, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.monasca = MonascaConnector() self.submission_url = info_plugin['spark_submisson_url'] self.expected_time = info_plugin['expected_time'] self.remaining_time = int(self.expected_time) self.job_expected_time = int(self.expected_time) self.number_of_jobs = int(info_plugin['number_of_jobs']) self.current_job_id = 0 self.dimensions = { 'application_id': self.app_id, 'service': 'spark-sahara' } self.conn = paramiko.SSHClient() self.conn.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.conn.connect(hostname=api.mesos_cluster_addr, username=api.mesos_username, password=api.mesos_password) self.spark_id = self._discover_id_from_spark()
def __init__(self, app_id, info_plugin, collect_period=2, retries=10): Plugin.__init__(self, app_id, info_plugin, collect_period, retries=retries) kubernetes.config.load_kube_config() self.enable_monasca = info_plugin['graphic_metrics'] if self.enable_monasca: self.monasca = MonascaConnector() self.submission_url = info_plugin['count_jobs_url'] self.expected_time = int(info_plugin['expected_time']) self.number_of_jobs = int(info_plugin['number_of_jobs']) self.submission_time = datetime.strptime( info_plugin['submission_time'], '%Y-%m-%dT%H:%M:%S.%fGMT') self.dimensions = { 'application_id': self.app_id, 'service': 'kubejobs' } self.rds = redis.StrictRedis(host=info_plugin['redis_ip'], port=info_plugin['redis_port']) self.metric_queue = "%s:metrics" % self.app_id self.current_job_id = 0 self.b_v1 = kubernetes.client.BatchV1Api()
def __init__(self, app_id, info_plugin, collect_period=2, retries=60, monasca_conn="monasca"): Plugin.__init__(self, app_id, info_plugin, collect_period, retries=retries) if monasca_conn == "monasca": self.monasca = MonascaConnector() else: self.monasca = monasca_conn self.submission_url = info_plugin['spark_submisson_url'] self.expected_time = info_plugin['expected_time'] self.number_of_jobs = int(info_plugin['number_of_jobs']) self.job_expected_time = (float(self.expected_time) / float(self.number_of_jobs)) self.remaining_time = float(self.expected_time) self.current_job_id = 0 self.app_id = app_id self.dimensions = {'application_id': self.app_id, 'service': 'spark-sahara'} self.job_ratio = 1.0 / self.number_of_jobs self.first_submission_time = None
def __init__(self, app_id, info_plugin, keypair, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.app_id = app_id self.host_ip = info_plugin['host_ip'] self.keypair_path = keypair self.host_username = info_plugin['host_username'] self.log_path = info_plugin['log_path'] self.dimensions = {'app_id': self.app_id, 'host': self.host_ip} self.last_checked = '' self.monasca = MonascaConnector()
def __init__(self, app_id, info_plugin, keypair, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.app_id = app_id self.host_ip = info_plugin['host_ip'] self.expected_time = info_plugin['expected_time'] self.log_path = info_plugin['log_path'] self.keypair_path = keypair self.host_username = '******' self.dimensions = {"application_id": self.app_id, "host": self.host_ip} self.last_checked = '' self.start_time = time.time() self.monasca = MonascaConnector()
def __init__(self, app_id, info_plugin, collect_period=2, retries=10): Plugin.__init__(self, app_id, info_plugin, collect_period, retries=retries) self.enable_visualizer = info_plugin['enable_visualizer'] self.submission_url = info_plugin['count_jobs_url'] self.expected_time = int(info_plugin['expected_time']) self.number_of_jobs = int(info_plugin['number_of_jobs']) self.submission_time = datetime.strptime(info_plugin['submission_time'], '%Y-%m-%dT%H:%M:%S.%fGMT') self.dimensions = {'application_id': self.app_id, 'service': 'kubejobs'} self.rds = redis.StrictRedis(host=info_plugin['redis_ip'], port=info_plugin['redis_port']) self.metric_queue = "%s:metrics" % self.app_id self.current_job_id = 0 kubernetes.config.load_kube_config() self.b_v1 = kubernetes.client.BatchV1Api() if self.enable_visualizer: datasource_type = info_plugin['datasource_type'] if datasource_type == "monasca": self.datasource = MonascaConnector() elif datasource_type == "influxdb": influx_url = info_plugin['database_data']['url'] influx_port = info_plugin['database_data']['port'] database_name = info_plugin['database_data']['name'] self.datasource = InfluxConnector(influx_url, influx_port, database_name) else: print("Unknown datasource type...!")
def setup_datasource(self, info_plugin): if self.enable_detailed_report: datasource_type = info_plugin['datasource_type'] if datasource_type == "monasca": return MonascaConnector() elif datasource_type == "influxdb": influx_url = info_plugin['database_data']['url'] influx_port = info_plugin['database_data']['port'] database_name = info_plugin['database_data']['name'] return InfluxConnector(influx_url, influx_port, database_name) else: raise ex.BadRequestException("Unknown datasource type...!")
def test_get_measurements(self, monasca_mock): ConfigParser.RawConfigParser = mock.Mock() m = MonascaConnector() m.get_measurements(None, None) monasca_mock.assert_called_once_with()
def test_init_manager(self, config_mock, monasca_mock): MonascaConnector() config_mock.assert_called_once_with() monasca_mock.assert_called_once_with()
class SparkProgressUPV(Plugin): def __init__(self, app_id, info_plugin, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.monasca = MonascaConnector() self.submission_url = info_plugin['spark_submisson_url'] self.expected_time = info_plugin['expected_time'] self.remaining_time = int(self.expected_time) self.job_expected_time = int(self.expected_time) self.number_of_jobs = int(info_plugin['number_of_jobs']) self.current_job_id = 0 self.dimensions = { 'application_id': self.app_id, 'service': 'spark-sahara' } self.conn = paramiko.SSHClient() self.conn.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.conn.connect(hostname=api.mesos_cluster_addr, username=api.mesos_username, password=api.mesos_password) self.spark_id = self._discover_id_from_spark() def _publish_measurement(self, jobs): application_progress_error = {} # Init jobs.reverse() if not len(jobs) == 0: current_job = jobs[self.current_job_id] if current_job['status'] == 'FAILED': self.current_job_id = len(jobs) - 1 elif current_job['status'] == 'SUCCEEDED': elapsed_time = float( self._get_elapsed_time(current_job['submissionTime'])) self.remaining_time = self.remaining_time - elapsed_time self.current_job_id = len(jobs) - 1 # Job Time self.job_expected_time = ( self.remaining_time / (float(self.number_of_jobs) - float(self.current_job_id))) elif current_job['status'] == 'RUNNING': # Job Progress job_progress = (current_job['numCompletedTasks'] / float(current_job['numTasks'])) # Elapsed Time elapsed_time = float( self._get_elapsed_time(current_job['submissionTime'])) # Reference Value ref_value = (elapsed_time / self.job_expected_time) # Error error = job_progress - ref_value application_progress_error['name'] = ('application-progress' '.error') application_progress_error['value'] = error application_progress_error['timestamp'] = time.time() * 1000 application_progress_error['dimensions'] = self.dimensions print application_progress_error['value'] self.monasca.send_metrics([application_progress_error]) time.sleep(MONITORING_INTERVAL) def _get_elapsed_time(self, gmt_timestamp): try: local_tz = tzlocal.get_localzone() except Exception as e: local_tz = "America/Recife" local_tz = pytz.timezone(local_tz) date_time = datetime.strptime(gmt_timestamp, '%Y-%m-%dT%H:%M:%S.%fGMT') date_time = date_time.replace(tzinfo=pytz.utc).astimezone(local_tz) date_time = date_time.replace(tzinfo=None) datetime_now = datetime.now() elapsed_time = datetime_now - date_time return elapsed_time.seconds def _discover_id_from_spark(self): for i in range(30): i, o, e = self.conn.exec_command('curl %s/api/v1/applications' % self.submission_url) applications_running = json.loads(o.read()) for app in applications_running: if app['name'] == self.app_id: return app['id'] time.sleep(1) return None def _get_progress(self, spark_id): i, o, e = self.conn.exec_command( 'curl %s/api/v1/applications/%s/jobs' % (self.submission_url, spark_id)) return json.loads(o.read()) def monitoring_application(self): try: job_request = self._get_progress(self.spark_id) self._publish_measurement(job_request) except Exception as ex: print("Error: No application found for %s. %s remaining attempts" % (self.app_id, self.attempts)) print ex.message raise
class OSGeneric(Plugin): def __init__(self, app_id, info_plugin, keypair, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.app_id = app_id self.host_ip = info_plugin['host_ip'] self.expected_time = info_plugin['expected_time'] self.log_path = info_plugin['log_path'] self.keypair_path = keypair self.host_username = '******' self.dimensions = {"application_id": self.app_id, "host": self.host_ip} self.last_checked = '' self.start_time = time.time() self.monasca = MonascaConnector() """ This method extracts the value information from a log line that contains the measurement for the interest metric """ def _get_metric_value(self, log): value = None for i in range(len(log) - 1, 0, -1): if log[i] == '#': value = float(log[i + 1:-1]) return value def _get_elapsed_time(self): delay = time.time() - self.start_time return delay """ This method returns a remote connection with the host where the log will be captured. It is possible to execute a command in the host using the function c.exec_command("write_command_here") with the object returned here """ def _get_ssh_connection(self): keypair = paramiko.RSAKey.from_private_key_file(self.keypair_path) conn = paramiko.SSHClient() conn.set_missing_host_key_policy(paramiko.AutoAddPolicy()) conn.connect(hostname=self.host_ip, username=self.host_username, pkey=keypair) return conn """ This is an auxiliary function to prepare and publish the metric. The point is to keep monitoring_application as simple as possible. """ def _publish_metrics(self, last_log): metric = {} print last_log # Check if this log line contains a new metric measurement. if '[Progress]' in last_log and self.last_checked != last_log: self.last_checked = last_log """ Add to metric_info values for this measurement: value and timestamp """ ref_value = self._get_elapsed_time() / self.expected_time measurement_value = self._get_metric_value(last_log) error = measurement_value - ref_value """ The Monasca metric must have the 3 following fields to be created: name, value and timestamp, but also is possible to increment the metrics identities - using dimensions - and informations - using value_meta, a dictionary that contains aditional information about the measurement. """ metric['name'] = 'application-progress.error' metric['value'] = error metric['timestamp'] = time.time() * 1000 metric['dimensions'] = self.dimensions time_progress_metric = {} time_progress_metric['name'] = 'application-progress.time_progress' time_progress_metric['value'] = ref_value time_progress_metric['timestamp'] = time.time() * 1000 time_progress_metric['dimensions'] = self.dimensions app_progress_metric = {} app_progress_metric['name'] = 'application-progress.app_progress' app_progress_metric['value'] = measurement_value app_progress_metric['timestamp'] = time.time() * 1000 app_progress_metric['dimensions'] = self.dimensions # Sending the metric to Monasca self.monasca.send_metrics([metric, app_progress_metric, time_progress_metric]) print "Application progress error: %.4f" % error # Flag that checks if the log capture is ended elif '[END]' in last_log: self.running = False def monitoring_application(self): try: # First of all, a connection with the host is created. conn = self._get_ssh_connection() """ The second step consists in execute the command to capture the last log line from the log file using the connection create below and saving the outputs. """ stdin , stdout, stderr = conn.exec_command( "sudo tail -1 %s" % self.log_path) # The last step is to actually publish using the captured log line self._publish_metrics(stdout.read()) except Exception as ex: print "Monitoring %s is not possible. \n\ Error: %s. %s remaining attempts" % (self.app_id, ex.message, self.attempts) raise ex
class KubeJobProgress(Plugin): def __init__(self, app_id, info_plugin, collect_period=2, retries=10): Plugin.__init__(self, app_id, info_plugin, collect_period, retries=retries) kubernetes.config.load_kube_config() self.enable_monasca = info_plugin['graphic_metrics'] if self.enable_monasca: self.monasca = MonascaConnector() self.submission_url = info_plugin['count_jobs_url'] self.expected_time = int(info_plugin['expected_time']) self.number_of_jobs = int(info_plugin['number_of_jobs']) self.submission_time = datetime.strptime( info_plugin['submission_time'], '%Y-%m-%dT%H:%M:%S.%fGMT') self.dimensions = { 'application_id': self.app_id, 'service': 'kubejobs' } self.rds = redis.StrictRedis(host=info_plugin['redis_ip'], port=info_plugin['redis_port']) self.metric_queue = "%s:metrics" % self.app_id self.current_job_id = 0 self.b_v1 = kubernetes.client.BatchV1Api() def _publish_measurement(self, jobs_completed): application_progress_error = {} job_progress_error = {} time_progress_error = {} cluster_size = {} parallelism = {} # Init print "Jobs Completed: %i" % jobs_completed # Job Progress job_progress = min(1.0, (float(jobs_completed) / self.number_of_jobs)) # Elapsed Time elapsed_time = float(self._get_elapsed_time()) # Reference Value ref_value = (elapsed_time / self.expected_time) replicas = self._get_num_replicas() # Error print "Job progress: %s\Time Progress: %s\nReplicas: %s" \ "\n========================" \ % (job_progress, ref_value, replicas) error = job_progress - ref_value application_progress_error['name'] = ('application-progress' '.error') job_progress_error['name'] = 'job-progress' time_progress_error['name'] = 'time-progress' application_progress_error['value'] = error application_progress_error['timestamp'] = time.time() * 1000 application_progress_error['dimensions'] = self.dimensions job_progress_error['value'] = job_progress job_progress_error['timestamp'] = time.time() * 1000 job_progress_error['dimensions'] = self.dimensions time_progress_error['value'] = ref_value time_progress_error['timestamp'] = time.time() * 1000 time_progress_error['dimensions'] = self.dimensions parallelism['name'] = "job-parallelism" parallelism['value'] = replicas parallelism['timestamp'] = time.time() * 1000 parallelism['dimensions'] = self.dimensions print "Error: %s " % application_progress_error['value'] self.rds.rpush(self.metric_queue, str(application_progress_error)) if self.enable_monasca: self.monasca.send_metrics([application_progress_error]) self.monasca.send_metrics([job_progress_error]) self.monasca.send_metrics([time_progress_error]) self.monasca.send_metrics([parallelism]) time.sleep(MONITORING_INTERVAL) def _get_num_replicas(self): job = self.b_v1.read_namespaced_job(name=self.app_id, namespace="default") return job.status.active def _get_elapsed_time(self): datetime_now = datetime.now() elapsed_time = datetime_now - self.submission_time print "Elapsed Time: %.2f" % elapsed_time.seconds return elapsed_time.seconds def monitoring_application(self): try: job_request = requests.get('http://%s/redis-%s/job/count' % (self.submission_url, self.app_id)) job_processing = requests.get( 'http://%s/redis-%s/job:processing/count' % (self.submission_url, self.app_id)) job_progress = self.number_of_jobs - (int(job_request.json()) + int(job_processing.json())) self._publish_measurement(jobs_completed=job_progress) return job_progress except Exception as ex: print("Error: No application found for %s. %s remaining attempts" % (self.app_id, self.attempts)) print ex.message raise
class SparkProgress(Plugin): def __init__(self, app_id, info_plugin, collect_period, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period, retries=retries) self.monasca = MonascaConnector() self.submission_url = info_plugin['spark_submisson_url'] self.expected_time = info_plugin['expected_time'] self.number_of_jobs = int(info_plugin['number_of_jobs']) self.job_expected_time = (float(self.expected_time) / float(self.number_of_jobs)) self.remaining_time = float(self.expected_time) self.current_job_id = 0 self.app_id = app_id self.dimensions = {'application_id': self.app_id, 'service': 'spark-sahara'} self.job_ratio = 1.0 / self.number_of_jobs self.first_submission_time = None def _publish_measurement(self, job_request): time_progress_metric = {} job_progress_metric = {} total_time_progress_metric = {} total_app_progress_metric = {} progress_error_metric = {} # Init jobs = job_request.json() jobs.reverse() if not len(jobs) == 0: if self.current_job_id is 0: self.first_submission_time = jobs[self.current_job_id]\ ['submissionTime'] current_job = jobs[self.current_job_id] if current_job['status'] == 'FAILED': self.current_job_id = len(jobs) - 1 elif current_job['status'] == 'SUCCEEDED': elapsed_time = float(self._get_elapsed_time( current_job['submissionTime'])) self.remaining_time = self.remaining_time - elapsed_time self.current_job_id = len(jobs) - 1 # Job Time if self.remaining_time <= 0.0: self.job_expected_time = -1 else: self.job_expected_time = (self.remaining_time / (float(self.number_of_jobs) - float(self.current_job_id))) elif current_job['status'] == 'RUNNING': # Job Progress job_progress = (current_job['numCompletedTasks'] / float(current_job['numTasks'])) # Total Elapsed Time total_elapsed_time = float(self._get_elapsed_time( self.first_submission_time)) # Total Time Progress total_time_progress = float( total_elapsed_time / self.expected_time) # Total Application Progress total_app_progress = self.job_ratio * (self.current_job_id + job_progress) # New Progress Error new_progress_error = total_app_progress - total_time_progress # Elapsed Time elapsed_time = float(self._get_elapsed_time( current_job['submissionTime'])) plugin_log.log("%s | %s: Elapsed time: %.2f -\ Expected time: %.2f" % (time.strftime("%H:%M:%S"), self.app_id, elapsed_time, self.job_expected_time)) # Error if self.job_expected_time == -1: time_progress = 1 error = -1.0 else: time_progress = (elapsed_time / self.job_expected_time) if time_progress > 1: time_progress = 1 error = job_progress - time_progress progress_error_metric = self._format_metric( 'application-progress.error', new_progress_error, time.time() * 1000, self.dimensions ) total_app_progress_metric = self._format_metric( 'application-progress.total_app_progress', total_app_progress * 100, time.time() * 1000, self.dimensions ) total_time_progress_metric = self._format_metric( 'application-progress.total_time_progress', total_time_progress * 100, time.time() * 1000, self.dimensions ) log_string = ("%s | %s: Ref value: %.2f - Job progress: %.2f" % (time.strftime("%H:%M:%S"), self.app_id, time_progress, job_progress)) plugin_log.log(log_string) log_string = ("%s | %s: Job: %d - Progress error: %.2f" % (time.strftime("%H:%M:%S"), self.app_id, self.current_job_id, float(progress_error_metric['value']))) plugin_log.log(log_string) self.monasca.send_metrics([progress_error_metric, total_time_progress_metric, total_app_progress_metric]) time.sleep(MONITORING_INTERVAL) def _format_metric(self, name, value, timestamp, dimensions): metric = {} metric['name'] = name metric['value'] = value metric['timestamp'] = timestamp metric['dimensions'] = dimensions return metric def _get_elapsed_time(self, gmt_timestamp): local_tz = tzlocal.get_localzone() submission_date = datetime.strptime(gmt_timestamp, '%Y-%m-%dT%H:%M:%S.%fGMT') submission_date = submission_date.replace(tzinfo=pytz.utc).\ astimezone(local_tz) submission_date = submission_date.replace(tzinfo=None) submission_timestamp = time.mktime(submission_date.timetuple()) this_timestamp = time.time() plugin_log.log("%s | %s: Submission timestamp: %.2f - \ This timestamp: %.2f" % (time.strftime("%H:%M:%S"), self.app_id, submission_timestamp, this_timestamp)) elapsed_time = this_timestamp - submission_timestamp return elapsed_time def monitoring_application(self): try: job_request = requests.get(self.submission_url + ':4040/api/v1/applications/' + self.app_id + '/jobs') self._publish_measurement(job_request) except Exception as ex: print ("Error: No application found for %s. %s remaining attempts" % (self.app_id, self.attempts)) print ex.message raise
class WebAppMonitor(Plugin): def __init__(self, app_id, info_plugin, keypair, retries=60): Plugin.__init__(self, app_id, info_plugin, collect_period=5, retries=retries) self.app_id = app_id self.host_ip = info_plugin['host_ip'] self.keypair_path = keypair self.host_username = info_plugin['host_username'] self.log_path = info_plugin['log_path'] self.dimensions = {'app_id': self.app_id, 'host': self.host_ip} self.last_checked = '' self.monasca = MonascaConnector() def _get_metric_value(self, log): value = None for i in range(len(log) - 1, 0, -1): if log[i] == '#': value = float(log[i + 1:-1]) return value def _get_ssh_connection(self): keypair = paramiko.RSAKey.from_private_key_file(self.keypair_path) conn = paramiko.SSHClient() conn.set_missing_host_key_policy(paramiko.AutoAddPolicy()) conn.connect(hostname=self.host_ip, username=self.host_username, pkey=keypair) return conn def _publish_metrics(self, last_log): metric = {} print last_log # Check if this log line contains a new metric measurement if '[Random]' in last_log and self.last_checked != last_log: self.last_checked = last_log # Add to metric_info values for this measurement: # value and timestamp value = self._get_metric_value(last_log) metric['name'] = 'web_app.random' metric['value'] = value metric['timestamp'] = time.time() * 1000 metric['dimensions'] = { "app_id": self.app_id, "host": self.host_ip } # Sending the metric to Monasca print value self.monasca.send_metrics([metric]) print "WebApp metric published: %i" % (value) # Flag that checks if the log capture is ended elif '[END]' in last_log: self.running = False def monitoring_application(self): try: conn = self._get_ssh_connection() stdin, stdout, stderr = conn.exec_command("sudo tail -1 %s" % self.log_path) self._publish_metrics(stdout.read()) except Exception as ex: print "Monitoring %s is not possible. \nError: %s. %s remaining attempts" % ( self.app_id, ex.message, self.attempts) raise ex