def update(self, master_info=None, state_json=None): """ Get new node list from master. If master_info is set (during registration and reregistration), a new master url will be set. """ if master_info is not None: self.master_info = master_info # We have no way to update; abort if state_json is None and self.master_info is None: return # For testing, allow caller to give state_json. if state_json is None and self.master_info is not None: state_endpoint = "http://" + self.master_info.hostname + ":" + \ str(self.master_info.port) + "/state.json" state_json = json_helper.from_url(state_endpoint) # Get node list. new_targets = [] for slave in state_json['slaves']: new_targets.append(slave['pid'].split('@')[1]) print "New targets: %s" % new_targets # Make copy of current targets, to identify deactivated slaves # TODO(nnielsen): Find lighter weight way of doing this. inactive_slaves = copy.deepcopy(self.targets) for new_target in new_targets: if new_target not in self.targets: slave = Agent(new_target) # TODO(nnielsen): Persist map id -> host to zookeeper. self.monitor[slave.task_id] = slave self.targets[new_target] = slave if new_target in inactive_slaves: print "Don't remove %s" % new_target del inactive_slaves[new_target] if len(inactive_slaves) > 0: print "%d slaves to be unmonitored" % len(inactive_slaves) for inactive_slave, slave in inactive_slaves.iteritems(): print "inactive_slave: %s" % inactive_slave # TODO(nnielsen): Remove from monitor queue as well. self.unmonitor[slave.task_id] = inactive_slave if slave.task_id in self.monitor: # Don't try to schedule for monitoring, if we decided slave is gone. del self.monitor[slave.task_id] # And no longer a target. if slave.hostname in self.targets: del self.targets[slave.hostname] self.stats()
def update(self, master_info=None): """ Get new node list from master. If master_info is set (during registration and reregistration), a new master url will be set. """ if master_info is not None: self.master_info = master_info state_endpoint = "http://" + self.master_info.hostname + ":" + str(self.master_info.port) + "/state.json" state_json = json_helper.from_url(state_endpoint) # Get node list new_targets = [] for slave in state_json['slaves']: new_targets.append(slave['pid'].split('@')[1]) # Make copy of current targets, to identify deactivated slaves inactive_slaves = self.targets for new_target in new_targets: if new_target not in self.targets: slave = Slave(new_target) # TODO(nnielsen): Persist map id -> host to zookeeper. self.monitor[slave.id] = slave self.targets[slave.hostname] = slave del inactive_slaves[slave.hostname] if len(inactive_slaves) > 0: print "%d slaves to be unmonitored" % len(inactive_slaves) for inactive_slave in inactive_slaves: # TODO(nnielsen): Remove from monitor queue as well. self.unmonitor[inactive_slave.id] = inactive_slave
def resolve_slave_id(slave_location): """ Helper to look up slave id from slave endpoint. :param slave_location: Address of slave (for example, localhost:5051). :return: ID of slave. """ state_endpoint = 'http://%s/state.json' % slave_location slave_state = json_helper.from_url(state_endpoint) slave_id = slave_state['id'] print "Resolved slave id: %s" % slave_id return slave_id
def run_task(): print "Running task %s" % task.task_id.value update = mesos_pb2.TaskStatus() update.task_id.value = task.task_id.value update.state = mesos_pb2.TASK_RUNNING driver.sendStatusUpdate(update) def fail(message): update = mesos_pb2.TaskStatus() update.task_id.value = task.task_id.value update.state = mesos_pb2.TASK_FAILED update.message = message driver.sendStatusUpdate(update) print "Validating task.data..." # Validate task.data try: if task.data is None: return fail('Data field not set for task; cannot monitor slave') task_json = json.loads(task.data) if 'slave_location' not in task_json: return fail('slave_location not found in task json') except: return fail('Data field could not be parsed for task; cannot monitor slave') print "Task.data validated" print task_json slave_location = task_json['slave_location'] monitor_endpoint = 'http://%s/monitor/statistics.json' % slave_location state_endpoint = 'http://%s/state.json' % slave_location metrics_endpoint = 'http://%s/metrics/snapshot' % slave_location slave_state = json_helper.from_url(state_endpoint) slave_id = slave_state['id'] print "Resolved slave id: %s" % slave_id # One second sample rate. sample_rate = 1 samples = {} sample_count = 0 print "Start sample loop..." # Sample loop. while True: # Poor mans GC: We loose one sample per framework every 10.000 iterations. sample_count += 1 if sample_count > 10000 == 0: print "Cleaning samples..." sample_count = 0 samples = {} stellar_samples = [] print "Collecting sample for %s" % monitor_endpoint metrics_snapshot = json_helper.from_url(metrics_endpoint) cpus_total = metrics_snapshot['slave/cpus_total'] cpus_used = metrics_snapshot['slave/cpus_used'] cpus_allocation_slack = cpus_total - cpus_used mem_total = metrics_snapshot['slave/mem_total'] mem_used = metrics_snapshot['slave/mem_used'] mem_allocation_slack = mem_total - mem_used # TODO(nnielsen): If slave is unreachable after a certain number of retries, send TASK_FAILED and abort. # Collect the latest resource usage statistics. # TODO(nnielsen): Make sample rate configurable. # TODO(nnielsen): Batch samples. for sample in json_helper.from_url(monitor_endpoint): print 'Collecting sample at \'%s\'' % monitor_endpoint if 'statistics' in sample and 'timestamp' not in sample['statistics']: sample['statistics']['timestamp'] = time.time() # Validate sample if not metrics.validate_statistics_sample(sample): print "Warning: partial sample %s" % sample continue framework_id = sample['framework_id'] executor_id = sample['executor_id'] if framework_id not in samples: samples[framework_id] = {} if executor_id not in samples[framework_id]: samples[framework_id][executor_id] = None if samples[framework_id][executor_id] is not None: # We need two samples to compute the cpu usage. prev = samples[framework_id][executor_id] interval = sample['statistics']['timestamp'] - prev['statistics']['timestamp'] user_time = sample['statistics']['cpus_user_time_secs'] - prev['statistics']['cpus_user_time_secs'] system_time = sample['statistics']['cpus_system_time_secs'] - prev['statistics']['cpus_system_time_secs'] cpu_usage = float(user_time + system_time) / float(interval) # Compute slack CPU. cpu_slack = sample['statistics']['cpus_limit'] - cpu_usage # Compute slack memory. mem_usage = sample['statistics']['mem_rss_bytes'] mem_slack = sample['statistics']['mem_limit_bytes'] - mem_usage # TODO(nnielsen): Hang off task id's for this executor. stellar_samples.append({ 'slave_id': slave_id, 'framework_id': framework_id, 'executor_id': executor_id, 'cpu_usage_slack': cpu_slack, 'cpu_usage': cpu_usage, 'cpu_allocation_slack': cpus_allocation_slack, 'mem_usage_slack': mem_slack, 'mem_usage': mem_usage, 'mem_allocation_slack': mem_allocation_slack, 'timestamp': sample['statistics']['timestamp'] }) samples[framework_id][executor_id] = sample # Send samples if collected. if stellar_samples is not '': json_out = json.dumps(stellar_samples) update = mesos_pb2.TaskStatus() update.task_id.value = task.task_id.value update.state = mesos_pb2.TASK_RUNNING update.data = json_out driver.sendStatusUpdate(update) time.sleep(sample_rate)
def run_task(executor_driver, task): """ Entry for collector thread. :return: False on failure """ print "Running task %s" % task.task_id.value running(executor_driver, task) slave_location = validate_task_info(executor_driver, task)['slave_location'] slave_id = resolve_slave_id(slave_location) monitor_endpoint = 'http://%s/monitor/statistics.json' % slave_location samples = {} sample_count = 0 print "Start sample loop..." # Sample loop. while True: # Poor mans GC: We loose one sample per framework every 10.000 iterations. sample_count += 1 if sample_count > 10000 == 0: print "Cleaning samples..." sample_count = 0 samples = {} stellar_samples = [] # Compute slave global allocation slacks. metrics_snapshot = json_helper.from_url('http://%s/metrics/snapshot' % slave_location) cpus_allocation_slack = metrics_snapshot['slave/cpus_total'] - \ metrics_snapshot['slave/cpus_used'] mem_allocation_slack = metrics_snapshot['slave/mem_total'] - \ metrics_snapshot['slave/mem_used'] # TODO(nnielsen): If slave is unreachable after a certain number of retries, # send TASK_FAILED and abort. # Collect the latest resource usage statistics. # TODO(nnielsen): Make sample rate configurable. # TODO(nnielsen): Batch samples. # TODO(nnielsen): We can adjust sample rate based on time of previous request. for sample in json_helper.from_url(monitor_endpoint): print 'Collecting sample at \'%s\'' % monitor_endpoint if 'statistics' in sample and 'timestamp' not in sample['statistics']: sample['statistics']['timestamp'] = time.time() # Validate sample if not validate_statistics_sample(sample): print "Warning: partial sample %s" % sample continue framework_id = sample['framework_id'] executor_id = sample['executor_id'] # Initialize 2-level deep map of framework -> executor -> sample. if framework_id not in samples: samples[framework_id] = {} if executor_id not in samples[framework_id]: samples[framework_id][executor_id] = None if samples[framework_id][executor_id] is not None: # We need two samples to compute the cpu usage. stellar_sample = process_sample(samples[framework_id][executor_id], sample) # Add global metrics. stellar_sample['slave_id'] = slave_id stellar_sample['cpu_allocation_slack'] = cpus_allocation_slack stellar_sample['mem_allocation_slack'] = mem_allocation_slack stellar_samples.append(stellar_sample) # Save current sample for next sample processing. samples[framework_id][executor_id] = sample # Send samples if collected. if len(stellar_samples) > 0: running(executor_driver, task, json.dumps(stellar_samples)) # One second sample rate. time.sleep(1)