def loop(): while True: try: tick() except Exception, e: LOG.warning("Exception in main loop: %s" % (traceback.format_exc(e))) time.sleep(int(config.get_value(KEY_LOOP_INTERVAL_SECS)))
def get_ssh_keys(): keys = re.split(r'\s*,\s*', config.get_value(KEY_SSH_KEYS)) for i in range(0, len(keys)): key = keys[i] if key[0] == '$': var_name = key[1:] key_file = KEY_FILE_NAME_PATTERN % var_name if not os.path.isfile(key_file): key_value = os.environ.get(var_name) if key_value: marker_begin = '-----BEGIN RSA PRIVATE KEY-----' marker_end = '-----END RSA PRIVATE KEY-----' key_value = key_value.replace(marker_begin, '') key_value = key_value.replace(marker_end, '') key_value = key_value.replace(' ', '\n') if marker_begin not in key_value: key_value = ('%s\n' % marker_begin) + key_value if marker_end not in key_value: key_value += ('\n%s' % marker_end) key_value = key_value.replace('\n\n', '\n') save_file(key_file, key_value) run('chmod 600 %s' % key_file) else: LOG.warning( 'Unable to read SSH key from environment variable: %s' % var_name) keys[i] = key_file return keys
def get_cluster_nodes(cluster_id, role=None): emr_client = connect_emr(role=role) result = run_func(emr_client.list_instances, ClusterId=cluster_id, InstanceStates=['AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING'], cache_duration_secs=QUERY_CACHE_TIMEOUT) result = json.loads(result) result = result['Instances'] # read domain name config custom_dn = config.get_value(constants.KEY_CUSTOM_DOMAIN_NAME, section=SECTION_EMR, resource=cluster_id) i = 0 while i < len(result): inst = result[i] if inst['Status']['State'] == INSTANCE_STATE_TERMINATED: del result[i] i -= 1 else: inst['cid'] = inst['Id'] if 'Id' in inst else 'n/a' inst['iid'] = inst['Ec2InstanceId'] if 'Ec2InstanceId' in inst else 'n/a' inst['gid'] = inst['InstanceGroupId'] if 'InstanceGroupId' in inst else 'n/a' inst['ip'] = inst['PrivateIpAddress'] if 'PrivateIpAddress' in inst else 'n/a' inst['host'] = inst['PrivateDnsName'] if 'PrivateDnsName' in inst else 'n/a' if custom_dn: inst['host'] = ip_to_hostname(hostname_to_ip(inst['host']), custom_dn) inst['type'] = get_instance_group_type(cluster_id, inst['InstanceGroupId'], role=role) inst['state'] = inst['Status']['State'] inst['market'] = get_instance_group_details(cluster_id, inst['InstanceGroupId'], role=role)['Market'] i += 1 return result
def get_node_queries(cluster): cmd = ('presto-cli --execute \\"SELECT n.http_uri,count(q.node_id) from system.runtime.nodes n ' + 'left join (select * from system.runtime.queries where state = \'RUNNING\' ) as q ' + 'on q.node_id = n.node_id group by n.http_uri\\"') result = {} if cluster.ip == 'localhost': # for testing purposes return result # run ssh command out = run_ssh(cmd, cluster.ip, user='******', cache_duration_secs=QUERY_CACHE_TIMEOUT) # remove SSH log output line out = remove_lines_from_string(out, r'.*Permanently added.*') # read config for domain custom_dn = config.get_value(constants.KEY_CUSTOM_DOMAIN_NAME, section=SECTION_EMR, resource=cluster.id) # assume input is actually domain name (not ip) dn = custom_dn if custom_dn else re.match(r'ip-[^\.]+\.(.+)', cluster.ip).group(1) for line in out.splitlines(): ip = re.sub(r'.*http://([0-9\.]+):.*', r'\1', line) if ip: queries = re.sub(r'.*"([0-9\.]+)"$', r'\1', line) host = aws_common.ip_to_hostname(ip, dn) try: result[host] = int(queries) except Exception, e: result[host] = 0
def run_ssh(cmd, host, user=None, keys=None, via_hosts=[], cache_duration_secs=0): if not keys: keys = config.get_value(KEY_SSH_KEYS).split(',') user = '******' % user if user else '' agent_forward = '' forward_addendum = '' ssh_configs = ('-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ' + '-o PasswordAuthentication=no -o BatchMode=yes -o ConnectTimeout=3') if len(via_hosts) > 0: agent_forward = '-o ForwardAgent=yes' for via_host in list(reversed(via_hosts)): forward_addendum = ('ssh %s %s%s ' % (ssh_configs, user, via_host)) + forward_addendum ssh_cmd_tmpl = 'ssh ' + ssh_configs + ' ' + agent_forward + ' -i %s %s%s "' + forward_addendum + '%s"' for key in keys: key = key.strip() ssh_cmd = ssh_cmd_tmpl % (key, user, host, cmd) if len(via_hosts) > 0: run('ssh-add %s 2>&1 > /dev/null' % key) try: out = run(ssh_cmd, cache_duration_secs) return out except subprocess.CalledProcessError, e: # TODO find a more elegant solution for this. if 'Permission denied (publickey)' not in e.output: raise e
def update_resources(resource_config): for resource in resource_config: id = resource.id enabled = config.get_value('enable_enhanced_monitoring', section=SECTION_KINESIS, resource=id) if enabled == 'true': resource.enhanced_monitoring = [MONITORING_METRICS_ALL] return resource_config
def select_tasknode_group(tasknodes_groups, cluster_id): if len(tasknodes_groups) <= 0: raise Exception("Empty list of task node instance groups for scaling: %s" % tasknodes_groups) if len(tasknodes_groups) == 1: return tasknodes_groups[0] preferred = config.get_value(KEY_PREFERRED_UPSCALE_INSTANCE_MARKET, cluster_id) for group in tasknodes_groups: if group['market'] == preferred: return group raise Exception("Could not select task node instance group for preferred market '%s': %s" % (preferred, tasknodes_groups))
def get_state(cluster_id): """ Get cluster state --- operationId: 'getState' parameters: - name: cluster_id in: path """ monitoring_interval_secs = int(config.get_value(KEY_MONITORING_INTERVAL_SECS)) info = monitoring.collect_info(CLUSTERS[cluster_id], monitoring_interval_secs=monitoring_interval_secs) return jsonify(info)
def loop(): while True: LOG.info("Running next loop iteration") try: resource_list = resources.get_resources() for resource in resource_list: resource.fetch_data() scaling_required = resource.needs_scaling() if scaling_required: resource.perform_scaling(scaling_required) except Exception, e: LOG.warning("Exception in main loop: %s" % (traceback.format_exc(e))) time.sleep(int(config.get_value(KEY_LOOP_INTERVAL_SECS)))
def get_emr_costs(): """ Get summary of cluster costs and cost savings --- operationId: 'getEmrCosts' parameters: - name: 'request' in: body """ data = json.loads(request.data) cluster_id = data['cluster_id'] num_datapoints = data['num_datapoints'] if 'num_datapoints' in data else 300 baseline_nodes = (data['baseline_nodes'] if 'baseline_nodes' in data else config.get_value(KEY_BASELINE_COMPARISON_NODES, section=SECTION_EMR, resource=cluster_id, default=20)) baseline_nodes = int(baseline_nodes) info = database.history_get(section=SECTION_EMR, resource=cluster_id, limit=num_datapoints) common.remove_NaN(info) result = aws_pricing.get_cluster_savings(info, baseline_nodes) common.remove_NaN(result, delete_values=False, replacement=0) return jsonify(results=result, baseline_nodes=baseline_nodes)
def tick(): LOG.info("Running next loop iteration") monitoring_interval_secs = int(config.get_value(KEY_MONITORING_INTERVAL_SECS)) for cluster_id, details in CLUSTERS.iteritems(): cluster_ip = details['ip_public'] info = None try: info = monitoring.collect_info(details, monitoring_interval_secs=monitoring_interval_secs) except Exception, e: LOG.warning("Error getting monitoring info for cluster %s: %s" % (cluster_id, e)) if info: action = 'N/A' # Make sure we are only resizing Presto clusters atm if details['type'] == 'Presto': # Make sure we don't change clusters that are not configured if cluster_id in get_autoscaling_clusters(): try: nodes_to_terminate = get_nodes_to_terminate(info) print nodes_to_terminate if len(nodes_to_terminate) > 0: for node in nodes_to_terminate: terminate_node(cluster_ip, node['ip'], node['gid']) action = 'DOWNSCALE(-%s)' % len(nodes_to_terminate) else: nodes_to_add = get_nodes_to_add(info) if len(nodes_to_add) > 0: tasknodes_groups = aws_common.get_instance_groups_tasknodes(cluster_id) tasknodes_group = select_tasknode_group(tasknodes_groups, cluster_id)['id'] current_num_nodes = len([n for key, n in info['nodes'].iteritems() if n['gid'] == tasknodes_group]) spawn_nodes(cluster_ip, tasknodes_group, current_num_nodes, len(nodes_to_add)) action = 'UPSCALE(+%s)' % len(nodes_to_add) else: action = 'NOTHING' except Exception, e: LOG.warning("WARNING: Error downscaling/upscaling cluster %s: %s" % (cluster_id, traceback.format_exc(e))) # clean up and terminate instances whose nodes are already in inactive state aws_common.terminate_inactive_nodes(cluster_ip, info['nodes']) # store the state for future reference monitoring.history_add(cluster_id, info, action)
def get_iam_role_for_cluster(cluster): if not isinstance(cluster, basestring): cluster = cluster.id return config.get_value('role_to_assume', section=SECTION_EMR, resource=cluster)
def get_iam_role_for_stream(stream): if not isinstance(stream, basestring): stream = stream.id return config.get_value('role_to_assume', section=SECTION_KINESIS, resource=stream)
def get_autoscaling_clusters(): return re.split(r'\s*,\s*', config.get_value(KEY_AUTOSCALING_CLUSTERS))