def init_cluster(): # wait for all cloudera agent processes to come up BDVLIB_ServiceWait( [["services", "cloudera_scm_agent", NODE_GROUP_ID, "kts"]]) # make sure cloudera manager has received registration # for all new agents all_cloudera_hosts = get_hosts_for_service( ["services", "cloudera_scm_agent"]) api = ApiResource(CM_HOST, username="******", password="******") while True: current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts()) setup_logger.info("Currently registered hosts with CM " + str(current_all_hosts)) if all(x in current_all_hosts for x in all_cloudera_hosts): break setup_logger.info( "waiting for new nodes to register with cloudera manager") time.sleep(10) manager = api.get_cloudera_manager() manager.update_config(CM_CONFIG) cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION, CDH_FULL_VERSION) KTS_HOSTS = ConfigMeta.getWithTokens( ['nodegroups', NODE_GROUP_ID, 'roles', 'kts', 'fqdns']) cluster.add_hosts(KTS_HOSTS) return (cluster, manager)
def adjust_yarn_memory_limits(region, stack_name, restart=True): ec2_conn = create_ec2_connection(region) manager_instance = get_manager_instance(ec2_conn, stack_name) with cm_tunnel_ctx(manager_instance) as local_port: cm_api = ApiResource('localhost', username='******', password='******', server_port=local_port, version=9) cluster = list(cm_api.get_all_clusters())[0] host = list(cm_api.get_all_hosts())[0] # all hosts same instance type yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] rm_cg = filter(lambda x: x.roleType == 'RESOURCEMANAGER', list(yarn.get_all_role_config_groups()))[0] nm_cg = filter(lambda x: x.roleType == 'NODEMANAGER', list(yarn.get_all_role_config_groups()))[0] rm_cg.update_config({ 'yarn_scheduler_maximum_allocation_mb': ( int(host.totalPhysMemBytes / 1024. / 1024.)), 'yarn_scheduler_maximum_allocation_vcores': host.numCores}) nm_cg.update_config({ 'yarn_nodemanager_resource_memory_mb': ( int(host.totalPhysMemBytes / 1024. / 1024.)), 'yarn_nodemanager_resource_cpu_vcores': host.numCores}) cluster.deploy_client_config().wait() if restart: cluster.restart().wait()
class ImpalaCluster(object): def __init__(self, cm_host, cm_cluster_name, username, password): self.cm_api = ApiResource(cm_host, username=username, password=password) self.hosts = dict() self.services = list() self.cluster = self.cm_api.get_cluster(cm_cluster_name) if self.cluster is None: raise RuntimeError, 'Cluster name "%s" not found' % cm_cluster_name self.__load_hosts() self.__impala_service = ImpalaService(self) def _get_all_services(self): return self.cluster.get_all_services() def get_impala_service(self): return self.__impala_service def __load_hosts(self): self.hosts = dict() # Search for all hosts that are in the target cluster. # There is no API that provides the list of host in a given cluster, so to find them # we must loop through all the hosts and check the cluster name matches. for host_info in self.cm_api.get_all_hosts(): # host_info doesn't include a link to the roleRef so need to do another lookup # based on the hostId. host = self.cm_api.get_host(host_info.hostId) for roleRef.get('clusterName') == self.cluster_name: self.hosts[host_info.hostId] = Host(host) break
def init_cluster(): # wait for all cloudera agent processes to come up setup_logger.info("Creating Clutser.") BDVLIB_ServiceWait([["services", "cloudera_scm_agent", NODE_GROUP_ID]]) # make sure cloudera manager has received registration # for all new agents all_cloudera_hosts = get_hosts_for_service( ["services", "cloudera_scm_agent"]) api = ApiResource(CM_HOST, username=ADMIN_USER, password=ADMIN_PASS) while True: current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts()) setup_logger.info("Currently registered hosts with CM " + str(current_all_hosts)) if all(x in current_all_hosts for x in all_cloudera_hosts): break setup_logger.info( "waiting for new nodes to register with cloudera manager") time.sleep(10) manager = api.get_cloudera_manager() manager.update_config(CM_CONFIG) cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION, CDH_FULL_VERSION) cluster.add_hosts(ALL_HOSTS) # turn off host swap alerting hosts_swap_alert_off(api) setup_logger.info("Setting Up SPARK2 Repo....") add_spark2_repo(api) ##Set java home setup_logger.info("Setting Up Java Path....") hosts_set_javahome(api) return (cluster, manager)
def get_hosts(self): hosts = {} from cm_api.api_client import ApiResource api = ApiResource(self.host, self.port, self.username, self.password) for h in api.get_all_hosts(): hosts[h.hostId] = h.ipAddress return hosts
def list_hosts(host, username, password, cafile): context = ssl.create_default_context(cafile=cafile) api = ApiResource(host, username=username, password=password, use_tls=True, ssl_context=context) for h in api.get_all_hosts(): print h.hostname
def get_cluster_specs(): cm_api = ApiResource(os.environ['MANAGER_HOST'], username='******', password='******', server_port=7180, version=9) host = list(cm_api.get_all_hosts())[0] # all hosts same instance type cluster = list(cm_api.get_all_clusters())[0] yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')), 'num_cores': host.numCores, 'node_memory': host.totalPhysMemBytes}
def main(cm_host, user, password): api = ApiResource(cm_host, username=user, password=password) cluster = api.get_all_clusters()[0] try: cluster.get_service(service_name) print "Service %s already configured. Skipping" % service_name except ApiException: print "creating new service %s" % service_name add_kudu_service(cluster, service_name) create_kudu_roles(cluster, api.get_all_hosts()) update_kudu_role_group_configs(cluster) start_service(cluster, service_name) update_impala_service(cluster, service_name) print "Waiting for cluster to restart stale services" cluster.restart(restart_only_stale_services=True, redeploy_client_configuration=True).wait()
def get_cluster_info(manager_host, server_port=7180, username='******', password='******'): cm_api = ApiResource(manager_host, username=username, password=password, server_port=server_port, version=9) host = list(cm_api.get_all_hosts())[0] # all hosts same instance type cluster = list(cm_api.get_all_clusters())[0] yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] hive = filter(lambda x: x.type == 'HIVE', list(cluster.get_all_services()))[0] impala = filter(lambda x: x.type == 'IMPALA', list(cluster.get_all_services()))[0] hive_hs2 = hive.get_roles_by_type('HIVESERVER2')[0] hive_host = cm_api.get_host(hive_hs2.hostRef.hostId).hostname hive_port = int( hive_hs2.get_config('full')['hs2_thrift_address_port'].default) impala_hs2 = impala.get_roles_by_type('IMPALAD')[0] impala_host = cm_api.get_host(impala_hs2.hostRef.hostId).hostname impala_port = int(impala_hs2.get_config('full')['hs2_port'].default) return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')), 'node_cores': host.numCores, 'node_memory': host.totalPhysMemBytes, 'hive_host': hive_host, 'hive_port': hive_port, 'impala_host': impala_host, 'impala_port': impala_port}
def adjust_yarn_memory_limits(region, stack_name): ec2_conn = create_ec2_connection(region) manager_instance = get_manager_instance(ec2_conn, stack_name) cm_api = ApiResource("localhost", username="******", password="******", server_port=64999, version=9) with http_tunnel_ctx(manager_instance, 7180, 64999): cluster = list(cm_api.get_all_clusters())[0] host = list(cm_api.get_all_hosts())[0] # all hosts same instance type yarn = filter(lambda x: x.type == "YARN", list(cluster.get_all_services()))[0] rm_cg = filter(lambda x: x.roleType == "RESOURCEMANAGER", list(yarn.get_all_role_config_groups()))[0] nm_cg = filter(lambda x: x.roleType == "NODEMANAGER", list(yarn.get_all_role_config_groups()))[0] rm_cg.update_config( { "yarn_scheduler_maximum_allocation_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)), "yarn_scheduler_maximum_allocation_vcores": host.numCores, } ) nm_cg.update_config( { "yarn_nodemanager_resource_memory_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)), "yarn_nodemanager_resource_cpu_vcores": host.numCores, } ) cluster.deploy_client_config().wait() cluster.restart().wait()
class handler_cm_api: def __init__(self): self._user_executing = grp.getgrnam(getpass.getuser())[0] def __getitem__(self): return self def setup(self, p_cm_host, p_cm_user, p_cm_pass, p_cm_version, p_cluster, p_cm_port=None, p_use_tls=False): self.cm_api = ApiResource(p_cm_host, server_port=p_cm_port, version=p_cm_version, username=p_cm_user, password=p_cm_pass, use_tls=p_use_tls) handler_cm_api.cluster_hosts = self.cm_api.get_all_hosts() if p_cluster: self.cluster = filter(lambda x: x.displayName == p_cluster, self.cm_api.get_all_clusters())[0] if not self.cluster: print("Error: That cluster is not valid.") return else: self.services = self.cluster.get_all_services() self.name = self.cluster.displayName tmp_topology = self.cluster.list_hosts() self.topology = {} for i in range(len(tmp_topology)): tmp_host = filter(lambda x: x.hostId == tmp_topology[i].hostId, handler_cm_api.cluster_hosts)[0] self.topology[tmp_topology[i].hostId] = tmp_host.hostname def get_current_group(self): return self._user_executing ############################### # For internal validations def __validate_service(self, p_service): v_service = filter(lambda x: x.type == p_service, self.services) if not v_service: print("Error: Service not found") raise SystemExit return v_service.pop() def __validate_hostname(self, p_hostname): v_node = filter(lambda x: x.hostname == p_hostname, handler_cm_api.cluster_hosts) if not v_node: print("Error: Hostname not found") raise SystemExit return v_node.pop() def __validate_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) v_role = filter(lambda x: x.hostRef.hostId == v_node.hostId, v_roles) if not v_role: print("Error: Role not found in that host") raise SystemExit return v_role.pop() ###################################################################### # START/STOP/RESTART ###################################################################### def stop_cluster(self): v_cmd = self.cluster.stop() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def start_cluster(self): v_cmd = self.cluster.start() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def restart_cluster(self): v_cmd = self.cluster.restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def rolling_restart_cluster(self): v_cmd = self.cluster.rolling_restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) ###################################################################### #SERVICES ###################################################################### ################ # Status ################ # ------ State def check_state_services(self): for v_srv in self.services: print(coloring(v_srv.serviceState, v_srv.type)) def check_state_service(self, p_service): v_service = self.__validate_service(p_service) print(coloring(v_service.serviceState, v_service.type)) def check_health_services(self): for v_srv in self.services: print(coloring(v_srv.healthSummary, v_srv.type)) # ----- Health def check_health_service(self, p_service): v_service = self.__service_validate(p_service) print(coloring(v_service.healthSummary, v_service.type)) ##################################### # stop/start/restart/Rolling Restart ##################################### def stop_service(self, p_service): v_service = self.__validate_service(p_service) print("* Stopping " + v_service.type) v_cmd = v_service.stop() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def start_service(self, p_service): v_service = self.__validate_service(p_service) print("* Starting " + v_service.type) v_cmd = v_service.start() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def restart_service(self, p_service): v_service = self.__validate_service(p_service) print("* Restarting " + v_service.type) v_cmd = v_service.restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def rolling_restart_service(self, p_service): v_service = self.__validate_service(p_service) try: print(" * Rolling Restarting " + v_service.type) v_cmd = v_service.rolling_restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) except: if re.match("Command not valid for", str(sys.exc_info()[1])): print "It's not possible to use Rolling Restart in this service." else: raise ################################################################### # ROLES ################################################################### ################# # Status ################# # ---- State def check_state_roles(self, p_service): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") for v_role in v_services.get_all_roles(): print( coloring( v_role.roleState, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_state_role(self, p_service, p_role): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) for v_role in v_roles: print( coloring( v_role.roleState, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_state_all_roles(self): for v_service in self.services: self.check_state_roles(v_service.type) print('---------------------') # ---- Health def check_health_roles(self, p_service): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") for v_role in v_service.get_all_roles(): print( coloring( v_role.healthSummary, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_health_role(self, p_service, p_role): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) for v_role in v_roles: print( coloring( v_role.healthSummary, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_health_all_roles(self): for v_service in self.services: self.check_health_roles(v_service.type) print('---------------------') ##################### # Stop/Start/Restart def stop_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* Stopping " + v_role.type) v_cmd = v_service.stop_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) def start_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* Starting " + v_role.type) v_cmd = v_service.start_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) def restart_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* restarting " + v_role.type) v_cmd = v_service.restart_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) ########################################################### #IMPALA QUERIES ########################################################### # FILTERS ############################ def setup_filters_impala_queries(self): v_start_time = raw_input( 'Introduce the start time with following format: DD/MM/YYYY_hh:mm:ss. Example: 01/01/2018_00:00:00: ' ) if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", v_start_time): print("Error: Invalid Format for start time") return v_end_time = raw_input( 'Introduce the end time with the following format: DD/MM/YYYY_hh:mm:ss. Example 31/01/2018_00:00:00: ' ) if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", v_end_time): print("Error: Invalid format for end time") return v_filter_type = raw_input( 'Choose the kind of filter: user|duration|state: ') if not v_filter_type in ('user', 'duration', 'state'): print("Error: Invalid kind of filter") return if v_filter_type == 'user': v_filter_value = raw_input( 'Introduce the user name you want to filter by: ') if not v_filter_value: print("Error: Invalid user name") return elif v_filter_type == 'duration': v_filter_value = raw_input( 'Introduce the query duration you want to filter by: +Xs|-Xs|=Xs. Example: +0s: ' ) if not re.match("^[+-=]\d+.\d*[hms]$", v_filter_value): print("Error: Invalid duration filter.") return elif v_filter_type == 'state': v_filter_value = raw_input( 'Introduce the query state you want to filter by: CREATED|INITIALIZED|COMPILED|RUNNING|FINISHED|EXCEPTION|UNKNOWN: ' ) if not v_filter_value in ('CREATED', 'INITIALIZED', 'COMPILED', 'RUNNING', 'FINISHED', 'EXCEPTION', 'UNKNOWN'): print("Error: Invalid state filter.") return v_limit = raw_input( "Introduce the max num of queries you want to check: ") if not re.match("^\d+$", v_limit): print("Error: Invalid limit. It has to be an integer") return return v_start_time, v_end_time, v_filter_type, v_filter_value, int( v_limit) ###################################### # Getting queries ###################################### def get_impala_queries(self, p_start_time=None, p_end_time=None, p_filter_type=None, p_filter_value=None, p_limit=None): if not (p_start_time and p_end_time and p_filter_type and p_filter_value and p_limit): p_start_time, p_end_time, p_filter_type, p_filter_value, p_limit = self.setup_filters_impala_queries( ) v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0] if not v_impala: print("Error: Impala service doesnt exist in this cluster.") return if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time): v_start_time = datetime.strptime(p_start_time, '%d/%m/%Y_%H:%M:%S') else: print("Error. startTime format is not valid.") return if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time): v_end_time = datetime.strptime(p_end_time, '%d/%m/%Y_%H:%M:%S') else: print("Error. startTime format is not valid.") return if p_filter_type == "user" and type(p_filter_value) == str: v_filter_str = 'user = '******'+': v_filter_value = p_filter_value.replace('+', '>') if p_filter_value[0] == '-': v_filter_value = p_filter_value.replace('-', '<') v_filter_str = 'queryDuration ' + v_filter_value elif p_filter_type == "state" and p_filter_value in ( 'CREATED', 'INITIALIZED', 'COMPILED', 'RUNNING', 'FINISHED', 'EXCEPTION', 'UNKNOWN'): v_filter_str = 'queryState = ' + v_filter_value else: print("Error: Filter is not valid.") return if type(p_limit) == int and p_limit < 201: v_limit = p_limit else: print("Error: Limit is not valid. It must be > 0 and <= 200") return v_queries = v_impala.get_impala_queries(v_start_time, v_end_time, v_filter_str, v_limit).queries v_output = '' for vq in v_queries: v_coordinator = filter(lambda x: x.hostId == vq.coordinator.hostId, self.cluster_hosts)[0].hostname v_output += COLORS.BLUE + "##################################################################################" + COLORS.RESET + "\n" v_output += vq.queryId + " -- " + vq.queryState + ":\n" v_output += COLORS.RED + vq.statement + COLORS.RESET + "\n" v_output += COLORS.GREEN + "--- Attributes ---" + COLORS.RESET + "\n" v_output += "Query Type: " + vq.queryType + "\n" if 'query_status' in vq.attributes.keys(): v_output += "Query Status: " + vq.attributes[ 'query_status'] + "\n" v_output += "User: "******"\n" v_output += "Database: " + vq.database + "\n" if 'pool' in vq.attributes.keys(): v_output += "Pool: " + vq.attributes['pool'] + "\n" v_output += "Starts at: " + vq.startTime.strftime( "%d/%m/%Y_%H:%M:%S") + "\n" v_output += "Ends at: " + vq.endTime.strftime( "%d/%m/%Y_%H:%M:%S") + "\n" v_output += "Coordinator: " + v_coordinator + "\n" v_output += "Rows Produced: " + str(vq.rowsProduced) + "\n" if vq.attributes['file_formats']: v_output += "File Format: " + vq.attributes[ 'file_formats'] + "\n" if 'hdfs_bytes_read' in vq.attributes.keys(): v_output += "HDFS bytes read: " + vq.attributes[ 'hdfs_bytes_read'] + "\n" if 'memory_aggregate_peak' in vq.attributes.keys(): v_output += "Memory Aggregate Peak: " + vq.attributes[ 'memory_aggregate_peak'] + "\n" if 'thread_cpu_time' in vq.attributes.keys(): v_output += "Threads Cpu Time: " + vq.attributes[ 'thread_cpu_time'] + "\n" print(v_output) print("Do you want to save the output? (Y/N)") v_save = raw_input("Your choice: ").upper() if v_save == 'Y': v_output_nc = re.sub("\\x1b\[\d+m", "", v_output) v_file = "/tmp/impala_queries_" + datetime.now().strftime( "%Y%m%d_%H%M%S") + ".log" with open(v_file, 'a') as file_output: file_output.write(v_output_nc) print("The output was written in: " + v_file) ###################### # Getting details ###################### def get_details_impala_query(self, p_query_id=None): if not p_query_id: v_query_id = raw_input( 'Introduce the query id you want to check the details: ') else: v_query_id = p_query_id v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0] v_queries = v_impala.get_impala_queries( datetime.now() - timedelta(days=30), datetime.now(), 'queryDuration > 0s', 1000).queries v_query = filter(lambda x: x.queryId == v_query_id, v_queries) if not v_query: print( "Error: The query_id is not valid, was executed more than 30 days ago or is not between the last 1000 queries. 1000 is the limit." ) return elif not v_query[0].detailsAvailable: print("Error: This Query does not have details available.") return else: v_output = "/tmp/impala_query_details_" + v_query[ 0].queryId + "_" + datetime.now().strftime( "%Y%m%d_%H%M%S") + ".log" with open(v_output, 'a') as file_output: file_output.write( str(v_impala.get_query_details(v_query[0].queryId))) print("The output was written in: " + v_output) ####################### def get_same_configuration(self): v_configs = [] v_command = 'hadoop org.apache.hadoop.conf.Configuration' for v_node in self.topology.values(): v_ssh = subprocess.Popen( ["ssh", v_node, "-o", "StrictHostKeyChecking=no", v_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE) v_configs += [v_ssh.stdout.readlines()] if len(self.topology) != len(v_configs): print( "Error: The num configs is different to the num of nodes in this cluster" ) return if v_configs[1:] == v_configs[:-1]: print(coloring('GOOD', "The configs are the same in all nodes.")) print("The nodes which were checked are: " + ', '.join(self.topology.values())) else: print(coloring('BAD', "The configs are not the same."))
class CMInventory(object): def _empty_inventory(self): return {"_meta" : {"hostvars" : {}}} def __init__(self): ''' Main execution path ''' self.config = ConfigParser.SafeConfigParser() if os.environ.get('CM_INI', ''): config_files = [os.environ['CM_INI']] else: config_files = CM_CONFIG_FILES for config_file in config_files: if os.path.exists(config_file): self.config.read(config_file) break # Load up connections info based on config and then environment variables username = (self.config.get('auth', 'username') or os.environ.get('CM_USERNAME', None)) password = (self.config.get('auth', 'password') or os.environ.get('CM_PASSWORD', None)) host = (self.config.get('auth', 'host') or os.environ.get('CM_HOST', None)) if self.config.has_option('auth', 'port'): port = self.config.get('auth', 'port') else: port = os.environ.get('CM_PORT', None) if self.config.has_option('auth', 'use_tls'): use_tls = self.config.get('auth', 'use_tls') else: use_tls = os.environ.get('CM_USETLS', False) if self.config.has_option('auth', 'version'): version = self.config.get('auth', 'version') else: version = os.environ.get('CM_VERSION', None) # Limit the clusters being scanned self.filter_clusters = os.environ.get('CM_CLUSTERS') if not self.filter_clusters and self.config.has_option('defaults', 'clusters'): self.filter_clusters = self.config.get('defaults', 'clusters') if self.filter_clusters: self.filter_clusters = [x.strip() for x in self.filter_clusters.split(',') if x.strip()] self.inv_lock = Lock() self.cm = ApiResource(host, port, username, password, use_tls) def _put_cache(self, name, value): ''' Saves the value to cache with the name given. ''' if self.config.has_option('defaults', 'cache_dir'): cache_dir = os.path.expanduser(self.config.get('defaults', 'cache_dir')) if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache_file = os.path.join(cache_dir, name) with open(cache_file, 'w') as cache: json.dump(value, cache) def _get_cache(self, name, default=None): ''' Retrieves the value from cache for the given name. ''' if self.config.has_option('defaults', 'cache_dir'): cache_dir = self.config.get('defaults', 'cache_dir') cache_file = os.path.expanduser( os.path.join(cache_dir, name) ) print cache_file if os.path.exists(cache_file): print "here" if self.config.has_option('defaults', 'cache_max_age'): cache_max_age = self.config.getint('defaults', 'cache_max_age') else: cache_max_age = 0 cache_stat = os.stat(cache_file) if (cache_stat.st_mtime + cache_max_age) >= time.time(): with open(cache_file) as cache: return json.load(cache) return default def get_host(self, hostname): inv = self._get_cache(hostname, None) if inv is not None: return inv if inv is None: try: inv = hosts.get_host(self.cm, hostname) except ObjectNotFoundError: pass if inv is not None: self._put_cache(hostname, inv) return inv or {} def _add_host(self, inv, parent_group, host_name): ''' Add the host to the parent group in the given inventory. ''' with self.inv_lock: p_group = inv.setdefault(parent_group, []) if isinstance(p_group, dict): group_hosts = p_group.setdefault('hosts', []) else: group_hosts = p_group if host_name not in group_hosts: group_hosts.append(host_name) def _add_child(self, inv, parent_group, child_group): ''' Add a child group to a parent group in the given inventory. ''' if parent_group != 'all': with self.inv_lock: p_group = inv.setdefault(parent_group, {}) if not isinstance(p_group, dict): with self.inv_lock: inv[parent_group] = {'hosts': p_group} p_group = inv[parent_group] group_children = p_group.setdefault('children', []) if child_group not in group_children: group_children.append(child_group) with self.inv_lock: inv.setdefault(child_group, []) def get_inventory(self, meta_hostvars=True, n_threads=5): ''' Reads the inventory from cache or VMware API via pSphere. ''' # Use different cache names for guests only vs. all hosts. cache_name = '__inventory_all__' inv = self._get_cache(cache_name, None) if inv is not None: print "Here" return inv def _build_host_inventory(hostRef,inv,meta_hostvars): host = hosts.get_host(self.cm, hostRef.hostId) #print host.hostname self._add_host(inv, 'all', host.hostname) if meta_hostvars: inv['_meta']['hostvars'][host.hostname] = host.to_json_dict(preserve_ro=True) self._put_cache(host.hostname, host.to_json_dict(preserve_ro=True)) # Group by cluster if host.clusterRef: cluster = clusters.get_cluster(self.cm, host.clusterRef.clusterName) self._add_child(inv, 'all', cluster.displayName) self._add_host(inv, cluster.displayName, host.hostname) if host.roleRefs: for roleRef in host.roleRefs: role = roles.get_role(self.cm, roleRef.serviceName, roleRef.roleName, roleRef.clusterName) # Group by service service = services.get_service(self.cm, roleRef.serviceName, roleRef.clusterName) # There is no way to ensure that service display name is unique across clusters # The only simple and unique representation of the service that can be used # is the concatination of the service name and the cluster's name service_group = cluster.displayName + '-' + service.displayName self._add_child(inv, 'all', service.type) self._add_child(inv, service.type, service_group) self._add_child(inv, cluster.displayName, service_group) self._add_host(inv, service_group, host.hostname) # Group by role, roles depend on services and clusters, so the only unique and # simple representation of a Group is the concatination of the role type, service # name and the cluster name role_group = cluster.displayName + '-' + service.displayName + '-' + role.type self._add_child(inv, 'all', role.type) #self._add_child(inv, role.type, service_group) #self._add_child(inv, service_group, role_group) self._add_child(inv, role.type, role_group) self._add_host(inv, role_group, host.hostname) # Group by role Group role_group = role.roleConfigGroupRef.roleConfigGroupName self._add_child(inv, role.type, role_group) self._add_host(inv, role_group, host.hostname) # Group by role template for template in host_templates.get_all_host_templates(self.cm, host.clusterRef.clusterName): self._add_child(inv, 'all', template.name) for group in template.roleConfigGroupRefs: if role_group == group.roleConfigGroupName: self._add_child(inv, template.name, role_group) else: self._add_child(inv, 'all', 'no_role') self._add_host(inv, 'no_role', host.hostname) # Group by Rack self._add_child(inv, 'all', host.rackId) self._add_host(inv, host.rackId, host.clusterRef.clusterName) else: cluster_group = "no_cluster" self._add_child(inv, 'all', cluster_group) self._add_host(inv, cluster_group, host.hostname) inv = {'all': {'hosts': []}} if meta_hostvars: inv['_meta'] = {'hostvars': {}} if self.filter_clusters: # Loop through clusters and find hosts: hosts_list = [] for host in self.cm.get_all_hosts(): if host.clusterRef: if clusters.get_cluster(self.cm, host.clusterRef.clusterName).displayName in self.filter_clusters: hosts_list.append(host) else: # Get list of all hosts hosts_list = self.cm.get_all_hosts() if n_threads == 1: for hostRef in hosts_list: _build_host_inventory(inv,hostRef,meta_hostvars) else: _partial_build_host_inventory = partial(_build_host_inventory, inv=inv,meta_hostvars=meta_hostvars) pool = ThreadPool(n_threads) if sys.version_info <= (2, 6): pool.map(_partial_build_host_inventory, hosts_list) else: pool.map_async(_partial_build_host_inventory, hosts_list).get(1 << 31) self._put_cache(cache_name, inv) return inv
def create_cluster(config_dict): config.read(['./conf/hadrian.ini','./conf/cluster_specs.ini', './conf/cloudera-manager/cm.ini']) cm_cluster_name = config_grabber("Globals")['cm.cluster.name'] cm_username = config_grabber("Globals")['cm.username'] cm_password = config_grabber("Globals")['cm.password'] cm_port = config_grabber("Globals")['cm.port'] version = config_grabber('Globals')['cdh.cluster.version'] cm_server = config_grabber(cm_cluster_name + '-en')['cm.server'] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir('./conf/' + cm_cluster_name): config.read('./conf/' + cm_cluster_name + '/' + i) all_nodes = list() while (get_cm_status(cm_server + ':' + cm_port) != 200): print 'Waiting for CM Server to start... ' time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password) # create cluster cluster = api.create_cluster(cm_cluster_name, version.upper()) #Config CM print 'Applying any configuration changes to Cloudera Manager' cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber('cloudera-manager-updates')) planned_nodes = config_grabber(cm_cluster_name + '-en')['full.list'].split(',') for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): for j in v.split(','): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): print 'Waiting for all agents to check into the CM Server before continuing.' while len(planned_nodes) > api.get_all_hosts(): print 'Waiting for the final set of CM Agent nodes to check in.' time.sleep(5) print 'Updating Rack configuration for data nodes.' all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k,v in config_grabber(cm_cluster_name + '-dn').iteritems(): if host.hostname in v: print 'Setting host: ' + host.hostname + ' to rack /default/' + k host.set_rack_id('/default/' + k) print 'Adding all hosts to cluster.' cluster.add_hosts(all_hosts) # download CDH Parcels # TODO add some logic here to make the parcel list something that's read from the hadrian.ini # This will allow support for other CDH packages, Search, etc. if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': distribute_parcel(cluster, 'CDH', config_grabber("Globals")['cdh.parcel.version']) if config_dict.get('hdfs_ha') == True: create_zookeeper_service(config_dict, cluster) create_hdfs_service(config_dict, cluster) cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configurations' else: print 'Client configuration deployment complete.' create_mapred_service(config_dict, cluster, cm_server) if config_dict.get('hbase') == True: if config_dict.get('hdfs_ha') == False: create_zookeeper_service(config_dict, cluster) create_hbase_service(config_dict, cluster) if config_dict.get('hive') == True: create_hive_service(config_dict, cluster) print 'Starting final client configuration deployment for all services.' cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configuration.' else: print 'Client configuration deployment complete. The cluster is all yours. Happy Hadooping.'
# Get a handle to the API client from cm_api.api_client import ApiResource import time import sys cm_host = raw_input ("Enter IP address of CM: ") cm_username = raw_input ("Enter username: "******"Enter password: "******"Clusters:" cdh5 = None for c in api.get_all_clusters(): print c.name if c.version == "CDH5": cdh5 = c #Print all hosts print "Hosts:" for i in hosts: print i #cdh5.rolling_restart(stale_configs_only=1) works only in Enterprise version #Get list of all services print "Services:" for s in cdh5.get_all_services(): print s if s.type == "HDFS":
from cm_api.api_client import ApiResource cm_host = "insilicodb.ulb.ac.be" api = ApiResource(cm_host, username="******", password="******") #print(api.get_all_clusters()) all_hosts = api.get_all_hosts(view='full') #print(all_hosts) all_hostnames = set([h.hostname for h in all_hosts]) #print(all_hostnames) h = all_hosts[0] #print(h) #print(h.roleRefs) role = api.get_cluster('cluster') #t = role.get_service('hbase').get_role('hbase-MASTER-4e61083dbd483f97174ec27ec055c1d3').get_config(view='full') t = role.get_service( 'hdfs' ) #.get_role('hbase-MASTER-4e61083dbd483f97174ec27ec055c1d3').get_config(view='full') t = t.get_role('hdfs-NAMENODE-4e61083dbd483f97174ec27ec055c1d3').get_config( view='full') for key, value in t.iteritems(): print(key)
print line print "++Adding HOST to the Cluster" addHost=cluster.add_hosts(newHostList) //Waiting for 5 minutes so that the parcels get downloaded & distributed & activated print "++Wait Time++ 300 seconds" time.sleep(300) if __name__ == '__main__': api = ApiResource(clouderaManagerHost, clouderaManagerPort, clouderaManagerUserName, clouderaManagerPassword, use_tls=clouderaManagerHTTPS) cluster = api.get_cluster(clusterDisplayName) hostlist=[] for hostName in api.get_all_hosts(): if hostName.hostname in newHosts: host = api.get_host(hostName.hostId) hostlist.append(host.hostId) addHost=addHostToCluster(api,cluster,hostlist) start_time=time.time() parcel=cluster.get_parcel('CDH',parcelVersion) //Check for parcel deployment errors. print "++ Checking Parcel Deployement" while True: if parcel.stage == 'ACTIVATED': print "CDH Parcels Activated" break if parcel.state.errors: raise Exception(str(parcel.state.errors))
def main(): global ec2con global cwcon ec2con = boto.ec2.connect_to_region('us-east-1') cwcon = boto.ec2.cloudwatch.CloudWatchConnection() api = ApiResource(CM_HOST, username="******", password="******") displayName = None for c in api.get_all_clusters(): displayName = c.displayName print "Cluster: %s (%s)" % (displayName, c.name) inst_cache = {} insts = api.get_all_hosts('full') print "Found %s in the cluster" % [inst.hostId for inst in insts.objects] for inst in insts.objects: clusterName = inst.roleRefs[0].clusterName if clusterName <> c.name: print 'Clusters do not correspond: %s vs %s' % (clusterName, c.name) continue cores = inst.numCores inst_id = inst.hostId inst_cache[inst_id] = my_cache = {} # For later - we'll send in one data point for every TS query # that has AWS data my_cache['aws_info_recorded'] = False # my_cache['healthSummary'] = inst.healthSummary ress = ec2con.get_all_reservations(filters={'instance-id' : inst_id}) if len(ress) > 0: print "Found %s reservations for %s: %s" % (len(ress), inst_id, ress) res = ress[0] instances = res.instances if len(instances) > 1: print "Found %s instances for %s %s" % (len(instances), inst_id, instances) inst = instances[0] if inst.id <> inst_id: raise Exception("%s != %s" % (inst.id, inst_id)) platform = inst.platform vpc_id = inst.vpc_id if platform == 'windows': product = 'Windows' elif not platform: product = 'Linux_UNIX' else: product = 'UNKNOWN' if vpc_id: product += "_Amazon_VPC" ami = inst.image_id my_cache['product'] = product my_cache['region'] = inst.region.name my_cache['zone'] = inst.placement inst_type = inst.instance_type.replace('.','_') my_cache['inst_type'] = inst_type time_f = arrow.utcnow().replace(minutes=common.DEFAULT_LOOKBACK_MINUTES) time_t = arrow.utcnow() # TODO # http://arr.gr/blog/2013/08/monitoring-ec2-instance-memory-usage-with-cloudwatch/ # http://blog.sciencelogic.com/netflix-steals-time-in-the-cloud-and-from-users/03/2011 # https://www.stackdriver.com/cpu-steal-why-aws-cloudwatch-metrics-are-different-than-agent-metrics/ stat = cwcon.get_metric_statistics(300, time_f, time_t, 'CPUUtilization', 'AWS/EC2', ['Average','Minimum','Maximum'], { 'InstanceId' : inst_id }) # [{u'Timestamp': datetime.datetime(2014, 4, 13, 6, 5), u'Average': 0.35250000000000004, u'Minimum': 0.33, u'Maximum': 0.42, u'Unit': u'Percent'}] print 'Fetching stats for %s: %s' % (inst_id, stat) if stat: for s in stat: ts = common.ts_from_aws(s) my_cache['avg_cpu'] = float(s['Average']) else: print "No stats found for %s" % inst_id print "Querying CDH." series = api.query_timeseries('SELECT * WHERE clusterName = %s' % c.name) for entry in series.objects[0].timeSeries: # print entry.metadata.__dict__ metric = entry.metadata.metricName # internal host hostname = "" if 'hostname' in entry.metadata.attributes: host = entry.metadata.attributes['hostname'] inst_id = "" my_cache = {} if 'hostId' in entry.metadata.attributes: inst_id = entry.metadata.attributes['hostId'] if inst_id not in my_cache: print "Cannot find %s in %s" % (inst_id, inst_cache) my_cache = inst_cache[inst_id] service_name = "" if 'serviceName' in entry.metadata.attributes: service_name = entry.metadata.attributes['serviceName'] service_type = "" if 'serviceType' in entry.metadata.attributes: service_type= entry.metadata.attributes['serviceType'] role_type = "" if 'roleType' in entry.metadata.attributes: role_type = entry.metadata.attributes['roleType'] num = entry.metadata.unitNumerators denom = entry.metadata.unitDenominators if len(num) > 1: print "Num:" + num if len(denom)>1: print "Denom:" + denom unit = num[0] if len(denom) > 0: unit += denom[0] tags = { 'cdh_service_name_service_type_role_type' : "%s.%s.%s" % ( service_name, service_type, role_type), 'unit' : unit } combined_tags = deepcopy(tags) if my_cache: # combined_tags['healthSummary']= my_cache['healthSummary'] combined_tags['inst_type'] = my_cache['inst_type'] combined_tags['cloud'] = 'aws' combined_tags['region'] = my_cache['region'] combined_tags['zone'] = my_cache['zone'] combined_tags['product'] = my_cache['product'] if not entry.data: continue for sample in entry.data: ts = arrow.Arrow.fromdatetime(sample.timestamp).timestamp val = sample.value if len(combined_tags) > 8: print "ERROR: Too many tags: %s" % combined_tags sys.exit(0) common.otsdb_send(metric, val, combined_tags, ts, False) # Do the AWS once only if my_cache and not my_cache['aws_info_recorded']: # print my_cache combined_tags['unit'] = 'percent' if 'avg_cpu' in my_cache: common.otsdb_send('aws_average_cpu_utilization', my_cache['avg_cpu'], combined_tags, my_cache['ts'], False)
# Get Cloudera Manager, config, and ODP Cluster logging.info('Retrieving Cloudera Manager service and cluster instance') api = ApiResource(cloudera_manager_server_api, 7180, management_console_username, management_console_password, version=api_version) cloudera_manager = ClouderaManager(api) cloudera_manager_config = api.get_cloudera_manager().get_config(view='full') cluster_name = 'Open Data Platform' cluster = api.get_cluster(cluster_name) # Retrieve all ApiHost objects, locate the management server and add others to clients logging.info('Retrieving all hosts from cluster') hosts = api.get_all_hosts() clients = [] for host in hosts: # Suppress Clock Offset warning that incorrectly states chrony is not working host.update_config({'host_health_suppression_host_clock_offset': 'true'}) # Separate Cloudera Manager Server from agents if host.hostname == cloudera_management_server_fqdn: cloudera_management_server = host else: clients.append(host) num_data_nodes = len( clients) + 1 # Every node is a datanode, so sum # clients with mgmt server # Create Zookeeper Service
def main(): config.read([ "./conf/hadrian.ini", "./conf/cluster_specs.ini", "./conf/cloudera-manager/cm.ini" ]) cm_cluster_name = config_grabber("Globals")["cm.cluster.name"] cm_username = config_grabber("Globals")["cm.username"] cm_password = config_grabber("Globals")["cm.password"] cm_port = config_grabber("Globals")["cm.port"] version = config_grabber("Globals")["cdh.cluster.version"] cm_server = config_grabber(cm_cluster_name + "-hn")["cm.server"] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir("./conf/" + cm_cluster_name): config.read("./conf/" + cm_cluster_name + "/" + i) while (get_cm_status(cm_server + ":" + cm_port) != 200): logging.info("Waiting for CM Server to start... ") time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password, version=12) # create cluster or get existing cluster cluster_exists = False for i in api.get_all_clusters(): if i.name == cm_cluster_name: cluster_exists = True if cluster_exists == False: cluster = api.create_cluster(cm_cluster_name, version.upper()) planned_nodes = config_grabber(cm_cluster_name + "-hn")["full.list"].split(",") for k, v in config_grabber(cm_cluster_name + "-dn").iteritems(): for j in v.split(","): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): logging.info( "Waiting for all agents to check into the CM Server before continuing." ) while len(planned_nodes) > api.get_all_hosts(): logging.info( "Waiting for the final set of CM Agent nodes to check in.") time.sleep(5) logging.info("Updating Rack configuration for data nodes.") all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k, v in config_grabber(cm_cluster_name + "-dn").iteritems(): if host.hostname in v: logging.info("Setting host: " + host.hostname + " to rack /" + k) host.set_rack_id("/" + k) logging.info("Adding all hosts to cluster.") cluster.add_hosts(all_hosts) else: cluster = api.get_cluster(cm_cluster_name) #Config CM logging.info("Applying any configuration changes to Cloudera Manager") cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber("cloudera-manager-updates")) if os.path.exists("/root/hadrian/cm_license.txt"): with open("/root/hadrian/cm_license.txt", "r") as license: logging.info("Applying Enterprise License to Cloudera Manager") cmanager.update_license(license.read()) if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': # increase the parcel refresh frequency to one minute to find parcel repos in a more timely manner cmanager.update_config({"PARCEL_UPDATE_FREQ": 1}) distribute_parcel(cluster, 'CDH', config_grabber('Globals')['cdh.parcel.version']) distribute_parcel(cluster, 'KAFKA', config_grabber('Globals')['kafka.parcel.version']) # restore parcel refresh time period to original 60 minutes cmanager.update_config({"PARCEL_UPDATE_FREQ": 60}) # grab current services, so that we can skip services already defined to make this script reentrant current_services = [] for i in cluster.get_all_services(): current_services.append(i.type) if "ZOOKEEPER" not in current_services: create_zookeeper_service(cluster) if "HDFS" not in current_services: create_hdfs_service(cluster, api) if "YARN" not in current_services: create_yarn_service(cluster) if "HIVE" not in current_services: create_hive_service(cluster) if "IMPALA" not in current_services: create_impala_service(cluster) if "KAFKA" not in current_services: create_kafka_service(cluster) if config_grabber("Globals")["kerberos.enabled"].lower() == "true": enable_kerberos(cluster, cmanager) else: logging.info("Starting remaining services.") cmd = cluster.start() if not cmd.wait(CMD_TIMEOUT).success: logging.info( "Error in cluster services start. Please review Cloudera Manager for details." ) else: logging.info("Remaining cluster services started.") logging.info( "Starting final client configuration deployment for all services.") cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: logging.info("Failed to deploy client configuration.") else: logging.info( "Client configuration deployment complete. The cluster is all yours. Happy Hadooping." )
#!/usr/bin/env python #author Steven #auto fill the Rack ID if it's /null import simplejson as json import urllib2, base64 import re from cm_api.api_client import ApiResource def get_rackID(host): url="https://cartographer.siri.apple.com/api/v2/hosts?host.hostname="+str(host) request = urllib2.Request(url) result = urllib2.urlopen(request) jsoncont=result.read() for i in json.loads(jsoncont): #print i f=i['asset']['location_in_building'].split(".") cm_rack_id="/"+f[0]+"."+f[1]+"."+f[2]+"."+f[3]+"."+f[4]+"."+f[5]+"/"+f[6] return cm_rack_id #print get_rackID("flume001.sp07.siri.apple.com") api = ApiResource('cm001.sp07.siri.apple.com',version=6,username='******',password='******') for h in api.get_all_hosts(): if h.rackId=="/null": # if h.hostname=="batch001.sp07.siri.apple.com": # h.set_rack_id("/US.RMR.02.01.0903.06/010") print get_rackID(h.hostname),h.hostname h.set_rack_id(get_rackID(h.hostname)) #print h.hostname
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '268435456', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_dir_a = module.params.get('jn_dir', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # deploy configuration - it always return changed elif action_a == 'deploy_configuration': service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] service = cluster.get_service(service_name) # deploying client configuration command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) module.exit_json(changed=True, msg='Configuration deployed') # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '215964392', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
logger('info',"License type: "+str(LICENSE_TYPE)); logger('info',"CDH_VERSION: " + str(CDH_VERSION) ); logger('info',"CDH_PARCEL_VERSION: "+str(CDH_PARCEL_VERSION)); logger('info',"CLUSTER_NM: "+str(CLUSTER_NM)); # Creating CM Instance try: resource = ApiResource(CM_HOST,CM_PORT,username=ADMIN_USER, password=ADMIN_PASS,version=CM_API_VERSION); logger('info',"Creating CM Instance:"+CM_HOST); except Exception,err: logger('warn',"Unable to connect Cloudera Manager:"+CM_HOST); logger('error',err); logger('info',"CM Hosts :"+str(resource.get_all_hosts())); # Fetching hosts from Ansible host file HOST_DICT = mutil.ansible_host_dict(logger,HOST_FILE) HOST_DATA_IP = (mutil.opts_hosts_parse(options.data_nodes) or HOST_DICT['Data-Nodes']) print HOST_DATA_IP; HOST_DATA = map(lambda a:mutil.return_host(a,cluster_type),HOST_DATA_IP) mgmt_nodes = [] if options.mgmt_nodes: mgmt_nodes = list(mutil.opts_hosts_parse(options.mgmt_nodes)) MGMT_1 = mutil.return_host(mgmt_nodes[0] or HOST_DICT['Mgmt-01'][0],cluster_type) MGMT_2 = mutil.return_host(mgmt_nodes[1] or HOST_DICT['Mgmt-02'][0],cluster_type) MGMT_3 = mutil.return_host(mgmt_nodes[2] or HOST_DICT['Mgmt-03'][0],cluster_type)
exit(1) api = ApiResource(args.cm_host, args.port, args.user, args.password) # Get CM object to decommission the host cm = api.get_cloudera_manager() if args.cluster is None: clusters = [] for c in api.get_all_clusters(): clusters.append(c) if len(clusters) > 1: cluster = pick_cluster(clusters) else: cluster = clusters[0] else: cluster = api.get_cluster(args.cluster) host_list = api.get_all_hosts() # Find host object given the hostname as input arg for h in host_list: if h.hostname == args.impala_host: host = h impala = get_impala_service(cluster) for r in impala.get_all_roles(): if r.type == "IMPALAD": if r.hostRef.hostId == host.hostId: impala_host = r # Function to check for running impala queries, then shutdown role, then proceed w/ host decom check_for_inflight_queries(host.hostname, impala_host.name, impala, args.wait_time)
hosts = options.hosts.split(',') print 'NodeManagers to start/stop:', hosts #print(options.url.replace('http://','').replace('https://','')) ssl_cert_path = options.ssl_cert_path context = ssl.create_default_context(cafile=ssl_cert_path) api = ApiResource(options.url.replace('http://', '').replace('https://', ''), options.port, options.user, options.password, use_tls=True, ssl_context=context) host_ids_action = [ h.hostId for h in api.get_all_hosts() if h.hostname in hosts ] cm_client.configuration.username = options.user cm_client.configuration.password = options.password cm_client.configuration.verify_ssl = True cm_client.configuration.ssl_ca_cert = ssl_cert_path # Create an instance of the API class api_host = options.url port = options.port api_version = 'v19' api_url = api_host + ':' + port + '/api/' + api_version print(api_url) api_client = cm_client.ApiClient(api_url) cluster_api_instance = cm_client.ClustersResourceApi(api_client)
def create_cluster(config_dict): config.read([ './conf/hadrian.ini', './conf/cluster_specs.ini', './conf/cloudera-manager/cm.ini' ]) cm_cluster_name = config_grabber("Globals")['cm.cluster.name'] cm_username = config_grabber("Globals")['cm.username'] cm_password = config_grabber("Globals")['cm.password'] cm_port = config_grabber("Globals")['cm.port'] version = config_grabber('Globals')['cdh.cluster.version'] cm_server = config_grabber(cm_cluster_name + '-en')['cm.server'] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir('./conf/' + cm_cluster_name): config.read('./conf/' + cm_cluster_name + '/' + i) all_nodes = list() while (get_cm_status(cm_server + ':' + cm_port) != 200): print 'Waiting for CM Server to start... ' time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password) # create cluster cluster = api.create_cluster(cm_cluster_name, version.upper()) #Config CM print 'Applying any configuration changes to Cloudera Manager' cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber('cloudera-manager-updates')) planned_nodes = config_grabber(cm_cluster_name + '-en')['full.list'].split(',') for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): for j in v.split(','): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): print 'Waiting for all agents to check into the CM Server before continuing.' while len(planned_nodes) > api.get_all_hosts(): print 'Waiting for the final set of CM Agent nodes to check in.' time.sleep(5) print 'Updating Rack configuration for data nodes.' all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): if host.hostname in v: print 'Setting host: ' + host.hostname + ' to rack /default/' + k host.set_rack_id('/default/' + k) print 'Adding all hosts to cluster.' cluster.add_hosts(all_hosts) # download CDH Parcels # TODO add some logic here to make the parcel list something that's read from the hadrian.ini # This will allow support for other CDH packages, Search, etc. if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': distribute_parcel(cluster, 'CDH', config_grabber("Globals")['cdh.parcel.version']) if config_dict.get('hdfs_ha') == True: create_zookeeper_service(config_dict, cluster) create_hdfs_service(config_dict, cluster) cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configurations' else: print 'Client configuration deployment complete.' create_mapred_service(config_dict, cluster, cm_server) if config_dict.get('hbase') == True: if config_dict.get('hdfs_ha') == False: create_zookeeper_service(config_dict, cluster) create_hbase_service(config_dict, cluster) if config_dict.get('hive') == True: create_hive_service(config_dict, cluster) print 'Starting final client configuration deployment for all services.' cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configuration.' else: print 'Client configuration deployment complete. The cluster is all yours. Happy Hadooping.'