def main(cm_host, user, password): api = ApiResource(cm_host, username=user, password=password) cm = api.get_cloudera_manager() cm.update_all_hosts_config( {"java_home": "/usr/java/jdk1.8.0_121-cloudera"}) print("restarting CM service - this will take a minute or so") cm.get_service().restart().wait() print("restarting cluster - this will take 2-5 minutes") api.get_all_clusters()[0].restart(restart_only_stale_services=True, redeploy_client_configuration=True).wait()
def main(cm_host, user, password): api = ApiResource(cm_host, username=user, password=password) cm = api.get_cloudera_manager() config = cm.get_all_hosts_config(view='full') if config['java_home'].value == "/usr/java/jdk1.8.0_121-cloudera": print "Java home already set - skipping" else: print "Updating jdk location" cm.update_all_hosts_config( {"java_home": "/usr/java/jdk1.8.0_121-cloudera"}) print("restarting CM service - this will take a minute or so") cm.get_service().restart().wait() print("restarting cluster - this will take 2-5 minutes") api.get_all_clusters()[0].restart(restart_only_stale_services=True, redeploy_client_configuration=True).wait()
def do_call(host, port, user, password, cluster_name, service_role_name, random_index): api = ApiResource(host, port, user, password, False, MAN_API_VERSION) for cluster in api.get_all_clusters(): if cluster_name is None: break elif cluster_name == cluster.name: break if cluster_name is not None and cluster_name != cluster.name: print >> sys.stderr, "Cloud not find cluster: " + cluster_name return -2 do_print_header() for service in cluster.get_all_services(): do_print_line_item(api, service, service_role_name, random_index, 'HDFS', 'NAMENODE', 'namenode_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'KUDU', 'KUDU_MASTER', 'webserver_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'HUE', 'HUE_SERVER', 'hue_http_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'HIVE', 'HIVESERVER2', 'hs2_thrift_address_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'IMPALA', 'IMPALAD', 'beeswax_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'FLUME', 'AGENT', 'agent_http_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'KAFKA', 'KAFKA_BROKER', 'port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'ZOOKEEPER', 'SERVER', 'clientPort', [], []) do_print_footer()
def adjust_yarn_memory_limits(region, stack_name, restart=True): ec2_conn = create_ec2_connection(region) manager_instance = get_manager_instance(ec2_conn, stack_name) with cm_tunnel_ctx(manager_instance) as local_port: cm_api = ApiResource('localhost', username='******', password='******', server_port=local_port, version=9) cluster = list(cm_api.get_all_clusters())[0] host = list(cm_api.get_all_hosts())[0] # all hosts same instance type yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] rm_cg = filter(lambda x: x.roleType == 'RESOURCEMANAGER', list(yarn.get_all_role_config_groups()))[0] nm_cg = filter(lambda x: x.roleType == 'NODEMANAGER', list(yarn.get_all_role_config_groups()))[0] rm_cg.update_config({ 'yarn_scheduler_maximum_allocation_mb': ( int(host.totalPhysMemBytes / 1024. / 1024.)), 'yarn_scheduler_maximum_allocation_vcores': host.numCores}) nm_cg.update_config({ 'yarn_nodemanager_resource_memory_mb': ( int(host.totalPhysMemBytes / 1024. / 1024.)), 'yarn_nodemanager_resource_cpu_vcores': host.numCores}) cluster.deploy_client_config().wait() if restart: cluster.restart().wait()
def getActiveCMConfig(totalconfig): cmConfig = {} for cm in totalconfig['cmfqdn']: api = ApiResource(cm, totalconfig[cm]['port'], totalconfig[cm]['user'], totalconfig[cm]['passwd'], totalconfig[cm]['tls'], totalconfig[cm]['apiv']) clusters = api.get_all_clusters() cmConfig[cm] = {} for cluster in clusters: cmConfig[cm][cluster.displayName] = {} services = cluster.get_all_services() for service in services: cmConfig[cm][cluster.displayName][service.name] = {} cmConfig[cm][cluster.displayName][service.name]['Service'] = {} for name, config in service.get_config(view='full')[0].items(): cmConfig[cm][cluster.displayName][ service.name]['Service'][name] = { 'value': config.value, 'default': config.default } for roleGroup in service.get_all_role_config_groups(): cmConfig[cm][cluster.displayName][service.name][ roleGroup.roleType] = {} for name, config in roleGroup.get_config( view='full').items(): cmConfig[cm][cluster.displayName][service.name][ roleGroup.roleType][name] = { 'value': config.value, 'default': config.default } print(roleGroup.roleType) #print(json.dumps(cmConfig, indent=4)) return cmConfig
def get_cluster_specs(): cm_api = ApiResource(os.environ['MANAGER_HOST'], username='******', password='******', server_port=7180, version=9) host = list(cm_api.get_all_hosts())[0] # all hosts same instance type cluster = list(cm_api.get_all_clusters())[0] yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')), 'num_cores': host.numCores, 'node_memory': host.totalPhysMemBytes}
def main(): cmhost = os.environ['DEPLOYMENT_HOST_PORT'].split(":")[0] api = ApiResource(cmhost, username='******', password='******') all_clusters = api.get_all_clusters() for cluster in all_clusters: if (cluster.name == os.environ['CLUSTER_NAME']): break template = cluster.create_host_template("cdsw-gateway")
def do_call(host, port, version, user, password, cluster_name, parcel_name, parcel_version, parcel_repo, init_pre_dir, init_post_dir): api = ApiResource(host, port, user, password, False, version) if not parcel_repo.endswith('/'): parcel_repo += '/' if re.match(REGEX_VERSION, parcel_version) is None or re.match(REGEX_VERSION, parcel_version).group() != parcel_version: raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] expected to match regular expression [' + REGEX_VERSION + ']') if not parcel_repo.endswith(parcel_version + '/'): raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] when compared with repository [' + parcel_repo + ']') cm_config = api.get_cloudera_manager().get_config(view='full') repo_config = cm_config['REMOTE_PARCEL_REPO_URLS'] repo_list = repo_config.value or repo_config.default if parcel_repo not in repo_list: repo_list += ',' + parcel_repo api.get_cloudera_manager().update_config({'REMOTE_PARCEL_REPO_URLS': repo_list}) time.sleep(POLL_SEC) # The parcel synchronize end-point is not exposed via the API, so sleep instead cluster_names = [] if cluster_name is None: for cluster in api.get_all_clusters(): cluster_names.append(cluster.name) else: cluster_names.append(cluster_name) for cluster_name_itr in cluster_names: print 'Cluster [DEPLOYMENT] starting ... ' cluster = api.get_cluster(cluster_name_itr) parcel = cluster.get_parcel(parcel_name, parcel_version) print 'Parcel [DEPLOYMENT] starting ... ' do_parcel_op(cluster, parcel_name, parcel_version, 'DOWNLOAD', 'AVAILABLE_REMOTELY', 'DOWNLOADED', 'start_download') do_parcel_op(cluster, parcel_name, parcel_version, 'DISTRIBUTE', 'DOWNLOADED', 'DISTRIBUTED', 'start_distribution') do_parcel_op(cluster, parcel_name, parcel_version, 'ACTIVATE', 'DISTRIBUTED', 'ACTIVATED', 'activate') parcel = cluster.get_parcel(parcel_name, parcel_version) if parcel.stage != 'ACTIVATED': raise Exception('Parcel is currently mid-stage [' + parcel.stage + '], please wait for this to complete') print 'Parcel [DEPLOYMENT] finished' if init_pre_dir is not None and os.path.isdir(init_pre_dir): print 'Cluster [PRE_INIT] starting ... ' for script in glob.glob(init_pre_dir + '/*.sh'): subprocess.call([script]) print 'Cluster [PRE_INIT] finihsed' print 'Cluster [CONFIG_DEPLOYMENT] starting ... ' cluster.deploy_client_config() cmd = cluster.deploy_client_config() if not cmd.wait(TIMEOUT_SEC).success: raise Exception('Failed to deploy client configs') print 'Cluster [CONFIG_DEPLOYMENT] finihsed' print 'Cluster [STOP] starting ... ' cluster.stop().wait() print 'Cluster [STOP] finihsed' print 'Cluster [START] starting ... ' cluster.start().wait() print 'Cluster [START] finihsed' if init_post_dir is not None and os.path.isdir(init_post_dir): print 'Cluster [POST_INIT] starting ... ' for script in glob.glob(init_post_dir + '/*.sh'): subprocess.call([script]) print 'Cluster [POST_INIT] finihsed' print 'Cluster [DEPLOYMENT] finished'
def getClusterInformation(self): api = ApiResource(self.cm_host, username=self.user, password=self.passwd) logger.info('Received; user -> %s, password -> %s, host -> %s', self.user, self.passwd, self.cm_host) for c in api.get_all_clusters(): clusterInf = "Cluster name %s and version %s" %(c.name, c.version) #print "Cluster name %s and version %s" %(c.name, c.version) logger.info("Cluster name %s and version %s", c.name, c.version) if c.version == "CDH5": cdh5 = c return cdh5, clusterInf
def services(self): api = ApiResource(self.host, username=self.username, password=self.password) version = None service_list = [] for cluster in api.get_all_clusters(): if cluster.version == "CDH5": version = cluster for service in version.get_all_services(): service_list.append(service.name) return service_list
def get_cluster(cm_host, user, pwd, cluster_name): global api api = ApiResource(cm_host, username=user, password=pwd, version=12) for c in api.get_all_clusters(): if cluster_name in c.name: # print("Cluster, Version : " + c.name + ", " + c.version) return True, c else: print("[ERR] : Cluster \"" + cluster_name + "\" not found at \"" + cm_host + "\"") return False, ""
def find_impala_in_cm(cm_host, cm_user, cm_password, cm_cluster_name): """Finds the Impala service in CM and returns an Impala instance.""" cm = ApiResource(cm_host, username=cm_user, password=cm_password) cm_impalas = [service for cluster in cm.get_all_clusters() if cm_cluster_name is None or cm_cluster_name == cluster.name for service in cluster.get_all_services() if service.type == "IMPALA"] if len(cm_impalas) > 1: raise Exception("Found %s Impala services in CM;" % len(cm_impalas) + " use --cm-cluster-name option to specify which one to use.") if len(cm_impalas) == 0: raise Exception("No Impala services found in CM") return Impala(cm_impalas[0])
def main(): s, a = arg_handle() for i in range(0, 15): while True: try: cm_host = "127.0.0.1" api = ApiResource(cm_host, username="******", password="******") cdh = api.get_all_clusters()[0] except: print "Failed to connect to Cloudera Manager." print "Attempting to connect to Cloudera Manager..." time.sleep(15) continue break srv = cdh.get_service(s) actions[a](srv, s)
def update_cm(cm_host, cm_port, username, password): """Update config using the CM API (note: will restart service)""" elts = generate_xml_elements() cm_api = ApiResource(cm_host, username=username, password=password, server_port=cm_port, version=9) cluster = list(cm_api.get_all_clusters())[0] hdfs = filter(lambda x: x.type == 'HDFS', list(cluster.get_all_services()))[0] print("Updating HFDS core-site.xml safety valve...") _ = hdfs.update_config({ 'core_site_safety_valve': '\n'.join(tostring(e) for e in elts)}) print("Deploying client config across the cluster...") cluster.deploy_client_config().wait() print("Restarting necessary services...") cluster.restart().wait() print("Done!")
def main(): s,a = arg_handle() for i in range(0,15): while True: try: cm_host = "127.0.0.1" api = ApiResource(cm_host, username="******", password="******") cdh=api.get_all_clusters()[0] except: print "Failed to connect to Cloudera Manager." print "Attempting to connect to Cloudera Manager..." time.sleep(15) continue break srv=cdh.get_service(s) actions[a](srv,s)
def main(cm_host, user, password): api = ApiResource(cm_host, username=user, password=password) cluster = api.get_all_clusters()[0] try: cluster.get_service(service_name) print "Service %s already configured. Skipping" % service_name except ApiException: print "creating new service %s" % service_name add_kudu_service(cluster, service_name) create_kudu_roles(cluster, api.get_all_hosts()) update_kudu_role_group_configs(cluster) start_service(cluster, service_name) update_impala_service(cluster, service_name) print "Waiting for cluster to restart stale services" cluster.restart(restart_only_stale_services=True, redeploy_client_configuration=True).wait()
class cm_utils(object): def __init__(self,service,role,host,list): self.service = service.lower() self.role = role.lower() self.host = host.lower() self.list = list.lower() cm_host = '10.7.177.234' self.api = ApiResource(cm_host, username="******", password="******") # "ALL" if service == "None" else service # "ALL" if role == "None" else role # "ALL" if host == "None" else host def main(self): # s_filter = None for c in self.api.get_all_clusters(): print c for s in c.get_all_services(): print "SERVICE : " + s.displayName + "===============" # if (s.displayName.lower() == self.service) or (self.service == "all"): if ( self.service in s.displayName.lower() ) or (self.service == "all"): s_filter = s for r in s_filter.get_all_roles(): # print "ROLE : " + r.type + "================" if (self.role in r.type.lower()) or (self.role == "all"): h = r.hostRef.hostId hostname,ipAddress,healthSummary = self._get_host_info(h) if (self.host in hostname) or (self.host in ipAddress) or (self.host in h) or (self.host == "all"): if self.list == "yes": print ipAddress else: print "[" + r.type + "]" + hostname + " " + ipAddress + " " + healthSummary def _get_host_info(self,hostid): host = self.api.get_host(hostid) # self.hostname = host.hostname # self.host_ip = host.ipAddress # self.host_status = host.healthSummary return host.hostname,host.ipAddress,host.healthSummary
def main(): """ This is a script to export a current Cloudera Manager cluster configuration into an Hadrian supported format. You can then use these configuration files as the basis for your new cluster configs. """ parser = argparse.ArgumentParser(description='Export Cloudera Manager configs in an Hadrian friendly format.') parser.add_argument('-H', '--host', '--hostname', action='store', dest='hostname', required=True, help='CM Server Name') parser.add_argument('-p', '--port', action='store', dest='port', type=int, default=7180, help='CM Port') parser.add_argument('-u', '--user', '--username', action='store', dest='username', required=True, help='CM username') args = parser.parse_args() password = getpass.getpass('Please enter your Cloudera Manager passsword: ') api = ApiResource(args.hostname, args.port, args.username, password, version=4) for cluster in api.get_all_clusters(): conf_dir = './confs/' + cluster.name if not os.path.exists(conf_dir): os.makedirs(conf_dir) for service in cluster.get_all_services(): with open(conf_dir + '/' + service.name + '.ini', 'w') as f: print 'Dumping Service config for ' + service.name rcg = list() for i in service.get_all_role_config_groups(): rcg.append(i.name) f.write('[' + service.type + ']\n') f.write('config_groups=' + ','.join(rcg)) f.write('\n\n') f.write('[' + service.name + '-svc-config]\n') for item in service.get_config(): for k,v in item.iteritems(): f.write(k + '=' + str(v) + '\n') for i in service.get_all_role_config_groups(): f.write('\n') f.write('[' + i.name + ']\n') for k,v in i.get_config('full').iteritems(): if v.value is not None: f.write(k + '=' + str(v.value) + '\n') f.close() else: print 'Cluster config dir already exists. Please rename or remove existing config dir: ' + conf_dir
def test(): cm_host = 'xxx' api = ApiResource(cm_host, username="******", password="******", use_tls=False, version="12") # Get a list of all clusters cdh4 = None for c in api.get_all_clusters(): print c.name if c.version == "CDH5": cdh4 = c ## -- Output -- # Cluster 1 - CDH4 # Cluster 2 - CDH3
def migrate_services(cm_host, cm_username, cm_password, old_node, new_node): uid = str(uuid.uuid4().hex) api = ApiResource(cm_host, username=cm_username, password=cm_password) cluster = api.get_all_clusters()[0] migrate_hdfs(cluster, new_node, old_node, uid, api) migrate_hue(cluster, new_node, old_node, uid) migrate_impala(cluster, new_node, old_node, uid) migrate_spark(cluster, new_node, old_node, uid) migrate_spark2(cluster, new_node, old_node, uid) migrate_hive(cluster, new_node, old_node, uid) migrate_oozie(cluster, new_node, old_node, uid) migrate_zookeeper(cluster, new_node, old_node, uid) migrate_sentry(cluster, new_node, old_node, uid) migrate_solr(cluster, new_node, old_node, uid) migrate_yarn(cluster, new_node, old_node, uid) migrate_arcadia(cluster, new_node, old_node, uid) print('Restarting cluster, please wait.....') time.sleep(30) cluster.restart().wait() print('Migration of Roles are completed')
def run_cdh(action, cdh_st, logger): component = "cdh" cfg = ConfigParser.ConfigParser() cfg.read("/home/ec2-user/aws/cdh.cfg") cmhost = cfg.get("CM", "host") user = cfg.get("CM", "username") passw = cfg.get("CM", "password") cafile = cfg.get("CM", "cafile") context = ssl.create_default_context(cafile=cafile) api = ApiResource(cmhost, username=user, password=passw, ssl_context=context, use_tls=True) allc = api.get_all_clusters() c = allc[0] run_process(c, action, cdh_st, component, logger)
def reset_cm(cm_host, cm_port, username, password): """Elim S3 config from CM API safety valve (service restart necessary)""" s3_props = set(get_s3_properties()) cm_api = ApiResource(cm_host, username=username, password=password, server_port=cm_port, version=9) cluster = list(cm_api.get_all_clusters())[0] hdfs = filter(lambda x: x.type == 'HDFS', list(cluster.get_all_services()))[0] print("Getting current safety valve config") current_config = hdfs.get_config('full')[0]['core_site_safety_valve'].value # need the "<foo>...</foo>" to make it valid XML (bc it requires root elt) elts = list(fromstring('<foo>' + current_config + '</foo>')) new_elts = filter(lambda x: x.find('name').text not in s3_props, elts) print("Updating safety valve and deleting S3 config") _ = hdfs.update_config({ 'core_site_safety_valve': '\n'.join(tostring(e) for e in new_elts)}) print("Deploying client config across the cluster...") cluster.deploy_client_config().wait() print("Restarting necessary services...") cluster.restart().wait() print("Done!")
def do_call(host, port, user, password, cluster_name, service_role_name, random_index): api = ApiResource(host, port, user, password, False, MAN_API_VERSION); for cluster in api.get_all_clusters(): if cluster_name is None: break elif cluster_name == cluster.name: break if cluster_name is not None and cluster_name != cluster.name: print >> sys.stderr, "Cloud not find cluster: " + cluster_name return -2; do_print_header() for service in cluster.get_all_services(): do_print_line_item(api, service, service_role_name, random_index, 'HDFS', 'NAMENODE', 'namenode_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'KUDU', 'KUDU_MASTER', 'webserver_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'HUE', 'HUE_SERVER', 'hue_http_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'HIVE', 'HIVESERVER2', 'hs2_thrift_address_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'IMPALA', 'IMPALAD', 'beeswax_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'FLUME', 'AGENT', 'agent_http_port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'KAFKA', 'KAFKA_BROKER', 'port', [], []) do_print_line_item(api, service, service_role_name, random_index, 'ZOOKEEPER', 'SERVER', 'clientPort', [], []) do_print_footer()
def main(): API = ApiResource(CM_HOST, version=16, username=ADMIN_USER, password=ADMIN_PASS) for c in API.get_all_clusters(): if c.version == "CDH5": cdh5 = c for s in cdh5.get_all_services(): restart_role(s, API) if (unhealthy_roles == []): print("ALL ROLES: OK") else: print("Following is the list of all unhealthy Roles:\n ") for role in unhealthy_roles: print("\n\t\t" + role) for s in cdh5.get_all_services(): restart_service(s, API)
def get_cluster_info(manager_host, server_port=7180, username='******', password='******'): cm_api = ApiResource(manager_host, username=username, password=password, server_port=server_port, version=9) host = list(cm_api.get_all_hosts())[0] # all hosts same instance type cluster = list(cm_api.get_all_clusters())[0] yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] hive = filter(lambda x: x.type == 'HIVE', list(cluster.get_all_services()))[0] impala = filter(lambda x: x.type == 'IMPALA', list(cluster.get_all_services()))[0] hive_hs2 = hive.get_roles_by_type('HIVESERVER2')[0] hive_host = cm_api.get_host(hive_hs2.hostRef.hostId).hostname hive_port = int( hive_hs2.get_config('full')['hs2_thrift_address_port'].default) impala_hs2 = impala.get_roles_by_type('IMPALAD')[0] impala_host = cm_api.get_host(impala_hs2.hostRef.hostId).hostname impala_port = int(impala_hs2.get_config('full')['hs2_port'].default) return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')), 'node_cores': host.numCores, 'node_memory': host.totalPhysMemBytes, 'hive_host': hive_host, 'hive_port': hive_port, 'impala_host': impala_host, 'impala_port': impala_port}
def adjust_yarn_memory_limits(region, stack_name): ec2_conn = create_ec2_connection(region) manager_instance = get_manager_instance(ec2_conn, stack_name) cm_api = ApiResource("localhost", username="******", password="******", server_port=64999, version=9) with http_tunnel_ctx(manager_instance, 7180, 64999): cluster = list(cm_api.get_all_clusters())[0] host = list(cm_api.get_all_hosts())[0] # all hosts same instance type yarn = filter(lambda x: x.type == "YARN", list(cluster.get_all_services()))[0] rm_cg = filter(lambda x: x.roleType == "RESOURCEMANAGER", list(yarn.get_all_role_config_groups()))[0] nm_cg = filter(lambda x: x.roleType == "NODEMANAGER", list(yarn.get_all_role_config_groups()))[0] rm_cg.update_config( { "yarn_scheduler_maximum_allocation_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)), "yarn_scheduler_maximum_allocation_vcores": host.numCores, } ) nm_cg.update_config( { "yarn_nodemanager_resource_memory_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)), "yarn_nodemanager_resource_cpu_vcores": host.numCores, } ) cluster.deploy_client_config().wait() cluster.restart().wait()
def main(): configfile='' if len(sys.argv) > 3 or len(sys.argv) < 3: print("Usage: %s -i configfile " % sys.argv[0]) sys.exit(2) try: myopts, args = getopt.getopt(sys.argv[1:],"i:h") except getopt.GetoptError as e: print (str(e)) print("Usage: %s -i configfile " % sys.argv[0]) sys.exit(2) for o, a in myopts: if o == '-i': configfile=a elif o == '-h': print("Usage: %s -i configfile " % sys.argv[0]) if os.path.isfile(configfile): print "processing configuration file...." pass else: print "file does not exist..." sys.exit(2) config = ConfigObj(configfile) cluster_name = config['cluster']['name'] cdh_manager = config['cluster']['cdh_manager'] cm_hostname = config['cluster']['cm_hostname'] hostnames = config['cluster']['server_hostnames'] services = config['cluster']['services'] server_rack = config['cluster']['server_rack'] server_login = config['cluster']['server_login'] server_passwd = config['cluster']['server_passwd'] server_key = config['cluster']['server_key'] server_passphrase = config['cluster']['server_passphrase'] cloudera_manager_repo = config['cluster']['cloudera_manager_repo'] cm_host = cdh_manager api = ApiResource(cm_host, username="******", password="******") #print config['hive']['config']['hive_metastore_database_name'] for c in api.get_all_clusters(): if c.name == cluster_name: #cluster = c print "Cluster %s already exists " % (cluster_name) print "Please manually delete the cluster %s , all hosts and associated services." % (cluster_name) sys.exit(0) else: print "Starting the automation process..." pass cdhproc(cluster_name,api,hostnames,server_rack,server_login,server_passwd,server_key,server_passphrase,cloudera_manager_repo) createMGMT(api,cm_hostname,server_login,server_passwd,server_passphrase,server_key) deployHDFSMAP(cluster_name,api,configfile) if "yarn" in services: createYarn(cluster_name,api,configfile) if "zookeeper" in services: createZookeeper(cluster_name,api,configfile) if "hive" in services: createHive(cluster_name,api,configfile) if "hbase" in services: createHbase(cluster_name,api,configfile) if "spark" in services: createSpark(cluster_name,api,configfile) if "impala" in services: createImpala(cluster_name,api,configfile) cluster = api.get_cluster(cluster_name) print "Stopping cluster..." cmd = cluster.stop().wait() print "Active: %s. Success: %s" % (cmd.active, cmd.success) print "Starting cluster..." cmd =cluster.start().wait() print "Active: %s. Success: %s" % (cmd.active, cmd.success) if "solr" in services: createSolr(cluster_name,api,configfile) if "flume" in services: createFlume(cluster_name,api,configfile) if "oozie" in services: createOozie(cluster_name,api,configfile) if "sqoop" in services: createSqoop(cluster_name,api,configfile) if "hue" in services: createHue(cluster_name,api,configfile) #print "Stopping cluster..." #cmd = cluster.stop().wait() #print "Active: %s. Success: %s" % (cmd.active, cmd.success) #print "Starting cluster..." #cmd =cluster.start().wait #print "Active: %s. Success: %s" % (cmd.active, cmd.success) print "Cluster deployed successfully...." print "Login to: http://"+cdh_manager+":7180"
def runner(self, args, display=True): values = [] health_values = [] plugin_args = args.split() \ if args is not None and (len(args.strip()) > 0) \ else "" options = self.read_args(plugin_args) if options.hadoopdistro == 'CDH': api = ApiResource(server_host=options.cmhost, \ server_port=options.cmport, \ username=options.cmuser, \ password=options.cmpassword, \ version=11) cluster = api.get_cluster(api.get_all_clusters()[0].name) cdh = CDHData(api, cluster) else: cdh = HDPData(options.cmhost, options.cmuser, options.cmpassword) hbase = None def run_test_sequence(): # pylint: disable=too-many-return-statements hbase = happybase.Connection(host=cdh.get_hbase_endpoint()) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() try: hbase.create_table('blackbox_test_table', {'cf': dict()}) logging.debug("test table created") except AlreadyExists: logging.debug("test table exists") table = hbase.table('blackbox_test_table') end = TIMESTAMP_MILLIS() create_table_ok = True create_table_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_time_ms", [], create_table_ms)) except: LOGGER.error(traceback.format_exc()) create_table_ok = False reason = ['Create HBase table operation failed'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_succeeded", reason, create_table_ok)) #write some data to it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() table.put('row_key', {'cf:column': 'value'}) end = TIMESTAMP_MILLIS() write_hbase_ok = True write_hbase_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_time_ms", [], write_hbase_ms)) except: LOGGER.error(traceback.format_exc()) write_hbase_ok = False reason = ['Failed to insert row in HBase table'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_succeeded", reason, write_hbase_ok)) #read some data from it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() row = table.row('row_key', columns=['cf:column']) end = TIMESTAMP_MILLIS() read_hbase_ms = end - start read_hbase_ok = row['cf:column'] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_time_ms", [], read_hbase_ms)) except: LOGGER.error(traceback.format_exc()) hbase_fix_output = subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair', 'blackbox_test_table' ]) for line in hbase_fix_output.splitlines(): if 'Status:' in line or 'inconsistencies detected' in line: LOGGER.debug(line) subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr', '/hbase/table/blackbox_test_table' ]) subprocess.check_output([ 'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash', '/hbase/data/default/blackbox_test_table' ]) read_hbase_ok = False reason = ['Failed to fetch row by row key from HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_succeeded", reason, read_hbase_ok)) #create some hive metadata reason = [] if abort_test_sequence is True: return try: start = TIMESTAMP_MILLIS() hive = hive_api.connect(cdh.get_hive_endpoint()) end = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") connect_to_hive_ms = end - start connect_to_hive_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_time_ms", [], connect_to_hive_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_hive_ok = False reason = ['Failed to connect to Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_succeeded", reason, connect_to_hive_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute(( "CREATE EXTERNAL TABLE " "blackbox_test_table (key STRING, value STRING)" "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" " "WITH SERDEPROPERTIES " "(\"hbase.columns.mapping\" = \":key,cf:column\") " "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")" )) end = TIMESTAMP_MILLIS() create_metadata_ms = end - start create_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_time_ms", [], create_metadata_ms)) except: LOGGER.error(traceback.format_exc()) create_metadata_ok = False reason = [ 'CREATE EXTERNAL TABLE statement failed on Hive Metastore' ] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_succeeded", reason, create_metadata_ok)) #read some data via impala using it if abort_test_sequence is True: return if cdh.get_impala_endpoint() is not None: reason = [] try: start = TIMESTAMP_MILLIS() impala = connect(host=cdh.get_impala_endpoint(), port=options.impalaport) end = TIMESTAMP_MILLIS() impala.cursor().execute("invalidate metadata") connect_to_impala_ms = end - start connect_to_impala_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_time_ms", [], connect_to_impala_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_impala_ok = False reason = ['Failed to connect to Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_succeeded", reason, connect_to_impala_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() impala_cursor = impala.cursor() impala_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = impala_cursor.fetchall() end = TIMESTAMP_MILLIS() read_impala_ms = end - start read_impala_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_time_ms", [], read_impala_ms)) except: LOGGER.error(traceback.format_exc()) read_impala_ok = False reason = ['Failed to SELECT from Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_succeeded", reason, read_impala_ok)) else: reason = [] try: start = TIMESTAMP_MILLIS() hive_cursor = hive.cursor() hive_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = hive_cursor.fetchall() end = TIMESTAMP_MILLIS() read_hive_ms = end - start read_hive_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_time_ms", [], read_hive_ms)) except: LOGGER.error(traceback.format_exc()) read_hive_ok = False reason = ['Failed to SELECT from Hive'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_succeeded", reason, read_hive_ok)) #delete metadata if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") end = TIMESTAMP_MILLIS() drop_metadata_ms = end - start drop_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_time_ms", [], drop_metadata_ms)) except: LOGGER.error(traceback.format_exc()) drop_metadata_ok = False reason = ['Failed to DROP table in Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_succeeded", reason, drop_metadata_ok)) #delete hbase table if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving # test step in so it can be easily re-enabled for testing. #hbase.disable_table('blackbox_test_table') #hbase.delete_table('blackbox_test_table') end = TIMESTAMP_MILLIS() drop_table_ms = end - start drop_table_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_time_ms", [], drop_table_ms)) except: LOGGER.error(traceback.format_exc()) drop_table_ok = False reason = ['Failed to drop table in HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_succeeded", reason, drop_table_ok)) def to_status(flag): ''' Convert True to OK and False to ERROR ''' if flag in [True, False]: status = 'OK' if flag is True else 'ERROR' else: status = flag return status def default_health_value(name, service, operation, failed_step): result = False if len([event for event in health_values if event.metric == name]) == 0: if failed_step is not None: message = 'Did not attempt to %s due to timeout waiting for: %s' % ( operation, failed_step) else: message = 'Timed out waiting for %s to complete' % operation health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name(service), name, [message], False)) result = True return result test_thread = threading.Thread(target=run_test_sequence) test_thread.daemon = True abort_test_sequence = False test_thread.start() test_thread.join(60.0) abort_test_sequence = True if hbase is not None: hbase.close() failed_step = None if default_health_value("hadoop.HBASE.create_table_succeeded", "HBASE", "create HBase table", failed_step) and failed_step is None: failed_step = "create HBase table" if default_health_value("hadoop.HBASE.write_succeeded", "HBASE", "write to HBase", failed_step) and failed_step is None: failed_step = "write to HBase" if default_health_value("hadoop.HBASE.read_succeeded", "HBASE", "read from HBase", failed_step) and failed_step is None: failed_step = "read from HBase" if default_health_value("hadoop.HIVE.connection_succeeded", "HIVE", "connect to Hive Metastore", failed_step) and failed_step is None: failed_step = "connect to Hive Metastore" if default_health_value("hadoop.HIVE.create_metadata_succeeded", "HIVE", "create Hive Metastore table", failed_step) and failed_step is None: failed_step = "create Hive Metastore table" if cdh.get_impala_endpoint() is not None: if default_health_value("hadoop.IMPALA.connection_succeeded", "IMPALA", "connect to Impala", failed_step) and failed_step is None: failed_step = "connect to Impala" if default_health_value("hadoop.IMPALA.read_succeeded", "IMPALA", "SELECT from Impala", failed_step) and failed_step is None: failed_step = "SELECT from Impala" else: if default_health_value("hadoop.HQUERY.read_succeeded", "HQUERY", "SELECT from Hive", failed_step) and failed_step is None: failed_step = "SELECT from Hive" if default_health_value("hadoop.HIVE.drop_table_succeeded", "HIVE", "DROP table in Hive Metastore", failed_step) and failed_step is None: failed_step = "DROP table in Hive Metastore" if default_health_value("hadoop.HBASE.drop_table_succeeded", "HBASE", "drop table in HBase", failed_step) and failed_step is None: failed_step = "drop table in HBase" cdh_status_indicators = cdh.get_status_indicators() health_values.extend(cdh_status_indicators) overall = {} for health_val in health_values: try: current = overall[health_val.source] current_val = to_status(current.value) current_causes = current.causes except KeyError: current_val = 'OK' current_causes = [] update = to_status(health_val.value) # If current is ERROR, output is ERROR, regardless # If current is WARN, output is WARN if update is OK but ERROR if further WARN or ERROR # If update is OK, output is OK if OK, WARN if WARN and ERROR if ERROR out = 'ERROR' if current_val != "ERROR": if current_val == 'WARN': if update == 'OK': out = 'WARN' if current_val == 'OK': out = update current_val = out current_causes.extend(health_val.causes) overall[health_val.source] = Event( health_val.timestamp, health_val.source, 'hadoop.%s.health' % cdh.get_type(health_val.source), current_causes, current_val) values.extend(health_values) values.extend(overall.values()) if display: self._do_display(values) return values
def get_clusters(): api = ApiResource(get_cm_host(), username=CM_USERNAME, password=CM_USER_PASSWORD, version=2) return api.get_all_clusters()
CONFIG = ConfigParser.ConfigParser() CONFIG.read('clouderaconfig.ini') cm_host = CONFIG.get("CM", 'cm.host') username = CONFIG.get("CM", 'admin.name') password = CONFIG.get("CM", 'admin.password') cluster_name = CONFIG.get("CM", 'cluster.name') master_nodes = CONFIG.get("CDH", 'cluster.masternodes').split(',') slave_nodes = CONFIG.get("CDH", 'cluster.slavenodes').split(',') edge_nodes = CONFIG.get("CDH", 'cluster.edgenodes').split(',') api = ApiResource(cm_host, username=username, password=password) # Connect with the Cluster CLUSTER = None for cluster in api.get_all_clusters(): #print c.name CLUSTER = cluster #Download and activate Kafka parcel PARCEL = None PARCEL_PRODUCT = None PARCEL_VERSION = None for p in CLUSTER.get_all_parcels(): # print p # print p.product # print p.version if p.product == "KAFKA": PARCEL = p PARCEL_PROCUCT = p.product PARCEL_VERSION = p.version
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. from cm_api.api_client import ApiResource cloudera_user = '******' cloudera_pass = '******' cm_host = "localhost" api = ApiResource(cm_host, username=cloudera_user, password=cloudera_pass, # nosec version=17) c = api.get_all_clusters()[0] services = c.get_all_services() def process_service(service): service_name = service.name if service_name == "spark_on_yarn": service_name = "spark" for role_cfgs in service.get_all_role_config_groups(): role_cm_cfg = role_cfgs.get_config(view='full') role_cfg = parse_config(role_cm_cfg) role_name = role_cfgs.roleType.lower() write_cfg(role_cfg, '%s-%s.json' % (service_name, role_name)) service_cm_cfg = service.get_config(view='full')[0] service_cfg = parse_config(service_cm_cfg)
def main(): global ec2con global cwcon ec2con = boto.ec2.connect_to_region('us-east-1') cwcon = boto.ec2.cloudwatch.CloudWatchConnection() api = ApiResource(CM_HOST, username="******", password="******") displayName = None for c in api.get_all_clusters(): displayName = c.displayName print "Cluster: %s (%s)" % (displayName, c.name) inst_cache = {} insts = api.get_all_hosts('full') print "Found %s in the cluster" % [inst.hostId for inst in insts.objects] for inst in insts.objects: clusterName = inst.roleRefs[0].clusterName if clusterName <> c.name: print 'Clusters do not correspond: %s vs %s' % (clusterName, c.name) continue cores = inst.numCores inst_id = inst.hostId inst_cache[inst_id] = my_cache = {} # For later - we'll send in one data point for every TS query # that has AWS data my_cache['aws_info_recorded'] = False # my_cache['healthSummary'] = inst.healthSummary ress = ec2con.get_all_reservations(filters={'instance-id' : inst_id}) if len(ress) > 0: print "Found %s reservations for %s: %s" % (len(ress), inst_id, ress) res = ress[0] instances = res.instances if len(instances) > 1: print "Found %s instances for %s %s" % (len(instances), inst_id, instances) inst = instances[0] if inst.id <> inst_id: raise Exception("%s != %s" % (inst.id, inst_id)) platform = inst.platform vpc_id = inst.vpc_id if platform == 'windows': product = 'Windows' elif not platform: product = 'Linux_UNIX' else: product = 'UNKNOWN' if vpc_id: product += "_Amazon_VPC" ami = inst.image_id my_cache['product'] = product my_cache['region'] = inst.region.name my_cache['zone'] = inst.placement inst_type = inst.instance_type.replace('.','_') my_cache['inst_type'] = inst_type time_f = arrow.utcnow().replace(minutes=common.DEFAULT_LOOKBACK_MINUTES) time_t = arrow.utcnow() # TODO # http://arr.gr/blog/2013/08/monitoring-ec2-instance-memory-usage-with-cloudwatch/ # http://blog.sciencelogic.com/netflix-steals-time-in-the-cloud-and-from-users/03/2011 # https://www.stackdriver.com/cpu-steal-why-aws-cloudwatch-metrics-are-different-than-agent-metrics/ stat = cwcon.get_metric_statistics(300, time_f, time_t, 'CPUUtilization', 'AWS/EC2', ['Average','Minimum','Maximum'], { 'InstanceId' : inst_id }) # [{u'Timestamp': datetime.datetime(2014, 4, 13, 6, 5), u'Average': 0.35250000000000004, u'Minimum': 0.33, u'Maximum': 0.42, u'Unit': u'Percent'}] print 'Fetching stats for %s: %s' % (inst_id, stat) if stat: for s in stat: ts = common.ts_from_aws(s) my_cache['avg_cpu'] = float(s['Average']) else: print "No stats found for %s" % inst_id print "Querying CDH." series = api.query_timeseries('SELECT * WHERE clusterName = %s' % c.name) for entry in series.objects[0].timeSeries: # print entry.metadata.__dict__ metric = entry.metadata.metricName # internal host hostname = "" if 'hostname' in entry.metadata.attributes: host = entry.metadata.attributes['hostname'] inst_id = "" my_cache = {} if 'hostId' in entry.metadata.attributes: inst_id = entry.metadata.attributes['hostId'] if inst_id not in my_cache: print "Cannot find %s in %s" % (inst_id, inst_cache) my_cache = inst_cache[inst_id] service_name = "" if 'serviceName' in entry.metadata.attributes: service_name = entry.metadata.attributes['serviceName'] service_type = "" if 'serviceType' in entry.metadata.attributes: service_type= entry.metadata.attributes['serviceType'] role_type = "" if 'roleType' in entry.metadata.attributes: role_type = entry.metadata.attributes['roleType'] num = entry.metadata.unitNumerators denom = entry.metadata.unitDenominators if len(num) > 1: print "Num:" + num if len(denom)>1: print "Denom:" + denom unit = num[0] if len(denom) > 0: unit += denom[0] tags = { 'cdh_service_name_service_type_role_type' : "%s.%s.%s" % ( service_name, service_type, role_type), 'unit' : unit } combined_tags = deepcopy(tags) if my_cache: # combined_tags['healthSummary']= my_cache['healthSummary'] combined_tags['inst_type'] = my_cache['inst_type'] combined_tags['cloud'] = 'aws' combined_tags['region'] = my_cache['region'] combined_tags['zone'] = my_cache['zone'] combined_tags['product'] = my_cache['product'] if not entry.data: continue for sample in entry.data: ts = arrow.Arrow.fromdatetime(sample.timestamp).timestamp val = sample.value if len(combined_tags) > 8: print "ERROR: Too many tags: %s" % combined_tags sys.exit(0) common.otsdb_send(metric, val, combined_tags, ts, False) # Do the AWS once only if my_cache and not my_cache['aws_info_recorded']: # print my_cache combined_tags['unit'] = 'percent' if 'avg_cpu' in my_cache: common.otsdb_send('aws_average_cpu_utilization', my_cache['avg_cpu'], combined_tags, my_cache['ts'], False)
# In your virtual environment do: # pip install cm-api import sys from cm_api.api_client import ApiResource cm_host = "localhost" api = ApiResource(cm_host, username="******", password="******") print "*** CLUSTERS ***" clusters = None # List clusters for c in api.get_all_clusters(): print "Cluster \"%s\" is version %s" % (c.name, c.version) clusters = c print "*** HOSTS ***" for host_ref in c.list_hosts(): host = api.get_host(host_ref.hostId) print host.hostname print "*** SERVICES ***" hdfs = None # List services & health info for s in clusters.get_all_services(): print "Service \"%s\" -- state \"%s\" -- health \"%s\"" %(s.name, s.serviceState, s.healthSummary) # Get HDFS service if 'hdfs' in s.type.lower(): hdfs = s
class handler_cm_api: def __init__(self): self._user_executing = grp.getgrnam(getpass.getuser())[0] def __getitem__(self): return self def setup(self, p_cm_host, p_cm_user, p_cm_pass, p_cm_version, p_cluster, p_cm_port=None, p_use_tls=False): self.cm_api = ApiResource(p_cm_host, server_port=p_cm_port, version=p_cm_version, username=p_cm_user, password=p_cm_pass, use_tls=p_use_tls) handler_cm_api.cluster_hosts = self.cm_api.get_all_hosts() if p_cluster: self.cluster = filter(lambda x: x.displayName == p_cluster, self.cm_api.get_all_clusters())[0] if not self.cluster: print("Error: That cluster is not valid.") return else: self.services = self.cluster.get_all_services() self.name = self.cluster.displayName tmp_topology = self.cluster.list_hosts() self.topology = {} for i in range(len(tmp_topology)): tmp_host = filter(lambda x: x.hostId == tmp_topology[i].hostId, handler_cm_api.cluster_hosts)[0] self.topology[tmp_topology[i].hostId] = tmp_host.hostname def get_current_group(self): return self._user_executing ############################### # For internal validations def __validate_service(self, p_service): v_service = filter(lambda x: x.type == p_service, self.services) if not v_service: print("Error: Service not found") raise SystemExit return v_service.pop() def __validate_hostname(self, p_hostname): v_node = filter(lambda x: x.hostname == p_hostname, handler_cm_api.cluster_hosts) if not v_node: print("Error: Hostname not found") raise SystemExit return v_node.pop() def __validate_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) v_role = filter(lambda x: x.hostRef.hostId == v_node.hostId, v_roles) if not v_role: print("Error: Role not found in that host") raise SystemExit return v_role.pop() ###################################################################### # START/STOP/RESTART ###################################################################### def stop_cluster(self): v_cmd = self.cluster.stop() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def start_cluster(self): v_cmd = self.cluster.start() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def restart_cluster(self): v_cmd = self.cluster.restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def rolling_restart_cluster(self): v_cmd = self.cluster.rolling_restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) ###################################################################### #SERVICES ###################################################################### ################ # Status ################ # ------ State def check_state_services(self): for v_srv in self.services: print(coloring(v_srv.serviceState, v_srv.type)) def check_state_service(self, p_service): v_service = self.__validate_service(p_service) print(coloring(v_service.serviceState, v_service.type)) def check_health_services(self): for v_srv in self.services: print(coloring(v_srv.healthSummary, v_srv.type)) # ----- Health def check_health_service(self, p_service): v_service = self.__service_validate(p_service) print(coloring(v_service.healthSummary, v_service.type)) ##################################### # stop/start/restart/Rolling Restart ##################################### def stop_service(self, p_service): v_service = self.__validate_service(p_service) print("* Stopping " + v_service.type) v_cmd = v_service.stop() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def start_service(self, p_service): v_service = self.__validate_service(p_service) print("* Starting " + v_service.type) v_cmd = v_service.start() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def restart_service(self, p_service): v_service = self.__validate_service(p_service) print("* Restarting " + v_service.type) v_cmd = v_service.restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) def rolling_restart_service(self, p_service): v_service = self.__validate_service(p_service) try: print(" * Rolling Restarting " + v_service.type) v_cmd = v_service.rolling_restart() v_msg = f_waiting_task(v_cmd) print(coloring(*v_msg)) except: if re.match("Command not valid for", str(sys.exc_info()[1])): print "It's not possible to use Rolling Restart in this service." else: raise ################################################################### # ROLES ################################################################### ################# # Status ################# # ---- State def check_state_roles(self, p_service): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") for v_role in v_services.get_all_roles(): print( coloring( v_role.roleState, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_state_role(self, p_service, p_role): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) for v_role in v_roles: print( coloring( v_role.roleState, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_state_all_roles(self): for v_service in self.services: self.check_state_roles(v_service.type) print('---------------------') # ---- Health def check_health_roles(self, p_service): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") for v_role in v_service.get_all_roles(): print( coloring( v_role.healthSummary, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_health_role(self, p_service, p_role): v_service = self.__validate_service(p_service) print("*" + v_service.type + ":") v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles()) for v_role in v_roles: print( coloring( v_role.healthSummary, filter(lambda x: x.hostId == v_role.hostRef.hostId, handler_cm_api.cluster_hosts)[0].hostname) + ":\t" + v_role.type) def check_health_all_roles(self): for v_service in self.services: self.check_health_roles(v_service.type) print('---------------------') ##################### # Stop/Start/Restart def stop_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* Stopping " + v_role.type) v_cmd = v_service.stop_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) def start_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* Starting " + v_role.type) v_cmd = v_service.start_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) def restart_role(self, p_service, p_role, p_hostname): v_service = self.__validate_service(p_service) v_node = self.__validate_hostname(p_hostname) v_role = self.__validate_role(p_service, p_role, p_hostname) print("* restarting " + v_role.type) v_cmd = v_service.restart_roles(v_role.name) v_msg = f_waiting_task(v_cmd[0]) print(coloring(*v_msg)) ########################################################### #IMPALA QUERIES ########################################################### # FILTERS ############################ def setup_filters_impala_queries(self): v_start_time = raw_input( 'Introduce the start time with following format: DD/MM/YYYY_hh:mm:ss. Example: 01/01/2018_00:00:00: ' ) if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", v_start_time): print("Error: Invalid Format for start time") return v_end_time = raw_input( 'Introduce the end time with the following format: DD/MM/YYYY_hh:mm:ss. Example 31/01/2018_00:00:00: ' ) if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", v_end_time): print("Error: Invalid format for end time") return v_filter_type = raw_input( 'Choose the kind of filter: user|duration|state: ') if not v_filter_type in ('user', 'duration', 'state'): print("Error: Invalid kind of filter") return if v_filter_type == 'user': v_filter_value = raw_input( 'Introduce the user name you want to filter by: ') if not v_filter_value: print("Error: Invalid user name") return elif v_filter_type == 'duration': v_filter_value = raw_input( 'Introduce the query duration you want to filter by: +Xs|-Xs|=Xs. Example: +0s: ' ) if not re.match("^[+-=]\d+.\d*[hms]$", v_filter_value): print("Error: Invalid duration filter.") return elif v_filter_type == 'state': v_filter_value = raw_input( 'Introduce the query state you want to filter by: CREATED|INITIALIZED|COMPILED|RUNNING|FINISHED|EXCEPTION|UNKNOWN: ' ) if not v_filter_value in ('CREATED', 'INITIALIZED', 'COMPILED', 'RUNNING', 'FINISHED', 'EXCEPTION', 'UNKNOWN'): print("Error: Invalid state filter.") return v_limit = raw_input( "Introduce the max num of queries you want to check: ") if not re.match("^\d+$", v_limit): print("Error: Invalid limit. It has to be an integer") return return v_start_time, v_end_time, v_filter_type, v_filter_value, int( v_limit) ###################################### # Getting queries ###################################### def get_impala_queries(self, p_start_time=None, p_end_time=None, p_filter_type=None, p_filter_value=None, p_limit=None): if not (p_start_time and p_end_time and p_filter_type and p_filter_value and p_limit): p_start_time, p_end_time, p_filter_type, p_filter_value, p_limit = self.setup_filters_impala_queries( ) v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0] if not v_impala: print("Error: Impala service doesnt exist in this cluster.") return if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time): v_start_time = datetime.strptime(p_start_time, '%d/%m/%Y_%H:%M:%S') else: print("Error. startTime format is not valid.") return if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time): v_end_time = datetime.strptime(p_end_time, '%d/%m/%Y_%H:%M:%S') else: print("Error. startTime format is not valid.") return if p_filter_type == "user" and type(p_filter_value) == str: v_filter_str = 'user = '******'+': v_filter_value = p_filter_value.replace('+', '>') if p_filter_value[0] == '-': v_filter_value = p_filter_value.replace('-', '<') v_filter_str = 'queryDuration ' + v_filter_value elif p_filter_type == "state" and p_filter_value in ( 'CREATED', 'INITIALIZED', 'COMPILED', 'RUNNING', 'FINISHED', 'EXCEPTION', 'UNKNOWN'): v_filter_str = 'queryState = ' + v_filter_value else: print("Error: Filter is not valid.") return if type(p_limit) == int and p_limit < 201: v_limit = p_limit else: print("Error: Limit is not valid. It must be > 0 and <= 200") return v_queries = v_impala.get_impala_queries(v_start_time, v_end_time, v_filter_str, v_limit).queries v_output = '' for vq in v_queries: v_coordinator = filter(lambda x: x.hostId == vq.coordinator.hostId, self.cluster_hosts)[0].hostname v_output += COLORS.BLUE + "##################################################################################" + COLORS.RESET + "\n" v_output += vq.queryId + " -- " + vq.queryState + ":\n" v_output += COLORS.RED + vq.statement + COLORS.RESET + "\n" v_output += COLORS.GREEN + "--- Attributes ---" + COLORS.RESET + "\n" v_output += "Query Type: " + vq.queryType + "\n" if 'query_status' in vq.attributes.keys(): v_output += "Query Status: " + vq.attributes[ 'query_status'] + "\n" v_output += "User: "******"\n" v_output += "Database: " + vq.database + "\n" if 'pool' in vq.attributes.keys(): v_output += "Pool: " + vq.attributes['pool'] + "\n" v_output += "Starts at: " + vq.startTime.strftime( "%d/%m/%Y_%H:%M:%S") + "\n" v_output += "Ends at: " + vq.endTime.strftime( "%d/%m/%Y_%H:%M:%S") + "\n" v_output += "Coordinator: " + v_coordinator + "\n" v_output += "Rows Produced: " + str(vq.rowsProduced) + "\n" if vq.attributes['file_formats']: v_output += "File Format: " + vq.attributes[ 'file_formats'] + "\n" if 'hdfs_bytes_read' in vq.attributes.keys(): v_output += "HDFS bytes read: " + vq.attributes[ 'hdfs_bytes_read'] + "\n" if 'memory_aggregate_peak' in vq.attributes.keys(): v_output += "Memory Aggregate Peak: " + vq.attributes[ 'memory_aggregate_peak'] + "\n" if 'thread_cpu_time' in vq.attributes.keys(): v_output += "Threads Cpu Time: " + vq.attributes[ 'thread_cpu_time'] + "\n" print(v_output) print("Do you want to save the output? (Y/N)") v_save = raw_input("Your choice: ").upper() if v_save == 'Y': v_output_nc = re.sub("\\x1b\[\d+m", "", v_output) v_file = "/tmp/impala_queries_" + datetime.now().strftime( "%Y%m%d_%H%M%S") + ".log" with open(v_file, 'a') as file_output: file_output.write(v_output_nc) print("The output was written in: " + v_file) ###################### # Getting details ###################### def get_details_impala_query(self, p_query_id=None): if not p_query_id: v_query_id = raw_input( 'Introduce the query id you want to check the details: ') else: v_query_id = p_query_id v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0] v_queries = v_impala.get_impala_queries( datetime.now() - timedelta(days=30), datetime.now(), 'queryDuration > 0s', 1000).queries v_query = filter(lambda x: x.queryId == v_query_id, v_queries) if not v_query: print( "Error: The query_id is not valid, was executed more than 30 days ago or is not between the last 1000 queries. 1000 is the limit." ) return elif not v_query[0].detailsAvailable: print("Error: This Query does not have details available.") return else: v_output = "/tmp/impala_query_details_" + v_query[ 0].queryId + "_" + datetime.now().strftime( "%Y%m%d_%H%M%S") + ".log" with open(v_output, 'a') as file_output: file_output.write( str(v_impala.get_query_details(v_query[0].queryId))) print("The output was written in: " + v_output) ####################### def get_same_configuration(self): v_configs = [] v_command = 'hadoop org.apache.hadoop.conf.Configuration' for v_node in self.topology.values(): v_ssh = subprocess.Popen( ["ssh", v_node, "-o", "StrictHostKeyChecking=no", v_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE) v_configs += [v_ssh.stdout.readlines()] if len(self.topology) != len(v_configs): print( "Error: The num configs is different to the num of nodes in this cluster" ) return if v_configs[1:] == v_configs[:-1]: print(coloring('GOOD', "The configs are the same in all nodes.")) print("The nodes which were checked are: " + ', '.join(self.topology.values())) else: print(coloring('BAD', "The configs are not the same."))
class RemoteDataLoad(object): """This is an implementation of the process to load a test-warehouse snapshot on a remote CM managed cluster. This script assumes that the warehouse snapshot was already downloaded and was either passed in as a parameter, or can be found by either inspecting the SNAPSHOT_DIR environment variable, or based on the WORKSPACE environment variable on a Jenkins build slave. The reason for the additional setup code is that in the local development environment it is assumed that $USER is HDFS superuser, which is not the case for remote deloyments. """ def __init__(self, cm_host, options): logger.info("Starting remote data load...") self.options = options self.cm_host = cm_host # Gateway host can be used if the CM host is not configured as a Hadoop gateway self.gateway = options.gateway if options.gateway else cm_host self.impala_home = os.environ["IMPALA_HOME"] self.api = ApiResource(self.cm_host, username=options.cm_user, password=options.cm_pass) # The API returns a list of clusters managed by the CM host. We're assuming # that this CM host was set up for the purpose of Impala testing on one # cluster, so the list should only have one value. self.cluster = self.api.get_all_clusters()[0] self.services = self.get_services() self.config = self.get_service_client_configurations() logger.info("Retrieved service configuration") logger.info(str(self.config)) self.prepare() logger.info("IMPALA_HOME: {0}".format(self.impala_home)) def get_hostname_for_ref(self, host_ref): """Translate the HostRef instance into the hostname.""" return self.api.get_host(host_ref.hostId).hostname @staticmethod def get_or_default(config): return config.value if config.value else config.default def get_services(self): """Confirm that all services are running, and return service dict.""" services = dict((s.type, s) for s in self.cluster.get_all_services()) if set(REQUIRED_SERVICES) != set(services.keys()): missing_services = set(REQUIRED_SERVICES) - set(services.keys()) logger.error("Services not installed: {0}".format(list(missing_services))) raise RuntimeError("Cluster not ready.") if not all(services[s].serviceState == 'STARTED' for s in services): stopped = [s for s in services if services[s].serviceState != "STARTED"] logger.error("Not all services started: {0}".format(stopped)) raise RuntimeError("Cluster not ready.") return services @timing def download_client_config(self, cluster, service): """Download the client configuration zip for a particular cluster and service. Since cm_api does not provide a way to download the archive we build the URL manually and download the file. Once it downloaded the file the archive is extracted and its content is copied to the Hadoop configuration directories defined by Impala. """ logger.info("Downloading client configuration for {0}".format(service.name)) url = "http://{0}:7180/api/{1}/clusters/{2}/services/{3}/clientConfig".format( self.cm_host, CM_API_VERSION, urlquote(cluster.name), urlquote(service.name)) path = mkdtemp() sh.curl(url, o=os.path.join(path, "clientConfig.zip"), _out=tee, _err=tee) current = os.getcwd() os.chdir(path) sh.unzip("clientConfig.zip") for root, _, file_names in os.walk("."): for filename in fnmatch.filter(file_names, "*.xml"): src = os.path.join(root, filename) dst = os.path.join(self.impala_home, "fe", "src", "test", "resources") logger.debug("Copying {0} to {1}".format(src, dst)) shutil.copy(src, dst) os.chdir(current) # TODO: this may be available in tests/comparison/cluster.py def set_hive_warehouse_dir(self, cluster, service): logger.info("Setting the Hive Warehouse Dir") for service in self.api.get_all_clusters()[0].get_all_services(): logger.info(service) if service.type == "HIVE": hive_config = { "hive_warehouse_directory" : HIVE_WAREHOUSE_DIR } service.update_config(hive_config) # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @timing def get_service_client_configurations(self): """Download the client configurations necessary to upload data to the remote cluster. Unfortunately, the CM API does not allow downloading it so we have to iterate over the services and download the config for all of them. In addition, returns an options dictionary with settings required for data loading like the HS2 server, Impala hosts, Name node etc. Returns: A client-configuration dictionary, e.g.: { 'hive_warehouse_directory': '/test-warehouse', 'hs2': 'impala-test-cluster-1.gce.cloudera.com:10000', 'impalad': ['impala-test-cluster-4.gce.cloudera.com:21000', 'impala-test-cluster-2.gce.cloudera.com:21000', 'impala-test-cluster-3.gce.cloudera.com:21000'], 'metastore': 'impala-test-cluster-1.gce.cloudera.com:9083', 'namenode': 'impala-test-cluster-1.gce.cloudera.com', 'namenode_http': 'impala-test-cluster-1.gce.cloudera.com:20101', 'kudu_master': 'impala-test-cluster-1.gce.cloudera.com' } """ # Iterate overs services and find the information we need result = {} for service_type, service in self.services.iteritems(): if service_type == "IMPALA": roles = service.get_roles_by_type("IMPALAD") impalads = [] for r in roles: rc_config = r.get_config("full") hostname = self.get_hostname_for_ref(r.hostRef) hs2_port = self.get_or_default(rc_config["beeswax_port"]) impalads.append("{0}:{1}".format(hostname, hs2_port)) result["impalad"] = impalads elif service_type == "HBASE": self.download_client_config(self.cluster, service) elif service_type == "HDFS": self.download_client_config(self.cluster, service) role = service.get_roles_by_type("NAMENODE") config = role[0].get_config("full") namenode = self.get_hostname_for_ref(role[0].hostRef) result["namenode"] = namenode result["namenode_http"] = "{0}:{1}".format( namenode, self.get_or_default(config["dfs_http_port"]) ) elif service_type == "HIVE": self.set_hive_warehouse_dir(self.cluster, service) self.download_client_config(self.cluster, service) hs2 = service.get_roles_by_type("HIVESERVER2")[0] rc_config = hs2.get_config("full") result["hive_warehouse_directory"] = self.get_or_default( service.get_config("full")[0]["hive_warehouse_directory"]) hostname = self.get_hostname_for_ref(hs2.hostRef) result["hs2"] = "{0}:{1}".format(hostname, self.get_or_default( rc_config["hs2_thrift_address_port"])) # Get Metastore information ms = service.get_roles_by_type("HIVEMETASTORE")[0] rc_config = ms.get_config("full") result["metastore"] = "{0}:{1}".format( self.get_hostname_for_ref(ms.hostRef), self.get_or_default(rc_config["hive_metastore_port"]) ) elif service_type == "KUDU": # Service KUDU does not require a client configuration result["kudu_master"] = self.cm_host return result # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @staticmethod def find_snapshot_file(snapshot_dir): """Given snapshot_directory, walks the directory tree until it finds a file matching the test-warehouse archive pattern.""" for root, _, file_names in os.walk(snapshot_dir): for filename in fnmatch.filter(file_names, "test-warehouse-*-SNAPSHOT.tar.gz"): logger.info("Found Snapshot file {0}".format(filename)) return os.path.join(root, filename) @timing def prepare(self): """Populate the environment of the process with the necessary values. In addition, it creates helper objects to run shell and SSH processes. """ # Populate environment with required variables os.environ["HS2_HOST_PORT"] = self.config["hs2"] os.environ["HDFS_NN"] = self.config["namenode"] os.environ["IMPALAD"] = self.config["impalad"][0] os.environ["REMOTE_LOAD"] = "1" os.environ["HADOOP_USER_NAME"] = "hdfs" os.environ["TEST_WAREHOUSE_DIR"] = self.config["hive_warehouse_directory"] os.environ["KUDU_MASTER"] = self.config["kudu_master"] if self.options.snapshot_file is None: if "SNAPSHOT_DIR" in os.environ: snapshot_dir = os.environ["SNAPSHOT_DIR"] else: snapshot_dir = "{0}/testdata/test-warehouse-SNAPSHOT".format( os.getenv("WORKSPACE")) if not os.path.isdir(snapshot_dir): err_msg = 'Snapshot directory "{0}" is not a valid directory' logger.error(err_msg.format(snapshot_dir)) raise OSError("Could not find test-warehouse snapshot file.") logger.info("Snapshot directory: {0}".format(snapshot_dir)) self.snapshot_file = self.find_snapshot_file(snapshot_dir) else: self.snapshot_file = self.options.snapshot_file # Prepare shortcuts for connecting to remote services self.gtw_ssh = ssh.bake("{0}@{1}".format(self.options.ssh_user, self.gateway), "-oStrictHostKeyChecking=no", "-oUserKnownHostsFile=/dev/null", t=True, _out=tee, _err=tee) self.beeline = sh.beeline.bake(silent=False, outputformat="csv2", n="impala", u="jdbc:hive2://{0}/default".format( self.config["hs2"])) self.load_test_warehouse = sh.Command( "{0}/testdata/bin/load-test-warehouse-snapshot.sh".format( self.impala_home)).bake( _out=tee, _err=tee) self.create_load_data = sh.Command( "{0}/testdata/bin/create-load-data.sh".format(self.impala_home)) self.main_impalad = self.config["impalad"][0] self.impala_shell = sh.Command("impala-shell.sh").bake(i=self.main_impalad, _out=tee, _err=tee) self.python = sh.Command("impala-python").bake(u=True) self.compute_stats = sh.Command( "{0}/testdata/bin/compute-table-stats.sh".format(self.impala_home)).bake( _out=tee, _err=tee) @timing def load(self): """This method performs the actual data load. First it removes any known artifacts from the remote location. Next it drops potentially existing database from the Hive Metastore. Now, it invokes the load-test-warehouse-snapshot.sh and create-load-data.sh scripts with the appropriate parameters. The most important paramters are implicitly passed to the scripts as environment variables pointing to the remote HDFS, Hive and Impala. """ exploration_strategy = self.options.exploration_strategy logger.info("Removing other databases") dblist = self.beeline(e="show databases;", _err=tee).stdout database_list = dblist.split()[1:] # The first element is the header string for db in database_list: if db.strip() != "default": logger.debug("Dropping database %s", db) self.impala_shell(q="drop database if exists {0} cascade;".format(db)) logger.info("Invalidating metadata in Impala") self.impala_shell(q="invalidate metadata;") logger.info("Removing previous remote {0}".format( self.config["hive_warehouse_directory"])) r = sh.hdfs.dfs("-rm", "-r", "-f", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Expunging HDFS trash") r = sh.hdfs.dfs("-expunge") logger.info("Uploading test warehouse snapshot") self.load_test_warehouse(self.snapshot_file) # TODO: We need to confirm that if we change any permissions, that we don't # affect any running tests. See IMPALA-4375. logger.info("Changing warehouse ownership") r = sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format( self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "-R", "g+rwx", "{0}".format( self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "1777", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Calling create_load_data.sh") # The $USER variable is used in the create-load-data.sh script for beeline # impersonation. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" # Regardless of whether we are in fact skipping the snapshot load or not, # we nonetheless always pass -skip_snapshot_load to create-load-data.sh. # This is because we have already loaded the snapshot earlier in this # script, so we don't want create-load-data.sh to invoke # load-test-warehouse-snapshot.sh again. # # It would actually be nice to be able to skip the snapshot load, but # because of the existing messiness of create-load-data.sh, we can't. # This invocation... # # $ create-load-data.sh -skip_snapshot_load -exploration_strategy core # # ...results in this error: # # Creating /test-warehouse HDFS directory \ # (logging to create-test-warehouse-dir.log)... FAILED # 'hadoop fs -mkdir /test-warehouse' failed. Tail of log: # Log for command 'hadoop fs -mkdir /test-warehouse' # mkdir: `/test-warehouse': File exists # # Similarly, even though we might pass in "core" as the exploration strategy, # because we aren't loading a metadata snapshot (i.e., -skip_metadata_load is # false), an exhaustive dataload will always be done. This again is the result # of logic in create-load-data.sh, which itself ignores the value passed in # for -exploration_strategy. # # See IMPALA-4399: "create-load-data.sh has bitrotted to some extent, and needs # to be cleaned up" create_load_data_args = ["-skip_snapshot_load", "-cm_host", self.cm_host, "-snapshot_file", self.snapshot_file, "-exploration_strategy", exploration_strategy] self.create_load_data(*create_load_data_args, _env=new_env, _out=tee, _err=tee) sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Re-load HBase data") # Manually load the HBase data last. self.python("{0}/bin/load-data.py".format(self.impala_home), "--hive_warehouse_dir={0}".format( self.config["hive_warehouse_directory"]), "--table_formats=hbase/none", "--hive_hs2_hostport={0}".format(self.config["hs2"]), "--hdfs_namenode={0}".format(self.config["namenode"]), "--exploration_strategy={0}".format(exploration_strategy), workloads="functional-query", force=True, impalad=self.main_impalad, _env=new_env, _out=tee, _err=tee) self.compute_stats() logger.info("Load data finished") # TODO: Should this be refactored out of this script? It has nothing to do with # data loading per se. If tests rely on the environment on the client being set # a certain way -- as in the prepare() method -- we may need to find another way # to deal with that. See IMPALA-4376. @timing def test(self): """Execute Impala's end-to-end tests against a remote cluster. All configuration paramters are picked from the cluster configuration that was fetched via the CM API.""" # TODO: Running tests via runtest.py is currently not working against a remote # cluster (although running directly via py.test seems to work.) This method # may be refactored out of this file under IMPALA-4376, so for the time being, # raise a NotImplementedError. raise NotImplementedError # Overwrite the username to match the service user on the remote system and deal # with the assumption that in the local development environment the current user # is HDFS superuser as well. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" strategy = self.options.exploration_strategy logger.info("Running tests with exploration strategy {0}".format(strategy)) run_tests = sh.Command("{0}/tests/run-tests.py".format(self.impala_home)) run_tests("--skip_local_tests", "--exploration_strategy={0}".format(strategy), "--workload_exploration_strategy=functional-query:{0}".format(strategy), "--namenode_http_address={0}".format(self.config["namenode_http"]), "--hive_server2={0}".format(self.config["hs2"]), "--metastore_server={0}".format(self.config["metastore"]), "query_test", maxfail=10, impalad=",".join(self.config["impalad"]), _env=new_env, _out=tee, _err=tee)
#config dir for Cloudera agent /etc/cloudera-scm-agent cloudera_agent_config = codecs.open(r"/etc/cloudera-scm-agent/config.ini", encoding="utf-8", mode="r") cloudera_manager_host = re.search('(?<=server_host=).*',cloudera_agent_config.read()).group(0) cloudera_agent_config.close() except IOError: print "not running on a Cloudera manager host" exit(1) api = ApiResource(cloudera_manager_host, server_port=args.port, username=args.username, password=args.password) #the user picked cluster or the only cluster managed by cloudera manager cluster = None # Get a list of all clusters clusters=api.get_all_clusters() for c in clusters: for h in c.list_hosts(): host = hosts.get_host(api, h.hostId) if host.hostname == node or host.ipAddress == node: cluster = c if cluster: services = cluster.get_all_services() else: print "Couldn't find node in any cluster" exit(1) groups = None if args.service: service = find_service(services, args.service)
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '215964392', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
def get_cluster(): # connect to cloudera manager api = ApiResource(CM_HOST, username="******", password="******") # Take care of the case where cluster name has changed # Hopefully users wouldn't use this CM to deploy another cluster manually return (api, api.get_cluster(api.get_all_clusters()[0].name))
cloudera_manager_host = re.search( '(?<=server_host=).*', cloudera_agent_config.read()).group(0) cloudera_agent_config.close() except IOError: print "not running on a Cloudera manager host" exit(1) api = ApiResource(cloudera_manager_host, server_port=args.port, username=args.username, password=args.password) #the user picked cluster or the only cluster managed by cloudera manager cluster = None # Get a list of all clusters clusters = api.get_all_clusters() for c in clusters: for h in c.list_hosts(): host = hosts.get_host(api, h.hostId) if host.hostname == node or host.ipAddress == node: cluster = c if cluster: services = cluster.get_all_services() else: print "Couldn't find node in any cluster" exit(1) groups = None if args.service: service = find_service(services, args.service)
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '268435456', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_dir_a = module.params.get('jn_dir', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # deploy configuration - it always return changed elif action_a == 'deploy_configuration': service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] service = cluster.get_service(service_name) # deploying client configuration command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) module.exit_json(changed=True, msg='Configuration deployed') # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
def main(cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name, cm_tls_enabled, cm_tls_cafile): #print cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name, cm_tls_enabled, cm_tls_cafile if cm_tls_enabled == 'false': api = ApiResource(server_host=cm_fqhn, username=cm_user_name, password=cm_user_password) else: #context = ssl.create_default_context(cafile='/opt/cloudera/security/certs/ChainedCA.cert.pem') context = ssl.create_default_context(cafile=cm_tls_cafile) api = ApiResource(server_host=cm_fqhn, username=cm_user_name, password=cm_user_password, use_tls=True, ssl_context=context) # Get a list of all clusters cdh_cluster = None for c in api.get_all_clusters(): if c.name == cm_cluster_name: print '\nCluster:', c cdh_cluster = c for x in cdh_cluster.list_hosts(): HOST_NAME2ID_MAP[api.get_host(x.hostId).hostname] = x.hostId HOST_ID2NAME_MAP[x.hostId] = api.get_host(x.hostId).hostname print '\nHostName to HostId Mapping:' for x in HOST_NAME2ID_MAP: print x, HOST_NAME2ID_MAP[x] print '\nHostId to HostName Mapping:' for x in HOST_ID2NAME_MAP: print x, HOST_ID2NAME_MAP[x] print '\nServices:' for x in cdh_cluster.get_all_services(): print x.type #ZooKeeper #zk_client_port = getKeyValueByServiceTypeAndRoleType(cdh_cluster, # SERVICE_TYPE_MAP['zookeeper'], # SERVICE_ROLE_TYPE_MAP['zookeeper'], # 'clientPort'); zk_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['zookeeper']) zk_server_rcg = getRCGByServiceAndRoleType( zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server']) zk_client_port = geValueByKeyInRCG( zk_server_rcg, CONFIG_PROPERTY_MAP['zk_client_port']) if zk_client_port != None: CONFIG_KEY_VALUE_MAP['ZOOKEEPER_PORT'] = zk_client_port zk_hosts = getHostsByServiceAndRoleType( zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server']) #print 'ZOOKEEPER HOSTS:', zk_hosts if len(zk_hosts) > 0: CONFIG_KEY_VALUE_MAP['ZOOKEEPER_QUORUM'] = ' '.join(zk_hosts) #HDFS hdfs_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['hdfs']) hdfs_nn_rcg = getRCGByServiceAndRoleType( hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode']) #inspectKVsInRCG(hdfs_nn_rcg) hdfs_nn_ns = geValueByKeyInRCG(hdfs_nn_rcg, CONFIG_PROPERTY_MAP['hdf_nn_ns']) #print 'HDFS NAMENODE NAMESERVICE:', hdfs_nn_ns hdfs_nn_port = geValueByKeyInRCG( hdfs_nn_rcg, CONFIG_PROPERTY_MAP['hdf_nn_port']) #print 'HDFS NAMENODE PORT:', hdfs_nn_port if hdfs_nn_port == None: hdfs_nn_port = CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT'] else: CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT'] = hdfs_nn_port nn_hosts = None if hdfs_nn_ns == None: nn_hosts = getHostsByServiceAndRoleType( hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode']) #print 'HDFS NAMENODE HOSTS:', nn_hosts CONFIG_KEY_VALUE_MAP[ 'NAME_NODE'] = 'hdfs://' + nn_hosts[0] + ':' + hdfs_nn_port else: CONFIG_KEY_VALUE_MAP['NAME_NODE'] = hdfs_nn_ns #YARN yarn_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['yarn']) #inspectRolesByService(yarn_service) #inspectRCGs(yarn_service) yarn_jt_rcg = getRCGByServiceAndRoleType( yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager']) #inspectKVsInRCG(yarn_jt_rcg) yarn_rm_address = geValueByKeyInRCG( yarn_jt_rcg, CONFIG_PROPERTY_MAP['yarn_rm_address']) if yarn_rm_address == None: yarn_rm_address = CONFIG_KEY_VALUE_MAP[ 'RESOURCEMANAGER_ADDRESS'] else: CONFIG_KEY_VALUE_MAP[ 'RESOURCEMANAGER_ADDRESS'] = yarn_rm_address rm_hosts = getHostsByServiceAndRoleType( yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager']) #print 'YARN RESOURCEMANGER HOSTS:', rm_hosts CONFIG_KEY_VALUE_MAP[ 'JOB_TRACKER'] = rm_hosts[0] + ':' + yarn_rm_address #OOZIE oozie_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['oozie']) #inspectConfigByService(oozie_service) oozie_use_ssl = getValueByKeyServiceConfig( oozie_service, CONFIG_PROPERTY_MAP['oozie_use_ssl']) #print 'OOZIE TLS/SSL:', oozie_use_ssl if oozie_use_ssl == 'true': CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] = 'true' oozie_LB = getValueByKeyServiceConfig( oozie_service, CONFIG_PROPERTY_MAP['oozie_load_balancer']) #inspectRolesByService(oozie_service) #inspectRCGs(oozie_service) oozie_server_rcg = getRCGByServiceAndRoleType( oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server']) #inspectKVsInRCG(oozie_server_rcg) oozie_http_port = geValueByKeyInRCG( oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_http_port']) oozie_https_port = geValueByKeyInRCG( oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_https_port']) if oozie_http_port == None: oozie_http_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTP_PORT'] if oozie_https_port == None: oozie_https_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTPS_PORT'] #print 'OOOZIE http(s) ports:', oozie_http_port, oozie_https_port oozie_hosts = getHostsByServiceAndRoleType( oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server']) #print oozie_hosts if CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] == 'true': if oozie_LB != None: CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'https://' + oozie_LB else: CONFIG_KEY_VALUE_MAP[ 'OOZIE_URL'] = 'http://' + oozie_hosts[ 0] + ':' + CONFIG_KEY_VALUE_MAP[ 'OOZIE_HTTPS_PORT'] + '/oozie' else: if oozie_LB != None: CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'http://' + oozie_LB else: CONFIG_KEY_VALUE_MAP[ 'OOZIE_URL'] = 'http://' + oozie_hosts[ 0] + ':' + CONFIG_KEY_VALUE_MAP[ 'OOZIE_HTTP_PORT'] + '/oozie' #HBASE hbase_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['hbase']) #inspectConfigByService(hbase_service) #inspectRolesByService(hbase_service) hbase_rs_rcg = getRCGByServiceAndRoleType( hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver']) #inspectKVsInRCG(hbase_rs_rcg) hbase_rs_port = geValueByKeyInRCG( hbase_rs_rcg, CONFIG_PROPERTY_MAP['hbase_rs_port']) if hbase_rs_port != None: CONFIG_KEY_VALUE_MAP['HBASE_REST_PORT'] = hbase_rs_port hbase_rs_hosts = getHostsByServiceAndRoleType( hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver']) CONFIG_KEY_VALUE_MAP['HBASE_REST_IP'] = hbase_rs_hosts[0] #KAFKA kafka_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['kafka']) #inspectConfigByService(kafka_service) #inspectRolesByService(kafka_service) kafka_broker_rcg = getRCGByServiceAndRoleType( kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker']) #inspectKVsInRCG(kafka_broker_rcg) kafka_client_security_protocol = geValueByKeyInRCG( kafka_broker_rcg, CONFIG_PROPERTY_MAP['kafka_client_security_protocol']) if kafka_client_security_protocol != None: CONFIG_KEY_VALUE_MAP[ 'KAFKA_SECURITY_PROTOCOL'] = kafka_client_security_protocol kafka_broker_hosts = getHostsByServiceAndRoleType( kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker']) if len(kafka_broker_hosts) > 0: CONFIG_KEY_VALUE_MAP['KAFKA_BROKER'] = ' '.join(zk_hosts) # Print all print '\nOUTPUT:\n', CONFIG_KEY_VALUE_MAP
def main(argv): #choosing a date format for the report fmt = '%Y-%m-%d %H:%M:%S %Z' current_datetime = datetime.datetime.now() current_date = current_datetime.date() str_current_datetime = str(current_datetime) str_current_date = str(current_date) ### Initialize script mail_content_file = "/root/scripts/mail_content_{0}".format(str_current_date) print mail_content_file ### Settings to connect to BDR cluster #This is a one time setup cm_host = "cm_host" cm_port = "7180" cm_login = "******" cm_password = "******" bdr_cluster_name = "your backup cluster name" #This program takes one parameter called limit, which limits the most recent N instances of a job to be reported #to get only the most recent run set Limit to 1 limit = 1 if len(argv) == 1: usage = 'Usage: %s <limit>' % (argv[0]) print usage quit(1) elif len(argv) == 2: if argv[1].isdigit(): limit = argv[1] else: limit = 7 else: limit = 1 print 'Limit: %s' % (str(limit)) #These variables are used later in a loop bdr_cluster = None hdfs_service = None hive_service = None ### Connect to CM print "\nConnecting to Cloudera Manager at " + cm_host + ":" + cm_port api = ApiResource(server_host=cm_host, server_port=cm_port, username=cm_login, password=cm_password) ### Get BDR Cluster clusters = api.get_all_clusters() for cluster in clusters: if cluster.displayName == bdr_cluster_name: bdr_cluster = cluster break if bdr_cluster is None: print "Error: Cluster '" + bdr_cluster_name + "' not found" quit(1) ### Get Hive Service service_list = bdr_cluster.get_all_services() for service in service_list: if service.type == "HIVE": hive_service = service break if hive_service is None: print "Error: Could not locate Hive Service" quit(1) ### Get HDFS Service service_list = bdr_cluster.get_all_services() for service in service_list: if service.type == "HDFS": hdfs_service = service break if hdfs_service is None: print "Error: Could not locate HDFS Service" quit(1) #open the mail content file for writing fp = open(mail_content_file, 'w') ### Begin: Hive Replication formatted_str = "\n### Begin: Hive replications ###".format() print formatted_str fp.write(formatted_str) #header format for hive replication #Status StartTime EndTime Database Message formatted_str = "\nStatus\tStart\tEnd\tDB\tMessage".format() print formatted_str fp.write(formatted_str) schedules = hive_service.get_replication_schedules() ## Iterate through all replication schedules for schedule in schedules: ## Get the Hive Replication Arguments hive_args = schedule.hiveArguments replicate_data = hive_args.replicateData ## Get the HDFS Replication Arguments for the Hive job if replicate_data: hdfs_args = hive_args.hdfsArguments ## get the replication schedule ID id = str(schedule.id) ## Get the history of commands for the scheduled Hive replication command_history = hive_service.get_replication_command_history(schedule_id=schedule.id, limit=limit, view='full') ## for each replication command for this schedule for command in command_history: if command.hiveResult is None: continue hive_result = command.hiveResult if hive_result.tables is None: continue tables = hive_result.tables database_name = '' for table in tables: database_name = table.database break start_time = command.startTime.strftime(fmt) result_message = '' if command.resultMessage: result_message = command.resultMessage if command.active: formatted_str = "\nRunning\t{0}\t{1}\t\t{2}".format(start_time, database_name, result_message) print formatted_str fp.write(formatted_str) else: end_time = command.endTime.strftime(fmt) if not command.success: formatted_str = "\n****Failed\t{0}\t{1}\t{2}\t\t{3}".format(start_time, end_time, database_name, result_message) print formatted_str fp.write(formatted_str) else: formatted_str = "\nSucceeded\t{0}\t{1}\t{2}\t\t{3}".format(start_time, end_time, database_name, result_message) print formatted_str fp.write(formatted_str) ############################## ### End: Hive replications ### ############################## ### Begin: HDFS Replication formatted_str = "\n\n### Begin: HDFS replications ###".format() print formatted_str fp.write(formatted_str) #header format for hdfs replication #Status StartTime EndTime HDFS_Path Message Files_Expected Files_Copied Files_Skipped Files_Failed formatted_str = "\nStatus\tStart\tEnd\tPath\tMessage\tFiles Expected\tFiles Copied\tFiles Skipped\tFiles Failed".format() print formatted_str fp.write(formatted_str) schedules = hdfs_service.get_replication_schedules() ### Iterate through all replication schedules for schedule in schedules: ### Get the HDFS Arguments hdfs_args = schedule.hdfsArguments ### get the replication schedule ID id = str(schedule.id) ## Get the history of commands for the scheduled HDFS replication command_history = hdfs_service.get_replication_command_history(schedule_id=schedule.id, limit=limit, view='full') for command in command_history: if command.hdfsResult is None: continue hdfs_result = command.hdfsResult start_time = command.startTime.strftime(fmt) source_path = hdfs_args.sourcePath numFilesExpected = hdfs_result.numFilesExpected numFilesCopied = hdfs_result.numFilesCopied numFilesSkipped = hdfs_result.numFilesSkipped numFilesCopyFailed = hdfs_result.numFilesCopyFailed result_message = '' if command.resultMessage: result_message = command.resultMessage if command.active: formatted_str = "\nRunning\t{0}\t{1}\t\t{2}\t{3}\t{4}\t{5}\t{6}".format(start_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed)) print formatted_str fp.write(formatted_str) else: end_time = command.endTime.strftime(fmt) if not command.success: formatted_str = "\n****Failed\t{0}\t{1}\t{2}\t\t{3}\t{4}\t{5}\t{6}\t{7}".format(start_time, end_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed)) print formatted_str fp.write(formatted_str) else: formatted_str = "\nSucceeded\t{0}\t{1}\t{2}\t\t{3}\t{4}\t{5}\t{6}\t{7}".format(start_time, end_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed)) print formatted_str fp.write(formatted_str) ############################## ### End: HDFS replications ### ############################## #print the hostname and the current time at the end of report and close the mail content file hostname = socket.gethostname() formatted_str = "\n\nCurrent Time on {0} is {1}".format(hostname, str_current_datetime) print formatted_str fp.write(formatted_str) fp.close() #send email from_addr = 'from address' to_addr = 'to address' mail_subject = 'Report from %s - Daily BDR Status Report %s' % (hostname, str_current_date) send_email(from_addr, to_addr, mail_subject, mail_content_file) quit(0)
class CmCluster(Cluster): def __init__(self, host_name, port=None, user="******", password="******", cluster_name=None, ssh_user=None, ssh_port=None, ssh_key_file=None, use_tls=False): # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently # something in the CM API uses strptime(). strptime("2015", "%Y") Cluster.__init__(self) # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls if port is None: if use_tls: port = CM_TLS_PORT else: port = CM_CLEAR_PORT self.cm = CmApiResource(host_name, server_port=port, username=user, password=password, use_tls=use_tls) clusters = self.cm.get_all_clusters() if not clusters: raise Exception("No clusters found in CM at %s" % host_name) if cluster_name: clusters_by_name = dict((c.name, c) for c in clusters) if cluster_name not in clusters_by_name: raise Exception(("No clusters named %s found in CM at %s." "Available clusters are %s.") % (cluster_name, host_name, ", ".join(sorted(clusters_by_name.keys())))) self.cm_cluster = clusters_by_name[cluster_name] else: if len(clusters) > 1: raise Exception(("Too many clusters found in CM at %s;" " a cluster name must be provided") % host_name) self.cm_cluster = clusters[-1] self.ssh_user = ssh_user self.ssh_port = ssh_port self.ssh_key_file = ssh_key_file self._ssh_client_lock = Lock() self._ssh_clients_by_host_name = defaultdict(list) def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT): with self._ssh_client(host_name) as client: return client.shell(cmd, timeout_secs=timeout_secs) @contextmanager def _ssh_client(self, host_name): """Returns an SSH client for use in a 'with' block. When the 'with' context exits, the client will be kept for reuse. """ with self._ssh_client_lock: clients = self._ssh_clients_by_host_name[host_name] if clients: client = clients.pop() else: # IMPALA-7460: Insulate this import away from the global context so as to avoid # requiring Paramiko unless it's absolutely needed. from tests.util.ssh_util import SshClient LOG.debug("Creating new SSH client for %s", host_name) client = SshClient() client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file) error_occurred = False try: yield client except Exception: error_occurred = True raise finally: if not error_occurred: with self._ssh_client_lock: self._ssh_clients_by_host_name[host_name].append(client) def _init_local_hadoop_conf_dir(self): self._local_hadoop_conf_dir = mkdtemp() data = StringIO(self.cm.get("/clusters/%s/services/%s/clientConfig" % (self.cm_cluster.name, self._find_service("HIVE").name))) zip_file = ZipFile(data) for name in zip_file.namelist(): if name.endswith("/"): continue extract_path = os.path.join(self._local_hadoop_conf_dir, os.path.basename(name)) with open(extract_path, "w") as conf_file: conf_file.write(zip_file.open(name).read()) def _find_service(self, service_type): """Find a service by its CM API service type. An exception will be raised if no service is found or multiple services are found. See the CM API documentation for more details about the service type. """ services = [s for s in self.cm_cluster.get_all_services() if s.type == service_type] if not services: raise Exception("No service of type %s found in cluster %s" % (service_type, self.cm_cluster.name)) if len(services) > 1: raise Exception("Found %s services in cluster %s; only one is expected." % len(services, self.cm_cluster.name)) return services[0] def _find_role(self, role_type, service_type): """Find a role by its CM API role and service type. An exception will be raised if no roles are found. See the CM API documentation for more details about the service and role types. """ service = self._find_service(service_type) roles = service.get_roles_by_type(role_type) if not roles: raise Exception("No roles of type %s found in service %s" % (role_type, service.name)) return roles[0] def _init_hdfs(self): self._hdfs = Hdfs(self, "hdfs") def _init_hive(self): hs2 = self._find_role("HIVESERVER2", "HIVE") host = self.cm.get_host(hs2.hostRef.hostId) config = hs2.get_config(view="full")["hs2_thrift_address_port"] self._hive = Hive(self, str(host.hostname), int(config.value or config.default)) def _init_impala(self): self._impala = CmImpala(self, self._find_service("IMPALA"))
def install_java_8(region, stack_name): # following general protocol for upgrading to JDK 1.8 here: # http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_cm_upgrading_to_jdk8.html ec2_conn = create_ec2_connection(region) manager_instance = get_manager_instance(ec2_conn, stack_name) cluster_instances = ( get_worker_instances(ec2_conn, stack_name) + [manager_instance, get_master_instance(ec2_conn, stack_name)]) cluster_hosts = [i.ip_address for i in cluster_instances] with cm_tunnel_ctx(manager_instance) as local_port: # Connect to CM API cm_api = ApiResource('localhost', username='******', password='******', server_port=local_port, version=9) cloudera_manager = cm_api.get_cloudera_manager() # Stop Cloudera Management Service print "Stopping Cloudera Management Service" mgmt_service = cloudera_manager.get_service() mgmt_service.stop().wait() # Stop cluster print "Stopping the cluster" clusters = cm_api.get_all_clusters() cluster = clusters.objects[0] cluster.stop().wait() # Stop all Cloudera Manager Agents @parallel def stop_cm_agents(): sudo('service cloudera-scm-agent stop') execute(stop_cm_agents, hosts=cluster_hosts) # Stop the Cloudera Manager Server def stop_cm_server(): sudo('service cloudera-scm-server stop') execute(stop_cm_server, hosts=[manager_instance.ip_address]) # Cleanup other Java versions and install JDK 1.8 @parallel def swap_jdks(): sudo('rpm -qa | grep jdk | xargs rpm -e') sudo('rm -rf /usr/java/jdk1.6*') sudo('rm -rf /usr/java/jdk1.7*') run('wget -O jdk-8-linux-x64.rpm --no-cookies --no-check-certificate ' '--header "Cookie: oraclelicense=accept-securebackup-cookie" ' 'http://download.oracle.com/otn-pub/java/jdk/8u51-b16/' 'jdk-8u51-linux-x64.rpm') sudo('yum install -y jdk-8-linux-x64.rpm') append('/home/ec2-user/.bash_profile', 'export JAVA_HOME=`find /usr/java -name "jdk1.8*"`') execute(swap_jdks, hosts=cluster_hosts) # Start the Cloudera Manager Server def start_cm_server(): sudo('service cloudera-scm-server start') execute(start_cm_server, hosts=[manager_instance.ip_address]) # Start all Cloudera Manager Agents @parallel def start_cm_agents(): sudo('service cloudera-scm-agent start') execute(start_cm_agents, hosts=cluster_hosts) with cm_tunnel_ctx(manager_instance) as local_port: # Connect to CM API cm_api = ApiResource('localhost', username='******', password='******', server_port=local_port, version=9) cloudera_manager = cm_api.get_cloudera_manager() # Start the cluster and the mgmt service print "Starting the cluster" cluster.start().wait() print "Starting the Cloudera Management Service" cloudera_manager = cm_api.get_cloudera_manager() mgmt_service = cloudera_manager.get_service() mgmt_service.start().wait()
def main(): api = ApiResource(host='r2341-d5-us01', user='******', password='******') #is get cluster None? what = api.get_all_clusters() print "what:", what
for s in api.get_all_clusters()[0].get_all_services(view='full'): if s.type == stype: print "name: %s displayName: %s" % (s.name, s.displayName) config = s.get_config()[0] print config def print_impala_resource_configs(api, stype): for s in api.get_all_clusters()[0].get_all_services(view='full'): if s.type == stype: print "name: %s displayName: %s" % (s.name, s.displayName) config = s.get_config()[0] print config for service in api.get_all_clusters()[0].get_all_services(): print "** Service name:%s type: %s displayName: %s" % (service.name, service.type, service.displayName) config = service.get_config() print print "Service Config: %s" % str(config) print for rcg in service.get_all_role_config_groups(): print "rcg name: %s rcg display name: %s" % (rcg.name, rcg.displayName) config = rcg.get_config() print print "rcg config: %s" % str(config) for role in service.get_all_roles(): print print "role name: %s role type: %s" % (role.name, role.type) print print "role config: %s" % (role.get_config())
class CmCluster(Cluster): def __init__(self, host_name, port=None, user="******", password="******", cluster_name=None, ssh_user=None, ssh_port=None, ssh_key_file=None, use_tls=False): # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently # something in the CM API uses strptime(). strptime("2015", "%Y") Cluster.__init__(self) # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls if port is None: if use_tls: port = CM_TLS_PORT else: port = CM_CLEAR_PORT self.cm = CmApiResource(host_name, server_port=port, username=user, password=password, use_tls=use_tls) clusters = self.cm.get_all_clusters() if not clusters: raise Exception("No clusters found in CM at %s" % host_name) if cluster_name: clusters_by_name = dict((c.name, c) for c in clusters) if cluster_name not in clusters_by_name: raise Exception(("No clusters named %s found in CM at %s." "Available clusters are %s.") % (cluster_name, host_name, ", ".join( sorted(clusters_by_name.keys())))) self.cm_cluster = clusters_by_name[cluster_name] else: if len(clusters) > 1: raise Exception( ("Too many clusters found in CM at %s;" " a cluster name must be provided") % host_name) self.cm_cluster = clusters[-1] self.ssh_user = ssh_user self.ssh_port = ssh_port self.ssh_key_file = ssh_key_file self._ssh_client_lock = Lock() self._ssh_clients_by_host_name = defaultdict(list) def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT): with self._ssh_client(host_name) as client: return client.shell(cmd, timeout_secs=timeout_secs) @contextmanager def _ssh_client(self, host_name): """Returns an SSH client for use in a 'with' block. When the 'with' context exits, the client will be kept for reuse. """ with self._ssh_client_lock: clients = self._ssh_clients_by_host_name[host_name] if clients: client = clients.pop() else: LOG.debug("Creating new SSH client for %s", host_name) client = SshClient() client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file) error_occurred = False try: yield client except Exception: error_occurred = True raise finally: if not error_occurred: with self._ssh_client_lock: self._ssh_clients_by_host_name[host_name].append(client) def _init_local_hadoop_conf_dir(self): self._local_hadoop_conf_dir = mkdtemp() data = StringIO( self.cm.get( "/clusters/%s/services/%s/clientConfig" % (self.cm_cluster.name, self._find_service("HIVE").name))) zip_file = ZipFile(data) for name in zip_file.namelist(): if name.endswith("/"): continue extract_path = os.path.join(self._local_hadoop_conf_dir, os.path.basename(name)) with open(extract_path, "w") as conf_file: conf_file.write(zip_file.open(name).read()) def _find_service(self, service_type): """Find a service by its CM API service type. An exception will be raised if no service is found or multiple services are found. See the CM API documentation for more details about the service type. """ services = [ s for s in self.cm_cluster.get_all_services() if s.type == service_type ] if not services: raise Exception("No service of type %s found in cluster %s" % (service_type, self.cm_cluster.name)) if len(services) > 1: raise Exception( "Found %s services in cluster %s; only one is expected." % len(services, self.cm_cluster.name)) return services[0] def _find_role(self, role_type, service_type): """Find a role by its CM API role and service type. An exception will be raised if no roles are found. See the CM API documentation for more details about the service and role types. """ service = self._find_service(service_type) roles = service.get_roles_by_type(role_type) if not roles: raise Exception("No roles of type %s found in service %s" % (role_type, service.name)) return roles[0] def _init_hdfs(self): self._hdfs = Hdfs(self, "hdfs") def _init_hive(self): hs2 = self._find_role("HIVESERVER2", "HIVE") host = self.cm.get_host(hs2.hostRef.hostId) config = hs2.get_config(view="full")["hs2_thrift_address_port"] self._hive = Hive(self, str(host.hostname), int(config.value or config.default)) def _init_impala(self): self._impala = CmImpala(self, self._find_service("IMPALA"))
def main(): api = ApiResource('r2341-d5-us01',username='******',password='******') clusters = api.get_all_clusters() print "clusters:", clusters if len(clusters) == 0: print "none"
class APIClient: def __init__(self, cm_host, cm_user, cm_pass, cluster_name=None): self.SERVICE_HIVE = 'HIVE' self.SERVICE_HUE = 'HUE' self.SERVICE_IMPALA = 'IMPALA' self.SERVICE_SOLR = 'SOLR' self.SERVICE_YARN = 'YARN' self.SERVICE_HDFS = 'HDFS' self.SERVICE_HBASE = 'HBASE' self.SERVICE_ZK = 'ZOOKEEPER' self.SERVICE_SENTRY = 'SENTRY' self.api = ApiResource( cm_host, username=cm_user, password=cm_pass, ) self.cluster = None self.services = {} for c in self.api.get_all_clusters(): if cluster_name is None or cluster_name == c.name: self.cluster = c break for service in self.cluster.get_all_services(): self.services[service.type] = service @staticmethod def get_api_client(cluster_name, cm_host, cm_user, cm_pass): return APIClient(cm_host, cm_user, cm_pass, cluster_name=cluster_name) def has_sentry(self): """ This function checks if sentry service is available in the cluster :return: boolean """ return self.SERVICE_SENTRY in self.services def has_hbase(self): """ This function checks if hbase service is available in the cluster :return: boolean """ return self.SERVICE_HBASE in self.services def get_impala_service(self): """ This function checks if sentry service is available in the cluster :return: boolean """ if self.SERVICE_IMPALA in self.services: return self.services[self.SERVICE_IMPALA] return None def get_hiveserver2_service(self): """ This function returns the hiveserver2 service instance :return: boolean """ if self.SERVICE_HIVE in self.services: return self.services[self.SERVICE_HIVE] return None def get_hbase_service(self): """ This function returns the hbase service instance :return: boolean """ if self.SERVICE_HBASE in self.services: return self.services[self.SERVICE_HBASE] return None def get_hdfs_service(self): """ This function returns the hdfs service instance :return: boolean """ if self.SERVICE_HDFS in self.services: return self.services[self.SERVICE_HDFS] return None def enable_sentry(self): service_list = [ self.SERVICE_HIVE, self.SERVICE_IMPALA, self.SERVICE_YARN, self.SERVICE_HUE ] for s_name in service_list: if s_name in self.services: className = s_name.capitalize() + "APIClient" module = importlib.import_module("api." + className) class_ = getattr(module, className) client = class_(self.services[s_name]) client.enable_sentry() def enable_kerberos(self): service_list = [ self.SERVICE_HDFS, self.SERVICE_ZK, self.SERVICE_HBASE, self.SERVICE_SOLR ] for s_name in service_list: if s_name in self.services: className = s_name.capitalize() + "APIClient" module = importlib.import_module("api." + className) class_ = getattr(module, className) client = class_(self.services[s_name]) client.enable_kerberos() def enable_impala_vip(self, host): impala_service = self.get_impala_service() ImpalaAPIClient(impala_service).enable_load_balancer(host) def disable_impala_vip(self): impala_service = self.get_impala_service() ImpalaAPIClient(impala_service).disable_load_balancer() def hiveserver2_create_role(self, host, i): hive_service = self.get_hiveserver2_service() HiveAPIClient(hive_service).add_hs2_role(host, i) def enable_hive_vip(self, host): service = self.get_hiveserver2_service() HiveAPIClient(service).enable_load_balancer(host) def disable_hive_vip(self): service = self.get_hiveserver2_service() HiveAPIClient(service).disable_load_balancer() def enable_hbase_authorization(self): service = self.get_hbase_service() HbaseAPIClient(service).enable_authorization() def enable_sentry_hdfs_sync(self, prefixes): service = self.get_hdfs_service() HdfsAPIClient(service).sentry_sync(prefixes)
def main(): api = ApiResource(host='r2341-d5-us01',user='******',password='******') #is get cluster None? what = api.get_all_clusters() print "what:", what
def do_call(user, password, man_host, man_port, cluster_name, parcel_name, parcel_version, parcel_repo, init_pre_dir, init_post_dir): api = ApiResource(man_host, man_port, user, password, False, MAN_API_VERSION) if not parcel_repo.endswith('/'): parcel_repo += '/' if re.match(REGEX_VERSION, parcel_version) is None or re.match( REGEX_VERSION, parcel_version).group() != parcel_version: raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] expected to match regular expression [' + REGEX_VERSION + ']') if not parcel_repo.endswith(parcel_version + '/'): raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] when compared with repository [' + parcel_repo + ']') cm_config = api.get_cloudera_manager().get_config(view='full') repo_config = cm_config['REMOTE_PARCEL_REPO_URLS'] repo_list = repo_config.value or repo_config.default if parcel_repo not in repo_list: repo_list += ',' + parcel_repo api.get_cloudera_manager().update_config( {'REMOTE_PARCEL_REPO_URLS': repo_list}) time.sleep( POLL_SEC ) # The parcel synchronize end-point is not exposed via the API, so sleep instead cluster_names = [] if cluster_name is None: for cluster in api.get_all_clusters(): cluster_names.append(cluster.name) else: cluster_names.append(cluster_name) for cluster_name_itr in cluster_names: print 'Cluster [DEPLOYMENT] starting ... ' cluster = api.get_cluster(cluster_name_itr) parcel = cluster.get_parcel(parcel_name, parcel_version) parcel_already_activated = False print 'Parcel [DEPLOYMENT] starting ... ' if parcel.stage == 'ACTIVATED': parcel_already_activated = True print 'Parcel [DEPLOYMENT] already deployed' else: do_parcel_op(cluster, parcel_name, parcel_version, 'DOWNLOAD', 'AVAILABLE_REMOTELY', 'DOWNLOADED', 'start_download') do_parcel_op(cluster, parcel_name, parcel_version, 'DISTRIBUTE', 'DOWNLOADED', 'DISTRIBUTED', 'start_distribution') do_parcel_op(cluster, parcel_name, parcel_version, 'ACTIVATE', 'DISTRIBUTED', 'ACTIVATED', 'activate') parcel = cluster.get_parcel(parcel_name, parcel_version) if parcel.stage != 'ACTIVATED': raise Exception('Parcel is currently mid-stage [' + parcel.stage + '], please wait for this to complete') print 'Parcel [DEPLOYMENT] finished' if init_pre_dir is not None and os.path.isdir(init_pre_dir): print 'Cluster [PRE_INIT] starting ... ' for script in glob.glob(init_pre_dir + '/*.sh'): subprocess.call([script]) print 'Cluster [PRE_INIT] finished' if not parcel_already_activated: print 'Cluster [CONFIG_DEPLOYMENT] starting ... ' cluster.deploy_client_config() cmd = cluster.deploy_client_config() if not cmd.wait(TIMEOUT_SEC).success: raise Exception('Failed to deploy client configs') print 'Cluster [CONFIG_DEPLOYMENT] finished' print 'Cluster [RESTART] starting ... ' for service in cluster.get_all_services(): if service.type == 'FLUME': service.restart().wait() if service.type == 'HIVE': service.restart().wait() if service.type == 'YARN': service.restart().wait() print 'Cluster [RESTART] finished' if init_post_dir is not None and os.path.isdir(init_post_dir): print 'Cluster [POST_INIT] starting ... ' for script in glob.glob(init_post_dir + '/*.sh'): subprocess.call([script]) print 'Cluster [POST_INIT] finished' print 'Cluster [DEPLOYMENT] finished'
'AD_KDC_DOMAIN': kerberos_ad_ou, 'KDC_HOST': kerberos_ad_server, 'KDC_TYPE': 'Active Directory', 'KRB_MANAGE_KRB5_CONF': 'true', 'KRB_ENC_TYPES': 'aes256-cts', 'SECURITY_REALM': kerberos_ad_realm }) print 'Import KDC credentials' cmd = cm.import_admin_credentials(kerberos_cm_principal, krb_pwd).wait() if not cmd.success: raise Exception('Command %s failed (%s)' % (cmd.name, cmd.resultMessage)) print 'Configure Kerberos for cluster services' if api_version >= 11: for cluster in api.get_all_clusters(): cmd = cluster.configure_for_kerberos(1004, 1006).wait() if not cmd.success: raise Exception('Command %s failed (%s)' % (cmd.name, cmd.resultMessage)) else: CFG = yaml.load(''' ZOOKEEPER: config: enableSecurity: true HDFS: config: hadoop_security_authentication: kerberos hadoop_security_authorization: true roleConfigGroups: DATANODE: dfs_datanode_data_dir_perm: 700