def init_cluster(): # wait for all cloudera agent processes to come up BDVLIB_ServiceWait( [["services", "cloudera_scm_agent", NODE_GROUP_ID, "kts"]]) # make sure cloudera manager has received registration # for all new agents all_cloudera_hosts = get_hosts_for_service( ["services", "cloudera_scm_agent"]) api = ApiResource(CM_HOST, username="******", password="******") while True: current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts()) setup_logger.info("Currently registered hosts with CM " + str(current_all_hosts)) if all(x in current_all_hosts for x in all_cloudera_hosts): break setup_logger.info( "waiting for new nodes to register with cloudera manager") time.sleep(10) manager = api.get_cloudera_manager() manager.update_config(CM_CONFIG) cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION, CDH_FULL_VERSION) KTS_HOSTS = ConfigMeta.getWithTokens( ['nodegroups', NODE_GROUP_ID, 'roles', 'kts', 'fqdns']) cluster.add_hosts(KTS_HOSTS) return (cluster, manager)
def init_cluster(): # wait for all cloudera agent processes to come up setup_logger.info("Creating Clutser.") BDVLIB_ServiceWait([["services", "cloudera_scm_agent", NODE_GROUP_ID]]) # make sure cloudera manager has received registration # for all new agents all_cloudera_hosts = get_hosts_for_service( ["services", "cloudera_scm_agent"]) api = ApiResource(CM_HOST, username=ADMIN_USER, password=ADMIN_PASS) while True: current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts()) setup_logger.info("Currently registered hosts with CM " + str(current_all_hosts)) if all(x in current_all_hosts for x in all_cloudera_hosts): break setup_logger.info( "waiting for new nodes to register with cloudera manager") time.sleep(10) manager = api.get_cloudera_manager() manager.update_config(CM_CONFIG) cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION, CDH_FULL_VERSION) cluster.add_hosts(ALL_HOSTS) # turn off host swap alerting hosts_swap_alert_off(api) setup_logger.info("Setting Up SPARK2 Repo....") add_spark2_repo(api) ##Set java home setup_logger.info("Setting Up Java Path....") hosts_set_javahome(api) return (cluster, manager)
logging.info('Starting the Cloudera Manager service') mgmt.start().wait() # Update the Parcels repo logging.info('Updating the remote parcels repo') cm_config = api.get_cloudera_manager().get_config(view='full') repo_urls = cdh_parcel_repo + ',' + kafka_parcel_repo api.get_cloudera_manager().update_config( {'REMOTE_PARCEL_REPO_URLS': repo_urls}) time.sleep(10) # Download the CDH Parcel logging.info('Downloading the CDH parcel') cluster_name = 'Open Data Platform' cluster = api.create_cluster(cluster_name, version='CDH5') cluster.add_hosts(hosts) cdh_parcel = cluster.get_parcel('CDH', cdh_parcel_version) cdh_parcel.start_download() while True: cdh_parcel = cluster.get_parcel('CDH', cdh_parcel_version) if cdh_parcel.stage == 'DOWNLOADED': break if cdh_parcel.state.errors: raise Exception(str(cdh_parcel.state.errors)) logging.info('Parcel download progress: %s / %s', cdh_parcel.state.progress, cdh_parcel.state.totalProgress) time.sleep(15) # check again in 15 seconds logging.info('Downloaded CDH parcel version %s on cluster %s', cdh_parcel_version, cluster_name)
class Deploy: def __init__(self, cm_port='7180', cm_user='******', cm_passwd='admin', cluster_name='cluster1'): self.cluster_name = cluster_name self.cdh_version = "CDH5" self.cfg = ParseConfig() self.host_list = self.cfg.get_hosts() self._get_host_allocate() self.cm_host = self.host_list[0] self.api = ApiResource(self.cm_host, cm_port, cm_user, cm_passwd, version=7) self.cm = self.api.get_cloudera_manager() try: self.cluster = self.api.get_cluster(self.cluster_name) except: try: self.cluster = self.api.create_cluster(self.cluster_name, self.cdh_version) except: err('Cannot connect to cloudera manager on %s' % self.cm_host) # add all our hosts to the cluster try: self.cluster.add_hosts(self.host_list) info('Add hosts successfully') except Exception as e: if e.code == 400: info('Already Added hosts') elif e.code == 404: err(e.message) def _auto_allocate(self, hosts): # enable mgmt node if node count is larger than mgmt_th mgmt_th = 6 if type(hosts) != list: err('hosts parameter should be a list') host_num = len(hosts) # node<=3, ZK=1 ,node>3, ZK=3 zk_num = 1 if host_num <= 3 else 3 # with mgmt node if host_num >= mgmt_th: self.ap_host = self.es_host = self.ho_host = self.sm_host = self.nn_host = self.hm_host = self.jt_host = hosts[ 0] self.dn_hosts = self.rs_hosts = self.tt_hosts = hosts[1:] self.snn_host = hosts[1] self.hms_host = hosts[2] self.hs2_host = hosts[3] # without mgmt node else: if host_num == 1: self.ap_host = self.es_host = self.ho_host = self.sm_host = self.jt_host = \ self.nn_host = self.hm_host = self.snn_host = self.hms_host = self.hs2_host = hosts[0] elif host_num > 1: # nn, snn not on same node tmp_hosts = hosts[:] self.nn_host = choice(tmp_hosts) tmp_hosts.remove(self.nn_host) self.snn_host = choice(tmp_hosts) self.hm_host = choice(tmp_hosts) self.jt_host = choice(hosts) self.hms_host = choice(hosts) self.hs2_host = choice(hosts) # cm self.ap_host = choice(hosts) self.es_host = choice(hosts) self.ho_host = choice(hosts) self.sm_host = choice(hosts) self.dn_hosts = self.rs_hosts = self.tt_hosts = hosts self.zk_hosts = hosts[-zk_num:] def _get_host_allocate(self): roles = self.cfg.get_roles() # auto set if no role config found if not roles: self._auto_allocate(self.host_list) return valid_roles = [ 'DN', 'RS', 'ZK', 'HM', 'NN', 'SNN', 'AP', 'ES', 'SM', 'HO', 'TT', 'JT', 'HMS', 'HS2' ] role_host = defaultdict(list) for item in roles: for role in item[1]: role = role.strip() if role not in valid_roles: err('Incorrect role config') role_host[role].append(item[0]) # cdh self.nn_host = role_host['NN'][0] self.snn_host = role_host['SNN'][0] self.hm_host = role_host['HM'][0] self.jt_host = role_host['JT'][0] self.hms_host = role_host['HMS'][0] self.hs2_host = role_host['HS2'][0] self.tt_hosts = role_host['TT'] self.zk_hosts = role_host['ZK'] self.dn_hosts = role_host['DN'] self.rs_hosts = role_host['RS'] # cm self.ap_host = role_host['AP'][0] self.es_host = role_host['ES'][0] self.ho_host = role_host['HO'][0] self.sm_host = role_host['SM'][0] def setup_cms(self): try: self.cm.delete_mgmt_service() except: pass # create the management service try: mgmt = self.cm.create_mgmt_service(ApiServiceSetupInfo()) mgmt.create_role('AlertPublisher', "ALERTPUBLISHER", self.ap_host) mgmt.create_role('EventServer', "EVENTSERVER", self.es_host) mgmt.create_role('HostMonitor', "HOSTMONITOR", self.hm_host) mgmt.create_role('ServiceMonitor', "SERVICEMONITOR", self.sm_host) ok('Cloudera management service created successfully.') except ApiException: info('Cloudera management service had already been created.') def setup_parcel(self): parcels_list = [] i = 1 for p in self.cluster.get_all_parcels(): if p.stage == 'AVAILABLE_REMOTELY': continue elif p.stage == 'ACTIVATED': info('Parcel [%s] has already been activated' % p.version) return else: print '\t' + str(i) + ': ' + p.product + ' ' + p.version i += 1 parcels_list.append(p) if len(parcels_list) == 0: err('No downloaded ' + self.cdh_version + ' parcel found!') elif len(parcels_list) > 1: index = raw_input('Input parcel number:') if not index.isdigit: err('Error index, must be a number') cdh_parcel = parcels_list[int(index) - 1] else: cdh_parcel = parcels_list[0] # # download the parcel # print "Starting parcel download. This might take a while." # cmd = cdh_parcel.start_download() # if cmd.success != True: # print "Parcel download failed!" # exit(0) # # make sure the download finishes # while cdh_parcel.stage != 'DOWNLOADED': # sleep(5) # cdh_parcel = self.cluster.get_parcel(cdh_parcel.product, cdh_parcel.version) # print cdh_parcel.product + ' ' + cdh_parcel.version + " downloaded" # distribute the parcel info('Starting parcel distribution. This might take a while.') cmd = cdh_parcel.start_distribution() i = 0 while cmd.success == None: i += 1 sleep(5) cmd = cmd.fetch() s = '.' * i print '\r%s' % s, sys.stdout.flush() if cmd.success != True: err('Parcel distribution failed!') # make sure the distribution finishes while cdh_parcel.stage != "DISTRIBUTED": sleep(5) cdh_parcel = self.cluster.get_parcel(cdh_parcel.product, cdh_parcel.version) ok(cdh_parcel.product + ' ' + cdh_parcel.version + ' distributed') # activate the parcel cmd = cdh_parcel.activate() if cmd.success != True: err('Parcel activation failed!') # make sure the activation finishes while cdh_parcel.stage != "ACTIVATED": sleep(5) cdh_parcel = self.cluster.get_parcel(cdh_parcel.product, cdh_parcel.version) ok(cdh_parcel.product + ' ' + cdh_parcel.version + ' activated') def _create_service(self, sdata): try: self.cluster.get_service(sdata['sname']) info('Service %s had already been configured' % sdata['sname']) except ApiException: service = self.cluster.create_service(sdata['sname'], sdata['stype']) ok('Service %s had been created successfully' % sdata['sname']) for role in sdata['roles']: if role.has_key('rhost'): service.create_role(role['rname'], role['rtype'], role['rhost']) elif role.has_key('rhosts'): rid = 0 for host in role['rhosts']: rid += 1 service.create_role(role['rname'] + '-' + str(rid), role['rtype'], host) def setup_cdh(self): service_data = [{ 'sname': 'hdfs', 'stype': 'HDFS', 'roles': [{ 'rname': 'hdfs-namenode', 'rtype': 'NAMENODE', 'rhost': self.nn_host }, { 'rname': 'hdfs-secondarynamenode', 'rtype': 'SECONDARYNAMENODE', 'rhost': self.snn_host }, { 'rname': 'hdfs-datanode', 'rtype': 'DATANODE', 'rhosts': self.dn_hosts }] }, { 'sname': 'zookeeper', 'stype': 'ZOOKEEPER', 'roles': [{ 'rname': 'zookeeper', 'rtype': 'SERVER', 'rhosts': self.zk_hosts }] }, { 'sname': 'hbase', 'stype': 'HBASE', 'roles': [{ 'rname': 'hbase-master', 'rtype': 'MASTER', 'rhost': self.hm_host }, { 'rname': 'hdfs-regionserver', 'rtype': 'REGIONSERVER', 'rhosts': self.rs_hosts }] }, { 'sname': 'hive', 'stype': 'HIVE', 'roles': [{ 'rname': 'hive-metastore', 'rtype': 'HIVEMETASTORE', 'rhost': self.hms_host }, { 'rname': 'hive-server2', 'rtype': 'HIVESERVER2', 'rhost': self.hs2_host }, { 'rname': 'hive-gateway', 'rtype': 'GATEWAY', 'rhosts': self.dn_hosts }] }, { 'sname': 'mapreduce', 'stype': 'MAPREDUCE', 'roles': [{ 'rname': 'mapreduce-jobtracker', 'rtype': 'JOBTRACKER', 'rhost': self.jt_host }, { 'rname': 'mapreduce-tasktracker', 'rtype': 'TASKTRACKER', 'rhosts': self.tt_hosts }] }] for sdata in service_data: self._create_service(sdata) # additional config for hive try: hive_service = self.cluster.get_service('hive') hive_metastore_host = self.cm_host # should be same as cm's host, FQDN hive_metastore_name = 'hive' hive_metastore_password = '******' hive_metastore_database_port = '7432' hive_metastore_database_type = 'postgresql' hive_config = { 'hive_metastore_database_host' : hive_metastore_host, \ 'hive_metastore_database_name' : hive_metastore_name, \ 'hive_metastore_database_password' : hive_metastore_password, \ 'hive_metastore_database_port' : hive_metastore_database_port, \ 'hive_metastore_database_type' : hive_metastore_database_type } hive_service.update_config(hive_config) ok('Additional hive configs had been updated') except ApiException as e: err(e.message) # use auto configure for *-site.xml configs try: self.cluster.auto_configure() except ApiException as e: err(e.message) def start_cms(self): # start the management service info('Starting cloudera management service...') cms = self.cm.get_service() cms.start().wait() ok('Cloudera management service started successfully') def start_cdh(self): info('Excuting first run command. This might take a while.') cmd = self.cluster.first_run() while cmd.success == None: cmd = cmd.fetch() sleep(1) if cmd.success != True: err('The first run command failed: ' + cmd.resultMessage) ok('First run successfully executed. Your cluster has been set up!')
class ClouderaManager(object): """ The complete orchestration of a cluster from start to finish assuming all the hosts are configured and Cloudera Manager is installed with all the required databases setup. Handle all the steps required in creating a cluster. All the functions are built to function idempotently. So you should be able to resume from any failed step but running thru the __class__.setup() """ def __init__(self, module, config, trial=False, license_txt=None): self.api = ApiResource(config['cm']['host'], username=config['cm']['username'], password=config['cm']['password']) self.manager = self.api.get_cloudera_manager() self.config = config self.module = module self.trial = trial self.license_txt = license_txt self.cluster = None def enable_license(self): """ Enable the requested license, either it's trial mode or a full license is entered and registered. """ try: _license = self.manager.get_license() except ApiException: print_json(type="LICENSE", msg="Enabling license") if self.trial: self.manager.begin_trial() else: if license_txt is not None: self.manager.update_license(license_txt) else: fail( self.module, 'License should be provided or trial should be specified' ) try: _license = self.manager.get_license() except ApiException: fail(self.module, 'Failed enabling license') print_json(type="LICENSE", msg="Owner: {}, UUID: {}".format(_license.owner, _license.uuid)) def create_cluster(self): """ Create a cluster and add hosts to the cluster. A new cluster is only created if another one doesn't exist with the same name. """ print_json(type="CLUSTER", msg="Creating cluster") cluster_config = self.config['cluster'] try: self.cluster = self.api.get_cluster(cluster_config['name']) except ApiException: print_json(type="CLUSTER", msg="Creating Cluster entity: {}".format( cluster_config['name'])) self.cluster = self.api.create_cluster( cluster_config['name'], cluster_config['version'], cluster_config['fullVersion']) cluster_hosts = [ self.api.get_host(host.hostId).hostname for host in self.cluster.list_hosts() ] hosts = [] for host in cluster_config['hosts']: if host not in cluster_hosts: hosts.append(host) self.cluster.add_hosts(hosts) def activate_parcels(self): print_json(type="PARCELS", msg="Setting up parcels") for parcel_cfg in self.config['parcels']: parcel = Parcels(self.module, self.manager, self.cluster, parcel_cfg.get('version'), parcel_cfg.get('repo'), parcel_cfg.get('product', 'CDH')) parcel.download() parcel.distribute() parcel.activate() @retry(attempts=20, delay=5) def wait_inspect_hosts(self, cmd): """ Inspect all the hosts. Basically wait till the check completes on all hosts. :param cmd: A command instance used for tracking the status of the command """ print_json(type="HOSTS", msg="Inspecting hosts") cmd = cmd.fetch() if cmd.success is None: raise ApiException("Waiting on command {} to finish".format(cmd)) elif not cmd.success: if (cmd.resultMessage is not None and 'is not currently available for execution' in cmd.resultMessage): raise ApiException('Retry Command') fail(self.module, 'Host inspection failed') print_json(type="HOSTS", msg="Host inspection completed: {}".format( cmd.resultMessage)) def deploy_mgmt_services(self): """ Configure, deploy and start all the Cloudera Management Services. """ print_json(type="MGMT", msg="Deploying Management Services") try: mgmt = self.manager.get_service() if mgmt.serviceState == 'STARTED': return except ApiException: print_json(type="MGMT", msg="Management Services don't exist. Creating.") mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo()) for role in config['services']['MGMT']['roles']: if not len(mgmt.get_roles_by_type(role['group'])) > 0: print_json(type="MGMT", msg="Creating role for {}".format(role['group'])) mgmt.create_role('{}-1'.format(role['group']), role['group'], role['hosts'][0]) for role in config['services']['MGMT']['roles']: role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format( role['group'])) role_group.update_config(role.get('config', {})) mgmt.start().wait() if self.manager.get_service().serviceState == 'STARTED': print_json(type="MGMT", msg="Management Services started") else: fail( self.module, "[MGMT] Cloudera Management services didn't start up properly") def service_orchestrate(self, services): """ Create, pre-configure provided list of services Stop/Start those services Perform and post service startup actions :param services: List of Services to perform service specific actions """ service_classes = [] # Create and pre-configure provided services for service in services: service_config = self.config['services'].get(service.upper()) if service_config: svc = getattr(sys.modules[__name__], service)(self.cluster, service_config) if not svc.started: svc.deploy() svc.pre_start() service_classes.append(svc) print_json(type="CLUSTER", msg="Starting services: {} on Cluster".format(services)) # Deploy all the client configs, since some of the services depend on other services # and is essential that the client configs are in place self.cluster.deploy_client_config() # Start each service and run the post_start actions for each service for svc in service_classes: # Only go thru the steps if the service is not yet started. This helps with # re-running the script after fixing errors if not svc.started: svc.start() svc.post_start() def setup(self): # TODO(rnirmal): Cloudera Manager SSL? # Enable a full license or start a trial self.enable_license() # Create the cluster entity and associate hosts self.create_cluster() # Download and activate the parcels self.activate_parcels() # Inspect all the hosts self.wait_inspect_hosts(self.manager.inspect_hosts()) # Create Management services self.deploy_mgmt_services() # Configure and Start base services self.service_orchestrate(BASE_SERVICES) # Configure and Start remaining services self.service_orchestrate(ADDITIONAL_SERVICES)
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '268435456', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_dir_a = module.params.get('jn_dir', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # deploy configuration - it always return changed elif action_a == 'deploy_configuration': service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] service = cluster.get_service(service_name) # deploying client configuration command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) module.exit_json(changed=True, msg='Configuration deployed') # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
def create_cluster(config_dict): config.read(['./conf/hadrian.ini','./conf/cluster_specs.ini', './conf/cloudera-manager/cm.ini']) cm_cluster_name = config_grabber("Globals")['cm.cluster.name'] cm_username = config_grabber("Globals")['cm.username'] cm_password = config_grabber("Globals")['cm.password'] cm_port = config_grabber("Globals")['cm.port'] version = config_grabber('Globals')['cdh.cluster.version'] cm_server = config_grabber(cm_cluster_name + '-en')['cm.server'] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir('./conf/' + cm_cluster_name): config.read('./conf/' + cm_cluster_name + '/' + i) all_nodes = list() while (get_cm_status(cm_server + ':' + cm_port) != 200): print 'Waiting for CM Server to start... ' time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password) # create cluster cluster = api.create_cluster(cm_cluster_name, version.upper()) #Config CM print 'Applying any configuration changes to Cloudera Manager' cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber('cloudera-manager-updates')) planned_nodes = config_grabber(cm_cluster_name + '-en')['full.list'].split(',') for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): for j in v.split(','): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): print 'Waiting for all agents to check into the CM Server before continuing.' while len(planned_nodes) > api.get_all_hosts(): print 'Waiting for the final set of CM Agent nodes to check in.' time.sleep(5) print 'Updating Rack configuration for data nodes.' all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k,v in config_grabber(cm_cluster_name + '-dn').iteritems(): if host.hostname in v: print 'Setting host: ' + host.hostname + ' to rack /default/' + k host.set_rack_id('/default/' + k) print 'Adding all hosts to cluster.' cluster.add_hosts(all_hosts) # download CDH Parcels # TODO add some logic here to make the parcel list something that's read from the hadrian.ini # This will allow support for other CDH packages, Search, etc. if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': distribute_parcel(cluster, 'CDH', config_grabber("Globals")['cdh.parcel.version']) if config_dict.get('hdfs_ha') == True: create_zookeeper_service(config_dict, cluster) create_hdfs_service(config_dict, cluster) cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configurations' else: print 'Client configuration deployment complete.' create_mapred_service(config_dict, cluster, cm_server) if config_dict.get('hbase') == True: if config_dict.get('hdfs_ha') == False: create_zookeeper_service(config_dict, cluster) create_hbase_service(config_dict, cluster) if config_dict.get('hive') == True: create_hive_service(config_dict, cluster) print 'Starting final client configuration deployment for all services.' cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configuration.' else: print 'Client configuration deployment complete. The cluster is all yours. Happy Hadooping.'
def create_cluster(config_dict): config.read([ './conf/hadrian.ini', './conf/cluster_specs.ini', './conf/cloudera-manager/cm.ini' ]) cm_cluster_name = config_grabber("Globals")['cm.cluster.name'] cm_username = config_grabber("Globals")['cm.username'] cm_password = config_grabber("Globals")['cm.password'] cm_port = config_grabber("Globals")['cm.port'] version = config_grabber('Globals')['cdh.cluster.version'] cm_server = config_grabber(cm_cluster_name + '-en')['cm.server'] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir('./conf/' + cm_cluster_name): config.read('./conf/' + cm_cluster_name + '/' + i) all_nodes = list() while (get_cm_status(cm_server + ':' + cm_port) != 200): print 'Waiting for CM Server to start... ' time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password) # create cluster cluster = api.create_cluster(cm_cluster_name, version.upper()) #Config CM print 'Applying any configuration changes to Cloudera Manager' cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber('cloudera-manager-updates')) planned_nodes = config_grabber(cm_cluster_name + '-en')['full.list'].split(',') for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): for j in v.split(','): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): print 'Waiting for all agents to check into the CM Server before continuing.' while len(planned_nodes) > api.get_all_hosts(): print 'Waiting for the final set of CM Agent nodes to check in.' time.sleep(5) print 'Updating Rack configuration for data nodes.' all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k, v in config_grabber(cm_cluster_name + '-dn').iteritems(): if host.hostname in v: print 'Setting host: ' + host.hostname + ' to rack /default/' + k host.set_rack_id('/default/' + k) print 'Adding all hosts to cluster.' cluster.add_hosts(all_hosts) # download CDH Parcels # TODO add some logic here to make the parcel list something that's read from the hadrian.ini # This will allow support for other CDH packages, Search, etc. if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': distribute_parcel(cluster, 'CDH', config_grabber("Globals")['cdh.parcel.version']) if config_dict.get('hdfs_ha') == True: create_zookeeper_service(config_dict, cluster) create_hdfs_service(config_dict, cluster) cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configurations' else: print 'Client configuration deployment complete.' create_mapred_service(config_dict, cluster, cm_server) if config_dict.get('hbase') == True: if config_dict.get('hdfs_ha') == False: create_zookeeper_service(config_dict, cluster) create_hbase_service(config_dict, cluster) if config_dict.get('hive') == True: create_hive_service(config_dict, cluster) print 'Starting final client configuration deployment for all services.' cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: print 'Failed to deploy client configuration.' else: print 'Client configuration deployment complete. The cluster is all yours. Happy Hadooping.'
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '215964392', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
cmd = manager.host_install(host_username, cluster_hosts, private_key=private_key_contents) print "checking if host_install finished" while cmd.active == True: sleep(5) print " ..." cmd = cmd.fetch() if cmd.success != True: print "host_install failed: " + cmd.resultMessage exit(0) print "host_install successful: " + cmd.resultMessage cluster = api.create_cluster(cluster_name, cdh_version) all_hosts = api.get_all_hosts() hostrefs = [] yarn_nodemanager_hostrefs = [] hdfs_datanode_hostrefs = [] zookeeper_server_hostrefs = [] hive_gateway_hostrefs = [] for host in all_hosts: if host.hostname == cm_management_host: cm_management_host_hostref = host.hostId if host.hostname == yarn_resourcemanager: yarn_resourcemanager_hostref = host.hostId if host.hostname == yarn_jobhistory:
from cm_api.api_client import ApiResource import sys, time api = ApiResource(sys.argv[1], 7180, "acm", "SCALE42secretly", version=15) excpected_hosts = sys.argv[2:] excpected_hosts_count = len(excpected_hosts) cluster = None try: cluster = api.get_cluster(name="ACM Cluster") except Exception, e: if e.message[-10:-1].lower() == "not found": #CLUSTER YET NOT CREATED -> create it cluster = api.create_cluster(name="ACM Cluster", version="CDH5") while True: #make sure every comissioned hosts is part of the cluster !!! print "Waiting for the <{0}> worker nodes of the cluster to be ready with their SCM configured AGENT...".format( excpected_hosts_count) hosts = api.get_all_hosts() #get the IP-Adress of every hosts comissioned actual_host_ips = list(set([str(h.ipAddress) for h in hosts])) verify_hosts = [eh in actual_host_ips for eh in excpected_hosts] if all(verify_hosts): current_host_ids = [h.hostId for h in hosts] #ADDING HOSTS to the cluster cluster.add_hosts(current_host_ids) break time.sleep(10)
class ClouderaManager(object): """ The complete orchestration of a cluster from start to finish assuming all the hosts are configured and Cloudera Manager is installed with all the required databases setup. Handle all the steps required in creating a cluster. All the functions are built to function idempotently. So you should be able to resume from any failed step but running thru the __class__.setup() """ def __init__(self, module, config, trial=False, license_txt=None): self.api = ApiResource(config['cm']['host'], username=config['cm']['username'], password=config['cm']['password']) self.manager = self.api.get_cloudera_manager() self.config = config self.module = module self.trial = trial self.license_txt = license_txt self.cluster = None def enable_license(self): """ Enable the requested license, either it's trial mode or a full license is entered and registered. """ try: _license = self.manager.get_license() except ApiException: print_json(type="LICENSE", msg="Enabling license") if self.trial: self.manager.begin_trial() else: if license_txt is not None: self.manager.update_license(license_txt) else: fail(self.module, 'License should be provided or trial should be specified') try: _license = self.manager.get_license() except ApiException: fail(self.module, 'Failed enabling license') print_json(type="LICENSE", msg="Owner: {}, UUID: {}".format(_license.owner, _license.uuid)) def create_cluster(self): """ Create a cluster and add hosts to the cluster. A new cluster is only created if another one doesn't exist with the same name. """ print_json(type="CLUSTER", msg="Creating cluster") cluster_config = self.config['cluster'] try: self.cluster = self.api.get_cluster(cluster_config['name']) except ApiException: print_json(type="CLUSTER", msg="Creating Cluster entity: {}".format(cluster_config['name'])) self.cluster = self.api.create_cluster(cluster_config['name'], cluster_config['version'], cluster_config['fullVersion']) cluster_hosts = [self.api.get_host(host.hostId).hostname for host in self.cluster.list_hosts()] hosts = [] for host in cluster_config['hosts']: if host not in cluster_hosts: hosts.append(host) self.cluster.add_hosts(hosts) def activate_parcels(self): print_json(type="PARCELS", msg="Setting up parcels") for parcel_cfg in self.config['parcels']: parcel = Parcels(self.module, self.manager, self.cluster, parcel_cfg.get('version'), parcel_cfg.get('repo'), parcel_cfg.get('product', 'CDH')) parcel.download() parcel.distribute() parcel.activate() @retry(attempts=20, delay=5) def wait_inspect_hosts(self, cmd): """ Inspect all the hosts. Basically wait till the check completes on all hosts. :param cmd: A command instance used for tracking the status of the command """ print_json(type="HOSTS", msg="Inspecting hosts") cmd = cmd.fetch() if cmd.success is None: raise ApiException("Waiting on command {} to finish".format(cmd)) elif not cmd.success: if (cmd.resultMessage is not None and 'is not currently available for execution' in cmd.resultMessage): raise ApiException('Retry Command') fail(self.module, 'Host inspection failed') print_json(type="HOSTS", msg="Host inspection completed: {}".format(cmd.resultMessage)) def deploy_mgmt_services(self): """ Configure, deploy and start all the Cloudera Management Services. """ print_json(type="MGMT", msg="Deploying Management Services") try: mgmt = self.manager.get_service() if mgmt.serviceState == 'STARTED': return except ApiException: print_json(type="MGMT", msg="Management Services don't exist. Creating.") mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo()) for role in config['services']['MGMT']['roles']: if not len(mgmt.get_roles_by_type(role['group'])) > 0: print_json(type="MGMT", msg="Creating role for {}".format(role['group'])) mgmt.create_role('{}-1'.format(role['group']), role['group'], role['hosts'][0]) for role in config['services']['MGMT']['roles']: role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format(role['group'])) role_group.update_config(role.get('config', {})) mgmt.start().wait() if self.manager.get_service().serviceState == 'STARTED': print_json(type="MGMT", msg="Management Services started") else: fail(self.module, "[MGMT] Cloudera Management services didn't start up properly") def service_orchestrate(self, services): """ Create, pre-configure provided list of services Stop/Start those services Perform and post service startup actions :param services: List of Services to perform service specific actions """ service_classes = [] # Create and pre-configure provided services for service in services: service_config = self.config['services'].get(service.upper()) if service_config: svc = getattr(sys.modules[__name__], service)(self.cluster, service_config) if not svc.started: svc.deploy() svc.pre_start() service_classes.append(svc) print_json(type="CLUSTER", msg="Starting services: {} on Cluster".format(services)) # Deploy all the client configs, since some of the services depend on other services # and is essential that the client configs are in place self.cluster.deploy_client_config() # Start each service and run the post_start actions for each service for svc in service_classes: # Only go thru the steps if the service is not yet started. This helps with # re-running the script after fixing errors if not svc.started: svc.start() svc.post_start() def setup(self): # TODO(rnirmal): Cloudera Manager SSL? # Enable a full license or start a trial self.enable_license() # Create the cluster entity and associate hosts self.create_cluster() # Download and activate the parcels self.activate_parcels() # Inspect all the hosts self.wait_inspect_hosts(self.manager.inspect_hosts()) # Create Management services self.deploy_mgmt_services() # Configure and Start base services self.service_orchestrate(BASE_SERVICES) # Configure and Start remaining services self.service_orchestrate(ADDITIONAL_SERVICES)
cm_host = "cloudera-pe-cm01" api = ApiResource(cm_host, username="******", password="******") # Distribute the CDH parcel parcel_repo = 'http://archive.cloudera.com/cdh5/parcels/5.2.0' #parcel_repo = 'http://archive.cloudera.com/cdh5/parcels/5.1.3/' cm_config = api.get_cloudera_manager().get_config(view='full') repo_config = cm_config['REMOTE_PARCEL_REPO_URLS'] value = repo_config.value or repo_config.default value += ',' + parcel_repo api.get_cloudera_manager().update_config({'REMOTE_PARCEL_REPO_URLS': value}) time.sleep(10) # create cluster, add the hosts cluster = api.create_cluster("cloudera-pe-test", "CDH5") #api.create_host("master", "ip-10-238-154-140", "10.238.154.140") #api.create_host("w01", "ip-10-143-183-98", "10.143.183.98") #api.create_host("w02", "ip-10-140-38-88", "10.140.38.88") #api.create_host("w03", "ip-10-140-28-243", "10.140.28.243") #hosts.append("master") #hosts.append("w01") #hosts.append("w02") #hosts.append("w03") hosts.append("ip-10-11-167-80") hosts.append("ip-10-153-224-197") hosts.append("ip-10-37-166-245") hosts.append("ip-10-169-69-118") cluster.add_hosts(hosts) # Downloads and distributes parcels
def main(argv): CM_HOST = "localhost" CM_PORT = 7180 CM_USER = "******" CM_PASSWD = "admin" CMD_TIMEOUT = 180 #Configurations HDFS_CONF = { } NAMENODE_CONF = { 'dfs_name_dir_list': '/dfs/nn', 'dfs_namenode_servicerpc_address': 8022, 'namenode_java_heapsize': 154140672, } SECONDARY_CONF = { 'fs_checkpoint_dir_list': '/dfs/snn', 'secondary_namenode_java_heapsize': 154140672, } DATANODE_CONF = { 'dfs_data_dir_list': '/data/1/dfs/dn,/data/2/dfs/dn,/data/3/dfs/dn', 'dfs_datanode_handler_count': 10, 'dfs_datanode_du_reserved': 2180395417, 'dfs_datanode_max_locked_memory': 983564288, 'datanode_java_heapsize': 286261248, } YARN_CONF = { 'hdfs_service':'hdfs', } RSRCMAN_CONF = { 'resource_manager_java_heapsize': 154140672, 'yarn_scheduler_maximum_allocation_mb': 1513, 'yarn_scheduler_maximum_allocation_vcores': 2, } JOBHIST_CONF = { 'mr2_jobhistory_java_heapsize': 154140672, } NODEMAN_CONF = { 'yarn_nodemanager_local_dirs': '/yarn/nm', 'yarn_nodemanager_resource_cpu_vcores': 2, 'yarn_nodemanager_resource_memory_mb': 1513, } #Parser Options parser = OptionParser() parser.set_defaults(action='') parser.add_option("-a", "--add", action="store_const", const="add", dest="action", help="add the list of hosts to the named cluster") parser.add_option("-r", "--remove", action="store_const", const="remove", dest="action", help="remove the list of hosts from the named cluster") parser.add_option("-d", "--deploy", action="store_const", const="deploy", dest="action", help="deploy the list of hosts as a new cluster with the given name") parser.add_option("--delete", action="store_const", const="delete", dest="action", help="delete the named cluster") parser.add_option("--name", dest="name", help="declare the cluster name to be created or to interact with") parser.add_option("--hosts", dest="hosts", help="comma delimited list of hosts to be added/removed") (opts,args) = parser.parse_args() CLUSTER_NAME = opts.name if opts.hosts and len(opts.hosts) > 1: HOSTNAMES = opts.hosts.split(",") elif opts.hosts: HOSTNAMES = opts.hosts else: HOSTNAMES = '' ROLEHASH = [] if HOSTNAMES: for host in HOSTNAMES: ROLEHASH.append(hashlib.md5(host).hexdigest()) api = ApiResource(CM_HOST, CM_PORT, CM_USER, CM_PASSWD) #Deploy a new cluster if(opts.action == "deploy"): #Create Cluster print "Creating cluster..." cluster = api.create_cluster(CLUSTER_NAME, "CDH5") cluster.add_hosts(HOSTNAMES) #Create HDFS Service and Roles print "Creating HDFS Service and Roles..." hdfs = cluster.create_service("hdfs", "HDFS") namenode = hdfs.create_role("hdfs-NAMENODE-" + ROLEHASH[0], "NAMENODE", HOSTNAMES[0]) secnamenode = hdfs.create_role("hdfs-SECONDARYNAMENODE-" + ROLEHASH[0], "SECONDARYNAMENODE", HOSTNAMES[0]) for i in range(len(HOSTNAMES)-1): datanode = hdfs.create_role("hdfs-DATANODE-" + ROLEHASH[i+1], "DATANODE", HOSTNAMES[i+1]) #Configure HDFS print "Configuring HDFS..." hdfs.update_config(svc_config = HDFS_CONF) for roleGroup in hdfs.get_all_role_config_groups(): if roleGroup.roleType == "NAMENODE": roleGroup.update_config(NAMENODE_CONF) elif roleGroup.roleType == "SECONDARYNAMENODE": roleGroup.update_config(SECONDARY_CONF) elif roleGroup.roleType == "DATANODE": roleGroup.update_config(DATANODE_CONF) #Start HDFS #format_hdfs takes a list of NameNodes print "Formatting HDFS..." cmd = hdfs.format_hdfs('hdfs-NAMENODE-' + ROLEHASH[0])[0] if not cmd.wait(CMD_TIMEOUT).success: print "Failed to format HDFS" print "Starting HDFS..." cmd = hdfs.start() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to start HDFS") cmd = hdfs.create_hdfs_tmp() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to create HDFS /tmp") for role in hdfs.get_all_roles(): cmd = hdfs.deploy_client_config(role.name) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to deploy client config. Role: " + role.name) #Create YARN Service and Roles print "Creating YARN Service and Roles..." yarn = cluster.create_service("yarn", "YARN") resourceman = yarn.create_role("yarn-RESOURCEMANAGER-" + ROLEHASH[0], "RESOURCEMANAGER", HOSTNAMES[0]) jobhist = yarn.create_role("yarn-JOBHISTORY-" + ROLEHASH[0], "JOBHISTORY", HOSTNAMES[0]) for i in range(len(HOSTNAMES)-1): nodeman = yarn.create_role("yarn-NODEMANAGER-" + ROLEHASH[i+1], "NODEMANAGER", HOSTNAMES[i+1]) #Configure YARN print "Configuring YARN..." yarn.update_config(svc_config = YARN_CONF) for roleGroup in yarn.get_all_role_config_groups(): if roleGroup.roleType == "RESOURCEMANAGER": roleGroup.update_config(RSRCMAN_CONF) elif roleGroup.roleType == "JOBHISTORY": roleGroup.update_config(JOBHIST_CONF) elif roleGroup.roleType == "NODEMANAGER": roleGroup.update_config(NODEMAN_CONF) #Start YARN print "Starting YARN..." cmd = yarn.create_yarn_job_history_dir() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to create Job History Directory") cmd = yarn.create_yarn_node_manager_remote_app_log_dir() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to create NodeManager remote application log directory") cmd = yarn.start() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to start YARN") for role in yarn.get_all_roles(): cmd = yarn.deploy_client_config(role.name) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to deploy client config. Role: " + role.name) #SUCCESS! print "Cluster succesfully deployed." #Add new nodes elif(opts.action == "add"): print "Adding hosts..." cluster = api.get_cluster(CLUSTER_NAME); cluster.add_hosts(HOSTNAMES); print "Configurng HDFS Roles..." hdfs = cluster.get_service("hdfs") for i in range(len(HOSTNAMES)): datanode = hdfs.create_role("hdfs-DATANODE-" + ROLEHASH[i], "DATANODE", HOSTNAMES[i]) datanode.update_config(DATANODE_CONF); cmds = hdfs.start_roles("hdfs-DATANODE-" + ROLEHASH[i]) for cmd in cmds: if not cmd.wait(CMD_TIMEOUT).success: raise Exception(cmd.name) cmd = hdfs.deploy_client_config("hdfs-DATANODE-" + ROLEHASH[i]) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to deploy client config hdfs-DATANODE-" + ROLEHASH[i]) print "Configuring YARN roles..." yarn = cluster.get_service("yarn") for i in range(len(HOSTNAMES)): nodeman = yarn.create_role("yarn-NODEMANAGER-" + ROLEHASH[i], "NODEMANAGER", HOSTNAMES[i]) nodeman.update_config(NODEMAN_CONF) cmds = yarn.start_roles("yarn-NODEMANAGER-" + ROLEHASH[i]) for cmd in cmds: if not cmd.wait(CMD_TIMEOUT).success: raise Exception(cmd.name) cmd = yarn.deploy_client_config("yarn-NODEMANAGER-" + ROLEHASH[i]) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to deploy client config yarn-NODEMANAGER-" + ROLEHASH[i]) #print "Restarting HDFS service..." #cmd = hdfs.restart() #if not cmd.wait(CMD_TIMEOUT).success: # raise Exception("Failed to restart HDFS") #print "Restarting YARN service..." #cmd = yarn.restart() #if not cmd.wait(CMD_TIMEOUT).success: # raise Exception("Failed to restart YARN") #SUCCESS! print "Nodes successfully added" #Remove nodes elif(opts.action == "remove"): cluster = api.get_cluster(CLUSTER_NAME); hdfs = cluster.get_service("hdfs") yarn = cluster.get_service("yarn") print "Decommissioning Roles..." for role in ROLEHASH: cmd = yarn.decommission("yarn-NODEMANAGER-" + role) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to decommission role yarn-NODEMANAGER" + role) cmd = hdfs.decommission("hdfs-DATANODE-" + role) if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to decommission role hdfs-DATANODE-" + role) print "Deleting Nodes..." for role in ROLEHASH: hdfs.delete_role("hdfs-DATANODE-" + role) yarn.delete_role("yarn-NODEMANAGER-" + role) for hostname in HOSTNAMES: cluster.remove_host(hostname); #SUCCESS print "Nodes successfull removed." #Delete Cluster elif(opts.action == "delete"): cluster = api.get_cluster(CLUSTER_NAME); hdfs = cluster.get_service("hdfs") yarn = cluster.get_service("yarn") print "Stopping YARN..." cmd = yarn.stop() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to stop YARN") print "Stopping HDFS..." cmd = hdfs.stop() if not cmd.wait(CMD_TIMEOUT).success: raise Exception("Failed to stop HDFS") print "Deleting Cluster..." api.delete_cluster(CLUSTER_NAME) #SUCCESS print "Cluster successfully deleted." else: print "PLEASE SELECT A CORRECT OPTION" parser.print_help()
def main(): config.read([ "./conf/hadrian.ini", "./conf/cluster_specs.ini", "./conf/cloudera-manager/cm.ini" ]) cm_cluster_name = config_grabber("Globals")["cm.cluster.name"] cm_username = config_grabber("Globals")["cm.username"] cm_password = config_grabber("Globals")["cm.password"] cm_port = config_grabber("Globals")["cm.port"] version = config_grabber("Globals")["cdh.cluster.version"] cm_server = config_grabber(cm_cluster_name + "-hn")["cm.server"] #Grab all configuration files in the directory with the CM Cluster Name. for i in os.listdir("./conf/" + cm_cluster_name): config.read("./conf/" + cm_cluster_name + "/" + i) while (get_cm_status(cm_server + ":" + cm_port) != 200): logging.info("Waiting for CM Server to start... ") time.sleep(15) api = ApiResource(cm_server, cm_port, cm_username, cm_password, version=12) # create cluster or get existing cluster cluster_exists = False for i in api.get_all_clusters(): if i.name == cm_cluster_name: cluster_exists = True if cluster_exists == False: cluster = api.create_cluster(cm_cluster_name, version.upper()) planned_nodes = config_grabber(cm_cluster_name + "-hn")["full.list"].split(",") for k, v in config_grabber(cm_cluster_name + "-dn").iteritems(): for j in v.split(","): planned_nodes.append(j) # TODO make this smarter. show which agents haven't checked in. Add the option to continue without them. if len(api.get_all_hosts()) != len(planned_nodes): logging.info( "Waiting for all agents to check into the CM Server before continuing." ) while len(planned_nodes) > api.get_all_hosts(): logging.info( "Waiting for the final set of CM Agent nodes to check in.") time.sleep(5) logging.info("Updating Rack configuration for data nodes.") all_hosts = list() for host in api.get_all_hosts(): all_hosts.append(host.hostId) for k, v in config_grabber(cm_cluster_name + "-dn").iteritems(): if host.hostname in v: logging.info("Setting host: " + host.hostname + " to rack /" + k) host.set_rack_id("/" + k) logging.info("Adding all hosts to cluster.") cluster.add_hosts(all_hosts) else: cluster = api.get_cluster(cm_cluster_name) #Config CM logging.info("Applying any configuration changes to Cloudera Manager") cmanager = api.get_cloudera_manager() cmanager.update_config(config_grabber("cloudera-manager-updates")) if os.path.exists("/root/hadrian/cm_license.txt"): with open("/root/hadrian/cm_license.txt", "r") as license: logging.info("Applying Enterprise License to Cloudera Manager") cmanager.update_license(license.read()) if config_grabber('Globals')['cdh.distribution.method'] == 'parcels': # increase the parcel refresh frequency to one minute to find parcel repos in a more timely manner cmanager.update_config({"PARCEL_UPDATE_FREQ": 1}) distribute_parcel(cluster, 'CDH', config_grabber('Globals')['cdh.parcel.version']) distribute_parcel(cluster, 'KAFKA', config_grabber('Globals')['kafka.parcel.version']) # restore parcel refresh time period to original 60 minutes cmanager.update_config({"PARCEL_UPDATE_FREQ": 60}) # grab current services, so that we can skip services already defined to make this script reentrant current_services = [] for i in cluster.get_all_services(): current_services.append(i.type) if "ZOOKEEPER" not in current_services: create_zookeeper_service(cluster) if "HDFS" not in current_services: create_hdfs_service(cluster, api) if "YARN" not in current_services: create_yarn_service(cluster) if "HIVE" not in current_services: create_hive_service(cluster) if "IMPALA" not in current_services: create_impala_service(cluster) if "KAFKA" not in current_services: create_kafka_service(cluster) if config_grabber("Globals")["kerberos.enabled"].lower() == "true": enable_kerberos(cluster, cmanager) else: logging.info("Starting remaining services.") cmd = cluster.start() if not cmd.wait(CMD_TIMEOUT).success: logging.info( "Error in cluster services start. Please review Cloudera Manager for details." ) else: logging.info("Remaining cluster services started.") logging.info( "Starting final client configuration deployment for all services.") cmd = cluster.deploy_client_config() if not cmd.wait(CMD_TIMEOUT).success: logging.info("Failed to deploy client configuration.") else: logging.info( "Client configuration deployment complete. The cluster is all yours. Happy Hadooping." )