Ejemplo n.º 1
0
def init_cluster():
    # wait for all cloudera agent processes to come up
    BDVLIB_ServiceWait(
        [["services", "cloudera_scm_agent", NODE_GROUP_ID, "kts"]])
    # make sure cloudera manager has received registration
    # for all new agents
    all_cloudera_hosts = get_hosts_for_service(
        ["services", "cloudera_scm_agent"])
    api = ApiResource(CM_HOST, username="******", password="******")
    while True:
        current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts())
        setup_logger.info("Currently registered hosts with CM " +
                          str(current_all_hosts))
        if all(x in current_all_hosts for x in all_cloudera_hosts):
            break
        setup_logger.info(
            "waiting for new nodes to register with cloudera manager")
        time.sleep(10)
    manager = api.get_cloudera_manager()
    manager.update_config(CM_CONFIG)
    cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION,
                                 CDH_FULL_VERSION)
    KTS_HOSTS = ConfigMeta.getWithTokens(
        ['nodegroups', NODE_GROUP_ID, 'roles', 'kts', 'fqdns'])
    cluster.add_hosts(KTS_HOSTS)

    return (cluster, manager)
Ejemplo n.º 2
0
def init_cluster():
    # wait for all cloudera agent processes to come up
    setup_logger.info("Creating Clutser.")
    BDVLIB_ServiceWait([["services", "cloudera_scm_agent", NODE_GROUP_ID]])
    # make sure cloudera manager has received registration
    # for all new agents
    all_cloudera_hosts = get_hosts_for_service(
        ["services", "cloudera_scm_agent"])
    api = ApiResource(CM_HOST, username=ADMIN_USER, password=ADMIN_PASS)
    while True:
        current_all_hosts = map(lambda x: x.hostname, api.get_all_hosts())
        setup_logger.info("Currently registered hosts with CM " +
                          str(current_all_hosts))
        if all(x in current_all_hosts for x in all_cloudera_hosts):
            break
        setup_logger.info(
            "waiting for new nodes to register with cloudera manager")
        time.sleep(10)
    manager = api.get_cloudera_manager()
    manager.update_config(CM_CONFIG)
    cluster = api.create_cluster(CLUSTER_NAME, CDH_MAJOR_VERSION,
                                 CDH_FULL_VERSION)
    cluster.add_hosts(ALL_HOSTS)

    # turn off host swap alerting
    hosts_swap_alert_off(api)

    setup_logger.info("Setting Up SPARK2 Repo....")
    add_spark2_repo(api)
    ##Set java home
    setup_logger.info("Setting Up Java Path....")
    hosts_set_javahome(api)

    return (cluster, manager)
Ejemplo n.º 3
0
logging.info('Starting the Cloudera Manager service')
mgmt.start().wait()

# Update the Parcels repo
logging.info('Updating the remote parcels repo')
cm_config = api.get_cloudera_manager().get_config(view='full')
repo_urls = cdh_parcel_repo + ',' + kafka_parcel_repo
api.get_cloudera_manager().update_config(
    {'REMOTE_PARCEL_REPO_URLS': repo_urls})
time.sleep(10)

# Download the CDH Parcel
logging.info('Downloading the CDH parcel')
cluster_name = 'Open Data Platform'
cluster = api.create_cluster(cluster_name, version='CDH5')
cluster.add_hosts(hosts)
cdh_parcel = cluster.get_parcel('CDH', cdh_parcel_version)
cdh_parcel.start_download()
while True:
    cdh_parcel = cluster.get_parcel('CDH', cdh_parcel_version)
    if cdh_parcel.stage == 'DOWNLOADED':
        break
    if cdh_parcel.state.errors:
        raise Exception(str(cdh_parcel.state.errors))
    logging.info('Parcel download progress: %s / %s',
                 cdh_parcel.state.progress, cdh_parcel.state.totalProgress)
    time.sleep(15)  # check again in 15 seconds

logging.info('Downloaded CDH parcel version %s on cluster %s',
             cdh_parcel_version, cluster_name)
Ejemplo n.º 4
0
class Deploy:
    def __init__(self,
                 cm_port='7180',
                 cm_user='******',
                 cm_passwd='admin',
                 cluster_name='cluster1'):

        self.cluster_name = cluster_name
        self.cdh_version = "CDH5"

        self.cfg = ParseConfig()
        self.host_list = self.cfg.get_hosts()

        self._get_host_allocate()
        self.cm_host = self.host_list[0]

        self.api = ApiResource(self.cm_host,
                               cm_port,
                               cm_user,
                               cm_passwd,
                               version=7)
        self.cm = self.api.get_cloudera_manager()

        try:
            self.cluster = self.api.get_cluster(self.cluster_name)
        except:
            try:
                self.cluster = self.api.create_cluster(self.cluster_name,
                                                       self.cdh_version)
            except:
                err('Cannot connect to cloudera manager on %s' % self.cm_host)

        # add all our hosts to the cluster
        try:
            self.cluster.add_hosts(self.host_list)
            info('Add hosts successfully')
        except Exception as e:
            if e.code == 400:
                info('Already Added hosts')
            elif e.code == 404:
                err(e.message)

    def _auto_allocate(self, hosts):
        # enable mgmt node if node count is larger than mgmt_th
        mgmt_th = 6

        if type(hosts) != list: err('hosts parameter should be a list')
        host_num = len(hosts)
        # node<=3, ZK=1 ,node>3, ZK=3
        zk_num = 1 if host_num <= 3 else 3

        # with mgmt node
        if host_num >= mgmt_th:
            self.ap_host = self.es_host = self.ho_host = self.sm_host = self.nn_host = self.hm_host = self.jt_host = hosts[
                0]
            self.dn_hosts = self.rs_hosts = self.tt_hosts = hosts[1:]
            self.snn_host = hosts[1]
            self.hms_host = hosts[2]
            self.hs2_host = hosts[3]
        # without mgmt node
        else:
            if host_num == 1:
                self.ap_host = self.es_host = self.ho_host = self.sm_host = self.jt_host = \
                self.nn_host = self.hm_host = self.snn_host = self.hms_host = self.hs2_host = hosts[0]
            elif host_num > 1:
                # nn, snn not on same node
                tmp_hosts = hosts[:]
                self.nn_host = choice(tmp_hosts)
                tmp_hosts.remove(self.nn_host)
                self.snn_host = choice(tmp_hosts)
                self.hm_host = choice(tmp_hosts)
                self.jt_host = choice(hosts)
                self.hms_host = choice(hosts)
                self.hs2_host = choice(hosts)
                # cm
                self.ap_host = choice(hosts)
                self.es_host = choice(hosts)
                self.ho_host = choice(hosts)
                self.sm_host = choice(hosts)

            self.dn_hosts = self.rs_hosts = self.tt_hosts = hosts

        self.zk_hosts = hosts[-zk_num:]

    def _get_host_allocate(self):
        roles = self.cfg.get_roles()
        # auto set if no role config found
        if not roles:
            self._auto_allocate(self.host_list)
            return

        valid_roles = [
            'DN', 'RS', 'ZK', 'HM', 'NN', 'SNN', 'AP', 'ES', 'SM', 'HO', 'TT',
            'JT', 'HMS', 'HS2'
        ]
        role_host = defaultdict(list)

        for item in roles:
            for role in item[1]:
                role = role.strip()
                if role not in valid_roles: err('Incorrect role config')
                role_host[role].append(item[0])

        # cdh
        self.nn_host = role_host['NN'][0]
        self.snn_host = role_host['SNN'][0]
        self.hm_host = role_host['HM'][0]
        self.jt_host = role_host['JT'][0]
        self.hms_host = role_host['HMS'][0]
        self.hs2_host = role_host['HS2'][0]
        self.tt_hosts = role_host['TT']
        self.zk_hosts = role_host['ZK']
        self.dn_hosts = role_host['DN']
        self.rs_hosts = role_host['RS']
        # cm
        self.ap_host = role_host['AP'][0]
        self.es_host = role_host['ES'][0]
        self.ho_host = role_host['HO'][0]
        self.sm_host = role_host['SM'][0]

    def setup_cms(self):
        try:
            self.cm.delete_mgmt_service()
        except:
            pass

        # create the management service
        try:
            mgmt = self.cm.create_mgmt_service(ApiServiceSetupInfo())
            mgmt.create_role('AlertPublisher', "ALERTPUBLISHER", self.ap_host)
            mgmt.create_role('EventServer', "EVENTSERVER", self.es_host)
            mgmt.create_role('HostMonitor', "HOSTMONITOR", self.hm_host)
            mgmt.create_role('ServiceMonitor', "SERVICEMONITOR", self.sm_host)
            ok('Cloudera management service created successfully.')
        except ApiException:
            info('Cloudera management service had already been created.')

    def setup_parcel(self):
        parcels_list = []
        i = 1
        for p in self.cluster.get_all_parcels():
            if p.stage == 'AVAILABLE_REMOTELY': continue
            elif p.stage == 'ACTIVATED':
                info('Parcel [%s] has already been activated' % p.version)
                return
            else:
                print '\t' + str(i) + ': ' + p.product + ' ' + p.version
                i += 1
                parcels_list.append(p)

        if len(parcels_list) == 0:
            err('No downloaded ' + self.cdh_version + ' parcel found!')
        elif len(parcels_list) > 1:
            index = raw_input('Input parcel number:')
            if not index.isdigit:
                err('Error index, must be a number')
            cdh_parcel = parcels_list[int(index) - 1]
        else:
            cdh_parcel = parcels_list[0]

    #  # download the parcel
    #  print "Starting parcel download. This might take a while."
    #  cmd = cdh_parcel.start_download()
    #  if cmd.success != True:
    #      print "Parcel download failed!"
    #      exit(0)

    #  # make sure the download finishes
    #  while cdh_parcel.stage != 'DOWNLOADED':
    #  sleep(5)
    #      cdh_parcel = self.cluster.get_parcel(cdh_parcel.product, cdh_parcel.version)

    #  print cdh_parcel.product + ' ' + cdh_parcel.version + " downloaded"

    # distribute the parcel
        info('Starting parcel distribution. This might take a while.')
        cmd = cdh_parcel.start_distribution()
        i = 0
        while cmd.success == None:
            i += 1
            sleep(5)
            cmd = cmd.fetch()
            s = '.' * i
            print '\r%s' % s,
            sys.stdout.flush()
        if cmd.success != True:
            err('Parcel distribution failed!')

        # make sure the distribution finishes
        while cdh_parcel.stage != "DISTRIBUTED":
            sleep(5)
            cdh_parcel = self.cluster.get_parcel(cdh_parcel.product,
                                                 cdh_parcel.version)

        ok(cdh_parcel.product + ' ' + cdh_parcel.version + ' distributed')

        # activate the parcel
        cmd = cdh_parcel.activate()
        if cmd.success != True:
            err('Parcel activation failed!')

        # make sure the activation finishes
        while cdh_parcel.stage != "ACTIVATED":
            sleep(5)
            cdh_parcel = self.cluster.get_parcel(cdh_parcel.product,
                                                 cdh_parcel.version)

        ok(cdh_parcel.product + ' ' + cdh_parcel.version + ' activated')

    def _create_service(self, sdata):
        try:
            self.cluster.get_service(sdata['sname'])
            info('Service %s had already been configured' % sdata['sname'])
        except ApiException:
            service = self.cluster.create_service(sdata['sname'],
                                                  sdata['stype'])
            ok('Service %s had been created successfully' % sdata['sname'])
            for role in sdata['roles']:
                if role.has_key('rhost'):
                    service.create_role(role['rname'], role['rtype'],
                                        role['rhost'])
                elif role.has_key('rhosts'):
                    rid = 0
                    for host in role['rhosts']:
                        rid += 1
                        service.create_role(role['rname'] + '-' + str(rid),
                                            role['rtype'], host)

    def setup_cdh(self):
        service_data = [{
            'sname':
            'hdfs',
            'stype':
            'HDFS',
            'roles': [{
                'rname': 'hdfs-namenode',
                'rtype': 'NAMENODE',
                'rhost': self.nn_host
            }, {
                'rname': 'hdfs-secondarynamenode',
                'rtype': 'SECONDARYNAMENODE',
                'rhost': self.snn_host
            }, {
                'rname': 'hdfs-datanode',
                'rtype': 'DATANODE',
                'rhosts': self.dn_hosts
            }]
        }, {
            'sname':
            'zookeeper',
            'stype':
            'ZOOKEEPER',
            'roles': [{
                'rname': 'zookeeper',
                'rtype': 'SERVER',
                'rhosts': self.zk_hosts
            }]
        }, {
            'sname':
            'hbase',
            'stype':
            'HBASE',
            'roles': [{
                'rname': 'hbase-master',
                'rtype': 'MASTER',
                'rhost': self.hm_host
            }, {
                'rname': 'hdfs-regionserver',
                'rtype': 'REGIONSERVER',
                'rhosts': self.rs_hosts
            }]
        }, {
            'sname':
            'hive',
            'stype':
            'HIVE',
            'roles': [{
                'rname': 'hive-metastore',
                'rtype': 'HIVEMETASTORE',
                'rhost': self.hms_host
            }, {
                'rname': 'hive-server2',
                'rtype': 'HIVESERVER2',
                'rhost': self.hs2_host
            }, {
                'rname': 'hive-gateway',
                'rtype': 'GATEWAY',
                'rhosts': self.dn_hosts
            }]
        }, {
            'sname':
            'mapreduce',
            'stype':
            'MAPREDUCE',
            'roles': [{
                'rname': 'mapreduce-jobtracker',
                'rtype': 'JOBTRACKER',
                'rhost': self.jt_host
            }, {
                'rname': 'mapreduce-tasktracker',
                'rtype': 'TASKTRACKER',
                'rhosts': self.tt_hosts
            }]
        }]

        for sdata in service_data:
            self._create_service(sdata)

        # additional config for hive
        try:
            hive_service = self.cluster.get_service('hive')
            hive_metastore_host = self.cm_host  # should be same as cm's host, FQDN
            hive_metastore_name = 'hive'
            hive_metastore_password = '******'
            hive_metastore_database_port = '7432'
            hive_metastore_database_type = 'postgresql'
            hive_config = { 'hive_metastore_database_host' : hive_metastore_host, \
                            'hive_metastore_database_name' : hive_metastore_name, \
                            'hive_metastore_database_password' : hive_metastore_password, \
                            'hive_metastore_database_port' : hive_metastore_database_port, \
                            'hive_metastore_database_type' : hive_metastore_database_type }
            hive_service.update_config(hive_config)
            ok('Additional hive configs had been updated')
        except ApiException as e:
            err(e.message)

        # use auto configure for *-site.xml configs
        try:
            self.cluster.auto_configure()
        except ApiException as e:
            err(e.message)

    def start_cms(self):
        # start the management service
        info('Starting cloudera management service...')
        cms = self.cm.get_service()
        cms.start().wait()
        ok('Cloudera management service started successfully')

    def start_cdh(self):
        info('Excuting first run command. This might take a while.')
        cmd = self.cluster.first_run()

        while cmd.success == None:
            cmd = cmd.fetch()
            sleep(1)

        if cmd.success != True:
            err('The first run command failed: ' + cmd.resultMessage)

        ok('First run successfully executed. Your cluster has been set up!')
Ejemplo n.º 5
0
class ClouderaManager(object):
    """
    The complete orchestration of a cluster from start to finish assuming all the hosts are
    configured and Cloudera Manager is installed with all the required databases setup.

    Handle all the steps required in creating a cluster. All the functions are built to function
    idempotently. So you should be able to resume from any failed step but running thru the
    __class__.setup()
    """
    def __init__(self, module, config, trial=False, license_txt=None):
        self.api = ApiResource(config['cm']['host'],
                               username=config['cm']['username'],
                               password=config['cm']['password'])
        self.manager = self.api.get_cloudera_manager()
        self.config = config
        self.module = module
        self.trial = trial
        self.license_txt = license_txt
        self.cluster = None

    def enable_license(self):
        """
        Enable the requested license, either it's trial mode or a full license is entered and
        registered.
        """
        try:
            _license = self.manager.get_license()
        except ApiException:
            print_json(type="LICENSE", msg="Enabling license")
            if self.trial:
                self.manager.begin_trial()
            else:
                if license_txt is not None:
                    self.manager.update_license(license_txt)
                else:
                    fail(
                        self.module,
                        'License should be provided or trial should be specified'
                    )

            try:
                _license = self.manager.get_license()
            except ApiException:
                fail(self.module, 'Failed enabling license')
        print_json(type="LICENSE",
                   msg="Owner: {}, UUID: {}".format(_license.owner,
                                                    _license.uuid))

    def create_cluster(self):
        """
        Create a cluster and add hosts to the cluster. A new cluster is only created
        if another one doesn't exist with the same name.
        """
        print_json(type="CLUSTER", msg="Creating cluster")
        cluster_config = self.config['cluster']
        try:
            self.cluster = self.api.get_cluster(cluster_config['name'])
        except ApiException:
            print_json(type="CLUSTER",
                       msg="Creating Cluster entity: {}".format(
                           cluster_config['name']))
            self.cluster = self.api.create_cluster(
                cluster_config['name'], cluster_config['version'],
                cluster_config['fullVersion'])

        cluster_hosts = [
            self.api.get_host(host.hostId).hostname
            for host in self.cluster.list_hosts()
        ]
        hosts = []
        for host in cluster_config['hosts']:
            if host not in cluster_hosts:
                hosts.append(host)
        self.cluster.add_hosts(hosts)

    def activate_parcels(self):
        print_json(type="PARCELS", msg="Setting up parcels")
        for parcel_cfg in self.config['parcels']:
            parcel = Parcels(self.module, self.manager, self.cluster,
                             parcel_cfg.get('version'), parcel_cfg.get('repo'),
                             parcel_cfg.get('product', 'CDH'))
            parcel.download()
            parcel.distribute()
            parcel.activate()

    @retry(attempts=20, delay=5)
    def wait_inspect_hosts(self, cmd):
        """
        Inspect all the hosts. Basically wait till the check completes on all hosts.

        :param cmd: A command instance used for tracking the status of the command
        """
        print_json(type="HOSTS", msg="Inspecting hosts")
        cmd = cmd.fetch()
        if cmd.success is None:
            raise ApiException("Waiting on command {} to finish".format(cmd))
        elif not cmd.success:
            if (cmd.resultMessage is not None
                    and 'is not currently available for execution'
                    in cmd.resultMessage):
                raise ApiException('Retry Command')
            fail(self.module, 'Host inspection failed')
        print_json(type="HOSTS",
                   msg="Host inspection completed: {}".format(
                       cmd.resultMessage))

    def deploy_mgmt_services(self):
        """
        Configure, deploy and start all the Cloudera Management Services.
        """
        print_json(type="MGMT", msg="Deploying Management Services")
        try:
            mgmt = self.manager.get_service()
            if mgmt.serviceState == 'STARTED':
                return
        except ApiException:
            print_json(type="MGMT",
                       msg="Management Services don't exist. Creating.")
            mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo())

        for role in config['services']['MGMT']['roles']:
            if not len(mgmt.get_roles_by_type(role['group'])) > 0:
                print_json(type="MGMT",
                           msg="Creating role for {}".format(role['group']))
                mgmt.create_role('{}-1'.format(role['group']), role['group'],
                                 role['hosts'][0])

        for role in config['services']['MGMT']['roles']:
            role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format(
                role['group']))
            role_group.update_config(role.get('config', {}))

        mgmt.start().wait()
        if self.manager.get_service().serviceState == 'STARTED':
            print_json(type="MGMT", msg="Management Services started")
        else:
            fail(
                self.module,
                "[MGMT] Cloudera Management services didn't start up properly")

    def service_orchestrate(self, services):
        """
        Create, pre-configure provided list of services
        Stop/Start those services
        Perform and post service startup actions

        :param services: List of Services to perform service specific actions
        """
        service_classes = []

        # Create and pre-configure provided services
        for service in services:
            service_config = self.config['services'].get(service.upper())
            if service_config:
                svc = getattr(sys.modules[__name__], service)(self.cluster,
                                                              service_config)
                if not svc.started:
                    svc.deploy()
                    svc.pre_start()
                service_classes.append(svc)

        print_json(type="CLUSTER",
                   msg="Starting services: {} on Cluster".format(services))

        # Deploy all the client configs, since some of the services depend on other services
        # and is essential that the client configs are in place
        self.cluster.deploy_client_config()

        # Start each service and run the post_start actions for each service
        for svc in service_classes:
            # Only go thru the steps if the service is not yet started. This helps with
            # re-running the script after fixing errors
            if not svc.started:
                svc.start()
                svc.post_start()

    def setup(self):
        # TODO(rnirmal): Cloudera Manager SSL?

        # Enable a full license or start a trial
        self.enable_license()

        # Create the cluster entity and associate hosts
        self.create_cluster()

        # Download and activate the parcels
        self.activate_parcels()

        # Inspect all the hosts
        self.wait_inspect_hosts(self.manager.inspect_hosts())

        # Create Management services
        self.deploy_mgmt_services()

        # Configure and Start base services
        self.service_orchestrate(BASE_SERVICES)

        # Configure and Start remaining services
        self.service_orchestrate(ADDITIONAL_SERVICES)
Ejemplo n.º 6
0
def main():
  module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS))

  api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9)
  cluster_name = CLUSTER_NAME

  manager = api.get_cloudera_manager()

  action_a = module.params.get('action', None)

  if action_a == 'create_cluster':
    license_a = module.params.get('license', None)
    version_a = module.params.get('version', None)

    cluster_list = [x.name for x in api.get_all_clusters()]
    if cluster_name in cluster_list:
      module.exit_json(changed=False, msg='Cluster exists')
    else:
      cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a)
      if license_a == None:
        manager.begin_trial()
      else:
        manager.update_license(license_a.decode('base64'))
      module.exit_json(changed=True, msg='Cluster created')
  elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']:
    # more complicated actions that need a created cluster go here
    cluster = api.get_cluster(cluster_name)
    host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts())

    # adds a host to the cluster
    # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal
    if action_a == 'add_host':
      host_a = module.params.get('host', None)

      host_list = host_map.keys()
      if host_a in host_list:
        module.exit_json(changed=False, msg='Host already in cluster')
      else:
        try:
          cluster.add_hosts([host_a])
        except ApiException:
          # if a host isn't there, it could be because the agent didn't manage to connect yet
          # so let's wait a moment for it
          sleep(120)
          cluster.add_hosts([host_a])

        module.exit_json(changed=True, msg='Host added')

    # create management service and set it's basic configuration
    # this needs a separate function since management is handled
    # differently than the rest of services
    elif action_a == 'create_mgmt':
      host_a = module.params.get('host', None)

      # getting the management service is the only way to check if mgmt exists
      # an exception means there isn't one
      try:
        mgmt = manager.get_service()
        module.exit_json(changed=False, msg='Mgmt service already exists')
      except ApiException:
        pass

      mgmt = manager.create_mgmt_service(ApiServiceSetupInfo())

      # this is ugly... and I see no good way to unuglify it
      firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")
      reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")

      # since there is no easy way of configuring the manager... let's do it here :(
      role_conf = defaultdict(dict)
      role_conf['ACTIVITYMONITOR'] = {
          'firehose_database_host': '{0}:7432'.format(host_a),
          'firehose_database_user': '******',
          'firehose_database_password': firehose_passwd,
          'firehose_database_type': 'postgresql',
          'firehose_database_name': 'amon',
          'firehose_heapsize': '268435456',
      }
      role_conf['EVENTSERVER'] = {
          'event_server_heapsize': '215964392'
      }
      role_conf['REPORTSMANAGER'] = {
          'headlamp_database_host': '{0}:7432'.format(host_a),
          'headlamp_database_user': '******',
          'headlamp_database_password': reports_passwd,
          'headlamp_database_type': 'postgresql',
          'headlamp_database_name': 'rman',
          'headlamp_heapsize': '268435456',
      }

      roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER']
      # create mangement roles
      for role in roles:
        mgmt.create_role('{0}-1'.format(role), role, host_map[host_a])

      # update configuration of each
      for group in mgmt.get_all_role_config_groups():
        group.update_config(role_conf[group.roleType])

      mgmt.start().wait()
      # after starting this service needs time to spin up
      sleep(30)
      module.exit_json(changed=True, msg='Mgmt created and started')

    # deploy a given parcel on all hosts in the cluster
    # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4
    elif action_a == 'deploy_parcel':
      name_a = module.params.get('name', None)
      version_a = module.params.get('version', None)

      if "latest" in version_a:
        available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a]
        if "-latest" in version_a:
          version_substr = match('(.+?)-latest', version_a).group(1)
        # if version is just "latest", try to check everything
        else:
          version_substr = ".*"
        try:
          [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None]
        except ValueError:
          module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions))
      else:
        version_parcel = version_a

      # we now go through various stages of getting the parcel
      # as there is no built-in way of waiting for an operation to complete
      # we use loops with sleep to get it done
      parcel = cluster.get_parcel(name_a, version_parcel)
      if parcel.stage == 'AVAILABLE_REMOTELY':
        parcel.start_download()

        while parcel.stage != 'DOWNLOADED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          sleep(10)

      if parcel.stage == 'DOWNLOADED':
        parcel.start_distribution()

        while parcel.stage != 'DISTRIBUTED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          # sleep while hosts report problems after the download
          for i in range(12):
            sleep(10)
            if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
              break

      # since parcels are distributed automatically when a new host is added to a cluster
      # we can encounter the ,,ACTIVATING'' stage then
      if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING':
        if parcel.stage == 'DISTRIBUTED':
          parcel.activate()

        while parcel.stage != 'ACTIVATED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          # this sleep has to be large because although the operation is very fast
          # it makes the management and cloudera hosts go bonkers, failing all of the health checks
          sleep(10)

        # sleep while hosts report problems after the distribution
        for i in range(60):
          sleep(10)
          if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
            break

        module.exit_json(changed=True, msg='Parcel activated')

      if parcel.stage == 'ACTIVATED':
        module.exit_json(changed=False, msg='Parcel already activated')

      # if we get down here, something is not right
      module.fail_json(msg='Invalid parcel state')

    # deploy nodes for workers, according to SERVICE_WORKER_MAP
    # also give them sane names and init zookeeper and kafka ones
    # which need id's specified
    elif action_a == 'deploy_service_worker_nodes':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      role_name = SERVICE_WORKER_MAP[service_a]['name']
      full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring']

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      nodes = [x for x in service.get_all_roles() if role_name in x.name]

      # if host already has the given group, we should skip it
      if host_map[host_a] in [x.hostRef.hostId for x in nodes]:
        module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name))
      # find out the highest id that currently exists
      else:
        node_names = [x.name for x in nodes]
        if len(node_names) == 0:
          # if no nodes, start numbering from 1
          node_i = 1
        else:
          # take the max number and add 1 to it
          node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1

        if service_name == 'ZOOKEEPER':
          role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a)
          # zookeeper needs a per-node ID in the configuration, so we set it now
          role.update_config({'serverId': node_i})
        elif service_name == 'KAFKA':
          role = service.create_role(full_role_name.format(node_i), role_name, host_a)
          # kafka needs a per-node ID in the configuration, so we set it now
          role.update_config({'broker.id': node_i})
        else:
          service.create_role(full_role_name.format(node_i), role_name, host_a)

        module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name))

    # deploy a service. just create it, don't do anything more
    # this is needed maily when we have to set service properties before role deployment
    elif action_a == 'deploy_service':
      name_a = module.params.get('name', None)

      if not name_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(name_a))
      service_name = SERVICE_MAP[name_a]
      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
        module.exit_json(changed=True, msg='{0} service created'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} service already exists'.format(service_name))

    # deploy the base hdfs roles (the namenode and secondary)
    # this doesn't create the service, as at least one datanode should already be added!
    # the format also requires certain properties to be set before we run it
    elif action_a == 'deploy_hdfs_base':
      nn_host_a = module.params.get('nn_host', None)
      sn_host_a = module.params.get('sn_host', None)

      changed = False

      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]

      # don't create a secondary namenode when:
      #- there is one that already exists
      #- there is a second namenode, which means we have HA and don't need a secondary
      if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles:
        hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a)
        changed = True

      # create a namenode and format it's FS
      # formating the namenode requires at least one datanode and secondary namenode already in the cluster!
      if not 'HDFS-NAMENODE' in hdfs_roles:
        hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a)
        for command in hdfs.format_hdfs('HDFS-NAMENODE'):
          if command.wait().success == False:
            module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage))
        changed = True

      module.exit_json(changed=changed, msg='Created HDFS service & NN roles')

    # enable HttpFS for HDFS
    # HUE require this for support HA in HDFS
    elif action_a == 'deploy_hdfs_httpfs':
      host_a = module.params.get('host', None)
      
      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]
      
      # don't install second instance of HttpFS
      if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0:
        module.exit_json(changed=False, msg='HDFS HttpFS service already exists')
       
      hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) 
        
      module.exit_json(changed=True, msg='HDFS HttpFS service created')
      
    # enable HA for HDFS
    # this deletes the secondary namenode and creates a second namenode in it's place
    # also, this spawns 3 journal node and 2 failover controller roles
    elif action_a == 'deploy_hdfs_ha':
      sn_host_a = module.params.get('sn_host', None)
      jn_dir_a = module.params.get('jn_dir', None)
      jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)]

      hdfs = cluster.get_service('HDFS')

      # if there's a second namenode, this means we already have HA enabled
      if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]:
        # this is bad and I should feel bad
        # jns is a list of dictionaries, each dict passes the required journalnode parameters
        jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)]

        # this call is so long because we set some predictable names for the sevices
        command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER',
                                    active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2')

        children = command.wait().children
        for command_children in children:
          # The format command is expected to fail, since we already formated the namenode
          if command_children.name != 'Format' and command.success == False:
            module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for HDFS service')
      else:
        module.exit_json(changed=False, msg='HDFS HA already enabled')
    # enable HA for YARN
    elif action_a == 'deploy_rm_ha':
      sn_host_a = module.params.get('sn_host', None)

      yarn = cluster.get_service('YARN')

      # if there are two roles matching to this name, this means HA for YARN is enabled
      if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1:
        command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER')
        children = command.wait().children
        for command_children in children:
          if command.success == False:
            module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for YARN service')
      else:
        module.exit_json(changed=False, msg='YARN HA already enabled')

    # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP
    # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP
    elif action_a == 'deploy_base_roles':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      changed = False

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      service_roles = [x.name for x in service.get_all_roles()]

      # create each service from the map
      for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items():
        # check if role already exists, script cant compare it directly
        # after enabling HA on YARN roles will have random strings in names
        if len([0 for x in service_roles if match(role_name, x) != None]) == 0:
          service.create_role(role_name, cloudera_name, host_a)
          changed = True

          # init commmands
          if role_name in SERVICE_INIT_COMMANDS.keys():
            for command_to_run in SERVICE_INIT_COMMANDS[role_name]:
              # different handling of commands specified by name and
              # ones specified by an instance method
              if ismethod(command_to_run):
                command = command_to_run(service)
              else:
                command = service.service_command_by_name(command_to_run)

              if command.wait().success == False:
                module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage))

      if changed == True:
        module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name))

    # deploy configuration - it always return changed
    elif action_a == 'deploy_configuration':
      service_a = module.params.get('service', None)
      service_name = SERVICE_MAP[service_a]
      service = cluster.get_service(service_name)

      # deploying client configuration
      command = service.deploy_client_config()
      if command.wait().success == False:
        module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      module.exit_json(changed=True, msg='Configuration deployed')
        
    # set config values for a given service/role
    elif action_a == 'set_config':
      entity_a = module.params.get('entity', None)
      service_a = module.params.get('service', None)
      role_a = module.params.get('role', None)
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)

      if not service_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(service_a))

      # since management is handled differently, it needs a different service
      if service_a == 'management':
        service = manager.get_service()
      elif service_a == 'cm':
        service = manager
      else:
        service = cluster.get_service(SERVICE_MAP[service_a])

      # role and service configs are handled differently
      if entity_a == 'service':
        prev_config = service.get_config()
        curr_config = service.update_config({name_a: value_a})
        if service_a == 'cm':
          prev_config = [prev_config]
          curr_config = [curr_config]
        module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a]))

      elif entity_a == 'role':
        if not role_a in ROLE_MAP:
          module.fail_json(msg='Unknown role: {0}'.format(service))

        role = service.get_role_config_group(ROLE_MAP[role_a])
        prev_config = role.get_config()
        curr_config = role.update_config({name_a: value_a})
        module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a]))

      else:
        module.fail_json(msg='Invalid entity, must be one of service, role')

    # handle service state
    # currently this only can start/restart a service
    elif action_a == 'service':
      state_a = module.params.get('state', None)
      service_a = module.params.get('service', None)

      try:
        if service_a == 'cm':
          service = manager.get_service()
        else:
          service = cluster.get_service(SERVICE_MAP[service_a])
      except ApiException:
        module.fail_json(msg='Service does not exist')

      # when starting a service, we also deploy the client config for it
      if state_a == 'started':
        if service.serviceState == 'STARTED':
          module.exit_json(changed=False, msg='Service already running')
        method = service.start
        verb = "start"
      elif state_a == 'restarted':
        method = service.restart
        verb = "restart"

      try:
        command = service.deploy_client_config()
        if command.wait().success == False:
          module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      # since there is no way to check if a service handles client config deployments
      # we try our best and pass the exception if it doesn't
      except ApiException, AttributeError:
        pass

      method().wait()
      # we need to wait for cloudera checks to complete...
      # otherwise it will report as failing
      sleep(10)
      for i in range(24):
        sleep(10)
        service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
        if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
          break
      service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
      if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
        module.exit_json(changed=True, msg='Service {0} successful'.format(verb))
      else:
        module.fail_json(msg='Service {0} failed'.format(verb))

    # handle cluster
    # currently this only can restart
    elif action_a == 'cluster':
      state_a = module.params.get('state', None)

      if state_a == 'restarted':
        command = cluster.restart(redeploy_client_configuration=True)
        if command.wait().success == False:
          module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage))
        else:
          module.exit_json(changed=True, msg='Cluster restart successful')

    # Snapshot policy
    # only create is supported
    elif action_a == 'create_snapshot_policy':
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)
      service_a = module.params.get('service', None)
      service = cluster.get_service(SERVICE_MAP[service_a])
      payload=loads(value_a)
      # checking if policy already exists. Exception is expected when configure for the first time.
      try: 
        test = service.get_snapshot_policy(name_a)
        module.exit_json(changed=False, msg='Defined policy already exists')
      except ApiException:
        pass
      try:
        command = service.create_snapshot_policy(payload)
        module.exit_json(changed=True, msg='Snapshot policy was created.')
      except ApiException, AttributeError:
        module.fail_json(msg='ERROR in creating snapshot policy.')
Ejemplo n.º 7
0
def create_cluster(config_dict):
    config.read(['./conf/hadrian.ini','./conf/cluster_specs.ini', './conf/cloudera-manager/cm.ini'])
    
    
    cm_cluster_name = config_grabber("Globals")['cm.cluster.name']
    cm_username = config_grabber("Globals")['cm.username']
    cm_password = config_grabber("Globals")['cm.password']
    cm_port = config_grabber("Globals")['cm.port']
    version = config_grabber('Globals')['cdh.cluster.version']
    cm_server = config_grabber(cm_cluster_name + '-en')['cm.server']
    
    #Grab all configuration files in the directory with the CM Cluster Name.
    
    for i in os.listdir('./conf/' + cm_cluster_name):
        config.read('./conf/' + cm_cluster_name + '/' + i)
    
    all_nodes = list()

    while (get_cm_status(cm_server + ':' + cm_port) != 200):
        print 'Waiting for CM Server to start... '
        time.sleep(15)
    
    api = ApiResource(cm_server, cm_port, cm_username, cm_password)
    # create cluster
    cluster = api.create_cluster(cm_cluster_name, version.upper())
    
    #Config CM
    print 'Applying any configuration changes to Cloudera Manager'
    cmanager = api.get_cloudera_manager()
    cmanager.update_config(config_grabber('cloudera-manager-updates'))
        
    planned_nodes = config_grabber(cm_cluster_name + '-en')['full.list'].split(',')
    for k, v in config_grabber(cm_cluster_name + '-dn').iteritems():
        for j in v.split(','):
            planned_nodes.append(j)
    
    # TODO make this smarter.  show which agents haven't checked in.  Add the option to continue without them.
    if len(api.get_all_hosts()) != len(planned_nodes):
        print 'Waiting for all agents to check into the CM Server before continuing.'
        
        while len(planned_nodes) > api.get_all_hosts():
            print 'Waiting for the final set of CM Agent nodes to check in.' 
            time.sleep(5)
        
    print 'Updating Rack configuration for data nodes.'
    all_hosts = list()
    for host in api.get_all_hosts():
        all_hosts.append(host.hostId)
        for k,v in config_grabber(cm_cluster_name + '-dn').iteritems():
            if host.hostname in v:
                print 'Setting host: ' + host.hostname + ' to rack /default/' + k
                host.set_rack_id('/default/' + k)
    
    print 'Adding all hosts to cluster.'
    cluster.add_hosts(all_hosts)

    # download CDH Parcels
    # TODO add some logic here to make the parcel list something that's read from the hadrian.ini
    # This will allow support for other CDH packages, Search, etc.
    if config_grabber('Globals')['cdh.distribution.method'] == 'parcels':
        distribute_parcel(cluster, 'CDH', config_grabber("Globals")['cdh.parcel.version'])
    
    if config_dict.get('hdfs_ha') == True:
        create_zookeeper_service(config_dict, cluster)
    create_hdfs_service(config_dict, cluster)    

    cmd = cluster.deploy_client_config()
    if not cmd.wait(CMD_TIMEOUT).success:
        print 'Failed to deploy client configurations'
    else:
        print 'Client configuration deployment complete.'

    create_mapred_service(config_dict, cluster, cm_server)
    if config_dict.get('hbase') == True:
        if config_dict.get('hdfs_ha') == False:
            create_zookeeper_service(config_dict, cluster)
        create_hbase_service(config_dict, cluster)
    if config_dict.get('hive') == True:
         create_hive_service(config_dict, cluster)
    print 'Starting final client configuration deployment for all services.'
    cmd = cluster.deploy_client_config()
    if not cmd.wait(CMD_TIMEOUT).success:
        print 'Failed to deploy client configuration.'
    else:
        print 'Client configuration deployment complete.  The cluster is all yours.  Happy Hadooping.'
Ejemplo n.º 8
0
def create_cluster(config_dict):
    config.read([
        './conf/hadrian.ini', './conf/cluster_specs.ini',
        './conf/cloudera-manager/cm.ini'
    ])

    cm_cluster_name = config_grabber("Globals")['cm.cluster.name']
    cm_username = config_grabber("Globals")['cm.username']
    cm_password = config_grabber("Globals")['cm.password']
    cm_port = config_grabber("Globals")['cm.port']
    version = config_grabber('Globals')['cdh.cluster.version']
    cm_server = config_grabber(cm_cluster_name + '-en')['cm.server']

    #Grab all configuration files in the directory with the CM Cluster Name.

    for i in os.listdir('./conf/' + cm_cluster_name):
        config.read('./conf/' + cm_cluster_name + '/' + i)

    all_nodes = list()

    while (get_cm_status(cm_server + ':' + cm_port) != 200):
        print 'Waiting for CM Server to start... '
        time.sleep(15)

    api = ApiResource(cm_server, cm_port, cm_username, cm_password)
    # create cluster
    cluster = api.create_cluster(cm_cluster_name, version.upper())

    #Config CM
    print 'Applying any configuration changes to Cloudera Manager'
    cmanager = api.get_cloudera_manager()
    cmanager.update_config(config_grabber('cloudera-manager-updates'))

    planned_nodes = config_grabber(cm_cluster_name +
                                   '-en')['full.list'].split(',')
    for k, v in config_grabber(cm_cluster_name + '-dn').iteritems():
        for j in v.split(','):
            planned_nodes.append(j)

    # TODO make this smarter.  show which agents haven't checked in.  Add the option to continue without them.
    if len(api.get_all_hosts()) != len(planned_nodes):
        print 'Waiting for all agents to check into the CM Server before continuing.'

        while len(planned_nodes) > api.get_all_hosts():
            print 'Waiting for the final set of CM Agent nodes to check in.'
            time.sleep(5)

    print 'Updating Rack configuration for data nodes.'
    all_hosts = list()
    for host in api.get_all_hosts():
        all_hosts.append(host.hostId)
        for k, v in config_grabber(cm_cluster_name + '-dn').iteritems():
            if host.hostname in v:
                print 'Setting host: ' + host.hostname + ' to rack /default/' + k
                host.set_rack_id('/default/' + k)

    print 'Adding all hosts to cluster.'
    cluster.add_hosts(all_hosts)

    # download CDH Parcels
    # TODO add some logic here to make the parcel list something that's read from the hadrian.ini
    # This will allow support for other CDH packages, Search, etc.
    if config_grabber('Globals')['cdh.distribution.method'] == 'parcels':
        distribute_parcel(cluster, 'CDH',
                          config_grabber("Globals")['cdh.parcel.version'])

    if config_dict.get('hdfs_ha') == True:
        create_zookeeper_service(config_dict, cluster)
    create_hdfs_service(config_dict, cluster)

    cmd = cluster.deploy_client_config()
    if not cmd.wait(CMD_TIMEOUT).success:
        print 'Failed to deploy client configurations'
    else:
        print 'Client configuration deployment complete.'

    create_mapred_service(config_dict, cluster, cm_server)
    if config_dict.get('hbase') == True:
        if config_dict.get('hdfs_ha') == False:
            create_zookeeper_service(config_dict, cluster)
        create_hbase_service(config_dict, cluster)
    if config_dict.get('hive') == True:
        create_hive_service(config_dict, cluster)
    print 'Starting final client configuration deployment for all services.'
    cmd = cluster.deploy_client_config()
    if not cmd.wait(CMD_TIMEOUT).success:
        print 'Failed to deploy client configuration.'
    else:
        print 'Client configuration deployment complete.  The cluster is all yours.  Happy Hadooping.'
Ejemplo n.º 9
0
def main():
  module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS))

  api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10)
  cluster_name = CLUSTER_NAME

  manager = api.get_cloudera_manager()

  action_a = module.params.get('action', None)

  if action_a == 'create_cluster':
    license_a = module.params.get('license', None)
    version_a = module.params.get('version', None)

    cluster_list = [x.name for x in api.get_all_clusters()]
    if cluster_name in cluster_list:
      module.exit_json(changed=False, msg='Cluster exists')
    else:
      cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a)
      if license_a == None:
        manager.begin_trial()
      else:
        manager.update_license(license_a.decode('base64'))
      module.exit_json(changed=True, msg='Cluster created')
  elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']:
    # more complicated actions that need a created cluster go here
    cluster = api.get_cluster(cluster_name)
    host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts())

    # adds a host to the cluster
    # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal
    if action_a == 'add_host':
      host_a = module.params.get('host', None)

      host_list = host_map.keys()
      if host_a in host_list:
        module.exit_json(changed=False, msg='Host already in cluster')
      else:
        try:
          cluster.add_hosts([host_a])
        except ApiException:
          # if a host isn't there, it could be because the agent didn't manage to connect yet
          # so let's wait a moment for it
          sleep(120)
          cluster.add_hosts([host_a])

        module.exit_json(changed=True, msg='Host added')

    # create management service and set it's basic configuration
    # this needs a separate function since management is handled
    # differently than the rest of services
    elif action_a == 'create_mgmt':
      host_a = module.params.get('host', None)

      # getting the management service is the only way to check if mgmt exists
      # an exception means there isn't one
      try:
        mgmt = manager.get_service()
        module.exit_json(changed=False, msg='Mgmt service already exists')
      except ApiException:
        pass

      mgmt = manager.create_mgmt_service(ApiServiceSetupInfo())

      # this is ugly... and I see no good way to unuglify it
      firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")
      reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")

      # since there is no easy way of configuring the manager... let's do it here :(
      role_conf = defaultdict(dict)
      role_conf['ACTIVITYMONITOR'] = {
          'firehose_database_host': '{0}:7432'.format(host_a),
          'firehose_database_user': '******',
          'firehose_database_password': firehose_passwd,
          'firehose_database_type': 'postgresql',
          'firehose_database_name': 'amon',
          'firehose_heapsize': '268435456',
      }
      role_conf['EVENTSERVER'] = {
          'event_server_heapsize': '215964392'
      }
      role_conf['REPORTSMANAGER'] = {
          'headlamp_database_host': '{0}:7432'.format(host_a),
          'headlamp_database_user': '******',
          'headlamp_database_password': reports_passwd,
          'headlamp_database_type': 'postgresql',
          'headlamp_database_name': 'rman',
          'headlamp_heapsize': '215964392',
      }

      roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER']
      # create mangement roles
      for role in roles:
        mgmt.create_role('{0}-1'.format(role), role, host_map[host_a])

      # update configuration of each
      for group in mgmt.get_all_role_config_groups():
        group.update_config(role_conf[group.roleType])

      mgmt.start().wait()
      # after starting this service needs time to spin up
      sleep(30)
      module.exit_json(changed=True, msg='Mgmt created and started')

    # deploy a given parcel on all hosts in the cluster
    # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4
    elif action_a == 'deploy_parcel':
      name_a = module.params.get('name', None)
      version_a = module.params.get('version', None)

      if "latest" in version_a:
        available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a]
        if "-latest" in version_a:
          version_substr = match('(.+?)-latest', version_a).group(1)
        # if version is just "latest", try to check everything
        else:
          version_substr = ".*"
        try:
          [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None]
        except ValueError:
          module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions))
      else:
        version_parcel = version_a

      # we now go through various stages of getting the parcel
      # as there is no built-in way of waiting for an operation to complete
      # we use loops with sleep to get it done
      parcel = cluster.get_parcel(name_a, version_parcel)
      if parcel.stage == 'AVAILABLE_REMOTELY':
        parcel.start_download()

        while parcel.stage != 'DOWNLOADED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          sleep(10)

      if parcel.stage == 'DOWNLOADED':
        parcel.start_distribution()

        while parcel.stage != 'DISTRIBUTED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          # sleep while hosts report problems after the download
          for i in range(12):
            sleep(10)
            if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
              break

      # since parcels are distributed automatically when a new host is added to a cluster
      # we can encounter the ,,ACTIVATING'' stage then
      if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING':
        if parcel.stage == 'DISTRIBUTED':
          parcel.activate()

        while parcel.stage != 'ACTIVATED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          # this sleep has to be large because although the operation is very fast
          # it makes the management and cloudera hosts go bonkers, failing all of the health checks
          sleep(10)

        # sleep while hosts report problems after the distribution
        for i in range(60):
          sleep(10)
          if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
            break

        module.exit_json(changed=True, msg='Parcel activated')

      if parcel.stage == 'ACTIVATED':
        module.exit_json(changed=False, msg='Parcel already activated')

      # if we get down here, something is not right
      module.fail_json(msg='Invalid parcel state')

    # deploy nodes for workers, according to SERVICE_WORKER_MAP
    # also give them sane names and init zookeeper and kafka ones
    # which need id's specified
    elif action_a == 'deploy_service_worker_nodes':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      role_name = SERVICE_WORKER_MAP[service_a]['name']
      full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring']

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      nodes = [x for x in service.get_all_roles() if role_name in x.name]

      # if host already has the given group, we should skip it
      if host_map[host_a] in [x.hostRef.hostId for x in nodes]:
        module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name))
      # find out the highest id that currently exists
      else:
        node_names = [x.name for x in nodes]
        if len(node_names) == 0:
          # if no nodes, start numbering from 1
          node_i = 1
        else:
          # take the max number and add 1 to it
          node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1

        if service_name == 'ZOOKEEPER':
          role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a)
          # zookeeper needs a per-node ID in the configuration, so we set it now
          role.update_config({'serverId': node_i})
        elif service_name == 'KAFKA':
          role = service.create_role(full_role_name.format(node_i), role_name, host_a)
          # kafka needs a per-node ID in the configuration, so we set it now
          role.update_config({'broker.id': node_i})
        else:
          service.create_role(full_role_name.format(node_i), role_name, host_a)

        module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name))

    # deploy a service. just create it, don't do anything more
    # this is needed maily when we have to set service properties before role deployment
    elif action_a == 'deploy_service':
      name_a = module.params.get('name', None)

      if not name_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(name_a))
      service_name = SERVICE_MAP[name_a]
      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
        module.exit_json(changed=True, msg='{0} service created'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} service already exists'.format(service_name))

    # deploy the base hdfs roles (the namenode and secondary)
    # this doesn't create the service, as at least one datanode should already be added!
    # the format also requires certain properties to be set before we run it
    elif action_a == 'deploy_hdfs_base':
      nn_host_a = module.params.get('nn_host', None)
      sn_host_a = module.params.get('sn_host', None)

      changed = False

      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]

      # don't create a secondary namenode when:
      #- there is one that already exists
      #- there is a second namenode, which means we have HA and don't need a secondary
      if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles:
        hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a)
        changed = True

      # create a namenode and format it's FS
      # formating the namenode requires at least one datanode and secondary namenode already in the cluster!
      if not 'HDFS-NAMENODE' in hdfs_roles:
        hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a)
        for command in hdfs.format_hdfs('HDFS-NAMENODE'):
          if command.wait().success == False:
            module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage))
        changed = True

      module.exit_json(changed=changed, msg='Created HDFS service & NN roles')

    # enable HttpFS for HDFS
    # HUE require this for support HA in HDFS
    elif action_a == 'deploy_hdfs_httpfs':
      host_a = module.params.get('host', None)
      
      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]
      
      # don't install second instance of HttpFS
      if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0:
        module.exit_json(changed=False, msg='HDFS HttpFS service already exists')
       
      hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) 
        
      module.exit_json(changed=True, msg='HDFS HttpFS service created')
      
    # enable HA for HDFS
    # this deletes the secondary namenode and creates a second namenode in it's place
    # also, this spawns 3 journal node and 2 failover controller roles
    elif action_a == 'deploy_hdfs_ha':
      sn_host_a = module.params.get('sn_host', None)
      jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)]

      hdfs = cluster.get_service('HDFS')

      # if there's a second namenode, this means we already have HA enabled
      if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]:
        # this is bad and I should feel bad
        # jns is a list of dictionaries, each dict passes the required journalnode parameters
        jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)]

        # this call is so long because we set some predictable names for the sevices
        command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER',
                                    active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2')

        children = command.wait().children
        for command_children in children:
          # The format command is expected to fail, since we already formated the namenode
          if command_children.name != 'Format' and command.success == False:
            module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for HDFS service')
      else:
        module.exit_json(changed=False, msg='HDFS HA already enabled')
    # enable HA for YARN
    elif action_a == 'deploy_rm_ha':
      sn_host_a = module.params.get('sn_host', None)

      yarn = cluster.get_service('YARN')

      # if there are two roles matching to this name, this means HA for YARN is enabled
      if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1:
        command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER')
        children = command.wait().children
        for command_children in children:
          if command.success == False:
            module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for YARN service')
      else:
        module.exit_json(changed=False, msg='YARN HA already enabled')

    # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP
    # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP
    elif action_a == 'deploy_base_roles':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      changed = False

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      service_roles = [x.name for x in service.get_all_roles()]

      # create each service from the map
      for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items():
        # check if role already exists, script cant compare it directly
        # after enabling HA on YARN roles will have random strings in names
        if len([0 for x in service_roles if match(role_name, x) != None]) == 0:
          service.create_role(role_name, cloudera_name, host_a)
          changed = True

          # init commmands
          if role_name in SERVICE_INIT_COMMANDS.keys():
            for command_to_run in SERVICE_INIT_COMMANDS[role_name]:
              # different handling of commands specified by name and
              # ones specified by an instance method
              if ismethod(command_to_run):
                command = command_to_run(service)
              else:
                command = service.service_command_by_name(command_to_run)

              if command.wait().success == False:
                module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage))

      if changed == True:
        module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name))

    # set config values for a given service/role
    elif action_a == 'set_config':
      entity_a = module.params.get('entity', None)
      service_a = module.params.get('service', None)
      role_a = module.params.get('role', None)
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)

      if not service_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(service_a))

      # since management is handled differently, it needs a different service
      if service_a == 'management':
        service = manager.get_service()
      elif service_a == 'cm':
        service = manager
      else:
        service = cluster.get_service(SERVICE_MAP[service_a])

      # role and service configs are handled differently
      if entity_a == 'service':
        prev_config = service.get_config()
        curr_config = service.update_config({name_a: value_a})
        if service_a == 'cm':
          prev_config = [prev_config]
          curr_config = [curr_config]
        module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a]))

      elif entity_a == 'role':
        if not role_a in ROLE_MAP:
          module.fail_json(msg='Unknown role: {0}'.format(service))

        role = service.get_role_config_group(ROLE_MAP[role_a])
        prev_config = role.get_config()
        curr_config = role.update_config({name_a: value_a})
        module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a]))

      else:
        module.fail_json(msg='Invalid entity, must be one of service, role')

    # handle service state
    # currently this only can start/restart a service
    elif action_a == 'service':
      state_a = module.params.get('state', None)
      service_a = module.params.get('service', None)

      try:
        if service_a == 'cm':
          service = manager.get_service()
        else:
          service = cluster.get_service(SERVICE_MAP[service_a])
      except ApiException:
        module.fail_json(msg='Service does not exist')

      # when starting a service, we also deploy the client config for it
      if state_a == 'started':
        if service.serviceState == 'STARTED':
          module.exit_json(changed=False, msg='Service already running')
        method = service.start
        verb = "start"
      elif state_a == 'restarted':
        method = service.restart
        verb = "restart"

      try:
        command = service.deploy_client_config()
        if command.wait().success == False:
          module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      # since there is no way to check if a service handles client config deployments
      # we try our best and pass the exception if it doesn't
      except ApiException, AttributeError:
        pass

      method().wait()
      # we need to wait for cloudera checks to complete...
      # otherwise it will report as failing
      sleep(10)
      for i in range(24):
        sleep(10)
        service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
        if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
          break
      service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
      if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
        module.exit_json(changed=True, msg='Service {0} successful'.format(verb))
      else:
        module.fail_json(msg='Service {0} failed'.format(verb))

    # handle cluster
    # currently this only can restart
    elif action_a == 'cluster':
      state_a = module.params.get('state', None)

      if state_a == 'restarted':
        command = cluster.restart(redeploy_client_configuration=True)
        if command.wait().success == False:
          module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage))
        else:
          module.exit_json(changed=True, msg='Cluster restart successful')

    # Snapshot policy
    # only create is supported
    elif action_a == 'create_snapshot_policy':
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)
      service_a = module.params.get('service', None)
      service = cluster.get_service(SERVICE_MAP[service_a])
      payload=loads(value_a)
      # checking if policy already exists. Exception is expected when configure for the first time.
      try: 
        test = service.get_snapshot_policy(name_a)
        module.exit_json(changed=False, msg='Defined policy already exists')
      except ApiException:
        pass
      try:
        command = service.create_snapshot_policy(payload)
        module.exit_json(changed=True, msg='Snapshot policy was created.')
      except ApiException, AttributeError:
        module.fail_json(msg='ERROR in creating snapshot policy.')
cmd = manager.host_install(host_username,
                           cluster_hosts,
                           private_key=private_key_contents)
print "checking if host_install finished"
while cmd.active == True:
    sleep(5)
    print " ..."
    cmd = cmd.fetch()

if cmd.success != True:
    print "host_install failed: " + cmd.resultMessage
    exit(0)
print "host_install successful: " + cmd.resultMessage

cluster = api.create_cluster(cluster_name, cdh_version)

all_hosts = api.get_all_hosts()

hostrefs = []
yarn_nodemanager_hostrefs = []
hdfs_datanode_hostrefs = []
zookeeper_server_hostrefs = []
hive_gateway_hostrefs = []

for host in all_hosts:
    if host.hostname == cm_management_host:
        cm_management_host_hostref = host.hostId
    if host.hostname == yarn_resourcemanager:
        yarn_resourcemanager_hostref = host.hostId
    if host.hostname == yarn_jobhistory:
Ejemplo n.º 11
0
from cm_api.api_client import ApiResource
import sys, time

api = ApiResource(sys.argv[1], 7180, "acm", "SCALE42secretly", version=15)
excpected_hosts = sys.argv[2:]
excpected_hosts_count = len(excpected_hosts)

cluster = None

try:
    cluster = api.get_cluster(name="ACM Cluster")
except Exception, e:
    if e.message[-10:-1].lower() == "not found":
        #CLUSTER YET NOT CREATED -> create it
        cluster = api.create_cluster(name="ACM Cluster", version="CDH5")

        while True:
            #make sure every comissioned hosts is part of the cluster !!!
            print "Waiting for the <{0}> worker nodes of the cluster to be ready with their SCM configured AGENT...".format(
                excpected_hosts_count)
            hosts = api.get_all_hosts()
            #get the IP-Adress of every hosts comissioned
            actual_host_ips = list(set([str(h.ipAddress) for h in hosts]))
            verify_hosts = [eh in actual_host_ips for eh in excpected_hosts]
            if all(verify_hosts):
                current_host_ids = [h.hostId for h in hosts]
                #ADDING HOSTS to the cluster
                cluster.add_hosts(current_host_ids)
                break
            time.sleep(10)
Ejemplo n.º 12
0
class ClouderaManager(object):
    """
    The complete orchestration of a cluster from start to finish assuming all the hosts are
    configured and Cloudera Manager is installed with all the required databases setup.

    Handle all the steps required in creating a cluster. All the functions are built to function
    idempotently. So you should be able to resume from any failed step but running thru the
    __class__.setup()
    """

    def __init__(self, module, config, trial=False, license_txt=None):
        self.api = ApiResource(config['cm']['host'], username=config['cm']['username'],
                               password=config['cm']['password'])
        self.manager = self.api.get_cloudera_manager()
        self.config = config
        self.module = module
        self.trial = trial
        self.license_txt = license_txt
        self.cluster = None

    def enable_license(self):
        """
        Enable the requested license, either it's trial mode or a full license is entered and
        registered.
        """
        try:
            _license = self.manager.get_license()
        except ApiException:
            print_json(type="LICENSE", msg="Enabling license")
            if self.trial:
                self.manager.begin_trial()
            else:
                if license_txt is not None:
                    self.manager.update_license(license_txt)
                else:
                    fail(self.module, 'License should be provided or trial should be specified')

            try:
                _license = self.manager.get_license()
            except ApiException:
                fail(self.module, 'Failed enabling license')
        print_json(type="LICENSE",
                   msg="Owner: {}, UUID: {}".format(_license.owner, _license.uuid))

    def create_cluster(self):
        """
        Create a cluster and add hosts to the cluster. A new cluster is only created
        if another one doesn't exist with the same name.
        """
        print_json(type="CLUSTER", msg="Creating cluster")
        cluster_config = self.config['cluster']
        try:
            self.cluster = self.api.get_cluster(cluster_config['name'])
        except ApiException:
            print_json(type="CLUSTER",
                       msg="Creating Cluster entity: {}".format(cluster_config['name']))
            self.cluster = self.api.create_cluster(cluster_config['name'],
                                                   cluster_config['version'],
                                                   cluster_config['fullVersion'])

        cluster_hosts = [self.api.get_host(host.hostId).hostname
                         for host in self.cluster.list_hosts()]
        hosts = []
        for host in cluster_config['hosts']:
            if host not in cluster_hosts:
                hosts.append(host)
        self.cluster.add_hosts(hosts)

    def activate_parcels(self):
        print_json(type="PARCELS", msg="Setting up parcels")
        for parcel_cfg in self.config['parcels']:
            parcel = Parcels(self.module, self.manager, self.cluster,
                             parcel_cfg.get('version'), parcel_cfg.get('repo'),
                             parcel_cfg.get('product', 'CDH'))
            parcel.download()
            parcel.distribute()
            parcel.activate()

    @retry(attempts=20, delay=5)
    def wait_inspect_hosts(self, cmd):
        """
        Inspect all the hosts. Basically wait till the check completes on all hosts.

        :param cmd: A command instance used for tracking the status of the command
        """
        print_json(type="HOSTS", msg="Inspecting hosts")
        cmd = cmd.fetch()
        if cmd.success is None:
            raise ApiException("Waiting on command {} to finish".format(cmd))
        elif not cmd.success:
            if (cmd.resultMessage is not None and
                    'is not currently available for execution' in cmd.resultMessage):
                raise ApiException('Retry Command')
            fail(self.module, 'Host inspection failed')
        print_json(type="HOSTS", msg="Host inspection completed: {}".format(cmd.resultMessage))

    def deploy_mgmt_services(self):
        """
        Configure, deploy and start all the Cloudera Management Services.
        """
        print_json(type="MGMT", msg="Deploying Management Services")
        try:
            mgmt = self.manager.get_service()
            if mgmt.serviceState == 'STARTED':
                return
        except ApiException:
            print_json(type="MGMT", msg="Management Services don't exist. Creating.")
            mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo())

        for role in config['services']['MGMT']['roles']:
            if not len(mgmt.get_roles_by_type(role['group'])) > 0:
                print_json(type="MGMT", msg="Creating role for {}".format(role['group']))
                mgmt.create_role('{}-1'.format(role['group']), role['group'], role['hosts'][0])

        for role in config['services']['MGMT']['roles']:
            role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format(role['group']))
            role_group.update_config(role.get('config', {}))

        mgmt.start().wait()
        if self.manager.get_service().serviceState == 'STARTED':
            print_json(type="MGMT", msg="Management Services started")
        else:
            fail(self.module, "[MGMT] Cloudera Management services didn't start up properly")

    def service_orchestrate(self, services):
        """
        Create, pre-configure provided list of services
        Stop/Start those services
        Perform and post service startup actions

        :param services: List of Services to perform service specific actions
        """
        service_classes = []

        # Create and pre-configure provided services
        for service in services:
            service_config = self.config['services'].get(service.upper())
            if service_config:
                svc = getattr(sys.modules[__name__], service)(self.cluster, service_config)
                if not svc.started:
                    svc.deploy()
                    svc.pre_start()
                service_classes.append(svc)

        print_json(type="CLUSTER", msg="Starting services: {} on Cluster".format(services))

        # Deploy all the client configs, since some of the services depend on other services
        # and is essential that the client configs are in place
        self.cluster.deploy_client_config()

        # Start each service and run the post_start actions for each service
        for svc in service_classes:
            # Only go thru the steps if the service is not yet started. This helps with
            # re-running the script after fixing errors
            if not svc.started:
                svc.start()
                svc.post_start()

    def setup(self):
        # TODO(rnirmal): Cloudera Manager SSL?

        # Enable a full license or start a trial
        self.enable_license()

        # Create the cluster entity and associate hosts
        self.create_cluster()

        # Download and activate the parcels
        self.activate_parcels()

        # Inspect all the hosts
        self.wait_inspect_hosts(self.manager.inspect_hosts())

        # Create Management services
        self.deploy_mgmt_services()

        # Configure and Start base services
        self.service_orchestrate(BASE_SERVICES)

        # Configure and Start remaining services
        self.service_orchestrate(ADDITIONAL_SERVICES)
cm_host = "cloudera-pe-cm01"
api = ApiResource(cm_host, username="******", password="******")

# Distribute the CDH parcel

parcel_repo = 'http://archive.cloudera.com/cdh5/parcels/5.2.0'
#parcel_repo = 'http://archive.cloudera.com/cdh5/parcels/5.1.3/'
cm_config = api.get_cloudera_manager().get_config(view='full')
repo_config = cm_config['REMOTE_PARCEL_REPO_URLS']
value = repo_config.value or repo_config.default
value += ',' + parcel_repo
api.get_cloudera_manager().update_config({'REMOTE_PARCEL_REPO_URLS': value})
time.sleep(10)

# create cluster, add the hosts
cluster = api.create_cluster("cloudera-pe-test", "CDH5")
#api.create_host("master", "ip-10-238-154-140", "10.238.154.140")
#api.create_host("w01", "ip-10-143-183-98", "10.143.183.98")
#api.create_host("w02", "ip-10-140-38-88", "10.140.38.88")
#api.create_host("w03", "ip-10-140-28-243", "10.140.28.243")
#hosts.append("master")
#hosts.append("w01")
#hosts.append("w02")
#hosts.append("w03")
hosts.append("ip-10-11-167-80")
hosts.append("ip-10-153-224-197")
hosts.append("ip-10-37-166-245")
hosts.append("ip-10-169-69-118")
cluster.add_hosts(hosts)

# Downloads and distributes parcels
Ejemplo n.º 14
0
def main(argv):
	CM_HOST         = "localhost"
	CM_PORT         = 7180
	CM_USER         = "******"
	CM_PASSWD       = "admin"
	CMD_TIMEOUT = 180
	
	#Configurations
	HDFS_CONF = {
	}
	
	NAMENODE_CONF = {
	        'dfs_name_dir_list': '/dfs/nn',
	        'dfs_namenode_servicerpc_address': 8022,
		'namenode_java_heapsize': 154140672,
	}
	
	SECONDARY_CONF = {
	        'fs_checkpoint_dir_list': '/dfs/snn',
		'secondary_namenode_java_heapsize': 154140672,
	}
	
	DATANODE_CONF = {
	        'dfs_data_dir_list': '/data/1/dfs/dn,/data/2/dfs/dn,/data/3/dfs/dn',
	        'dfs_datanode_handler_count': 10,
	        'dfs_datanode_du_reserved': 2180395417,
	        'dfs_datanode_max_locked_memory': 983564288,
	        'datanode_java_heapsize': 286261248,
	}
	
	YARN_CONF = {
		'hdfs_service':'hdfs',
	}

	RSRCMAN_CONF = {
		'resource_manager_java_heapsize': 154140672,
		'yarn_scheduler_maximum_allocation_mb': 1513,
		'yarn_scheduler_maximum_allocation_vcores': 2,
	}

	JOBHIST_CONF = {
		'mr2_jobhistory_java_heapsize': 154140672,
	}

	NODEMAN_CONF = {
		'yarn_nodemanager_local_dirs': '/yarn/nm',
		'yarn_nodemanager_resource_cpu_vcores': 2,
		'yarn_nodemanager_resource_memory_mb': 1513,
	}

	
	
	#Parser Options
	parser = OptionParser()
	parser.set_defaults(action='')
	parser.add_option("-a", "--add", action="store_const", const="add", dest="action", help="add the list of hosts to the named cluster")
	parser.add_option("-r", "--remove", action="store_const", const="remove", dest="action", help="remove the list of hosts from the named cluster")
	parser.add_option("-d", "--deploy", action="store_const", const="deploy", dest="action", help="deploy the list of hosts as a new cluster with the given name")
	parser.add_option("--delete", action="store_const", const="delete", dest="action", help="delete the named cluster")
	parser.add_option("--name", dest="name", help="declare the cluster name to be created or to interact with")
	parser.add_option("--hosts", dest="hosts", help="comma delimited list of hosts to be added/removed")
	
	
	(opts,args) = parser.parse_args()
	CLUSTER_NAME = opts.name
	if opts.hosts and len(opts.hosts) > 1:
		HOSTNAMES = opts.hosts.split(",")
	elif opts.hosts:
		HOSTNAMES = opts.hosts
	else:
		HOSTNAMES = ''
	
	ROLEHASH = []
	if HOSTNAMES:
		for host in HOSTNAMES:
			ROLEHASH.append(hashlib.md5(host).hexdigest())

	api = ApiResource(CM_HOST, CM_PORT, CM_USER, CM_PASSWD)
	
	
	#Deploy a new cluster
	if(opts.action == "deploy"):	
		#Create Cluster
		print "Creating cluster..."
		cluster = api.create_cluster(CLUSTER_NAME, "CDH5")
		cluster.add_hosts(HOSTNAMES)

		#Create HDFS Service and Roles
		print "Creating HDFS Service and Roles..."
		hdfs = cluster.create_service("hdfs", "HDFS")


		namenode        = hdfs.create_role("hdfs-NAMENODE-" + ROLEHASH[0], "NAMENODE", HOSTNAMES[0])
		secnamenode     = hdfs.create_role("hdfs-SECONDARYNAMENODE-" + ROLEHASH[0], "SECONDARYNAMENODE", HOSTNAMES[0])
		for i in range(len(HOSTNAMES)-1):
			datanode = hdfs.create_role("hdfs-DATANODE-" + ROLEHASH[i+1], "DATANODE", HOSTNAMES[i+1])
		
		#Configure HDFS
		print "Configuring HDFS..."
		hdfs.update_config(svc_config = HDFS_CONF)

		for roleGroup in hdfs.get_all_role_config_groups():
			if roleGroup.roleType == "NAMENODE":
				roleGroup.update_config(NAMENODE_CONF)
			elif roleGroup.roleType == "SECONDARYNAMENODE":
				roleGroup.update_config(SECONDARY_CONF)
			elif roleGroup.roleType == "DATANODE":
				roleGroup.update_config(DATANODE_CONF)       

		#Start HDFS
		#format_hdfs takes a list of NameNodes
		print "Formatting HDFS..."
		cmd = hdfs.format_hdfs('hdfs-NAMENODE-' + ROLEHASH[0])[0]
		if not cmd.wait(CMD_TIMEOUT).success:
			print "Failed to format HDFS"
		
		print "Starting HDFS..."
		cmd = hdfs.start()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to start HDFS")

		cmd = hdfs.create_hdfs_tmp()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to create HDFS /tmp")


		for role in hdfs.get_all_roles():
			cmd = hdfs.deploy_client_config(role.name)
			if not cmd.wait(CMD_TIMEOUT).success:
				raise Exception("Failed to deploy client config. Role: " + role.name)

		#Create YARN Service and Roles
		print "Creating YARN Service and Roles..."
		yarn = cluster.create_service("yarn", "YARN")

		resourceman = yarn.create_role("yarn-RESOURCEMANAGER-" + ROLEHASH[0], "RESOURCEMANAGER", HOSTNAMES[0])
		jobhist = yarn.create_role("yarn-JOBHISTORY-" + ROLEHASH[0], "JOBHISTORY", HOSTNAMES[0])
		for i in range(len(HOSTNAMES)-1):
			nodeman = yarn.create_role("yarn-NODEMANAGER-" + ROLEHASH[i+1], "NODEMANAGER", HOSTNAMES[i+1])
	
		#Configure YARN
		print "Configuring YARN..."
		yarn.update_config(svc_config = YARN_CONF)

		for roleGroup in yarn.get_all_role_config_groups():
			if roleGroup.roleType == "RESOURCEMANAGER":
				roleGroup.update_config(RSRCMAN_CONF)
			elif roleGroup.roleType == "JOBHISTORY":
				roleGroup.update_config(JOBHIST_CONF)
			elif roleGroup.roleType == "NODEMANAGER":
				roleGroup.update_config(NODEMAN_CONF)

		#Start YARN
		print "Starting YARN..."
		cmd = yarn.create_yarn_job_history_dir()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to create Job History Directory")
		
		cmd = yarn.create_yarn_node_manager_remote_app_log_dir()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to create NodeManager remote application log directory")
	
		cmd = yarn.start()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to start YARN")
		
		for role in yarn.get_all_roles():
			cmd = yarn.deploy_client_config(role.name)
			if not cmd.wait(CMD_TIMEOUT).success:
				raise Exception("Failed to deploy client config. Role: " + role.name)

		#SUCCESS!
		print "Cluster succesfully deployed."
			
	#Add new nodes		
	elif(opts.action == "add"):
		print "Adding hosts..."
		cluster = api.get_cluster(CLUSTER_NAME);
		cluster.add_hosts(HOSTNAMES);
		
		print "Configurng HDFS Roles..."
		hdfs = cluster.get_service("hdfs")
		for i in range(len(HOSTNAMES)):
			datanode = hdfs.create_role("hdfs-DATANODE-" + ROLEHASH[i], "DATANODE", HOSTNAMES[i])
			datanode.update_config(DATANODE_CONF);
			cmds = hdfs.start_roles("hdfs-DATANODE-" + ROLEHASH[i])
			for cmd in cmds:
				if not cmd.wait(CMD_TIMEOUT).success:
					raise Exception(cmd.name)	
			cmd = hdfs.deploy_client_config("hdfs-DATANODE-" + ROLEHASH[i])	
			if not cmd.wait(CMD_TIMEOUT).success:
				raise Exception("Failed to deploy client config hdfs-DATANODE-" + ROLEHASH[i])
			
		print "Configuring YARN roles..."
		yarn = cluster.get_service("yarn")
		for i in range(len(HOSTNAMES)):
			nodeman = yarn.create_role("yarn-NODEMANAGER-" + ROLEHASH[i], "NODEMANAGER", HOSTNAMES[i])
			nodeman.update_config(NODEMAN_CONF)
			cmds = yarn.start_roles("yarn-NODEMANAGER-" + ROLEHASH[i])
			for cmd in cmds:
				if not cmd.wait(CMD_TIMEOUT).success:
					raise Exception(cmd.name)
			cmd = yarn.deploy_client_config("yarn-NODEMANAGER-" + ROLEHASH[i])
			if not cmd.wait(CMD_TIMEOUT).success:
                                raise Exception("Failed to deploy client config yarn-NODEMANAGER-" + ROLEHASH[i])

		#print "Restarting HDFS service..."
		#cmd = hdfs.restart()
		#if not cmd.wait(CMD_TIMEOUT).success:
		#	raise Exception("Failed to restart HDFS")

		#print "Restarting YARN service..."
		#cmd = yarn.restart()
		#if not cmd.wait(CMD_TIMEOUT).success:
		#	raise Exception("Failed to restart YARN")

		#SUCCESS!
		print "Nodes successfully added"
			
	#Remove nodes		
	elif(opts.action == "remove"):
		cluster = api.get_cluster(CLUSTER_NAME);
		hdfs = cluster.get_service("hdfs")
		yarn = cluster.get_service("yarn")
		
		print "Decommissioning Roles..."
		for role in ROLEHASH:
			cmd = yarn.decommission("yarn-NODEMANAGER-" + role)
			if not cmd.wait(CMD_TIMEOUT).success:
				raise Exception("Failed to decommission role yarn-NODEMANAGER" + role)
			cmd = hdfs.decommission("hdfs-DATANODE-" + role)
			if not cmd.wait(CMD_TIMEOUT).success:	
				raise Exception("Failed to decommission role hdfs-DATANODE-" + role)

		print "Deleting Nodes..."
		for role in ROLEHASH:
			hdfs.delete_role("hdfs-DATANODE-" + role)
			yarn.delete_role("yarn-NODEMANAGER-" + role)
		for hostname in HOSTNAMES:		
			cluster.remove_host(hostname);		
		
		#SUCCESS
		print "Nodes successfull removed."

	#Delete Cluster
	elif(opts.action == "delete"):
		cluster = api.get_cluster(CLUSTER_NAME);
		hdfs = cluster.get_service("hdfs")
		yarn = cluster.get_service("yarn")
		print "Stopping YARN..."
		cmd = yarn.stop()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to stop YARN")
		print "Stopping HDFS..."
		cmd = hdfs.stop()
		if not cmd.wait(CMD_TIMEOUT).success:
			raise Exception("Failed to stop HDFS")
		print "Deleting Cluster..."
		api.delete_cluster(CLUSTER_NAME)

		#SUCCESS
		print "Cluster successfully deleted."
				
	else:
		print "PLEASE SELECT A CORRECT OPTION"
		parser.print_help()
Ejemplo n.º 15
0
def main():
    config.read([
        "./conf/hadrian.ini", "./conf/cluster_specs.ini",
        "./conf/cloudera-manager/cm.ini"
    ])

    cm_cluster_name = config_grabber("Globals")["cm.cluster.name"]
    cm_username = config_grabber("Globals")["cm.username"]
    cm_password = config_grabber("Globals")["cm.password"]
    cm_port = config_grabber("Globals")["cm.port"]
    version = config_grabber("Globals")["cdh.cluster.version"]
    cm_server = config_grabber(cm_cluster_name + "-hn")["cm.server"]

    #Grab all configuration files in the directory with the CM Cluster Name.

    for i in os.listdir("./conf/" + cm_cluster_name):
        config.read("./conf/" + cm_cluster_name + "/" + i)

    while (get_cm_status(cm_server + ":" + cm_port) != 200):
        logging.info("Waiting for CM Server to start... ")
        time.sleep(15)

    api = ApiResource(cm_server, cm_port, cm_username, cm_password, version=12)
    # create cluster or get existing cluster
    cluster_exists = False
    for i in api.get_all_clusters():
        if i.name == cm_cluster_name:
            cluster_exists = True

    if cluster_exists == False:
        cluster = api.create_cluster(cm_cluster_name, version.upper())
        planned_nodes = config_grabber(cm_cluster_name +
                                       "-hn")["full.list"].split(",")
        for k, v in config_grabber(cm_cluster_name + "-dn").iteritems():
            for j in v.split(","):
                planned_nodes.append(j)

        # TODO make this smarter.  show which agents haven't checked in.  Add the option to continue without them.
        if len(api.get_all_hosts()) != len(planned_nodes):
            logging.info(
                "Waiting for all agents to check into the CM Server before continuing."
            )

            while len(planned_nodes) > api.get_all_hosts():
                logging.info(
                    "Waiting for the final set of CM Agent nodes to check in.")
                time.sleep(5)

        logging.info("Updating Rack configuration for data nodes.")
        all_hosts = list()
        for host in api.get_all_hosts():
            all_hosts.append(host.hostId)
            for k, v in config_grabber(cm_cluster_name + "-dn").iteritems():
                if host.hostname in v:
                    logging.info("Setting host: " + host.hostname +
                                 " to rack /" + k)
                    host.set_rack_id("/" + k)

        logging.info("Adding all hosts to cluster.")
        cluster.add_hosts(all_hosts)

    else:
        cluster = api.get_cluster(cm_cluster_name)

    #Config CM
    logging.info("Applying any configuration changes to Cloudera Manager")
    cmanager = api.get_cloudera_manager()
    cmanager.update_config(config_grabber("cloudera-manager-updates"))
    if os.path.exists("/root/hadrian/cm_license.txt"):
        with open("/root/hadrian/cm_license.txt", "r") as license:
            logging.info("Applying Enterprise License to Cloudera Manager")
            cmanager.update_license(license.read())

    if config_grabber('Globals')['cdh.distribution.method'] == 'parcels':
        # increase the parcel refresh frequency to one minute to find parcel repos in a more timely manner
        cmanager.update_config({"PARCEL_UPDATE_FREQ": 1})
        distribute_parcel(cluster, 'CDH',
                          config_grabber('Globals')['cdh.parcel.version'])
        distribute_parcel(cluster, 'KAFKA',
                          config_grabber('Globals')['kafka.parcel.version'])
        # restore parcel refresh time period to original 60 minutes
        cmanager.update_config({"PARCEL_UPDATE_FREQ": 60})

    # grab current services, so that we can skip services already defined to make this script reentrant
    current_services = []
    for i in cluster.get_all_services():
        current_services.append(i.type)

    if "ZOOKEEPER" not in current_services:
        create_zookeeper_service(cluster)

    if "HDFS" not in current_services:
        create_hdfs_service(cluster, api)

    if "YARN" not in current_services:
        create_yarn_service(cluster)

    if "HIVE" not in current_services:
        create_hive_service(cluster)

    if "IMPALA" not in current_services:
        create_impala_service(cluster)

    if "KAFKA" not in current_services:
        create_kafka_service(cluster)

    if config_grabber("Globals")["kerberos.enabled"].lower() == "true":
        enable_kerberos(cluster, cmanager)
    else:
        logging.info("Starting remaining services.")
        cmd = cluster.start()

        if not cmd.wait(CMD_TIMEOUT).success:
            logging.info(
                "Error in cluster services start. Please review Cloudera Manager for details."
            )
        else:
            logging.info("Remaining cluster services started.")

    logging.info(
        "Starting final client configuration deployment for all services.")
    cmd = cluster.deploy_client_config()

    if not cmd.wait(CMD_TIMEOUT).success:
        logging.info("Failed to deploy client configuration.")
    else:
        logging.info(
            "Client configuration deployment complete.  The cluster is all yours.  Happy Hadooping."
        )