Exemple #1
0
def main(args):
    primary_node_image = "{0}/{1}/{2}:cdh-cm-primary-{3}".format(
        args.registry, args.clusterdock_namespace, args.image_name,
        args.version_string)

    secondary_node_image = "{0}/{1}/{2}:cdh-cm-secondary-{3}".format(
        args.registry, args.clusterdock_namespace, args.image_name,
        args.version_string)

    edge_node_image = "{0}/{1}/{2}:cdh-cm-edge-{3}".format(
        args.registry, args.clusterdock_namespace, args.image_name,
        args.version_string)

    # Docker's API for healthcheck uses units of nanoseconds. Define a constant
    # to make this more readable.
    SECONDS = 1000000000
    cm_server_healthcheck = {
        'test':
        'curl --silent --output /dev/null 127.0.0.1:{}'.format(CM_PORT),
        'interval': 1 * SECONDS,
        'timeout': 1 * SECONDS,
        'retries': 1,
        'start_period': 30 * SECONDS
    }
    primary_node = Node(hostname=args.primary_node[0],
                        group='primary',
                        image=primary_node_image,
                        ports=[{
                            CM_PORT: CM_PORT
                        }],
                        healthcheck=cm_server_healthcheck)
    secondary_nodes = [
        Node(hostname=hostname, group='secondary', image=secondary_node_image)
        for hostname in args.secondary_nodes
    ]

    edge_nodes = [
        Node(hostname=hostname, group='edge', image=edge_node_image)
        for hostname in args.edge_nodes
    ]

    all_nodes = [primary_node] + secondary_nodes + edge_nodes

    cluster = Cluster(*all_nodes)

    cluster.primary_node = primary_node

    secondary_node_group = NodeGroup(secondary_nodes)
    edge_node_group = NodeGroup(edge_nodes)

    cluster.start(args.network)

    filesystem_fix_commands = [
        'cp {0} {0}.1; umount {0}; mv -f {0}.1 {0}'.format(file_) for file_ in
        ['/etc/hosts', '/etc/resolv.conf', '/etc/hostname', '/etc/localtime']
    ]
    cluster.execute("bash -c '{}'".format('; '.join(filesystem_fix_commands)))

    # Use BSD tar instead of tar because it works bether with docker
    cluster.execute("ln -fs /usr/bin/bsdtar /bin/tar")

    _configure_cm_agents(cluster)

    if args.change_hostfile:
        update_hosts_file(cluster)

    # The CDH topology uses two pre-built images ('primary' and 'secondary'). If a cluster
    # larger than 2 nodes is started, some modifications need to be done to the nodes to
    # prevent duplicate heartbeats and things like that.
    if len(secondary_nodes) > 1:
        _remove_files(
            nodes=secondary_nodes[1:],
            files=['/var/lib/cloudera-scm-agent/uuid', '/dfs*/dn/current/*'])

    logger.info('Configuring Kerberos...')

    cluster.primary_node.execute('/root/configure-kerberos.sh', quiet=True)
    cluster.primary_node.execute('service krb5kdc start', quiet=True)
    cluster.primary_node.execute('service kadmin start', quiet=True)

    logger.info('Restarting Cloudera Manager agents ...')
    # _restart_cm_agents(cluster)

    logger.info('Waiting for Cloudera Manager server to come online ...')
    _wait_for_cm_server(primary_node)

    # Docker for Mac exposes ports that can be accessed only with ``localhost:<port>`` so
    # use that instead of the hostname if the host name is ``moby``.
    hostname = 'localhost' if client.info().get(
        'Name') == 'moby' else socket.gethostname()
    port = primary_node.host_ports.get(CM_PORT)
    server_url = 'http://{}:{}'.format(hostname, port)
    logger.info('Cloudera Manager server is now reachable at %s', server_url)

    # The work we need to do through CM itself begins here...
    deployment = ClouderaManagerDeployment(server_url)

    deployment.stop_cm_service()
    time.sleep(10)

    logger.info('Starting krb5kdc and kadmin ...')
    cluster.primary_node.execute('service krb5kdc start', quiet=True)
    cluster.primary_node.execute('service kadmin start', quiet=True)

    logger.info("Regenerating keytabs...")
    regenerate_keytabs(cluster, primary_node, deployment)

    logger.info("Adding hosts to cluster ...")
    # Add all CM hosts to the cluster (i.e. only new hosts that weren't part of the original
    # images).
    all_host_ids = {}
    for host in deployment.get_all_hosts():
        all_host_ids[host['hostId']] = host['hostname']
        for node in cluster:
            if node.fqdn == host['hostname']:
                node.host_id = host['hostId']
                break
        else:
            raise Exception('Could not find CM host with hostname {}.'.format(
                node.fqdn))
    cluster_host_ids = {
        host['hostId']
        for host in deployment.get_cluster_hosts(
            cluster_name=DEFAULT_CLUSTER_NAME)
    }
    host_ids_to_add = set(all_host_ids.keys()) - cluster_host_ids

    if host_ids_to_add:
        logger.debug(
            'Adding %s to cluster %s ...', 'host{} ({})'.format(
                's' if len(host_ids_to_add) > 1 else '',
                ', '.join(all_host_ids[host_id]
                          for host_id in host_ids_to_add)),
            DEFAULT_CLUSTER_NAME)
        deployment.add_cluster_hosts(cluster_name=DEFAULT_CLUSTER_NAME,
                                     host_ids=host_ids_to_add)

    _wait_for_activated_cdh_parcel(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME)

    # create and Apply host templates
    deployment.create_host_template(cluster_name='cluster',
                                    host_template_name='secondary',
                                    role_config_group_names=[
                                        'hdfs-DATANODE-BASE',
                                        'hbase-REGIONSERVER-BASE',
                                        'yarn-NODEMANAGER-BASE'
                                    ])
    deployment.create_host_template(cluster_name='cluster',
                                    host_template_name='edgenode',
                                    role_config_group_names=[
                                        'hive-GATEWAY-BASE',
                                        'hbase-GATEWAY-BASE',
                                        'hdfs-GATEWAY-BASE',
                                        'spark_on_yarn-GATEWAY-BASE'
                                    ])

    deployment.apply_host_template(cluster_name=DEFAULT_CLUSTER_NAME,
                                   host_template_name='secondary',
                                   start_roles=False,
                                   host_ids=host_ids_to_add)

    deployment.apply_host_template(cluster_name=DEFAULT_CLUSTER_NAME,
                                   host_template_name='edgenode',
                                   start_roles=False,
                                   host_ids=host_ids_to_add)

    logger.info('Updating database configurations ...')
    _update_database_configs(deployment=deployment,
                             cluster_name=DEFAULT_CLUSTER_NAME,
                             primary_node=primary_node)

    # deployment.update_database_configs()
    # deployment.update_hive_metastore_namenodes()

    logger.info("Update KDC Config  ")
    deployment.update_cm_config({
        'SECURITY_REALM': 'CLOUDERA',
        'KDC_HOST': 'node-1.cluster',
        'KRB_MANAGE_KRB5_CONF': 'true'
    })

    deployment.update_service_config(
        service_name='hbase',
        cluster_name=DEFAULT_CLUSTER_NAME,
        configs={'hbase_superuser': '******'})

    deployment.update_service_role_config_group_config(
        service_name='hive',
        cluster_name=DEFAULT_CLUSTER_NAME,
        role_config_group_name='hive-HIVESERVER2-BASE',
        configs={'hiveserver2_webui_port': '10009'})

    logger.info("Importing Credentials..")

    cluster.primary_node.execute(
        "curl -XPOST -u admin:admin http://{0}:{1}/api/v14/cm/commands/importAdminCredentials?username=cloudera-scm/admin@CLOUDERA&password=cloudera"
        .format(primary_node.fqdn, CM_PORT),
        quiet=True)
    logger.info("deploy cluster client config ...")
    deployment.deploy_cluster_client_config(cluster_name=DEFAULT_CLUSTER_NAME)

    logger.info("Configure for kerberos ...")
    cluster.primary_node.execute(
        "curl -XPOST -u admin:admin http://{0}:{1}/api/v14/cm/commands/configureForKerberos --data 'clustername={2}'"
        .format(primary_node.fqdn, CM_PORT, DEFAULT_CLUSTER_NAME),
        quiet=True)

    logger.info("Creating keytab files ...")
    cluster.execute('/root/create-keytab.sh', quiet=True)

    logger.info('Deploying client config ...')
    _deploy_client_config(deployment=deployment,
                          cluster_name=DEFAULT_CLUSTER_NAME)

    if not args.dont_start_cluster:
        logger.info('Starting cluster services ...')
        _start_service_command(deployment=deployment,
                               cluster_name=DEFAULT_CLUSTER_NAME,
                               service_name="zookeeper",
                               command="start")
        _start_service_command(deployment=deployment,
                               cluster_name=DEFAULT_CLUSTER_NAME,
                               service_name="hdfs",
                               command="start")
        if not args.skip_accumulo:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="accumulo16",
                                   command="CreateHdfsDirCommand")
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="accumulo16",
                                   command="CreateAccumuloUserDirCommand")
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="accumulo16",
                                   command="AccumuloInitServiceCommand")
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="accumulo16",
                                   command="start")
        if not args.skip_yarn:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="yarn",
                                   command="start")
        if not args.skip_hbase:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="hbase",
                                   command="start")
        if not args.skip_flume:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="flume",
                                   command="start")
        if not args.skip_spark:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="spark_on_yarn",
                                   command="start")
        if not args.skip_sqoop:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="sqoop",
                                   command="start")
        if not args.skip_hive:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="hive",
                                   command="start")
        if not args.skip_oozie:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="oozie",
                                   command="start")
        if not args.skip_hue:
            _start_service_command(deployment=deployment,
                                   cluster_name=DEFAULT_CLUSTER_NAME,
                                   service_name="hue",
                                   command="start")

        logger.info('Starting CM services ...')
        _start_cm_service(deployment=deployment)

    logger.info("Setting up HDFS Homedir ...")

    cluster.primary_node.execute(
        "kinit -kt /var/run/cloudera-scm-agent/process/*-hdfs-NAMENODE/hdfs.keytab hdfs/node-1.cluster@CLOUDERA",
        quiet=True)
    cluster.primary_node.execute("hadoop fs -mkdir /user/cloudera-scm",
                                 quiet=True)
    cluster.primary_node.execute(
        "hadoop fs -chown cloudera-scm:cloudera-scm /user/cloudera-scm",
        quiet=True)

    logger.info("Kinit cloudera-scm/admin ...")
    cluster.execute('kinit -kt /root/cloudera-scm.keytab cloudera-scm/admin',
                    quiet=True)

    logger.info("Executing post run script ...")
    secondary_node_group.execute("/root/post_run.sh")
    edge_node_group.execute("/root/post_run.sh")
Exemple #2
0
def main(args):
    if args.license_url and not args.license_credentials:
        raise Exception(
            '--license-credentials is a required argument if --license-url is provided.'
        )

    image_prefix = '{}/{}/clusterdock:mapr{}'.format(
        args.registry, args.namespace or DEFAULT_NAMESPACE, args.mapr_version)
    if args.mep_version:
        image_prefix = '{}_mep{}'.format(image_prefix, args.mep_version)
    primary_node_image = '{}_{}'.format(image_prefix, 'primary-node')
    secondary_node_image = '{}_{}'.format(image_prefix, 'secondary-node')

    node_disks = yaml.load(args.node_disks)

    # MapR-FS needs each fileserver node to have a disk allocated for it, so fail fast if the
    # node disks map is missing any nodes.
    if set(args.primary_node + args.secondary_nodes) != set(node_disks):
        raise Exception(
            'Not all nodes are accounted for in the --node-disks dictionary')

    primary_node = Node(
        hostname=args.primary_node[0],
        group='primary',
        image=primary_node_image,
        ports=[{
            MCS_SERVER_PORT: MCS_SERVER_PORT
        } if args.predictable else MCS_SERVER_PORT],
        devices=node_disks.get(args.primary_node[0]),
        # Secure cluster needs the ticket to execute rest of commands
        # after cluster start.
        environment=['MAPR_TICKETFILE_LOCATION=/opt/mapr/conf/mapruserticket']
        if args.secure else [])

    secondary_nodes = [
        Node(hostname=hostname,
             group='secondary',
             image=secondary_node_image,
             devices=node_disks.get(hostname))
        for hostname in args.secondary_nodes
    ]

    cluster = Cluster(primary_node, *secondary_nodes)

    if args.secure:
        secure_config_host_dir = os.path.expanduser(
            args.secure_config_directory)
        volumes = [{secure_config_host_dir: SECURE_CONFIG_CONTAINER_DIR}]
        for node in cluster.nodes:
            node.volumes.extend(volumes)

    # MapR versions 6.0.0 onwards use CentOS 7 which needs following settings.
    mapr_version_tuple = tuple(int(i) for i in args.mapr_version.split('.'))
    if mapr_version_tuple >= EARLIEST_MAPR_VERSION_WITH_LICENSE_AND_CENTOS_7:
        for node in cluster.nodes:
            node.volumes.append({'/sys/fs/cgroup': '/sys/fs/cgroup'})
            temp_dir_name = tempfile.mkdtemp()
            logger.debug('Created temporary directory %s', temp_dir_name)
            node.volumes.append({temp_dir_name: '/run'})
    cluster.primary_node = primary_node
    cluster.start(args.network, pull_images=args.always_pull)

    logger.info('Generating new UUIDs ...')
    cluster.execute('/opt/mapr/server/mruuidgen > /opt/mapr/hostid')

    if not args.secure:
        logger.info('Configuring the cluster ...')
        for node in cluster:
            configure_command = (
                '/opt/mapr/server/configure.sh -C {0} -Z {0} -RM {0} -HS {0} '
                '-u mapr -g mapr -D {1}'.format(
                    primary_node.fqdn,
                    ','.join(node_disks.get(node.hostname))))
            node.execute("bash -c '{}'".format(configure_command))
    else:
        logger.info('Configuring native security for the cluster ...')
        configure_command = (
            '/opt/mapr/server/configure.sh -secure -genkeys -C {0} -Z {0} -RM {0} -HS {0} '
            '-u mapr -g mapr -D {1}'.format(
                primary_node.fqdn,
                ','.join(node_disks.get(primary_node.hostname))))
        source_files = [
            '{}/{}'.format(MAPR_CONFIG_DIR, file) for file in SECURE_FILES
        ]
        commands = [
            configure_command,
            'chmod 600 {}/{}'.format(MAPR_CONFIG_DIR, SSL_KEYSTORE_FILE),
            'cp -f {src} {dest_dir}'.format(
                src=' '.join(source_files),
                dest_dir=SECURE_CONFIG_CONTAINER_DIR)
        ]
        primary_node.execute(' && '.join(commands))
        for node in secondary_nodes:
            source_files = [
                '{}/{}'.format(SECURE_CONFIG_CONTAINER_DIR, file)
                for file in SECURE_FILES
            ]
            configure_command = (
                '/opt/mapr/server/configure.sh -secure -C {0} -Z {0} -RM {0} -HS {0} '
                '-u mapr -g mapr -D {1}'.format(
                    primary_node.fqdn,
                    ','.join(node_disks.get(node.hostname))))
            commands = [
                'cp -f {src} {dest_dir}'.format(src=' '.join(source_files),
                                                dest_dir=MAPR_CONFIG_DIR),
                configure_command
            ]
            node.execute(' && '.join(commands))

    logger.info('Waiting for MapR Control System server to come online ...')

    def condition(address, port):
        return socket().connect_ex((address, port)) == 0

    def success(time):
        logger.info('MapR Control System server is online after %s seconds.',
                    time)

    def failure(timeout):
        raise TimeoutError(
            'Timed out after {} seconds waiting '
            'for MapR Control System server to come online.'.format(timeout))

    wait_for_condition(
        condition=condition,
        condition_args=[primary_node.ip_address, MCS_SERVER_PORT],
        time_between_checks=3,
        timeout=180,
        success=success,
        failure=failure)
    mcs_server_host_port = primary_node.host_ports.get(MCS_SERVER_PORT)

    logger.info('Creating /apps/spark directory on %s ...',
                primary_node.hostname)
    spark_directory_command = [
        'hadoop fs -mkdir -p /apps/spark', 'hadoop fs -chmod 777 /apps/spark'
    ]
    primary_node.execute("bash -c '{}'".format(
        '; '.join(spark_directory_command)))

    logger.info('Creating MapR sample Stream named /sample-stream on %s ...',
                primary_node.hostname)
    primary_node.execute('maprcli stream create -path /sample-stream '
                         '-produceperm p -consumeperm p -topicperm p')

    if mapr_version_tuple >= EARLIEST_MAPR_VERSION_WITH_LICENSE_AND_CENTOS_7 and args.license_url:
        license_commands = [
            'curl --user {} {} > /tmp/lic'.format(args.license_credentials,
                                                  args.license_url),
            '/opt/mapr/bin/maprcli license add -license /tmp/lic -is_file true',
            'rm -rf /tmp/lic'
        ]
        logger.info('Applying license ...')
        primary_node.execute(' && '.join(license_commands))

    if not args.dont_register_gateway:
        logger.info('Registering gateway with the cluster ...')
        register_gateway_commands = [
            "cat /opt/mapr/conf/mapr-clusters.conf | egrep -o '^[^ ]* '"
            ' > /tmp/cluster-name',
            'maprcli cluster gateway set -dstcluster $(cat '
            '/tmp/cluster-name) -gateways {}'.format(primary_node.fqdn),
            'rm /tmp/cluster-name'
        ]
        primary_node.execute(' && '.join(register_gateway_commands))

    logger.info(
        'MapR Control System server is now accessible at https://%s:%s',
        getfqdn(), mcs_server_host_port)