Ejemplo n.º 1
0
def _update_hive_metastore_namenodes(deployment, cluster_name):
    for service in deployment.get_cluster_services(cluster_name=cluster_name):
        if service['type'] == 'HIVE':
            command_id = deployment.update_hive_metastore_namenodes(
                cluster_name, service['name'])['id']
            break

    def condition(deployment, command_id):
        command_information = deployment.api_client.get_command_information(
            command_id)
        active = command_information.get('active')
        success = command_information.get('success')
        logger.debug(
            'Hive Metastore namenodes command: (active: %s, success: %s)',
            active, success)
        if not active and not success:
            raise Exception('Failed to update Hive Metastore Namenodes.')
        return not active and success

    def success(time):
        logger.debug('Updated Hive Metastore Namenodes in %s seconds.', time)

    def failure(timeout):
        raise TimeoutError(
            'Timed out after {} seconds waiting '
            'for Hive Metastore Namenodes to update.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[deployment, command_id],
                       time_between_checks=3,
                       timeout=180,
                       success=success,
                       failure=failure)
Ejemplo n.º 2
0
def _start_cm_service(deployment):
    command_id = deployment.start_cm_service()['id']

    def condition(deployment, command_id):
        command_information = deployment.api_client.get_command_information(
            command_id)
        active = command_information.get('active')
        success = command_information.get('success')
        logger.debug('Start CM service command: (active: %s, success: %s)',
                     active, success)
        if not active and not success:
            raise Exception('Failed to start CM service.')
        return not active and success

    def success(time):
        logger.debug('Started CM service in %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting '
                           'for CM service to start.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[deployment, command_id],
                       time_between_checks=3,
                       timeout=180,
                       success=success,
                       failure=failure)
Ejemplo n.º 3
0
    def deploy_client_config(self):
        command_id = self.api_client.deploy_cluster_client_config(
            cluster_name=self.name)['id']

        def condition(command_id):
            command_information = self.api_client.get_command_information(
                command_id)
            active = command_information.get('active')
            success = command_information.get('success')
            result_message = command_information.get('resultMessage')
            logger.debug(
                'Deploy cluster client config command: (active: %s, success: %s)',
                active, success)
            if not active and not success:
                if 'not currently available for execution' in result_message:
                    logger.debug('Deploy cluster client config execution not '
                                 'currently available. Continuing ...')
                    return True
                raise Exception('Failed to deploy cluster config.')
            return not active and success

        def success(time):
            logger.debug('Deployed cluster client config in %s seconds.', time)

        def failure(timeout):
            raise TimeoutError(
                'Timed out after {} seconds waiting '
                'for cluster client config to deploy.'.format(timeout))

        wait_for_condition(condition=condition,
                           condition_args=[command_id],
                           time_between_checks=3,
                           timeout=180,
                           success=success,
                           failure=failure)
Ejemplo n.º 4
0
    def start(self):
        command_id = self.api_client.start_all_cluster_services(
            cluster_name=self.name)['id']

        def condition(command_id):
            command_information = self.api_client.get_command_information(
                command_id)
            active = command_information.get('active')
            success = command_information.get('success')
            logger.debug('Start cluster command: (active: %s, success: %s)',
                         active, success)
            if not active and not success:
                raise Exception('Failed to start cluster.')
            return not active and success

        def success(time):
            logger.debug('Started cluster in %s seconds.', time)

        def failure(timeout):
            raise TimeoutError('Timed out after {} seconds waiting '
                               'for cluster to start.'.format(timeout))

        wait_for_condition(condition=condition,
                           condition_args=[command_id],
                           time_between_checks=3,
                           timeout=600,
                           success=success,
                           failure=failure)
Ejemplo n.º 5
0
def _wait_for_command(object, command_id, timeout=180):
    def condition(object, command_id):
        command_information = object.api_client.get_command_information(
            command_id)
        active = command_information.get('active')
        success = command_information.get('success')
        name = command_information.get('name')
        logger.debug('Run %s command: (active: %s, success: %s)', name, active,
                     success)
        if not active and not success:
            raise Exception('Failed to run command {}.'.format(name))
        return not active and success

    def success(time):
        logger.debug('Command ran in %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting '
                           'for command to run.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[object, command_id],
                       time_between_checks=3,
                       timeout=timeout,
                       success=success,
                       failure=failure)
Ejemplo n.º 6
0
def _validate_service_health(deployment, cluster_name):
    def condition(deployment, cluster_name):
        services = (
            deployment.get_cluster_services(cluster_name=cluster_name) +
            [deployment.get_cm_service()])
        if all(
                service.get('serviceState') == 'NA'
                or service.get('serviceState') == 'STARTED'
                and service.get('healthSummary') == 'GOOD'
                for service in services):
            return True
        else:
            logger.debug(
                'Services with poor health: %s', ', '.join(
                    service['name'] for service in services
                    if (service.get('healthSummary') != 'GOOD'
                        and service.get('serviceState') != 'NA')
                    or service.get('serviceState') not in ('STARTED', 'NA')))

    def success(time):
        logger.debug('Validated service health in %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting '
                           'to validate service health.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[deployment, cluster_name],
                       time_between_checks=3,
                       timeout=600,
                       time_to_success=30,
                       success=success,
                       failure=failure)
def _validate_service_health(node, services, quiet=True):
    def condition(node, services):
        services_with_poor_health = [
            service for service in services
            if node.execute(command='service {} status'.format(service),
                            quiet=quiet).exit_code != 0
        ]
        if services_with_poor_health:
            logger.debug('Services with poor health: %s',
                         ', '.join(services_with_poor_health))
        # Return True if the list of services with poor health is empty.
        return not bool(services_with_poor_health)

    def success(time):
        logger.debug('Validated service health in %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting '
                           'to validate service health.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[node, services],
                       time_between_checks=3,
                       timeout=30,
                       success=success,
                       failure=failure)
Ejemplo n.º 8
0
    def wait_for_parcel_stage(self, product, version=None, stage=None):
        def condition(product, version, stage):
            parcel = self.parcel(product=product, version=version, stage=stage)
            return parcel is not None

        def success(time):
            logger.debug(
                '%s parcel with %s version found in %s stage after %s seconds.',
                product, version, stage, time)

        def failure(timeout):
            raise TimeoutError(
                'Timed out after {} seconds waiting for {} parcel with {} version'
                ' in the {} stage.'.format(timeout, product, version, stage))

        wait_for_condition(condition=condition,
                           condition_args=[product, version, stage],
                           time_between_checks=3,
                           timeout=540,
                           success=success,
                           failure=failure)
Ejemplo n.º 9
0
    def refresh_parcel_repos(self):
        """Refresh parcel information.

        For CM API versions without support for the REST endpoint, this will simply sleep.
        """
        if self.api_client.api_version < 'v16':
            logger.warning(
                'Detected API version without support '
                'for refreshParcelRepos (%s). Sleeping instead ...',
                self.api_client.api_version)
            sleep(30)
        else:
            command_id = self.api_client.refresh_parcel_repos()['id']

            def condition(command_id):
                command_information = self.api_client.get_command_information(
                    command_id)
                active = command_information.get('active')
                success = command_information.get('success')
                logger.debug(
                    'Refresh parcel repos command: (active: %s, success: %s)',
                    active, success)
                if not active and not success:
                    raise Exception('Failed to refresh parcel repos.')
                return not active and success

            def success(time):
                logger.debug('Refreshed parcel repos in %s seconds.', time)

            def failure(timeout):
                raise TimeoutError(
                    'Timed out after {} seconds waiting '
                    'for parcel repos to refresh.'.format(timeout))

            wait_for_condition(condition=condition,
                               condition_args=[command_id],
                               time_between_checks=3,
                               timeout=180,
                               success=success,
                               failure=failure)
Ejemplo n.º 10
0
def _wait_for_activated_cdh_parcel(deployment, cluster_name):
    parcels = deployment.get_cluster_parcels(cluster_name=cluster_name)
    parcel_version = next(
        parcel['version'] for parcel in parcels
        if parcel['product'] == 'CDH' and parcel['stage'] in ('ACTIVATING',
                                                              'ACTIVATED'))

    def condition(deployment, cluster_name):
        parcels = deployment.get_cluster_parcels(cluster_name=cluster_name)
        for parcel in parcels:
            if parcel['product'] == 'CDH' and parcel[
                    'version'] == parcel_version:
                logger.debug('Found CDH parcel with version %s in state %s.',
                             parcel_version, parcel['stage'])
                break
        else:
            raise Exception(
                'Could not find activating or activated CDH parcel.')

        logger.debug('CDH parcel is in stage %s ...', parcel['stage'])

        if parcel['stage'] == 'ACTIVATED':
            return True

    def success(time):
        logger.debug('CDH parcel became activated after %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting for '
                           'CDH parcel to become activated.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[deployment, cluster_name],
                       time_between_checks=1,
                       timeout=500,
                       time_to_success=10,
                       success=success,
                       failure=failure)
Ejemplo n.º 11
0
def _wait_for_cm_server(primary_node):
    def condition(container):
        container.reload()
        health_status = nested_get(container.attrs,
                                   ['State', 'Health', 'Status'])
        logger.debug('Cloudera Manager health status evaluated to %s.',
                     health_status)
        return health_status == 'healthy'

    def success(time):
        logger.debug(
            'Cloudera Manager reached healthy state after %s seconds.', time)

    def failure(timeout):
        raise TimeoutError('Timed out after {} seconds waiting '
                           'for Cloudera Manager to start.'.format(timeout))

    wait_for_condition(condition=condition,
                       condition_args=[primary_node.container],
                       time_between_checks=3,
                       timeout=180,
                       success=success,
                       failure=failure)
Ejemplo n.º 12
0
    def wait_for_stage(self, stage, timeout=300, time_to_success=0):
        def condition():
            for parcel in self.cluster.parcels:
                if parcel.product == self.product and parcel.version == self.version:
                    break
            logger.debug('%s parcel is in stage %s ...', self.product,
                         parcel.stage)
            return parcel.stage == stage

        def success(time):
            logger.debug('%s parcel reached stage %s after %s seconds.',
                         self.product, stage, time)

        def failure(timeout):
            raise TimeoutError(
                'Timed out after {} seconds waiting for {} parcel '
                'to reach stage {}.'.format(timeout, self.product, stage))

        return wait_for_condition(condition=condition,
                                  time_between_checks=3,
                                  timeout=timeout,
                                  time_to_success=time_to_success,
                                  success=success,
                                  failure=failure)
Ejemplo n.º 13
0
def main(args):
    quiet = not args.verbose

    # Image name
    image = '{}/{}/topology_apache_kafka:kafka-{}-{}'.format(
        args.registry, args.namespace or DEFAULT_NAMESPACE, args.kafka_version,
        args.scala_version)

    # Nodes in the Kafka cluster
    nodes = [
        Node(hostname=hostname,
             group='brokers',
             ports=[ZOOKEEPER_PORT, BROKER_PORT],
             image=image) for hostname in args.brokers
    ]

    cluster = Cluster(*nodes)
    cluster.start(args.network, pull_images=args.always_pull)

    # Create distributed zookeeper configuration
    zookeeper_config = ('tickTime=2000\n'
                        'dataDir=/zookeeper\n'
                        'clientPort=2181\n'
                        'initLimit=5\n'
                        'syncLimit=2\n')
    for idx, node in enumerate(cluster):
        zookeeper_config += 'server.{}={}:2888:3888\n'.format(
            idx, node.hostname)

    # Start all zookeepers
    for idx, node in enumerate(cluster):
        logger.info('Starting Zookeeper on node {}'.format(node.hostname))
        node.execute('mkdir -p /zookeeper')
        node.put_file('/zookeeper/myid', str(idx))
        node.put_file('/zookeeper.properties', zookeeper_config)
        node.execute('/start_zookeeper &', detach=True)

    # Validate that Zookeepr is alive from each node
    for node in cluster:
        logger.info('Validating Zookeeper on node %s', node.hostname)
        wait_for_condition(condition=validate_zookeeper,
                           condition_args=[node, quiet],
                           time_between_checks=3,
                           timeout=60,
                           success=success,
                           failure=failure)

    # Start all brokers
    for idx, node in enumerate(cluster):
        logger.info('Starting Kafka on node {}'.format(node.hostname))

        kafka_config = node.get_file('/kafka/config/server.properties')
        kafka_config = kafka_config.replace('broker.id=0',
                                            'broker.id={}'.format(idx))
        node.put_file('/kafka.properties', kafka_config)

        node.execute('/start_kafka &', detach=True)

    # Verify that all Kafka brokers up
    logger.info('Waiting on all brokers to register in zookeeper')
    wait_for_condition(condition=validate_kafka,
                       condition_args=[nodes[0], len(nodes), quiet],
                       time_between_checks=3,
                       timeout=60,
                       success=success,
                       failure=failure)

    # Automatically create topics
    for topic in args.topics.split(','):
        logger.info('Creating topic %s', topic)
        nodes[0].execute('/create_topic {}'.format(topic), quiet=quiet)
Ejemplo n.º 14
0
def main(args):
    quiet = not args.verbose
    print_topology_meta(args.topology)

    if args.include_services and args.exclude_services:
        raise ValueError(
            'Cannot pass both --include-services and --exclude-services.')

    image_prefix = '{}/{}/topology_hdp:hdp{}_ambari{}'.format(
        args.registry, args.namespace or DEFAULT_NAMESPACE, args.hdp_version,
        args.ambari_version)
    primary_node_image = '{}_{}'.format(image_prefix, 'primary-node')
    secondary_node_image = '{}_{}'.format(image_prefix, 'secondary-node')

    clusterdock_config_host_dir = os.path.realpath(
        os.path.expanduser(args.clusterdock_config_directory))
    volumes = [{clusterdock_config_host_dir: CLUSTERDOCK_CLIENT_CONTAINER_DIR}]

    primary_node = Node(hostname=args.primary_node[0],
                        group='primary',
                        volumes=volumes,
                        image=primary_node_image,
                        ports=[{
                            AMBARI_PORT: AMBARI_PORT
                        } if args.predictable else AMBARI_PORT])

    secondary_nodes = [
        Node(hostname=hostname,
             group='secondary',
             volumes=volumes,
             image=secondary_node_image) for hostname in args.secondary_nodes
    ]

    cluster = Cluster(primary_node, *secondary_nodes)
    cluster.primary_node = primary_node
    cluster.secondary_nodes = secondary_nodes

    for node in cluster.nodes:
        node.volumes.append({'/sys/fs/cgroup': '/sys/fs/cgroup'})
        # do not use tempfile.mkdtemp, as systemd wont be able to bring services up when temp ends to be created in
        # /var/tmp/ directory
        node.volumes.append(['/run', '/run/lock'])

    cluster.start(args.network)

    hdp_version_tuple = version_tuple(args.hdp_version)

    logger.debug('Starting PostgreSQL for Ambari server ...')

    # Need this as init system in Docker misreports on postgres start initially
    # Check https://github.com/docker-library/postgres/issues/146 for more
    def condition():
        primary_node.execute('service postgresql restart', quiet=quiet)
        if '1 row' in primary_node.execute(
                'PGPASSWORD=bigdata psql ambari '
                '-U ambari -h localhost -c "select 1"',
                quiet=quiet).output:
            return True

    wait_for_condition(condition=condition, time_between_checks=2)

    def condition():
        if 'running' in primary_node.execute('service postgresql status',
                                             quiet=quiet).output:
            return True

    wait_for_condition(condition=condition)

    time.sleep(
        10
    )  # If images are set to start Ambari server/agents - give some time to recover the right status
    _update_node_names(cluster, quiet=quiet)

    # The HDP topology uses two pre-built images ('primary' and 'secondary'). If a cluster
    # larger than 2 nodes is started, some modifications need to be done.
    if len(secondary_nodes) > 1:
        _remove_files(nodes=secondary_nodes[1:],
                      files=['/hadoop/hdfs/data/current/*'],
                      quiet=quiet)

    logger.info('Starting Ambari server ...')
    primary_node.execute('ambari-server start', quiet=quiet)

    # Docker for Mac exposes ports that can be accessed only with ``localhost:<port>`` so
    # use that instead of the hostname if the host name is ``moby``.
    hostname = ('localhost'
                if client.info().get('Name') == 'moby' else socket.getaddrinfo(
                    socket.gethostname(), 0, flags=socket.AI_CANONNAME)[0][3])
    port = cluster.primary_node.host_ports.get(AMBARI_PORT)
    server_url = 'http://{}:{}'.format(hostname, port)
    logger.info('Ambari server is now reachable at %s', server_url)

    logger.info('Starting Ambari agents ...')
    for node in cluster:
        logger.debug('Starting Ambari agent on %s ...', node.fqdn)
        node.execute('ambari-agent start', quiet=quiet)

    ambari = Ambari(server_url, username='******', password='******')

    def condition(ambari, cluster):
        cluster_hosts = {node.fqdn for node in cluster}
        ambari_hosts = {host.host_name for host in ambari.hosts}
        logger.debug('Cluster hosts: %s; Ambari hosts: %s', cluster_hosts,
                     ambari_hosts)
        return cluster_hosts == ambari_hosts

    wait_for_condition(condition=condition, condition_args=[ambari, cluster])

    service_types_to_leave = (args.include_services.upper().split(',')
                              if args.include_services else [])
    service_types_to_remove = (args.exclude_services.upper().split(',')
                               if args.exclude_services else [])
    if service_types_to_leave or service_types_to_remove:
        for service in list(ambari.clusters(DEFAULT_CLUSTER_NAME).services):
            service_name = service.service_name.upper()
            if (service_name in service_types_to_remove
                    or (service_types_to_leave
                        and service_name not in service_types_to_leave)):
                logger.info('Removing cluster service (name = %s) ...',
                            service_name)
                service.delete()

    for node in secondary_nodes[1:]:
        logger.info('Adding %s to cluster ...', node.fqdn)
        ambari.clusters(DEFAULT_CLUSTER_NAME).hosts.create(node.fqdn)
        secondary_node = ambari.clusters(DEFAULT_CLUSTER_NAME).hosts(
            secondary_nodes[0].fqdn)
        for component in secondary_node.components:
            logger.debug('Adding component (%s) to cluster on host (%s) ...',
                         component.component_name, node.fqdn)
            host_components = ambari.clusters(DEFAULT_CLUSTER_NAME).hosts(
                node.fqdn).components
            host_components.create(component.component_name).wait()

        logger.debug('Installing all registered components on host (%s) ...',
                     node.fqdn)
        ambari.clusters(DEFAULT_CLUSTER_NAME).hosts(
            node.fqdn).components.install().wait()

    logger.info('Waiting for all hosts to reach healthy state ...')

    def condition(ambari):
        health_report = ambari.clusters(DEFAULT_CLUSTER_NAME).health_report
        logger.debug('Ambari cluster health report: %s ...', health_report)
        return health_report.get('Host/host_state/HEALTHY') == len(
            list(ambari.hosts))

    wait_for_condition(condition=condition, condition_args=[ambari])

    service_names = [
        service['service_name'] for service in ambari.clusters(
            DEFAULT_CLUSTER_NAME).services.to_dict()
    ]

    if 'ATLAS' in service_names:
        logger.info('Configuring Atlas required properties ...')
        _configure_atlas(ambari,
                         args.hdp_version,
                         atlas_server_host=cluster.primary_node.fqdn)

    if 'HIVE' in service_names:
        primary_node.execute('touch /etc/hive/sys.db.created', quiet=quiet)

    logger.info('Waiting for components to be ready ...')

    def condition(ambari):
        comps = ambari.clusters(
            DEFAULT_CLUSTER_NAME).cluster.host_components.refresh()
        for comp in comps:
            if comp.state.upper() == 'UNKNOWN':
                logger.debug('Not ready with component `%s` ...',
                             comp.component_name)
                return False
        else:
            return True

    wait_for_condition(condition=condition, condition_args=[ambari])

    if not args.dont_start_cluster:
        logger.info('Starting cluster services ...')
        ambari.clusters(DEFAULT_CLUSTER_NAME).services.start().wait(
            timeout=3600)

        if 'HBASE' in service_names:
            logger.info('Starting Thrift server ...')
            if hdp_version_tuple <= (2, 0, 13, 0):
                hbase_daemon_path = '/usr/lib/hbase/bin/hbase-daemon.sh'
            else:
                hbase_daemon_path = '/usr/hdp/current/hbase-master/bin/hbase-daemon.sh'
            primary_node.execute('{} start thrift -p {} '
                                 '--infoport {}'.format(
                                     hbase_daemon_path,
                                     HBASE_THRIFT_SERVER_PORT,
                                     HBASE_THRIFT_SERVER_INFO_PORT),
                                 quiet=quiet)
Ejemplo n.º 15
0
def main(args):
    if args.license_url and not args.license_credentials:
        raise Exception(
            '--license-credentials is a required argument if --license-url is provided.'
        )

    image_prefix = '{}/{}/clusterdock:mapr{}'.format(
        args.registry, args.namespace or DEFAULT_NAMESPACE, args.mapr_version)
    if args.mep_version:
        image_prefix = '{}_mep{}'.format(image_prefix, args.mep_version)
    primary_node_image = '{}_{}'.format(image_prefix, 'primary-node')
    secondary_node_image = '{}_{}'.format(image_prefix, 'secondary-node')

    node_disks = yaml.load(args.node_disks)

    # MapR-FS needs each fileserver node to have a disk allocated for it, so fail fast if the
    # node disks map is missing any nodes.
    if set(args.primary_node + args.secondary_nodes) != set(node_disks):
        raise Exception(
            'Not all nodes are accounted for in the --node-disks dictionary')

    primary_node = Node(
        hostname=args.primary_node[0],
        group='primary',
        image=primary_node_image,
        ports=[{
            MCS_SERVER_PORT: MCS_SERVER_PORT
        } if args.predictable else MCS_SERVER_PORT],
        devices=node_disks.get(args.primary_node[0]),
        # Secure cluster needs the ticket to execute rest of commands
        # after cluster start.
        environment=['MAPR_TICKETFILE_LOCATION=/opt/mapr/conf/mapruserticket']
        if args.secure else [])

    secondary_nodes = [
        Node(hostname=hostname,
             group='secondary',
             image=secondary_node_image,
             devices=node_disks.get(hostname))
        for hostname in args.secondary_nodes
    ]

    cluster = Cluster(primary_node, *secondary_nodes)

    if args.secure:
        secure_config_host_dir = os.path.expanduser(
            args.secure_config_directory)
        volumes = [{secure_config_host_dir: SECURE_CONFIG_CONTAINER_DIR}]
        for node in cluster.nodes:
            node.volumes.extend(volumes)

    # MapR versions 6.0.0 onwards use CentOS 7 which needs following settings.
    mapr_version_tuple = tuple(int(i) for i in args.mapr_version.split('.'))
    if mapr_version_tuple >= EARLIEST_MAPR_VERSION_WITH_LICENSE_AND_CENTOS_7:
        for node in cluster.nodes:
            node.volumes.append({'/sys/fs/cgroup': '/sys/fs/cgroup'})
            temp_dir_name = tempfile.mkdtemp()
            logger.debug('Created temporary directory %s', temp_dir_name)
            node.volumes.append({temp_dir_name: '/run'})
    cluster.primary_node = primary_node
    cluster.start(args.network, pull_images=args.always_pull)

    logger.info('Generating new UUIDs ...')
    cluster.execute('/opt/mapr/server/mruuidgen > /opt/mapr/hostid')

    if not args.secure:
        logger.info('Configuring the cluster ...')
        for node in cluster:
            configure_command = (
                '/opt/mapr/server/configure.sh -C {0} -Z {0} -RM {0} -HS {0} '
                '-u mapr -g mapr -D {1}'.format(
                    primary_node.fqdn,
                    ','.join(node_disks.get(node.hostname))))
            node.execute("bash -c '{}'".format(configure_command))
    else:
        logger.info('Configuring native security for the cluster ...')
        configure_command = (
            '/opt/mapr/server/configure.sh -secure -genkeys -C {0} -Z {0} -RM {0} -HS {0} '
            '-u mapr -g mapr -D {1}'.format(
                primary_node.fqdn,
                ','.join(node_disks.get(primary_node.hostname))))
        source_files = [
            '{}/{}'.format(MAPR_CONFIG_DIR, file) for file in SECURE_FILES
        ]
        commands = [
            configure_command,
            'chmod 600 {}/{}'.format(MAPR_CONFIG_DIR, SSL_KEYSTORE_FILE),
            'cp -f {src} {dest_dir}'.format(
                src=' '.join(source_files),
                dest_dir=SECURE_CONFIG_CONTAINER_DIR)
        ]
        primary_node.execute(' && '.join(commands))
        for node in secondary_nodes:
            source_files = [
                '{}/{}'.format(SECURE_CONFIG_CONTAINER_DIR, file)
                for file in SECURE_FILES
            ]
            configure_command = (
                '/opt/mapr/server/configure.sh -secure -C {0} -Z {0} -RM {0} -HS {0} '
                '-u mapr -g mapr -D {1}'.format(
                    primary_node.fqdn,
                    ','.join(node_disks.get(node.hostname))))
            commands = [
                'cp -f {src} {dest_dir}'.format(src=' '.join(source_files),
                                                dest_dir=MAPR_CONFIG_DIR),
                configure_command
            ]
            node.execute(' && '.join(commands))

    logger.info('Waiting for MapR Control System server to come online ...')

    def condition(address, port):
        return socket().connect_ex((address, port)) == 0

    def success(time):
        logger.info('MapR Control System server is online after %s seconds.',
                    time)

    def failure(timeout):
        raise TimeoutError(
            'Timed out after {} seconds waiting '
            'for MapR Control System server to come online.'.format(timeout))

    wait_for_condition(
        condition=condition,
        condition_args=[primary_node.ip_address, MCS_SERVER_PORT],
        time_between_checks=3,
        timeout=180,
        success=success,
        failure=failure)
    mcs_server_host_port = primary_node.host_ports.get(MCS_SERVER_PORT)

    logger.info('Creating /apps/spark directory on %s ...',
                primary_node.hostname)
    spark_directory_command = [
        'hadoop fs -mkdir -p /apps/spark', 'hadoop fs -chmod 777 /apps/spark'
    ]
    primary_node.execute("bash -c '{}'".format(
        '; '.join(spark_directory_command)))

    logger.info('Creating MapR sample Stream named /sample-stream on %s ...',
                primary_node.hostname)
    primary_node.execute('maprcli stream create -path /sample-stream '
                         '-produceperm p -consumeperm p -topicperm p')

    if mapr_version_tuple >= EARLIEST_MAPR_VERSION_WITH_LICENSE_AND_CENTOS_7 and args.license_url:
        license_commands = [
            'curl --user {} {} > /tmp/lic'.format(args.license_credentials,
                                                  args.license_url),
            '/opt/mapr/bin/maprcli license add -license /tmp/lic -is_file true',
            'rm -rf /tmp/lic'
        ]
        logger.info('Applying license ...')
        primary_node.execute(' && '.join(license_commands))

    if not args.dont_register_gateway:
        logger.info('Registering gateway with the cluster ...')
        register_gateway_commands = [
            "cat /opt/mapr/conf/mapr-clusters.conf | egrep -o '^[^ ]* '"
            ' > /tmp/cluster-name',
            'maprcli cluster gateway set -dstcluster $(cat '
            '/tmp/cluster-name) -gateways {}'.format(primary_node.fqdn),
            'rm /tmp/cluster-name'
        ]
        primary_node.execute(' && '.join(register_gateway_commands))

    logger.info(
        'MapR Control System server is now accessible at https://%s:%s',
        getfqdn(), mcs_server_host_port)
Ejemplo n.º 16
0
def main(args):
    image_prefix = '{}/{}/topology_hdp:hdp{}_ambari{}'.format(
        args.registry, args.namespace or DEFAULT_NAMESPACE, args.hdp_version,
        args.ambari_version)
    primary_node_image = '{}_{}'.format(image_prefix, 'primary-node')
    secondary_node_image = '{}_{}'.format(image_prefix, 'secondary-node')

    primary_node = Node(hostname=args.primary_node[0],
                        group='primary',
                        image=primary_node_image,
                        ports=[{
                            AMBARI_PORT: AMBARI_PORT
                        } if args.predictable else AMBARI_PORT])

    secondary_nodes = [
        Node(hostname=hostname, group='secondary', image=secondary_node_image)
        for hostname in args.secondary_nodes
    ]

    cluster = Cluster(primary_node, *secondary_nodes)
    cluster.primary_node = primary_node
    cluster.secondary_nodes = secondary_nodes
    cluster.start(args.network)

    logger.debug('Starting PostgreSQL for Ambari server ...')
    primary_node.execute('service postgresql start', quiet=not args.verbose)
    _update_node_names(cluster, quiet=not args.verbose)

    # The HDP topology uses two pre-built images ('primary' and 'secondary'). If a cluster
    # larger than 2 nodes is started, some modifications need to be done.
    if len(secondary_nodes) > 1:
        _remove_files(nodes=secondary_nodes[1:],
                      files=['/hadoop/hdfs/data/current/*'])

    logger.info('Starting Ambari server ...')
    primary_node.execute('ambari-server start', quiet=not args.verbose)

    # Docker for Mac exposes ports that can be accessed only with ``localhost:<port>`` so
    # use that instead of the hostname if the host name is ``moby``.
    hostname = 'localhost' if client.info().get(
        'Name') == 'moby' else socket.gethostname()
    port = cluster.primary_node.host_ports.get(AMBARI_PORT)
    server_url = 'http://{}:{}'.format(hostname, port)
    logger.info('Ambari server is now reachable at %s', server_url)

    logger.info('Starting Ambari agents ...')
    for node in cluster:
        logger.debug('Starting Ambari agent on %s ...', node.fqdn)
        node.execute('ambari-agent start', quiet=not args.verbose)

    ambari = Ambari(server_url, username='******', password='******')

    def condition(ambari, cluster):
        cluster_hosts = {node.fqdn for node in cluster}
        ambari_hosts = {host.host_name for host in ambari.hosts}
        logger.debug('Cluster hosts: %s; Ambari hosts: %s', cluster_hosts,
                     ambari_hosts)
        return cluster_hosts == ambari_hosts

    wait_for_condition(condition=condition, condition_args=[ambari, cluster])

    for node in secondary_nodes[1:]:
        logger.info('Adding %s to cluster ...', node.fqdn)
        ambari.clusters('cluster').hosts.create(node.fqdn)
        for component in ambari.clusters('cluster').hosts(
                secondary_nodes[0].fqdn).components:
            logger.debug('Adding component (%s) to cluster on host (%s) ...',
                         component.component_name, node.fqdn)
            host_components = ambari.clusters('cluster').hosts(
                node.fqdn).components
            host_components.create(component.component_name).wait()

        logger.debug('Installing all registered components on host (%s) ...',
                     node.fqdn)
        ambari.clusters('cluster').hosts(node.fqdn).components.install().wait()

    if not args.dont_start_cluster:
        logger.debug(
            'Waiting for all hosts to reach healthy state before starting cluster ...'
        )

        def condition(ambari):
            health_report = ambari.clusters('cluster').health_report
            logger.debug('Ambari cluster health report: %s ...', health_report)
            return health_report.get('Host/host_state/HEALTHY') == len(
                list(ambari.hosts))

        wait_for_condition(condition=condition, condition_args=[ambari])

        logger.info('Starting cluster services ...')
        ambari.clusters('cluster').services.start().wait()
Ejemplo n.º 17
0
def main(args):
    quiet = not args.verbose

    node_image = '{}/{}/topology_apache_pulsar:pulsar-{}'.format(args.registry,
                                                                 args.namespace or DEFAULT_NAMESPACE,
                                                                 args.pulsar_version)
    ports = [{WEB_SERVICE_PORT: WEB_SERVICE_PORT} if args.predictable else WEB_SERVICE_PORT,
             {WEB_SERVICE_TLS_PORT: WEB_SERVICE_TLS_PORT} if args.predictable else WEB_SERVICE_TLS_PORT,
             {BROKER_SERVICE_PORT: BROKER_SERVICE_PORT} if args.predictable else BROKER_SERVICE_PORT,
             {BROKER_SERVICE_TLS_PORT: BROKER_SERVICE_TLS_PORT} if args.predictable else BROKER_SERVICE_TLS_PORT]

    clusterdock_config_host_dir = os.path.realpath(os.path.expanduser(args.clusterdock_config_directory))
    volumes = [{clusterdock_config_host_dir: CLUSTERDOCK_CLIENT_CONTAINER_DIR}]

    proxy_node = Node(hostname=args.proxy_node_name,
                      group='proxy',
                      image=node_image,
                      ports=ports,
                      volumes=volumes)
    broker_nodes = [Node(hostname=hostname, group='broker', image=node_image, volumes=volumes)
                    for hostname in args.broker_nodes]
    zk_nodes = [Node(hostname=hostname, group='zookeeper', image=node_image, volumes=volumes)
                for hostname in args.zookeeper_nodes]
    nodes = [proxy_node] + broker_nodes + zk_nodes
    cluster = Cluster(*nodes)
    cluster.start(args.network)

    logger.info('Starting pulsar cluster (%s) version %s ...', args.pulsar_cluster_name, args.pulsar_version)

    # zookeeper
    for idx, node in enumerate(zk_nodes, start=1):
        zookeeper_conf = node.get_file(ZOOKEEPER_CONF)
        zookeeper_properties = PropertiesFile.loads(zookeeper_conf)
        for srvidx, srvnode in enumerate(zk_nodes, start=1):
            zookeeper_properties['server.{}'.format(srvidx)] = '{}.{}:2888:3888'.format(srvnode.hostname,
                                                                                        cluster.network)
        node.put_file(ZOOKEEPER_CONF, PropertiesFile.dumps(zookeeper_properties))
        zookeeper_commands = [
            'mkdir -p {}/data/zookeeper'.format(PULSAR_HOME),
            'echo {} > {}/data/zookeeper/myid'.format(idx, PULSAR_HOME),
            '{}/bin/pulsar-daemon start zookeeper'.format(PULSAR_HOME)
        ]
        execute_node_command(node, ' && '.join(zookeeper_commands), quiet, 'Zookeeper start failed')

    web_service_url = 'http://{}.{}:{}'.format(proxy_node.hostname, cluster.network, WEB_SERVICE_PORT)
    web_service_url_tls = 'https://{}.{}:{}'.format(proxy_node.hostname, cluster.network, WEB_SERVICE_TLS_PORT)
    broker_service_url = 'pulsar://{}.{}:{}'.format(proxy_node.hostname, cluster.network, BROKER_SERVICE_PORT)
    broker_service_url_tls = 'pulsar+ssl://{}.{}:{}'.format(proxy_node.hostname, cluster.network,
                                                            BROKER_SERVICE_TLS_PORT)

    init_cluster_cmd = ('{home}/bin/pulsar initialize-cluster-metadata'
                        ' --cluster {cluster_name}'
                        ' --zookeeper {zkhostname}.{network}:2181'
                        ' --configuration-store {zkhostname}.{network}:2181'
                        ' --web-service-url {web_service_url}'
                        ' --web-service-url-tls {web_service_url_tls}'
                        ' --broker-service-url {broker_service_url}'
                        ' --broker-service-url-tls {broker_service_url_tls}'
                        .format(home=PULSAR_HOME,
                                cluster_name=args.pulsar_cluster_name,
                                zkhostname=zk_nodes[0].hostname,
                                hostname=proxy_node.hostname,
                                network=cluster.network,
                                web_service_url=web_service_url,
                                web_service_url_tls=web_service_url_tls,
                                broker_service_url=broker_service_url,
                                broker_service_url_tls=broker_service_url_tls))
    execute_node_command(zk_nodes[0], init_cluster_cmd, quiet, 'Cluster initialization failed')

    zk_servers_conf = ','.join(['{}.{}:2181'.format(node.hostname, cluster.network) for node in zk_nodes])

    # bookkeepers
    for node in broker_nodes:
        bookkeeper_conf = node.get_file(BOOKKEEPER_CONF)
        bookkeeper_properties = PropertiesFile.loads(bookkeeper_conf)
        bookkeeper_properties['zkServers'] = zk_servers_conf
        node.put_file(BOOKKEEPER_CONF, PropertiesFile.dumps(bookkeeper_properties))

        execute_node_command(node, '{}/bin/pulsar-daemon start bookie'.format(PULSAR_HOME), quiet,
                             'Bookkeeper start failed')
        execute_node_command(node, '{}/bin/bookkeeper shell bookiesanity'.format(PULSAR_HOME), quiet,
                             'Book keeper sanity check failed')

    # brokers
    for node in broker_nodes:
        broker_conf = node.get_file(BROKER_CONF)
        broker_properties = PropertiesFile.loads(broker_conf)
        broker_properties.update({'zookeeperServers': zk_servers_conf,
                                  'configurationStoreServers': zk_servers_conf,
                                  'clusterName': args.pulsar_cluster_name})
        node.put_file(BROKER_CONF, PropertiesFile.dumps(broker_properties))

    # proxy
    proxy_conf = proxy_node.get_file(PROXY_CONF)
    proxy_properties = PropertiesFile.loads(proxy_conf)
    proxy_properties.update({'zookeeperServers': zk_servers_conf,
                             'configurationStoreServers': zk_servers_conf,
                             'httpNumThreads': '8'})
    proxy_node.put_file(PROXY_CONF, PropertiesFile.dumps(proxy_properties))

    # TLS
    execute_node_command(proxy_node, 'rm -rf {}'.format(TLS_DIR), quiet=quiet)
    if args.tls:
        setup_commands = [
            'mkdir -p {}'.format(TLS_CLIENT_DIR),
            'wget -P {} {}'.format(TLS_DIR, TLS_CONF_URL),
            'mkdir -p {dir}/certs {dir}/crl {dir}/newcerts {dir}/private'.format(dir=TLS_DIR),
            'chmod 700 {}/private'.format(TLS_DIR),
            'touch {}/index.txt'.format(TLS_DIR),
            'echo "unique_subject = no" > {}/index.txt.attr'.format(TLS_DIR),
            'echo 1000 > {}/serial'.format(TLS_DIR),
        ]
        execute_node_command(proxy_node, ' && '.join(setup_commands), quiet, 'TLS system setup failed')

        ca_auth_commands = [
            'export CA_HOME={}'.format(TLS_DIR),
            'openssl genrsa -out {dir}/private/ca.key.pem 4096'.format(dir=TLS_DIR),
            'chmod 400 {}/private/ca.key.pem'.format(TLS_DIR),
            ('openssl req -config {dir}/openssl.cnf -key {dir}/private/ca.key.pem'
             ' -new -x509 -days 7300 -sha256 -extensions v3_ca -out {dir}/certs/ca.cert.pem'
             ' -subj "/C=US/ST=California/L=Palo Alto/O=My company/CN=*"').format(dir=TLS_DIR),
            'chmod 444 {}/certs/ca.cert.pem'.format(TLS_DIR),
            'cp {}/certs/ca.cert.pem {}'.format(TLS_DIR, TLS_CLIENT_DIR)
        ]
        execute_node_command(proxy_node, ' && '.join(ca_auth_commands), quiet,
                             'Certificate authority creation failed')

        server_cert_commands = [
            'export CA_HOME={}'.format(TLS_DIR),
            'openssl genrsa -out {}/broker.key.pem 2048'.format(TLS_DIR),
            ('openssl pkcs8 -topk8 -inform PEM -outform PEM -in {dir}/broker.key.pem'
             ' -out {dir}/broker.key-pk8.pem -nocrypt').format(dir=TLS_DIR),
            # comman name (CN) needs to be *.<nw> so as that <nw> hosts can access Pulsar cluster
            ('openssl req -config {dir}/openssl.cnf -key {dir}/broker.key.pem -new -sha256 -out {dir}/broker.csr.pem'
             ' -subj "/C=US/ST=California/L=Palo Alto/O=My company/CN=*.{nw}"').format(dir=TLS_DIR, nw=cluster.network),
            ('openssl ca -batch -config {dir}/openssl.cnf -extensions server_cert -days 1000 -notext -md sha256'
             ' -in {dir}/broker.csr.pem -out {dir}/broker.cert.pem').format(dir=TLS_DIR)
        ]
        execute_node_command(proxy_node, ' && '.join(server_cert_commands), quiet,
                             'Broker certificate creation failed')

        for node in broker_nodes:
            broker_conf = node.get_file(BROKER_CONF)
            broker_properties = PropertiesFile.loads(broker_conf)
            broker_properties.update({'brokerServicePortTls': '6651',
                                      'tlsEnabled': 'true',
                                      'tlsCertificateFilePath': '{}/broker.cert.pem'.format(TLS_DIR),
                                      'tlsKeyFilePath': '{}/broker.key-pk8.pem'.format(TLS_DIR),
                                      'tlsTrustCertsFilePath': '{}/certs/ca.cert.pem'.format(TLS_DIR),
                                      'webServicePortTls': '8443'})
            node.put_file(BROKER_CONF, PropertiesFile.dumps(broker_properties))

        proxy_conf = proxy_node.get_file(PROXY_CONF)
        proxy_properties = PropertiesFile.loads(proxy_conf)
        proxy_properties.update({'servicePortTls': '6651',
                                 'tlsEnabledInProxy': 'true',
                                 'tlsCertificateFilePath': '{}/broker.cert.pem'.format(TLS_DIR),
                                 'tlsKeyFilePath': '{}/broker.key-pk8.pem'.format(TLS_DIR),
                                 'tlsTrustCertsFilePath': '{}/certs/ca.cert.pem'.format(TLS_DIR),
                                 'tlsEnabledWithBroker': 'true',
                                 'brokerClientTrustCertsFilePath': '{}/certs/ca.cert.pem'.format(TLS_DIR),
                                 'webServicePortTls': '8443'})
        proxy_node.put_file(PROXY_CONF, PropertiesFile.dumps(proxy_properties))

        for node in nodes:
            client_conf = node.get_file(CLIENT_CONF)
            client_properties = PropertiesFile.loads(client_conf)
            client_properties.update({'webServiceUrl': web_service_url_tls,
                                      'brokerServiceUrl': broker_service_url_tls,
                                      'useTls': 'true',
                                      'tlsAllowInsecureConnection': 'false',
                                      'tlsTrustCertsFilePath': '{}/certs/ca.cert.pem'.format(TLS_DIR)})
            node.put_file(CLIENT_CONF, PropertiesFile.dumps(client_properties))

        # TLS auth
        if args.tls == 'authentication':
            client_cert_commands = [
                'export CA_HOME={}'.format(TLS_DIR),
                'openssl genrsa -out {}/admin.key.pem 2048'.format(TLS_DIR),
                ('openssl pkcs8 -topk8 -inform PEM -outform PEM -in {dir}/admin.key.pem'
                 ' -out {dir}/admin.key-pk8.pem -nocrypt').format(dir=TLS_DIR),
                # comman name (CN) needs to be admin - same as user principal in Pulsar
                ('openssl req -config {dir}/openssl.cnf -key {dir}/admin.key.pem -new -sha256 -out {dir}/admin.csr.pem'
                 ' -subj "/C=US/ST=California/L=Palo Alto/O=My company/CN=admin"').format(dir=TLS_DIR),
                ('openssl ca -batch -config {dir}/openssl.cnf -extensions usr_cert -days 1000 -notext -md sha256'
                 ' -in {dir}/admin.csr.pem -out {dir}/admin.cert.pem').format(dir=TLS_DIR),
                'mv {}/admin.* {}'.format(TLS_DIR, TLS_CLIENT_DIR)
            ]
            execute_node_command(proxy_node, ' && '.join(client_cert_commands), quiet,
                                 'Client certificate creation failed')

            proxy_cert_commands = [
                'export CA_HOME={}'.format(TLS_DIR),
                'openssl genrsa -out {}/proxy.key.pem 2048'.format(TLS_DIR),
                ('openssl pkcs8 -topk8 -inform PEM -outform PEM -in {dir}/proxy.key.pem'
                 ' -out {dir}/proxy.key-pk8.pem -nocrypt').format(dir=TLS_DIR),
                # comman name (CN) needs to be proxyadmin - same as proxy principal in Pulsar
                ('openssl req -config {dir}/openssl.cnf -key {dir}/proxy.key.pem -new -sha256 -out {dir}/proxy.csr.pem'
                 ' -subj "/C=US/ST=California/L=Palo Alto/O=My company/CN=proxyadmin"').format(dir=TLS_DIR),
                ('openssl ca -batch -config {dir}/openssl.cnf -extensions usr_cert -days 1000 -notext -md sha256'
                 ' -in {dir}/proxy.csr.pem -out {dir}/proxy.cert.pem').format(dir=TLS_DIR)
            ]
            execute_node_command(proxy_node, ' && '.join(proxy_cert_commands), quiet,
                                 'Proxy certificate creation failed')

            for node in broker_nodes:
                broker_conf = node.get_file(BROKER_CONF)
                broker_properties = PropertiesFile.loads(broker_conf)
                broker_properties.update({
                    'authenticationEnabled': 'true',
                    'authenticationProviders': 'org.apache.pulsar.broker.authentication.AuthenticationProviderTls',
                    'proxyRoles': 'proxyadmin',
                    'superUserRoles': 'proxyadmin,admin'})
                node.put_file(BROKER_CONF, PropertiesFile.dumps(broker_properties))

            proxy_conf = proxy_node.get_file(PROXY_CONF)
            proxy_properties = PropertiesFile.loads(proxy_conf)
            proxy_properties.update({
                'authenticationEnabled': 'true',
                'authenticationProviders': 'org.apache.pulsar.broker.authentication.AuthenticationProviderTls',
                'brokerClientAuthenticationPlugin': 'org.apache.pulsar.client.impl.auth.AuthenticationTls',
                'brokerClientAuthenticationParameters': ('tlsCertFile:{dir}/proxy.cert.pem,'
                                                         'tlsKeyFile:{dir}/proxy.key-pk8.pem').format(dir=TLS_DIR),
                'superUserRoles': 'admin'})
            proxy_node.put_file(PROXY_CONF, PropertiesFile.dumps(proxy_properties))

            for node in nodes:
                client_conf = node.get_file(CLIENT_CONF)
                client_properties = PropertiesFile.loads(client_conf)
                client_properties.update({'authPlugin': 'org.apache.pulsar.client.impl.auth.AuthenticationTls',
                                          'authParams': ('tlsCertFile:{dir}/admin.cert.pem,tlsKeyFile:'
                                                         '{dir}/admin.key-pk8.pem').format(dir=TLS_CLIENT_DIR)})
                node.put_file(CLIENT_CONF, PropertiesFile.dumps(client_properties))

    # start broker nodes and proxy node
    for node in broker_nodes:
        execute_node_command(node, '{}/bin/pulsar-daemon start broker'.format(PULSAR_HOME), quiet,
                             'Broker start failed')

    out_file = '{}/logs/pulsar-proxy-{}.{}.out'.format(PULSAR_HOME, proxy_node.hostname, cluster.network)
    execute_node_command(proxy_node, 'mkdir -p {}/logs'.format(PULSAR_HOME), quiet)
    execute_node_command(proxy_node,
                         'nohup {}/bin/pulsar proxy > "{}" 2>&1 < /dev/null &'.format(PULSAR_HOME, out_file),
                         quiet, 'Proxy start failed')

    logger.info('Performing health check on Pulsar cluster (%s) ...', args.pulsar_cluster_name)
    def condition(node, cluster_name, command):
        command_status = node.execute(command, quiet=True)
        return command_status.exit_code == 0 and command_status.output.splitlines()[-1].strip().strip('"') == cluster_name
    wait_for_condition(condition=condition, condition_args=[proxy_node, args.pulsar_cluster_name,
                                                            '{}/bin/pulsar-admin clusters list'.format(PULSAR_HOME)])

    logger.info('Pulsar cluster (%s) can be reached on docker network (%s):\n%s \n%s',
                args.pulsar_cluster_name, cluster.network,
                textwrap.indent('Web service URL: {}'.format(web_service_url), prefix='    '),
                textwrap.indent('Broker service URL: {}'.format(broker_service_url), prefix='    '))
    logger.log(logging.INFO if args.tls else -1,
               'Pulsar cluster (%s) can be reached securely on docker network (%s):\n%s \n%s',
               args.pulsar_cluster_name, cluster.network,
               textwrap.indent('Secure web service URL: {}'.format(web_service_url_tls), prefix='    '),
               textwrap.indent('Secure broker service URL: {}'.format(broker_service_url_tls), prefix='    '))
Ejemplo n.º 18
0
def main(args):
    quiet = not args.verbose
    print_topology_meta(args.topology)

    models.LOCALTIME_MOUNT = False
    models.PRIVILEGED_CONTAINER = True  # 'privileged' containers are needed to have systemd work with no issues

    os_major_version = (args.operating_system or
                        DEFAULT_OPERATING_SYSTEM)[6]  # always assume 'centosX'
    image = '{}/topology_nodebase:{}'.format(
        defaults['DEFAULT_REPOSITORY'], args.operating_system
        or DEFAULT_OPERATING_SYSTEM)
    primary_node = models.Node(hostname='node-1',
                               group='nodes',
                               image=image,
                               ports=[{
                                   AMBARI_PORT: AMBARI_PORT
                               }])
    secondary_node = models.Node(hostname='node-2', group='nodes', image=image)
    cluster = models.Cluster(primary_node, secondary_node)
    cluster.start(args.network)

    hdp_version_tuple = version_tuple(args.hdp_version)
    stack_version = '{}.{}'.format(hdp_version_tuple[0], hdp_version_tuple[1])
    stack_version_tuple = (hdp_version_tuple[0], hdp_version_tuple[1])
    DEFAULT_CLUSTER_HOST_MAPPING[0]['hosts'][0]['fqdn'] = primary_node.fqdn
    DEFAULT_CLUSTER_HOST_MAPPING[1]['hosts'][0]['fqdn'] = secondary_node.fqdn

    host_groups = DEFAULT_BASE_HOST_GROUPS
    if not args.bare:
        if hdp_version_tuple <= (2, 0, 13, 0):
            host_groups[0]['components'].extend(
                EXTRA_HOST_GROUPS_2_0_13_0[0]['components'])
            host_groups[1]['components'].extend(
                EXTRA_HOST_GROUPS_2_0_13_0[1]['components'])
        elif hdp_version_tuple <= (2, 4, 0, 0):
            host_groups[0]['components'].extend(
                EXTRA_HOST_GROUPS_2_4_0_0[0]['components'])
            host_groups[1]['components'].extend(
                EXTRA_HOST_GROUPS_2_4_0_0[1]['components'])
        elif hdp_version_tuple <= (2, 6, 4, 0):
            host_groups[0]['components'].extend(
                EXTRA_HOST_GROUPS_2_6_4_0[0]['components'])
            host_groups[1]['components'].extend(
                EXTRA_HOST_GROUPS_2_6_4_0[1]['components'])
        elif hdp_version_tuple <= (3, 1, 0, 0):
            host_groups[0]['components'].extend(
                EXTRA_HOST_GROUPS_3_1_0_0[0]['components'])
            host_groups[1]['components'].extend(
                EXTRA_HOST_GROUPS_3_1_0_0[1]['components'])
        else:
            host_groups[0]['components'].extend(
                DEFAULT_EXTRA_HOST_GROUPS[0]['components'])
            host_groups[1]['components'].extend(
                DEFAULT_EXTRA_HOST_GROUPS[1]['components'])

    if hdp_version_tuple <= (
            2, 0, 13,
            0):  # APP_TIMELINE_SERVER not applicable for this version
        host_groups[0]['components'] = list(
            filter(lambda x: x.get('name') != 'APP_TIMELINE_SERVER',
                   host_groups[0]['components']))

    repo_url_host = 'http://public-repo-1.hortonworks.com'
    ambari_repo_url = ('{}/ambari/centos{}/{}.x/updates/{}/'
                       'ambari.repo'.format(repo_url_host, os_major_version,
                                            args.ambari_version[0],
                                            args.ambari_version))
    hdp_repo_url = ('{}/HDP/centos{}/{}.x/updates/{}'.format(
        repo_url_host, os_major_version, args.hdp_version[0],
        args.hdp_version))

    for node in cluster:
        node.execute('wget -nv {} -O /etc/yum.repos.d/ambari.repo'.format(
            ambari_repo_url),
                     quiet=quiet)

    logger.info('Installing Ambari server and agents ...')
    primary_node.execute('yum -y install ambari-server', quiet=quiet)
    primary_node.execute('ambari-server setup -v -s', quiet=quiet)
    primary_node.execute('ambari-server start', quiet=quiet)

    for node in cluster:
        node.execute('yum -y install ambari-agent', quiet=quiet)
        ambari_agent_config = node.get_file(AMBARI_AGENT_CONFIG_FILE_PATH)
        node.put_file(
            AMBARI_AGENT_CONFIG_FILE_PATH,
            re.sub(r'(hostname)=.*', r'\1={}'.format(primary_node.fqdn),
                   ambari_agent_config))
        node.execute('ambari-agent start', quiet=quiet)

    mysql_config_commands = [
        ('wget -nv -O /tmp/mysql-connector-java.tar.gz '
         'https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.45.tar.gz'
         ), 'gzip -d /tmp/mysql-connector-java.tar.gz',
        'tar -xf /tmp/mysql-connector-java.tar -C /tmp',
        ('cp /tmp/mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar '
         '/tmp/mysql-connector-java.jar'),
        'ambari-server setup --jdbc-db=mysql --jdbc-driver=/tmp/mysql-connector-java.jar',
        'rm -rf /tmp/mysql-connector-java*'
    ]
    primary_node.execute(' && '.join(mysql_config_commands), quiet=quiet)

    # Docker for Mac exposes ports that can be accessed only with ``localhost:<port>`` so
    # use that instead of the hostname if the host name is ``moby``.
    hostname = ('localhost' if models.client.info().get('Name') == 'moby' else
                socket.getaddrinfo(
                    socket.gethostname(), 0, flags=socket.AI_CANONNAME)[0][3])
    port = primary_node.host_ports.get(AMBARI_PORT)
    server_url = 'http://{}:{}'.format(hostname, port)
    logger.info('Ambari server is now reachable at %s', server_url)

    ambari = Ambari(server_url,
                    username=DEFAULT_AMBARI_USERNAME,
                    password=DEFAULT_AMBARI_PASSWORD)

    logger.info('Waiting for all hosts to be visible in Ambari ...')

    def condition(ambari, cluster):
        cluster_hosts = {node.fqdn for node in cluster}
        ambari_hosts = {host.host_name for host in ambari.hosts}
        logger.debug('Cluster hosts: %s; Ambari hosts: %s', cluster_hosts,
                     ambari_hosts)
        return cluster_hosts == ambari_hosts

    wait_for_condition(condition=condition, condition_args=[ambari, cluster])

    logger.info('Updating install repo to use %s HDP version ...',
                args.hdp_version)
    # based off of release notes of https://bit.ly/2R06NKp
    if stack_version_tuple >= (2, 6):
        url = join_url_parts(hdp_repo_url, 'build.id')
        response = requests.get(url)
        response.raise_for_status()
        build_number = next(
            (int(item.split(':')[1].strip())
             for item in response.text.split('\n') if 'BUILD_NUMBER' in item),
            None)
        if not build_number:
            raise Exception(
                'Could not determine build number as required for repo setting. Build data found: ',
                response.text)

        # version_definitions not yet supported by Ambari client library - a TODO
        hdp_repo_version = '{}-{}'.format(args.hdp_version, build_number)
        version_definition = {
            'VersionDefinition': {
                'version_url':
                '{}/HDP-{}.xml'.format(hdp_repo_url, hdp_repo_version)
            }
        }
        url = join_url_parts(server_url, 'api', 'v1', 'version_definitions')
        data = json.dumps(version_definition)
        response = requests.post(
            url,
            data=data,
            auth=(DEFAULT_AMBARI_USERNAME, DEFAULT_AMBARI_PASSWORD),
            headers={'X-Requested-By': 'topology_hdp build'})
        response.raise_for_status()
    else:
        hdp_os = ambari.stacks('HDP').versions(
            stack_version).operating_systems('redhat6')
        hdp_os.repositories('HDP-{}'.format(stack_version)).update(
            base_url=hdp_repo_url, verify_base_url=False)
        hdp_repo_version = None
        build_number = None

    logger.info('Creating `cluster` with pre-defined components ...')
    ambari.blueprints('cluster').create(blueprint_name='cluster',
                                        stack_version=stack_version,
                                        stack_name='HDP',
                                        host_groups=host_groups)

    logger.info('Installing cluster components ...')
    hdp_cluster = ambari.clusters('cluster')
    # INSTALL_ONLY option not applicable for <= 2.0.13.0 ver, it will be install and start services.
    if hdp_version_tuple <= (2, 0, 13, 0):
        hdp_cluster = hdp_cluster.create(
            blueprint='cluster',
            default_password='******',
            host_groups=DEFAULT_CLUSTER_HOST_MAPPING)
    elif hdp_repo_version:
        hdp_cluster = hdp_cluster.create(
            blueprint='cluster',
            default_password='******',
            repository_version=hdp_repo_version,
            host_groups=DEFAULT_CLUSTER_HOST_MAPPING,
            provision_action='INSTALL_ONLY')
    else:
        hdp_cluster = hdp_cluster.create(
            blueprint='cluster',
            default_password='******',
            host_groups=DEFAULT_CLUSTER_HOST_MAPPING,
            provision_action='INSTALL_ONLY')

    time.sleep(
        30
    )  # Some versions of Ambari provide wrong status on wait. Need to slug some time.
    hdp_cluster.wait(timeout=5400, interval=30)

    logger.info('Waiting for all hosts to reach healthy state ...')

    def condition(ambari):
        health_report = hdp_cluster.health_report
        logger.debug('Ambari cluster health report: %s ...', health_report)
        return health_report.get('Host/host_state/HEALTHY') == len(
            list(ambari.hosts))

    wait_for_condition(condition=condition, condition_args=[ambari])

    logger.info('Waiting for components to be verified ...')

    def condition(ambari):
        comps = hdp_cluster.cluster.host_components.refresh()
        for comp in comps:
            if comp.state.upper() == 'UNKNOWN':
                logger.debug('Not ready with component `%s` ...',
                             comp.component_name)
                return False
        else:
            return True

    wait_for_condition(condition=condition, condition_args=[ambari])

    hdp_services_state = set(service['state']
                             for service in hdp_cluster.services.to_dict())
    if 'STARTED' in hdp_services_state or 'STARTING' in hdp_services_state:
        logger.info('Ambari task queued to stop services ...')
        hdp_cluster.cluster.services.stop().wait()

    logger.info('Stopping Ambari for saving to Docker image ...')
    for node in cluster:
        node.execute('ambari-agent stop', quiet=quiet)

    primary_node.execute('ambari-server stop', quiet=quiet)
    primary_node.execute('service postgresql stop', quiet=quiet)

    for node in cluster:
        node.execute('; '.join(
            ['yum clean all',
             'cat /dev/null > ~/.bash_history && history -c']),
                     quiet=quiet)

    repository = '{}/topology_hdp'.format(args.repository
                                          or defaults['DEFAULT_REPOSITORY'])
    tag_prefix = 'hdp{}_ambari{}'.format(args.hdp_version, args.ambari_version)
    primary_node_tag = '{}_{}'.format(tag_prefix, 'primary-node')
    secondary_node_tag = '{}_{}'.format(tag_prefix, 'secondary-node')

    logger.info('Committing the primary node container as %s %s',
                primary_node_tag,
                ('and pushing its image to {} ...'.format(repository)
                 if args.push else '...'))
    primary_node.commit(repository=repository,
                        tag=primary_node_tag,
                        push=args.push)
    logger.info('Committing the secondary node container as %s %s',
                secondary_node_tag,
                ('and pushing its image to {} ...'.format(repository)
                 if args.push else '...'))
    secondary_node.commit(repository=repository,
                          tag=secondary_node_tag,
                          push=args.push)

    if args.retain:
        logger.info('Starting Ambari ...')
        primary_node.execute('service postgresql start', quiet=quiet)
        primary_node.execute('ambari-server start', quiet=quiet)
        for node in cluster:
            node.execute('ambari-agent start', quiet=quiet)
    else:
        logger.info('Removing the containers ...')
        primary_node.stop()
        secondary_node.stop()
def main(args):
    quiet = not args.verbose

    # Image name
    image = '{}/{}/topology_confluent_schema_registry:schema_registry-{}'.format(args.registry,
                                                                                 args.namespace or DEFAULT_NAMESPACE,
                                                                                 args.confluent_version)

    # Nodes in the Kafka cluster
    nodes = [Node(hostname=hostname,
                  group='brokers',
                  ports=[{REST_PORT : REST_PORT}],
                  image=image)
             for hostname in args.nodes]

    cluster = Cluster(*nodes)
    cluster.start(args.network, pull_images=args.always_pull)

    # Create distributed zookeeper configuration
    zookeeper_config = ['tickTime=2000',
                        'dataDir=/zookeeper',
                        'clientPort=2181',
                        'initLimit=5',
                        'syncLimit=2']

    for idx, node in enumerate(cluster):
        zookeeper_config.append('server.{}={}:2888:3888'.format(idx, node.hostname))

    # Start all zookeepers
    for idx, node in enumerate(cluster):
        logger.info('Starting Zookeeper on node {}'.format(node.hostname))
        node.execute('mkdir -p /zookeeper')
        node.put_file('/zookeeper/myid', str(idx))
        node.put_file('/zookeeper.properties', '\n'.join(zookeeper_config))
        node.execute('/start_zookeeper &', detach=True)

    # Validate that Zookeepr is alive from each node
    for node in cluster:
        logger.info('Validating Zookeeper on node %s', node.hostname)
        wait_for_condition(condition=validate_zookeeper,
                           condition_args=[node, quiet],
                           time_between_checks=3,
                           timeout=60,
                           success=success,
                           failure=failure)

    # Start all brokers
    for idx, node in enumerate(cluster):
        logger.info('Starting Kafka on node {}'.format(node.hostname))

        kafka_config = node.get_file('/confluent/etc/kafka/server.properties')
        kafka_config = kafka_config.replace('broker.id=0', 'broker.id={}'.format(idx))
        node.put_file('/kafka.properties', kafka_config)

        node.execute('/start_kafka &', detach=True)

    # Verify that all Kafka brokers up
    logger.info('Waiting on all brokers to register in zookeeper')
    wait_for_condition(condition=validate_kafka,
                       condition_args=[nodes[0], len(nodes), quiet],
                       time_between_checks=3,
                       timeout=60,
                       success=success,
                       failure=failure)

    # Start schema registry on all nodes
    for idx, node in enumerate(cluster):
        logger.info('Starting Schema Registry on node {}'.format(node.hostname))
        node.execute('/start_schema_registry &', detach=True)