Exemple #1
0
def main():
    validate_env()
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'testing' + ''.join(random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name}
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print("ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(
        group_name,
        ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error), file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['masterFQDN']['value']

        print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444")
        print("For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}".format(
            get_value('linuxAdminUsername'), master_lb))

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {
            'master': [],
            'private': [],
            'public': []}

        for resource in rmc.resource_groups.list_resources(
                group_name, filter=("resourceType eq 'Microsoft.Network/networkInterfaces' or "
                                    "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")):
            if resource.type == 'Microsoft.Network/networkInterfaces':
                nics = [nmc.network_interfaces.get(group_name, resource.name)]
            elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets':
                nics = list(nmc.network_interfaces.list_virtual_machine_scale_set_network_interfaces(
                            virtual_machine_scale_set_name=resource.name, resource_group_name=group_name))
            else:
                raise('Unexpected resourceType: {}'.format(resource.type))

            for bucket_name in ip_buckets.keys():
                if bucket_name in resource.name:
                    for n in nics:
                        for config in n.ip_configurations:
                            ip_buckets[bucket_name].append(config.private_ip_address)

        print('Detected IP configuration: {}'.format(ip_buckets))

        with Tunnel(get_value('linuxAdminUsername'), 'ssh_key', master_lb, port=2200) as t:
            integration_test(
                tunnel=t,
                test_dir='/home/{}'.format(get_value('linuxAdminUsername')),
                dcos_dns=ip_buckets['master'][0],
                master_list=ip_buckets['master'],
                agent_list=ip_buckets['private'],
                public_agent_list=ip_buckets['public'],
                provider='azure',
                test_dns_search=False,
                add_env=get_test_config(),
                pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -s -rs -m 'not ccm' ") + os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        traceback.print_exc()
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        if os.getenv('AZURE_CLEANUP') == 'false':
            print("Cluster must be cleaned up manually")
            print("Cluster details: {}".format(azure_cluster))
        else:
            # Send a delete request
            # TODO(cmaloney): The old code had a retry around this:
            # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
            poller = rmc.resource_groups.delete(group_name)

            # poll for the delete to complete
            print("Deleting resource group: {} ...".format(group_name))

            @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
            def wait_for_delete():
                assert poller.done(), "Timed out waiting for delete"

            print("Waiting for delete ...")
            wait_for_delete()

            print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
Exemple #2
0
def run():
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'tesing' + ''.join(random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()
    if template_parameters.get('numberOfPrivateSlaves'):
        assert template_parameters['numberOfPrivateSlaves']['value'] >= 2, 'Test requires at least 2 private slaves!'
    else:
        template_parameters['numberOfPrivateSlaves'] = {'value': 2}
    if template_parameters.get('numberOfPublicSlaves'):
        assert template_parameters['numberOfPublicSlaves']['value'] >= 1, 'Test requires at least 1 public slave!'
    else:
        template_parameters['numberOfPublicSlaves'] = {'value': 1}

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name}
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print("ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(
        group_name,
        ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error), file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['dnsAddress']['value']
        master_url = "http://{}".format(master_lb)

        print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444")
        print("For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}".format(master_lb))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000))
        def poll_on_dcos_ui_up():
            r = get_dcos_ui(master_url)
            assert r is not None and r.status_code == requests.codes.ok, \
                "Unable to reach DC/OS UI: {}".format(master_url)

        print("Waiting for DC/OS UI at: {} ...".format(master_url))
        poll_on_dcos_ui_up()

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {
            'masterNodeNic': [],
            'slavePrivateNic': [],
            'slavePublicNic': []}

        for resource in rmc.resource_groups.list_resources(group_name):
            for bucket_name, bucket in ip_buckets.items():
                if resource.name.startswith(bucket_name):
                    nic = nmc.network_interfaces.get(group_name, resource.name)
                    all_ips = []
                    for config in nic.ip_configurations:
                        all_ips.append(config.private_ip_address)
                    bucket.extend(all_ips)

        with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t:
            integration_test(
                    tunnel=t,
                    test_dir='/home/core',
                    dcos_dns=master_lb,
                    master_list=ip_buckets['masterNodeNic'],
                    agent_list=ip_buckets['slavePrivateNic'],
                    public_agent_list=ip_buckets['slavePublicNic'],
                    provider='azure',
                    test_dns_search=False,
                    pytest_dir=os.getenv('DCOS_PYTEST_DIR', '/opt/mesosphere/active/dcos-integration-test'),
                    pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -m 'not ccm' ")+os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        # Send a delete request
        # TODO(cmaloney): The old code had a retry around this:
        # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
        poller = rmc.resource_groups.delete(group_name)

        # poll for the delete to complete
        print("Deleting resource group: {} ...".format(group_name))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
        def wait_for_delete():
            assert poller.done(), "Timed out waiting for delete"

        print("Waiting for delete ...")
        wait_for_delete()

        print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
Exemple #3
0
def main():
    validate_env()
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'testing' + ''.join(random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name}
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print("ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(
        group_name,
        ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error), file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['masterFQDN']['value']

        print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444")
        print("For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}".format(
            get_value('linuxAdminUsername'), master_lb))

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {
            'master': [],
            'private': [],
            'public': []}

        for resource in rmc.resource_groups.list_resources(
                group_name, filter=("resourceType eq 'Microsoft.Network/networkInterfaces' or "
                                    "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")):
            if resource.type == 'Microsoft.Network/networkInterfaces':
                nics = [nmc.network_interfaces.get(group_name, resource.name)]
            elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets':
                nics = list(nmc.network_interfaces.list_virtual_machine_scale_set_network_interfaces(
                            virtual_machine_scale_set_name=resource.name, resource_group_name=group_name))
            else:
                raise('Unexpected resourceType: {}'.format(resource.type))

            for bucket_name in ip_buckets.keys():
                if bucket_name in resource.name:
                    for n in nics:
                        for config in n.ip_configurations:
                            ip_buckets[bucket_name].append(config.private_ip_address)

        print('Detected IP configuration: {}'.format(ip_buckets))

        with SSHTunnel(get_value('linuxAdminUsername'), 'ssh_key', master_lb, port=2200) as t:
            integration_test(
                tunnel=t,
                test_dir='/home/{}'.format(get_value('linuxAdminUsername')),
                dcos_dns=ip_buckets['master'][0],
                master_list=ip_buckets['master'],
                agent_list=ip_buckets['private'],
                public_agent_list=ip_buckets['public'],
                provider='azure',
                test_dns_search=False,
                add_env=get_test_config(),
                pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -rs -vv -m 'not ccm' ") + os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        traceback.print_exc()
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        if os.getenv('AZURE_CLEANUP') == 'false':
            print("Cluster must be cleaned up manually")
            print("Cluster details: {}".format(azure_cluster))
        else:
            # Send a delete request
            # TODO(cmaloney): The old code had a retry around this:
            # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
            poller = rmc.resource_groups.delete(group_name)

            # poll for the delete to complete
            print("Deleting resource group: {} ...".format(group_name))

            @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
            def wait_for_delete():
                assert poller.done(), "Timed out waiting for delete"

            print("Waiting for delete ...")
            wait_for_delete()

            print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
Exemple #4
0
def main():
    options = check_environment()

    host_list = None
    vpc = None  # Set if the test owns the VPC

    if options.host_list is None:
        log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...')
        vpc = make_vpc(use_bare_os=options.test_install_prereqs)
        host_list = vpc.hosts()
    else:
        host_list = options.host_list

    assert os.path.exists(
        'ssh_key'), 'Valid SSH key for hosts must be in working dir!'
    # key must be chmod 600 for test_runner to use
    os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE)

    # Create custom SSH Runnner to help orchestrate the test
    ssh_user = '******'
    ssh_key_path = 'ssh_key'
    remote_dir = '/home/centos'

    if options.use_api:
        installer = test_util.installer_api_test.DcosApiInstaller()
        if not options.test_install_prereqs:
            # If we dont want to test the prereq install, use offline mode to avoid it
            installer.offline_mode = True
    else:
        installer = test_util.installer_api_test.DcosCliInstaller()

    host_list_w_port = [i + ':22' for i in host_list]

    @retry(stop_max_delay=120000)
    def establish_host_connectivity():
        """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes
        """
        return closing(
            TunnelCollection(ssh_user, ssh_key_path, host_list_w_port))

    log.info('Checking that hosts are accessible')
    with establish_host_connectivity() as tunnels:
        local_ip = {}
        for tunnel in tunnels.tunnels:
            local_ip[tunnel.host] = get_local_address(tunnel, remote_dir)
            if options.do_setup:
                # Make the default user priveleged to use docker
                tunnel.remote_cmd(
                    ['sudo', 'usermod', '-aG', 'docker', ssh_user])

    # use first node as bootstrap node, second node as master, all others as agents
    test_host = host_list[0]
    registry_host = local_ip[host_list[0]]
    master_list = [local_ip[host_list[1]]]
    agent_list = [local_ip[host_list[2]]]
    public_agent_list = [local_ip[host_list[3]]]
    log.info('Test/registry host public/private IP: ' + test_host + '/' +
             registry_host)

    with closing(SSHTunnel(ssh_user, ssh_key_path,
                           test_host)) as test_host_tunnel:
        log.info('Setting up installer on test host')

        installer.setup_remote(tunnel=test_host_tunnel,
                               installer_path=remote_dir +
                               '/dcos_generate_config.sh',
                               download_url=options.installer_url)
        if options.do_setup:
            # only do on setup so you can rerun this test against a living installer
            log.info('Verifying installer password hashing')
            test_pass = '******'
            hash_passwd = installer.get_hashed_password(test_pass)
            assert passlib.hash.sha512_crypt.verify(
                test_pass, hash_passwd), 'Hash does not match password'
            if options.use_api:
                installer.start_web_server()

        with open(pkg_resources.resource_filename(
                "gen", "ip-detect/aws.sh")) as ip_detect_fh:
            ip_detect_script = ip_detect_fh.read()
        with open('ssh_key', 'r') as key_fh:
            ssh_key = key_fh.read()
        # Using static exhibitor is the only option in the GUI installer
        if options.use_api:
            log.info(
                'Installer API is selected, so configure for static backend')
            zk_host = None  # causes genconf to use static exhibitor backend
        else:
            log.info('Installer CLI is selected, so configure for ZK backend')
            zk_host = registry_host + ':2181'
            zk_cmd = [
                'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p',
                '2888:2888', '-p', '3888:3888', 'jplock/zookeeper'
            ]
            test_host_tunnel.remote_cmd(zk_cmd)

        log.info("Configuring install...")
        installer.genconf(zk_host=zk_host,
                          master_list=master_list,
                          agent_list=agent_list,
                          public_agent_list=public_agent_list,
                          ip_detect_script=ip_detect_script,
                          ssh_user=ssh_user,
                          ssh_key=ssh_key)

        log.info("Running Preflight...")
        if options.test_install_prereqs:
            # Runs preflight in --web or --install-prereqs for CLI
            # This may take up 15 minutes...
            installer.install_prereqs()
            if options.test_install_prereqs_only:
                if vpc:
                    vpc.delete()
                sys.exit(0)
        else:
            # Will not fix errors detected in preflight
            installer.preflight()

        log.info("Running Deploy...")
        installer.deploy()

        log.info("Running Postflight")
        installer.postflight()

        # Runs dcos-image/integration_test.py inside the cluster
        setup_integration_test(tunnel=test_host_tunnel, test_dir=remote_dir)
        integration_test(
            tunnel=test_host_tunnel,
            test_dir=remote_dir,
            region=vpc.get_region() if vpc else DEFAULT_AWS_REGION,
            dcos_dns=master_list[0],
            master_list=master_list,
            agent_list=agent_list,
            public_agent_list=public_agent_list,
            registry_host=registry_host,
            variant=options.variant,
            # Setting dns_search: mesos not currently supported in API
            test_dns_search=not options.use_api,
            ci_flags=options.ci_flags,
            aws_access_key_id=options.aws_access_key_id,
            aws_secret_access_key=options.aws_secret_access_key)

    # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics
    # on every host to see if they are working.

    log.info("Test successsful!")
    # Delete the cluster if all was successful to minimize potential costs.
    # Failed clusters the hosts will continue running
    if vpc is not None:
        vpc.delete()
Exemple #5
0
def run():
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'tesing' + ''.join(
        random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()
    if template_parameters.get('numberOfPrivateSlaves'):
        assert template_parameters['numberOfPrivateSlaves'][
            'value'] >= 2, 'Test requires at least 2 private slaves!'
    else:
        template_parameters['numberOfPrivateSlaves'] = {'value': 2}
    if template_parameters.get('numberOfPublicSlaves'):
        assert template_parameters['numberOfPublicSlaves'][
            'value'] >= 1, 'Test requires at least 1 public slave!'
    else:
        template_parameters['numberOfPublicSlaves'] = {'value': 1}

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name
    }
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print(
            "ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(group_name,
                                         ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name,
                                          deployment_name,
                                          properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error),
                  file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(
            group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(
                res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(
        ), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['dnsAddress']['value']
        master_url = "http://{}".format(master_lb)

        print(
            "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444"
        )
        print(
            "For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}"
            .format(master_lb))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000))
        def poll_on_dcos_ui_up():
            r = get_dcos_ui(master_url)
            assert r is not None and r.status_code == requests.codes.ok, \
                "Unable to reach DC/OS UI: {}".format(master_url)

        print("Waiting for DC/OS UI at: {} ...".format(master_url))
        poll_on_dcos_ui_up()

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {
            'masterNodeNic': [],
            'slavePrivateNic': [],
            'slavePublicNic': []
        }

        for resource in rmc.resource_groups.list_resources(group_name):
            for bucket_name, bucket in ip_buckets.items():
                if resource.name.startswith(bucket_name):
                    nic = nmc.network_interfaces.get(group_name, resource.name)
                    all_ips = []
                    for config in nic.ip_configurations:
                        all_ips.append(config.private_ip_address)
                    bucket.extend(all_ips)

        with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t:
            integration_test(
                tunnel=t,
                test_dir='/home/core',
                dcos_dns=master_lb,
                master_list=ip_buckets['masterNodeNic'],
                agent_list=ip_buckets['slavePrivateNic'],
                public_agent_list=ip_buckets['slavePublicNic'],
                provider='azure',
                test_dns_search=False,
                pytest_dir=os.getenv(
                    'DCOS_PYTEST_DIR',
                    '/opt/mesosphere/active/dcos-integration-test'),
                pytest_cmd=os.getenv('DCOS_PYTEST_CMD',
                                     "py.test -vv -m 'not ccm' ") +
                os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        # Send a delete request
        # TODO(cmaloney): The old code had a retry around this:
        # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
        poller = rmc.resource_groups.delete(group_name)

        # poll for the delete to complete
        print("Deleting resource group: {} ...".format(group_name))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
        def wait_for_delete():
            assert poller.done(), "Timed out waiting for delete"

        print("Waiting for delete ...")
        wait_for_delete()

        print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
Exemple #6
0
def main():
    options = check_environment()

    host_list = None
    vpc = None  # Set if the test owns the VPC

    if options.host_list is None:
        log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...')
        vpc = make_vpc(use_bare_os=options.test_install_prereqs)
        host_list = vpc.hosts()
    else:
        host_list = options.host_list

    assert os.path.exists('ssh_key'), 'Valid SSH key for hosts must be in working dir!'
    # key must be chmod 600 for test_runner to use
    os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE)

    # Create custom SSH Runnner to help orchestrate the test
    ssh_user = '******'
    ssh_key_path = 'ssh_key'
    remote_dir = '/home/centos'

    if options.use_api:
        installer = test_util.installer_api_test.DcosApiInstaller()
        if not options.test_install_prereqs:
            # If we dont want to test the prereq install, use offline mode to avoid it
            installer.offline_mode = True
    else:
        installer = test_util.installer_api_test.DcosCliInstaller()

    host_list_w_port = [i+':22' for i in host_list]

    @retry(stop_max_delay=120000)
    def establish_host_connectivity():
        """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes
        """
        return closing(TunnelCollection(ssh_user, ssh_key_path, host_list_w_port))

    log.info('Checking that hosts are accessible')
    with establish_host_connectivity() as tunnels:
        local_ip = {}
        for tunnel in tunnels.tunnels:
            local_ip[tunnel.host] = get_local_address(tunnel, remote_dir)
            if options.do_setup:
                # Make the default user priveleged to use docker
                tunnel.remote_cmd(['sudo', 'usermod', '-aG', 'docker', ssh_user])

    # use first node as bootstrap node, second node as master, all others as agents
    test_host = host_list[0]
    registry_host = local_ip[host_list[0]]
    master_list = [local_ip[host_list[1]]]
    agent_list = [local_ip[host_list[2]]]
    public_agent_list = [local_ip[host_list[3]]]
    log.info('Test/registry host public/private IP: ' + test_host + '/' + registry_host)

    with closing(SSHTunnel(ssh_user, ssh_key_path, test_host)) as test_host_tunnel:
        log.info('Setting up installer on test host')

        installer.setup_remote(
                tunnel=test_host_tunnel,
                installer_path=remote_dir+'/dcos_generate_config.sh',
                download_url=options.installer_url)
        if options.do_setup:
            # only do on setup so you can rerun this test against a living installer
            log.info('Verifying installer password hashing')
            test_pass = '******'
            hash_passwd = installer.get_hashed_password(test_pass)
            assert passlib.hash.sha512_crypt.verify(test_pass, hash_passwd), 'Hash does not match password'
            if options.use_api:
                installer.start_web_server()

        with open(pkg_resources.resource_filename("gen", "ip-detect/aws.sh")) as ip_detect_fh:
            ip_detect_script = ip_detect_fh.read()
        with open('ssh_key', 'r') as key_fh:
            ssh_key = key_fh.read()
        # Using static exhibitor is the only option in the GUI installer
        if options.use_api:
            log.info('Installer API is selected, so configure for static backend')
            zk_host = None  # causes genconf to use static exhibitor backend
        else:
            log.info('Installer CLI is selected, so configure for ZK backend')
            zk_host = registry_host + ':2181'
            zk_cmd = [
                    'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p',
                    '2888:2888', '-p', '3888:3888', 'jplock/zookeeper']
            test_host_tunnel.remote_cmd(zk_cmd)

        log.info("Configuring install...")
        installer.genconf(
                zk_host=zk_host,
                master_list=master_list,
                agent_list=agent_list,
                public_agent_list=public_agent_list,
                ip_detect_script=ip_detect_script,
                ssh_user=ssh_user,
                ssh_key=ssh_key)

        log.info("Running Preflight...")
        if options.test_install_prereqs:
            # Runs preflight in --web or --install-prereqs for CLI
            # This may take up 15 minutes...
            installer.install_prereqs()
            if options.test_install_prereqs_only:
                if vpc:
                    vpc.delete()
                sys.exit(0)
        else:
            # Will not fix errors detected in preflight
            installer.preflight()

        log.info("Running Deploy...")
        installer.deploy()

        log.info("Running Postflight")
        installer.postflight()

        # Runs dcos-image/integration_test.py inside the cluster
        setup_integration_test(
                tunnel=test_host_tunnel,
                test_dir=remote_dir)
        integration_test(
                tunnel=test_host_tunnel,
                test_dir=remote_dir,
                region=vpc.get_region() if vpc else DEFAULT_AWS_REGION,
                dcos_dns=master_list[0],
                master_list=master_list,
                agent_list=agent_list,
                public_agent_list=public_agent_list,
                registry_host=registry_host,
                variant=options.variant,
                # Setting dns_search: mesos not currently supported in API
                test_dns_search=not options.use_api,
                ci_flags=options.ci_flags,
                aws_access_key_id=options.aws_access_key_id,
                aws_secret_access_key=options.aws_secret_access_key)

    # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics
    # on every host to see if they are working.

    log.info("Test successsful!")
    # Delete the cluster if all was successful to minimize potential costs.
    # Failed clusters the hosts will continue running
    if vpc is not None:
        vpc.delete()