def main(): validate_env() location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'testing' + ''.join(random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name} pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print("ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update( group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format(res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['masterFQDN']['value'] print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444") print("For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}".format( get_value('linuxAdminUsername'), master_lb)) # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = { 'master': [], 'private': [], 'public': []} for resource in rmc.resource_groups.list_resources( group_name, filter=("resourceType eq 'Microsoft.Network/networkInterfaces' or " "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")): if resource.type == 'Microsoft.Network/networkInterfaces': nics = [nmc.network_interfaces.get(group_name, resource.name)] elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets': nics = list(nmc.network_interfaces.list_virtual_machine_scale_set_network_interfaces( virtual_machine_scale_set_name=resource.name, resource_group_name=group_name)) else: raise('Unexpected resourceType: {}'.format(resource.type)) for bucket_name in ip_buckets.keys(): if bucket_name in resource.name: for n in nics: for config in n.ip_configurations: ip_buckets[bucket_name].append(config.private_ip_address) print('Detected IP configuration: {}'.format(ip_buckets)) with Tunnel(get_value('linuxAdminUsername'), 'ssh_key', master_lb, port=2200) as t: integration_test( tunnel=t, test_dir='/home/{}'.format(get_value('linuxAdminUsername')), dcos_dns=ip_buckets['master'][0], master_list=ip_buckets['master'], agent_list=ip_buckets['private'], public_agent_list=ip_buckets['public'], provider='azure', test_dns_search=False, add_env=get_test_config(), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -s -rs -m 'not ccm' ") + os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: traceback.print_exc() print("ERROR: exception {}".format(ex)) raise finally: if os.getenv('AZURE_CLEANUP') == 'false': print("Cluster must be cleaned up manually") print("Cluster details: {}".format(azure_cluster)) else: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def run(): location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'tesing' + ''.join(random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() if template_parameters.get('numberOfPrivateSlaves'): assert template_parameters['numberOfPrivateSlaves']['value'] >= 2, 'Test requires at least 2 private slaves!' else: template_parameters['numberOfPrivateSlaves'] = {'value': 2} if template_parameters.get('numberOfPublicSlaves'): assert template_parameters['numberOfPublicSlaves']['value'] >= 1, 'Test requires at least 1 public slave!' else: template_parameters['numberOfPublicSlaves'] = {'value': 1} # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name} pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print("ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update( group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format(res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['dnsAddress']['value'] master_url = "http://{}".format(master_lb) print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444") print("For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}".format(master_lb)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000)) def poll_on_dcos_ui_up(): r = get_dcos_ui(master_url) assert r is not None and r.status_code == requests.codes.ok, \ "Unable to reach DC/OS UI: {}".format(master_url) print("Waiting for DC/OS UI at: {} ...".format(master_url)) poll_on_dcos_ui_up() # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = { 'masterNodeNic': [], 'slavePrivateNic': [], 'slavePublicNic': []} for resource in rmc.resource_groups.list_resources(group_name): for bucket_name, bucket in ip_buckets.items(): if resource.name.startswith(bucket_name): nic = nmc.network_interfaces.get(group_name, resource.name) all_ips = [] for config in nic.ip_configurations: all_ips.append(config.private_ip_address) bucket.extend(all_ips) with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t: integration_test( tunnel=t, test_dir='/home/core', dcos_dns=master_lb, master_list=ip_buckets['masterNodeNic'], agent_list=ip_buckets['slavePrivateNic'], public_agent_list=ip_buckets['slavePublicNic'], provider='azure', test_dns_search=False, pytest_dir=os.getenv('DCOS_PYTEST_DIR', '/opt/mesosphere/active/dcos-integration-test'), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -m 'not ccm' ")+os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: print("ERROR: exception {}".format(ex)) raise finally: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def main(): validate_env() location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'testing' + ''.join(random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name} pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print("ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update( group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update(group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format(res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done(), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['masterFQDN']['value'] print("Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444") print("For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}".format( get_value('linuxAdminUsername'), master_lb)) # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = { 'master': [], 'private': [], 'public': []} for resource in rmc.resource_groups.list_resources( group_name, filter=("resourceType eq 'Microsoft.Network/networkInterfaces' or " "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")): if resource.type == 'Microsoft.Network/networkInterfaces': nics = [nmc.network_interfaces.get(group_name, resource.name)] elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets': nics = list(nmc.network_interfaces.list_virtual_machine_scale_set_network_interfaces( virtual_machine_scale_set_name=resource.name, resource_group_name=group_name)) else: raise('Unexpected resourceType: {}'.format(resource.type)) for bucket_name in ip_buckets.keys(): if bucket_name in resource.name: for n in nics: for config in n.ip_configurations: ip_buckets[bucket_name].append(config.private_ip_address) print('Detected IP configuration: {}'.format(ip_buckets)) with SSHTunnel(get_value('linuxAdminUsername'), 'ssh_key', master_lb, port=2200) as t: integration_test( tunnel=t, test_dir='/home/{}'.format(get_value('linuxAdminUsername')), dcos_dns=ip_buckets['master'][0], master_list=ip_buckets['master'], agent_list=ip_buckets['private'], public_agent_list=ip_buckets['public'], provider='azure', test_dns_search=False, add_env=get_test_config(), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -rs -vv -m 'not ccm' ") + os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: traceback.print_exc() print("ERROR: exception {}".format(ex)) raise finally: if os.getenv('AZURE_CLEANUP') == 'false': print("Cluster must be cleaned up manually") print("Cluster details: {}".format(azure_cluster)) else: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def main(): options = check_environment() host_list = None vpc = None # Set if the test owns the VPC if options.host_list is None: log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...') vpc = make_vpc(use_bare_os=options.test_install_prereqs) host_list = vpc.hosts() else: host_list = options.host_list assert os.path.exists( 'ssh_key'), 'Valid SSH key for hosts must be in working dir!' # key must be chmod 600 for test_runner to use os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE) # Create custom SSH Runnner to help orchestrate the test ssh_user = '******' ssh_key_path = 'ssh_key' remote_dir = '/home/centos' if options.use_api: installer = test_util.installer_api_test.DcosApiInstaller() if not options.test_install_prereqs: # If we dont want to test the prereq install, use offline mode to avoid it installer.offline_mode = True else: installer = test_util.installer_api_test.DcosCliInstaller() host_list_w_port = [i + ':22' for i in host_list] @retry(stop_max_delay=120000) def establish_host_connectivity(): """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes """ return closing( TunnelCollection(ssh_user, ssh_key_path, host_list_w_port)) log.info('Checking that hosts are accessible') with establish_host_connectivity() as tunnels: local_ip = {} for tunnel in tunnels.tunnels: local_ip[tunnel.host] = get_local_address(tunnel, remote_dir) if options.do_setup: # Make the default user priveleged to use docker tunnel.remote_cmd( ['sudo', 'usermod', '-aG', 'docker', ssh_user]) # use first node as bootstrap node, second node as master, all others as agents test_host = host_list[0] registry_host = local_ip[host_list[0]] master_list = [local_ip[host_list[1]]] agent_list = [local_ip[host_list[2]]] public_agent_list = [local_ip[host_list[3]]] log.info('Test/registry host public/private IP: ' + test_host + '/' + registry_host) with closing(SSHTunnel(ssh_user, ssh_key_path, test_host)) as test_host_tunnel: log.info('Setting up installer on test host') installer.setup_remote(tunnel=test_host_tunnel, installer_path=remote_dir + '/dcos_generate_config.sh', download_url=options.installer_url) if options.do_setup: # only do on setup so you can rerun this test against a living installer log.info('Verifying installer password hashing') test_pass = '******' hash_passwd = installer.get_hashed_password(test_pass) assert passlib.hash.sha512_crypt.verify( test_pass, hash_passwd), 'Hash does not match password' if options.use_api: installer.start_web_server() with open(pkg_resources.resource_filename( "gen", "ip-detect/aws.sh")) as ip_detect_fh: ip_detect_script = ip_detect_fh.read() with open('ssh_key', 'r') as key_fh: ssh_key = key_fh.read() # Using static exhibitor is the only option in the GUI installer if options.use_api: log.info( 'Installer API is selected, so configure for static backend') zk_host = None # causes genconf to use static exhibitor backend else: log.info('Installer CLI is selected, so configure for ZK backend') zk_host = registry_host + ':2181' zk_cmd = [ 'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p', '2888:2888', '-p', '3888:3888', 'jplock/zookeeper' ] test_host_tunnel.remote_cmd(zk_cmd) log.info("Configuring install...") installer.genconf(zk_host=zk_host, master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, ip_detect_script=ip_detect_script, ssh_user=ssh_user, ssh_key=ssh_key) log.info("Running Preflight...") if options.test_install_prereqs: # Runs preflight in --web or --install-prereqs for CLI # This may take up 15 minutes... installer.install_prereqs() if options.test_install_prereqs_only: if vpc: vpc.delete() sys.exit(0) else: # Will not fix errors detected in preflight installer.preflight() log.info("Running Deploy...") installer.deploy() log.info("Running Postflight") installer.postflight() # Runs dcos-image/integration_test.py inside the cluster setup_integration_test(tunnel=test_host_tunnel, test_dir=remote_dir) integration_test( tunnel=test_host_tunnel, test_dir=remote_dir, region=vpc.get_region() if vpc else DEFAULT_AWS_REGION, dcos_dns=master_list[0], master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, registry_host=registry_host, variant=options.variant, # Setting dns_search: mesos not currently supported in API test_dns_search=not options.use_api, ci_flags=options.ci_flags, aws_access_key_id=options.aws_access_key_id, aws_secret_access_key=options.aws_secret_access_key) # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics # on every host to see if they are working. log.info("Test successsful!") # Delete the cluster if all was successful to minimize potential costs. # Failed clusters the hosts will continue running if vpc is not None: vpc.delete()
def run(): location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'tesing' + ''.join( random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() if template_parameters.get('numberOfPrivateSlaves'): assert template_parameters['numberOfPrivateSlaves'][ 'value'] >= 2, 'Test requires at least 2 private slaves!' else: template_parameters['numberOfPrivateSlaves'] = {'value': 2} if template_parameters.get('numberOfPublicSlaves'): assert template_parameters['numberOfPublicSlaves'][ 'value'] >= 1, 'Test requires at least 1 public slave!' else: template_parameters['numberOfPublicSlaves'] = {'value': 1} # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name } pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print( "ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update(group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update( group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format( res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done( ), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['dnsAddress']['value'] master_url = "http://{}".format(master_lb) print( "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444" ) print( "For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}" .format(master_lb)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000)) def poll_on_dcos_ui_up(): r = get_dcos_ui(master_url) assert r is not None and r.status_code == requests.codes.ok, \ "Unable to reach DC/OS UI: {}".format(master_url) print("Waiting for DC/OS UI at: {} ...".format(master_url)) poll_on_dcos_ui_up() # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = { 'masterNodeNic': [], 'slavePrivateNic': [], 'slavePublicNic': [] } for resource in rmc.resource_groups.list_resources(group_name): for bucket_name, bucket in ip_buckets.items(): if resource.name.startswith(bucket_name): nic = nmc.network_interfaces.get(group_name, resource.name) all_ips = [] for config in nic.ip_configurations: all_ips.append(config.private_ip_address) bucket.extend(all_ips) with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t: integration_test( tunnel=t, test_dir='/home/core', dcos_dns=master_lb, master_list=ip_buckets['masterNodeNic'], agent_list=ip_buckets['slavePrivateNic'], public_agent_list=ip_buckets['slavePublicNic'], provider='azure', test_dns_search=False, pytest_dir=os.getenv( 'DCOS_PYTEST_DIR', '/opt/mesosphere/active/dcos-integration-test'), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -m 'not ccm' ") + os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: print("ERROR: exception {}".format(ex)) raise finally: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def main(): options = check_environment() host_list = None vpc = None # Set if the test owns the VPC if options.host_list is None: log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...') vpc = make_vpc(use_bare_os=options.test_install_prereqs) host_list = vpc.hosts() else: host_list = options.host_list assert os.path.exists('ssh_key'), 'Valid SSH key for hosts must be in working dir!' # key must be chmod 600 for test_runner to use os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE) # Create custom SSH Runnner to help orchestrate the test ssh_user = '******' ssh_key_path = 'ssh_key' remote_dir = '/home/centos' if options.use_api: installer = test_util.installer_api_test.DcosApiInstaller() if not options.test_install_prereqs: # If we dont want to test the prereq install, use offline mode to avoid it installer.offline_mode = True else: installer = test_util.installer_api_test.DcosCliInstaller() host_list_w_port = [i+':22' for i in host_list] @retry(stop_max_delay=120000) def establish_host_connectivity(): """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes """ return closing(TunnelCollection(ssh_user, ssh_key_path, host_list_w_port)) log.info('Checking that hosts are accessible') with establish_host_connectivity() as tunnels: local_ip = {} for tunnel in tunnels.tunnels: local_ip[tunnel.host] = get_local_address(tunnel, remote_dir) if options.do_setup: # Make the default user priveleged to use docker tunnel.remote_cmd(['sudo', 'usermod', '-aG', 'docker', ssh_user]) # use first node as bootstrap node, second node as master, all others as agents test_host = host_list[0] registry_host = local_ip[host_list[0]] master_list = [local_ip[host_list[1]]] agent_list = [local_ip[host_list[2]]] public_agent_list = [local_ip[host_list[3]]] log.info('Test/registry host public/private IP: ' + test_host + '/' + registry_host) with closing(SSHTunnel(ssh_user, ssh_key_path, test_host)) as test_host_tunnel: log.info('Setting up installer on test host') installer.setup_remote( tunnel=test_host_tunnel, installer_path=remote_dir+'/dcos_generate_config.sh', download_url=options.installer_url) if options.do_setup: # only do on setup so you can rerun this test against a living installer log.info('Verifying installer password hashing') test_pass = '******' hash_passwd = installer.get_hashed_password(test_pass) assert passlib.hash.sha512_crypt.verify(test_pass, hash_passwd), 'Hash does not match password' if options.use_api: installer.start_web_server() with open(pkg_resources.resource_filename("gen", "ip-detect/aws.sh")) as ip_detect_fh: ip_detect_script = ip_detect_fh.read() with open('ssh_key', 'r') as key_fh: ssh_key = key_fh.read() # Using static exhibitor is the only option in the GUI installer if options.use_api: log.info('Installer API is selected, so configure for static backend') zk_host = None # causes genconf to use static exhibitor backend else: log.info('Installer CLI is selected, so configure for ZK backend') zk_host = registry_host + ':2181' zk_cmd = [ 'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p', '2888:2888', '-p', '3888:3888', 'jplock/zookeeper'] test_host_tunnel.remote_cmd(zk_cmd) log.info("Configuring install...") installer.genconf( zk_host=zk_host, master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, ip_detect_script=ip_detect_script, ssh_user=ssh_user, ssh_key=ssh_key) log.info("Running Preflight...") if options.test_install_prereqs: # Runs preflight in --web or --install-prereqs for CLI # This may take up 15 minutes... installer.install_prereqs() if options.test_install_prereqs_only: if vpc: vpc.delete() sys.exit(0) else: # Will not fix errors detected in preflight installer.preflight() log.info("Running Deploy...") installer.deploy() log.info("Running Postflight") installer.postflight() # Runs dcos-image/integration_test.py inside the cluster setup_integration_test( tunnel=test_host_tunnel, test_dir=remote_dir) integration_test( tunnel=test_host_tunnel, test_dir=remote_dir, region=vpc.get_region() if vpc else DEFAULT_AWS_REGION, dcos_dns=master_list[0], master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, registry_host=registry_host, variant=options.variant, # Setting dns_search: mesos not currently supported in API test_dns_search=not options.use_api, ci_flags=options.ci_flags, aws_access_key_id=options.aws_access_key_id, aws_secret_access_key=options.aws_secret_access_key) # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics # on every host to see if they are working. log.info("Test successsful!") # Delete the cluster if all was successful to minimize potential costs. # Failed clusters the hosts will continue running if vpc is not None: vpc.delete()