コード例 #1
0
ファイル: monitor.py プロジェクト: tomassa/batch-shipyard
def create_monitoring_resource(auth_client, resource_client, compute_client,
                               network_client, blob_client, table_client,
                               config, resources_path, bootstrap_file,
                               monitoring_files):
    # type: (azure.mgmt.authorization.AuthorizationManagementClient,
    #        azure.mgmt.resource.resources.ResourceManagementClient,
    #        azure.mgmt.compute.ComputeManagementClient,
    #        azure.mgmt.network.NetworkManagementClient,
    #        azure.storage.blob.BlockBlobService,
    #        azure.cosmosdb.table.TableService,
    #        dict, str, pathlib.Path, Tuple[str, pathlib.Path],
    #        List[Tuple[str, pathlib.Path]]) -> None
    """Create a monitoring resource
    :param azure.mgmt.authorization.AuthorizationManagementClient auth_client:
        auth client
    :param azure.mgmt.resource.resources.ResourceManagementClient
        resource_client: resource client
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
        compute client
    :param azure.mgmt.network.NetworkManagementClient network_client:
        network client
    :param azure.storage.blob.BlockBlobService blob_client: blob client
    :param azure.cosmosdb.table.TableService table_client: table client
    :param dict config: configuration dict
    :param pathlib.Path: resources path
    :param Tuple[str, pathlib.Path] bootstrap_file: customscript bootstrap file
    :param List[Tuple[str, pathlib.Path]] monitoring_files:
        configurable monitoring files
    """
    ms = settings.monitoring_settings(config)
    # get subscription id for msi
    sub_id = settings.credentials_management(config).subscription_id
    if util.is_none_or_empty(sub_id):
        raise ValueError('Management subscription id not specified')
    # check if cluster already exists
    logger.debug('checking if monitoring resource exists')
    try:
        vm = compute_client.virtual_machines.get(
            resource_group_name=ms.resource_group,
            vm_name=settings.generate_virtual_machine_name(ms, 0))
        raise RuntimeError(
            'Existing virtual machine {} found for monitoring'.format(vm.id))
    except msrestazure.azure_exceptions.CloudError as e:
        if e.status_code == 404:
            pass
        else:
            raise
    # confirm before proceeding
    if not util.confirm_action(config, 'create monitoring resource'):
        return
    # create resource group if it doesn't exist
    resource.create_resource_group(resource_client, ms.resource_group,
                                   ms.location)
    # check for conflicting options
    servconf = settings.monitoring_services_settings(config)
    if servconf.lets_encrypt_enabled and not ms.public_ip.enabled:
        raise ValueError(
            'cannot create a monitoring resource without a public ip and '
            'lets encrypt enabled')
    # create storage container
    storage.create_storage_containers_nonbatch(blob_client, table_client, None,
                                               'monitoring')
    # configure yaml files and write to resources
    if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
        with monitoring_files['compose'][1].open('r') as f:
            compdata = f.read()
    else:
        with monitoring_files['compose-nonginx'][1].open('r') as f:
            compdata = f.read()
    with monitoring_files['prometheus'][1].open('r') as f:
        promdata = f.read()
    with monitoring_files['nginx'][1].open('r') as f:
        nginxdata = f.read()
    compdata = compdata.replace('{GRAFANA_ADMIN_USER}',
                                servconf.grafana.admin_user).replace(
                                    '{GRAFANA_ADMIN_PASSWORD}',
                                    servconf.grafana.admin_password)
    if servconf.prometheus.port is not None:
        if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
            compdata = compdata.replace(
                '{PROMETHEUS_PORT}',
                '- "{p}:{p}"'.format(p=servconf.prometheus.port))
            nginxdata = nginxdata.replace('{PROMETHEUS_PORT}',
                                          servconf.prometheus.port)
        else:
            compdata = compdata.replace('{PROMETHEUS_PORT}',
                                        servconf.prometheus.port)
    else:
        if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
            compdata = compdata.replace('{PROMETHEUS_PORT}', '')
            nginxdata = nginxdata.replace('{PROMETHEUS_PORT}', '9090')
        else:
            compdata = compdata.replace('{PROMETHEUS_PORT}', '9090')
    promdata = promdata.replace('{PROMETHEUS_SCRAPE_INTERVAL}',
                                servconf.prometheus.scrape_interval)
    compyml = resources_path / monitoring_files['compose'][0]
    promyml = resources_path / monitoring_files['prometheus'][0]
    nginxconf = resources_path / monitoring_files['nginx'][0]
    with compyml.open('wt') as f:
        f.write(compdata)
    with promyml.open('wt') as f:
        f.write(promdata)
    with nginxconf.open('wt') as f:
        f.write(nginxdata)
    del compdata
    del promdata
    del nginxdata
    monitoring_files = [
        bootstrap_file,
        monitoring_files['dashboard'],
        (monitoring_files['compose'][0], compyml),
        (monitoring_files['prometheus'][0], promyml),
        (monitoring_files['nginx'][0], nginxconf),
    ]
    add_dash = None
    if util.is_not_empty(servconf.grafana.additional_dashboards):
        add_dash = resources_path / 'additional_dashboards.txt'
        with add_dash.open('wt') as f:
            for key in servconf.grafana.additional_dashboards:
                f.write('{},{}\n'.format(
                    key, servconf.grafana.additional_dashboards[key]))
        monitoring_files.append((add_dash.name, add_dash))
    # upload scripts to blob storage for customscript vm extension
    blob_urls = storage.upload_for_nonbatch(blob_client, monitoring_files,
                                            'monitoring')
    try:
        compyml.unlink()
    except OSError:
        pass
    try:
        promyml.unlink()
    except OSError:
        pass
    try:
        nginxconf.unlink()
    except OSError:
        pass
    if add_dash is not None:
        try:
            add_dash.unlink()
        except OSError:
            pass
    # async operation dictionary
    async_ops = {}
    # create nsg
    async_ops['nsg'] = resource.AsyncOperation(
        functools.partial(resource.create_network_security_group,
                          network_client, ms))
    # use dynamic ips for private
    private_ips = None
    logger.debug('using dynamic private ip address allocation')
    # create virtual network and subnet if specified
    vnet, subnet = resource.create_virtual_network_and_subnet(
        resource_client, network_client, ms.virtual_network.resource_group,
        ms.location, ms.virtual_network)
    # create public ips
    pips = None
    if ms.public_ip.enabled:
        async_ops['pips'] = {}
        async_ops['pips'][0] = resource.AsyncOperation(
            functools.partial(resource.create_public_ip, network_client, ms,
                              0))
        logger.debug('waiting for public ips to provision')
        pips = {}
        for offset in async_ops['pips']:
            pip = async_ops['pips'][offset].result()
            logger.info(('public ip: {} [provisioning_state={} ip_address={} '
                         'public_ip_allocation={}]').format(
                             pip.id, pip.provisioning_state, pip.ip_address,
                             pip.public_ip_allocation_method))
            pips[offset] = pip
    else:
        logger.info('public ip is disabled for monitoring resource')
    # get nsg
    logger.debug('waiting for network security group to provision')
    nsg = async_ops['nsg'].result()
    # create nics
    async_ops['nics'] = {}
    async_ops['nics'][0] = resource.AsyncOperation(
        functools.partial(resource.create_network_interface, network_client,
                          ms, subnet, nsg, private_ips, pips, 0))
    # wait for nics to be created
    logger.debug('waiting for network interfaces to provision')
    nics = {}
    for offset in async_ops['nics']:
        nic = async_ops['nics'][offset].result()
        logger.info(
            ('network interface: {} [provisioning_state={} private_ip={} '
             'private_ip_allocation_method={} network_security_group={} '
             'accelerated_networking={}]').format(
                 nic.id, nic.provisioning_state,
                 nic.ip_configurations[0].private_ip_address,
                 nic.ip_configurations[0].private_ip_allocation_method,
                 nsg.name if nsg is not None else None,
                 nic.enable_accelerated_networking))
        nics[offset] = nic
    # read or generate ssh keys
    if util.is_not_empty(ms.ssh.ssh_public_key_data):
        key_data = ms.ssh.ssh_public_key_data
    else:
        # create universal ssh key for all vms if not specified
        ssh_pub_key = ms.ssh.ssh_public_key
        if ssh_pub_key is None:
            _, ssh_pub_key = crypto.generate_ssh_keypair(
                ms.ssh.generated_file_export_path,
                crypto.get_monitoring_ssh_key_prefix())
        # read public key data
        with ssh_pub_key.open('rb') as fd:
            key_data = fd.read().decode('utf8')
    ssh_pub_key = compute_client.virtual_machines.models.SshPublicKey(
        path='/home/{}/.ssh/authorized_keys'.format(ms.ssh.username),
        key_data=key_data,
    )
    # create vms
    async_ops['vms'] = {}
    async_ops['vms'][0] = resource.AsyncOperation(
        functools.partial(resource.create_virtual_machine,
                          compute_client,
                          ms,
                          None,
                          nics,
                          None,
                          ssh_pub_key,
                          0,
                          enable_msi=True))
    # wait for vms to be created
    logger.info('waiting for {} virtual machines to provision'.format(
        len(async_ops['vms'])))
    vms = {}
    for offset in async_ops['vms']:
        vms[offset] = async_ops['vms'][offset].result()
    logger.debug('{} virtual machines created'.format(len(vms)))
    # create role assignments for msi identity
    logger.debug('assigning roles to msi identity')
    sub_scope = '/subscriptions/{}/'.format(sub_id)
    cont_role = None
    for role in auth_client.role_definitions.list(
            sub_scope, filter='roleName eq \'Reader\''):
        cont_role = role.id
        break
    if cont_role is None:
        raise RuntimeError('Role Id not found for Reader')
    # sometimes the sp created is not added to the directory in time for
    # the following call, allow a minute worth of retries before giving up
    attempts = 0
    while attempts < 30:
        try:
            role_assign = auth_client.role_assignments.create(
                scope=sub_scope,
                role_assignment_name=uuid.uuid4(),
                parameters=authmodels.RoleAssignmentCreateParameters(
                    role_definition_id=cont_role,
                    principal_id=vms[0].identity.principal_id),
            )
            break
        except msrestazure.azure_exceptions.CloudError:
            time.sleep(2)
            attempts += 1
            if attempts == 30:
                raise
    del attempts
    if settings.verbose(config):
        logger.debug('reader role assignment: {}'.format(role_assign))
    cont_role = None
    for role in auth_client.role_definitions.list(
            sub_scope, filter='roleName eq \'Reader and Data Access\''):
        cont_role = role.id
        break
    if cont_role is None:
        raise RuntimeError('Role Id not found for Reader and Data Access')
    role_assign = auth_client.role_assignments.create(
        scope=sub_scope,
        role_assignment_name=uuid.uuid4(),
        parameters=authmodels.RoleAssignmentCreateParameters(
            role_definition_id=cont_role,
            principal_id=vms[0].identity.principal_id),
    )
    if settings.verbose(config):
        logger.debug(
            'reader and data access role assignment: {}'.format(role_assign))
    # get ip info for vm
    if util.is_none_or_empty(pips):
        fqdn = None
        ipinfo = 'private_ip_address={}'.format(
            nics[offset].ip_configurations[0].private_ip_address)
    else:
        # refresh public ip for vm
        pip = network_client.public_ip_addresses.get(
            resource_group_name=ms.resource_group,
            public_ip_address_name=pips[offset].name,
        )
        fqdn = pip.dns_settings.fqdn
        ipinfo = 'fqdn={} public_ip_address={}'.format(fqdn, pip.ip_address)
        # temporary enable port 80 for ACME challenge if fqdn is present
        if servconf.lets_encrypt_enabled:
            isr = settings.InboundNetworkSecurityRule(
                destination_port_range='80',
                source_address_prefix='*',
                protocol='tcp',
            )
            logger.debug('creating temporary port 80 rule for ACME challenge')
            async_ops['port80'] = resource.AsyncOperation(
                functools.partial(resource.add_inbound_network_security_rule,
                                  network_client, ms, 'acme80', isr))
    # install msi vm extension
    async_ops['vmext'] = {}
    async_ops['vmext'][0] = resource.AsyncOperation(
        functools.partial(resource.create_msi_virtual_machine_extension,
                          compute_client, ms, vms[0].name, 0,
                          settings.verbose(config)),
        max_retries=0,
    )
    logger.debug('waiting for virtual machine msi extensions to provision')
    for offset in async_ops['vmext']:
        async_ops['vmext'][offset].result()
    # ensure port 80 rule is ready
    if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
        async_ops['port80'].result()
    # install vm extension
    async_ops['vmext'][0] = resource.AsyncOperation(
        functools.partial(_create_virtual_machine_extension, compute_client,
                          config, ms, bootstrap_file, blob_urls, vms[0].name,
                          private_ips, fqdn, 0, settings.verbose(config)),
        max_retries=0,
    )
    logger.debug('waiting for virtual machine extensions to provision')
    for offset in async_ops['vmext']:
        # get vm extension result
        vm_ext = async_ops['vmext'][offset].result()
        vm = vms[offset]
        logger.info(
            ('virtual machine: {} [provisioning_state={}/{} '
             'vm_size={} {}]').format(vm.id, vm.provisioning_state,
                                      vm_ext.provisioning_state,
                                      vm.hardware_profile.vm_size, ipinfo))
    # disable port 80 for ACME challenge
    if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
        logger.debug('removing temporary port 80 rule for ACME challenge')
        async_ops['port80'] = resource.AsyncOperation(
            functools.partial(resource.remove_inbound_network_security_rule,
                              network_client, ms, 'acme80'))
        async_ops['port80'].result()
    # output connection info
    if ms.public_ip.enabled:
        logger.info(('To connect to Grafana, open a web browser and go '
                     'to https://{}').format(fqdn))
        if servconf.prometheus.port is not None:
            logger.info(('To connect to Prometheus, open a web browser and go '
                         'to https://{}:{}').format(fqdn,
                                                    servconf.prometheus.port))
    else:
        logger.info(('To connect to Grafana, open a web browser and go '
                     'to http://{} within the virtual network').format(
                         nics[offset].ip_configurations[0].private_ip_address))
        if servconf.prometheus.port is not None:
            logger.info(
                ('To connect to Prometheus, open a web browser and go '
                 'to http://{}:{} within the virtual network').format(
                     nics[offset].ip_configurations[0].private_ip_address,
                     servconf.prometheus.port))
コード例 #2
0
def create_federation_proxy(auth_client, resource_client, compute_client,
                            network_client, blob_client, table_client,
                            queue_client, config, resources_path,
                            bootstrap_file, federation_files):
    # type: (azure.mgmt.authorization.AuthorizationManagementClient,
    #        azure.mgmt.resource.resources.ResourceManagementClient,
    #        azure.mgmt.compute.ComputeManagementClient,
    #        azure.mgmt.network.NetworkManagementClient,
    #        azure.storage.blob.BlockBlobService,
    #        azure.cosmosdb.table.TableService,
    #        azure.storage.queue.QueueService,
    #        dict, pathlib.Path, Tuple[str, pathlib.Path],
    #        List[Tuple[str, pathlib.Path]]) -> None
    """Create a federation proxy
    :param azure.mgmt.authorization.AuthorizationManagementClient auth_client:
        auth client
    :param azure.mgmt.resource.resources.ResourceManagementClient
        resource_client: resource client
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
        compute client
    :param azure.mgmt.network.NetworkManagementClient network_client:
        network client
    :param azure.storage.blob.BlockBlobService blob_client: blob client
    :param azure.cosmosdb.table.TableService table_client: table client
    :param azure.storage.queue.QueueService queue_client: queue client
    :param dict config: configuration dict
    :param pathlib.Path: resources path
    :param Tuple[str, pathlib.Path] bootstrap_file: customscript bootstrap file
    :param List[Tuple[str, pathlib.Path]] federation_files: federation files
    """
    fs = settings.federation_settings(config)
    # get subscription id for msi
    sub_id = settings.credentials_management(config).subscription_id
    if util.is_none_or_empty(sub_id):
        raise ValueError('Management subscription id not specified')
    # check if cluster already exists
    logger.debug('checking if federation proxy exists')
    try:
        vm = compute_client.virtual_machines.get(
            resource_group_name=fs.resource_group,
            vm_name=settings.generate_virtual_machine_name(fs, 0))
        raise RuntimeError(
            'Existing virtual machine {} found for federation proxy'.format(
                vm.id))
    except msrestazure.azure_exceptions.CloudError as e:
        if e.status_code == 404:
            pass
        else:
            raise
    # confirm before proceeding
    if not util.confirm_action(config, 'create federation proxy'):
        return
    # create resource group if it doesn't exist
    resource.create_resource_group(resource_client, fs.resource_group,
                                   fs.location)
    # create storage containers
    storage.create_storage_containers_nonbatch(blob_client, table_client,
                                               queue_client, 'federation')
    # create file share for log persistence
    bs = settings.batch_shipyard_settings(config)
    storage.create_file_share_saskey(
        settings.credentials_storage(
            config,
            bs.storage_account_settings,
        ),
        '{}fedlogs'.format(bs.storage_entity_prefix),
        'ingress',
        create_share=True,
    )
    # create global lock
    storage.create_global_lock_blob(blob_client, 'federation')
    # upload scripts to blob storage for customscript vm extension
    blob_urls = storage.upload_for_nonbatch(blob_client, federation_files,
                                            'federation')
    # async operation dictionary
    async_ops = {}
    # create nsg
    async_ops['nsg'] = resource.AsyncOperation(
        functools.partial(resource.create_network_security_group,
                          network_client, fs))
    # use dynamic ips for private
    private_ips = None
    logger.debug('using dynamic private ip address allocation')
    # create virtual network and subnet if specified
    vnet, subnet = resource.create_virtual_network_and_subnet(
        resource_client, network_client, fs.virtual_network.resource_group,
        fs.location, fs.virtual_network)
    # create public ips
    pips = None
    if fs.public_ip.enabled:
        async_ops['pips'] = {}
        async_ops['pips'][0] = resource.AsyncOperation(
            functools.partial(resource.create_public_ip, network_client, fs,
                              0))
        logger.debug('waiting for public ips to provision')
        pips = {}
        for offset in async_ops['pips']:
            pip = async_ops['pips'][offset].result()
            logger.info(('public ip: {} [provisioning_state={} ip_address={} '
                         'public_ip_allocation={}]').format(
                             pip.id, pip.provisioning_state, pip.ip_address,
                             pip.public_ip_allocation_method))
            pips[offset] = pip
    else:
        logger.info('public ip is disabled for monitoring resource')
    # get nsg
    logger.debug('waiting for network security group to provision')
    nsg = async_ops['nsg'].result()
    # create nics
    async_ops['nics'] = {}
    async_ops['nics'][0] = resource.AsyncOperation(
        functools.partial(resource.create_network_interface, network_client,
                          fs, subnet, nsg, private_ips, pips, 0))
    # wait for nics to be created
    logger.debug('waiting for network interfaces to provision')
    nics = {}
    for offset in async_ops['nics']:
        nic = async_ops['nics'][offset].result()
        logger.info(
            ('network interface: {} [provisioning_state={} private_ip={} '
             'private_ip_allocation_method={} network_security_group={} '
             'accelerated_networking={}]').format(
                 nic.id, nic.provisioning_state,
                 nic.ip_configurations[0].private_ip_address,
                 nic.ip_configurations[0].private_ip_allocation_method,
                 nsg.name if nsg is not None else None,
                 nic.enable_accelerated_networking))
        nics[offset] = nic
    # read or generate ssh keys
    if util.is_not_empty(fs.ssh.ssh_public_key_data):
        key_data = fs.ssh.ssh_public_key_data
    else:
        # create universal ssh key for all vms if not specified
        ssh_pub_key = fs.ssh.ssh_public_key
        if ssh_pub_key is None:
            _, ssh_pub_key = crypto.generate_ssh_keypair(
                fs.ssh.generated_file_export_path,
                crypto.get_federation_ssh_key_prefix())
        # read public key data
        with ssh_pub_key.open('rb') as fd:
            key_data = fd.read().decode('utf8')
    ssh_pub_key = compute_client.virtual_machines.models.SshPublicKey(
        path='/home/{}/.ssh/authorized_keys'.format(fs.ssh.username),
        key_data=key_data,
    )
    # create vms
    async_ops['vms'] = {}
    async_ops['vms'][0] = resource.AsyncOperation(
        functools.partial(resource.create_virtual_machine,
                          compute_client,
                          fs,
                          None,
                          nics,
                          None,
                          ssh_pub_key,
                          0,
                          enable_msi=True))
    # wait for vms to be created
    logger.info('waiting for {} virtual machines to provision'.format(
        len(async_ops['vms'])))
    vms = {}
    for offset in async_ops['vms']:
        vms[offset] = async_ops['vms'][offset].result()
    logger.debug('{} virtual machines created'.format(len(vms)))
    # create role assignments for msi identity
    logger.debug('assigning roles to msi identity')
    sub_scope = '/subscriptions/{}/'.format(sub_id)
    cont_role = None
    for role in auth_client.role_definitions.list(
            sub_scope, filter='roleName eq \'Contributor\''):
        cont_role = role.id
        break
    if cont_role is None:
        raise RuntimeError('Role Id not found for Reader')
    # sometimes the sp created is not added to the directory in time for
    # the following call, allow some retries before giving up
    attempts = 0
    while attempts < 90:
        try:
            role_assign = auth_client.role_assignments.create(
                scope=sub_scope,
                role_assignment_name=uuid.uuid4(),
                parameters=authmodels.RoleAssignmentCreateParameters(
                    role_definition_id=cont_role,
                    principal_id=vms[0].identity.principal_id),
            )
            break
        except msrestazure.azure_exceptions.CloudError:
            time.sleep(2)
            attempts += 1
            if attempts == 90:
                raise
    del attempts
    if settings.verbose(config):
        logger.debug('reader role assignment: {}'.format(role_assign))
    cont_role = None
    for role in auth_client.role_definitions.list(
            sub_scope, filter='roleName eq \'Reader and Data Access\''):
        cont_role = role.id
        break
    if cont_role is None:
        raise RuntimeError('Role Id not found for Reader and Data Access')
    role_assign = auth_client.role_assignments.create(
        scope=sub_scope,
        role_assignment_name=uuid.uuid4(),
        parameters=authmodels.RoleAssignmentCreateParameters(
            role_definition_id=cont_role,
            principal_id=vms[0].identity.principal_id),
    )
    if settings.verbose(config):
        logger.debug(
            'reader and data access role assignment: {}'.format(role_assign))
    # get ip info for vm
    if util.is_none_or_empty(pips):
        fqdn = None
        ipinfo = 'private_ip_address={}'.format(
            nics[offset].ip_configurations[0].private_ip_address)
    else:
        # refresh public ip for vm
        pip = network_client.public_ip_addresses.get(
            resource_group_name=fs.resource_group,
            public_ip_address_name=pips[offset].name,
        )
        fqdn = pip.dns_settings.fqdn
        ipinfo = 'fqdn={} public_ip_address={}'.format(fqdn, pip.ip_address)
    for offset in async_ops['vmext']:
        async_ops['vmext'][offset].result()
    # install vm extension
    async_ops['vmext'] = {}
    async_ops['vmext'][0] = resource.AsyncOperation(
        functools.partial(_create_virtual_machine_extension, compute_client,
                          config, fs, bootstrap_file, blob_urls, vms[0].name,
                          private_ips, fqdn, 0, settings.verbose(config)),
        max_retries=0,
    )
    logger.debug('waiting for virtual machine extensions to provision')
    for offset in async_ops['vmext']:
        # get vm extension result
        vm_ext = async_ops['vmext'][offset].result()
        vm = vms[offset]
        logger.info(
            ('virtual machine: {} [provisioning_state={}/{} '
             'vm_size={} {}]').format(vm.id, vm.provisioning_state,
                                      vm_ext.provisioning_state,
                                      vm.hardware_profile.vm_size, ipinfo))