def start(self): credentials, subscription_id = self._get_credentials() # Cluster name cluster_name = self.config.get("cluster", None) if _is_none_or_blank(cluster_name): cluster_name = self.cluster_name logging.info("Using same cluster name as DSS: {}".format(cluster_name)) # Resource group resource_group = self.config.get('resourceGroup', None) if _is_none_or_blank(resource_group): metadata = get_instance_metadata() resource_group = metadata["compute"]["resourceGroupName"] logging.info("Using same resource group as DSS: {}".format(resource_group)) clusters_client = ContainerServiceClient(credentials, subscription_id) # Get kubeconfig logging.info("Fetching kubeconfig for cluster %s in %s", cluster_name, resource_group) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials(resource_group, cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[0].value.decode('utf8') kube_config_path = os.path.join(os.getcwd(), 'kube_config') with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) # Get other cluster infos def do_inspect(): return clusters_client.managed_clusters.get(resource_group, cluster_name) get_cluster_result = run_and_process_cloud_error(do_inspect) return [overrides, {'kube_config_path':kube_config_path, 'cluster':get_cluster_result.as_dict()}]
def start(self): connection_info = self.config.get("connectionInfo", {}) connection_info_secret = self.plugin_config.get("connectionInfo", {}) subscription_id = connection_info.get('subscriptionId', None) if _is_none_or_blank(subscription_id): raise Exception('Subscription must be defined') credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) clusters_client = ContainerServiceClient(credentials, subscription_id) resource_group_name = self.config.get('resourceGroup', None) if _is_none_or_blank(resource_group_name): raise Exception( "A resource group to put the cluster in is required") cluster_name = self.config.get('cluster', self.cluster_name) logging.info("Fetching kubeconfig for cluster %s in %s" % (cluster_name, resource_group_name)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group_name, cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode('utf8') kube_config_path = os.path.join(os.getcwd(), 'kube_config') with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) def do_inspect(): return clusters_client.managed_clusters.get( resource_group_name, cluster_name) get_cluster_result = run_and_process_cloud_error(do_inspect) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': get_cluster_result.as_dict() } ]
def start(self): # retrieve the cluster info from GKE # this will fail if the cluster doesn't exist, but the API message is enough clusters = get_cluster_from_connection_info(self.config['connectionInfo'], self.plugin_config['connectionInfo']) cluster = clusters.get_cluster(self.config.get('clusterId', self.cluster_name)) cluster_info = cluster.get_info() # build the config file for kubectl # we don't add the context to the main config file, to not end up with an oversized config, # and because 2 different clusters could be concurrently editing the config file kube_config_path = os.path.join(os.getcwd(), 'kube_config') kube_config = cluster.get_kube_config(self.cluster_id) with open(kube_config_path, 'w') as f: yaml.safe_dump(kube_config, f, default_flow_style=False) # add the admin role so that we can do the managed kubernetes stuff for spark create_admin_binding(self.config.get("userName", None), kube_config_path) # collect and prepare the overrides so that DSS can know where and how to use the cluster overrides = make_overrides(self.config, kube_config, kube_config_path) return [overrides, {'kube_config_path':kube_config_path, 'cluster':cluster_info}]
def start(self): connection_info = self.config.get('connectionInfo', {}) networking_settings = self.config["networkingSettings"] args = ['create', 'cluster'] args = args + ['-v', '4'] if not self.config.get('advanced'): args = args + ['--name', self.cluster_id] if _has_not_blank_property(connection_info, 'region'): args = args + ['--region', connection_info['region']] elif 'AWS_DEFAULT_REGION' is os.environ: args = args + ['--region', os.environ['AWS_DEFAULT_REGION']] args = args + ['--full-ecr-access'] subnets = networking_settings.get('subnets', []) if networking_settings.get('privateNetworking', False): args = args + ['--node-private-networking'] private_subnets = networking_settings.get('privateSubnets', []) if len(private_subnets) > 0: args = args + [ '--vpc-private-subnets', ','.join(private_subnets) ] if len(subnets) > 0: args = args + ['--vpc-public-subnets', ','.join(subnets)] security_groups = networking_settings.get('securityGroups', []) if len(security_groups) > 0: args = args + [ '--node-security-groups', ','.join(security_groups) ] node_pool = self.config.get('nodePool', {}) if 'machineType' in node_pool: args = args + ['--node-type', node_pool['machineType']] if 'diskType' in node_pool: args = args + ['--node-volume-type', node_pool['diskType']] if 'diskSizeGb' in node_pool and node_pool['diskSizeGb'] > 0: args = args + [ '--node-volume-size', str(node_pool['diskSizeGb']) ] args = args + ['--nodes', str(node_pool.get('numNodes', 3))] if node_pool.get('numNodesAutoscaling', False): args = args + ['--asg-access'] args = args + [ '--nodes-min', str(node_pool.get('minNumNodes', 2)) ] args = args + [ '--nodes-max', str(node_pool.get('maxNumNodes', 5)) ] k8s_version = self.config.get("k8sVersion", None) if not _is_none_or_blank(k8s_version): args = args + ['--version', k8s_version.strip()] else: yaml_dict = yaml.safe_load(self.config.get("advancedYaml")) yaml_loc = os.path.join(os.getcwd(), self.cluster_id + '_advanced.yaml') with open(yaml_loc, 'w') as outfile: yaml.dump(yaml_dict, outfile, default_flow_style=False) args = args + ['-f', yaml_loc] # we don't add the context to the main config file, to not end up with an oversized config, # and because 2 different clusters could be concurrently editing the config file kube_config_path = os.path.join(os.getcwd(), 'kube_config') args = args + ['--kubeconfig', kube_config_path] c = EksctlCommand(args, connection_info) if c.run_and_log() != 0: raise Exception("Failed to start cluster") args = ['get', 'cluster'] args = args + ['--name', self.cluster_id] if _has_not_blank_property(connection_info, 'region'): args = args + ['--region', connection_info['region']] elif 'AWS_DEFAULT_REGION' is os.environ: args = args + ['--region', os.environ['AWS_DEFAULT_REGION']] args = args + ['-o', 'json'] if _has_not_blank_property(connection_info, 'accessKey') and _has_not_blank_property( connection_info, 'secretKey'): creds_in_env = { 'AWS_ACCESS_KEY_ID': connection_info['accessKey'], 'AWS_SECRET_ACCESS_KEY': connection_info['secretKey'] } add_authenticator_env(kube_config_path, creds_in_env) if not self.config.get('advanced'): if node_pool.get('numNodesAutoscaling', False): logging.info("Nodegroup is autoscaling, ensuring autoscaler") add_autoscaler_if_needed(self.cluster_id, kube_config_path) elif self.config.get('clusterAutoScaling'): logging.info("Nodegroup is autoscaling, ensuring autoscaler") add_autoscaler_if_needed(self.cluster_id, kube_config_path) c = EksctlCommand(args, connection_info) cluster_info = json.loads(c.run_and_get_output())[0] with open(kube_config_path, "r") as f: kube_config = yaml.safe_load(f) # collect and prepare the overrides so that DSS can know where and how to use the cluster overrides = make_overrides(self.config, kube_config, kube_config_path) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': cluster_info } ]
def start(self): """ Build the create cluster request. """ credentials, subscription_id, managed_identity_id = self._get_credentials( ) # Fetch metadata about the instance metadata = get_instance_metadata() # Resource group resource_group = self.config.get('resourceGroup', None) dss_host_resource_group = metadata["compute"]["resourceGroupName"] if _is_none_or_blank(resource_group): resource_group = dss_host_resource_group logging.info( "Using same resource group as DSS: {}".format(resource_group)) # Location location = self.config.get('location', None) if _is_none_or_blank(location): location = metadata["compute"]["location"] logging.info("Using same location as DSS: {}".format(location)) # Consistency checks if _is_none_or_blank(resource_group): raise Exception( "A resource group to put the cluster in is required") if _is_none_or_blank(location): raise Exception("A location to put the cluster in is required") # AKS Client clusters_client = None # Credit the cluster to DATAIKU if os.environ.get("DISABLE_AZURE_USAGE_ATTRIBUTION", "0") == "1": logging.info("Azure usage attribution is disabled") clusters_client = ContainerServiceClient(credentials, subscription_id) else: policy = UserAgentPolicy() policy.add_user_agent('pid-fd3813c7-273c-5eec-9221-77323f62a148') clusters_client = ContainerServiceClient(credentials, subscription_id, user_agent_policy=policy) # check that the cluster doesn't exist yet, otherwise azure will try to update it # and will almost always fail try: existing = clusters_client.managed_clusters.get( resource_group, self.cluster_name) if existing is not None: raise Exception( "A cluster with name %s in resource group %s already exists" % (self.cluster_name, resource_group)) except CloudError as e: logging.info("Cluster doesn't seem to exist yet") except ResourceNotFoundError as e: logging.info("Cluster doesn't seem to exist yet") cluster_builder = ClusterBuilder(clusters_client) cluster_builder.with_name(self.cluster_name) cluster_builder.with_dns_prefix("{}-dns".format(self.cluster_name)) cluster_builder.with_resource_group(resource_group) cluster_builder.with_location(location) cluster_builder.add_tags(self.config.get("tags", None)) cluster_builder.with_linux_profile() # default is None cluster_builder.with_network_profile( service_cidr=self.config.get("serviceCIDR", None), dns_service_ip=self.config.get("dnsServiceIP", None), load_balancer_sku=self.config.get("loadBalancerSku", None), outbound_type=self.config.get("outboundType", None), network_plugin=self.config.get("networkPlugin"), docker_bridge_cidr=self.config.get("dockerBridgeCidr")) if self.config.get("useCustomNodeResourceGroup", False): cluster_builder.with_node_resource_group( self.config.get("nodeResourceGroup")) # Cluster identity connection_info = self.config.get("connectionInfo", None) cluster_idendity_legacy_use_distinct_sp = self.config.get( "useDistinctSPForCluster", False) cluster_idendity_legacy_sp = self.config.get("clusterServicePrincipal", None) cluster_identity_type = None cluster_identity = None if not _is_none_or_blank( connection_info) or cluster_idendity_legacy_use_distinct_sp: logging.warn( "Using legacy options to configure cluster identity. Clear them to use the new ones." ) if not cluster_idendity_legacy_use_distinct_sp and not _is_none_or_blank( connection_info): cluster_sp = connection_info elif cluster_idendity_legacy_use_distinct_sp and not _is_none_or_blank( cluster_idendity_legacy_sp): cluster_sp = self.config.get("clusterServicePrincipal") else: raise Exception( "Legacy options are not complete enough to determine cluster identity settings" ) cluster_builder.with_cluster_sp_legacy( cluster_service_principal_connection_info=cluster_sp) else: cluster_identity = self.config.get( "clusterIdentity", {"identityType": "managed-identity"}) cluster_identity_type = cluster_identity.get( "identityType", "managed-identity") if cluster_identity_type == "managed-identity": if cluster_identity.get("inheritDSSIdentity", True): logging.info( "Need to inspect Managed Identity infos from Azure") if metadata is None: metadata = get_instance_metadata() vm_resource_group = metadata["compute"][ "resourceGroupName"] vm_name = metadata["compute"]["name"] compute_client = ComputeManagementClient( credentials, subscription_id) vm = compute_client.virtual_machines.get( vm_resource_group, vm_name) # No choice here but to use the first one if managed_identity_id is None: managed_identity_id = next( iter(vm.identity.user_assigned_identities.keys())) for managed_identity_resource_id, managed_identity_properties in vm.identity.user_assigned_identities.items( ): if managed_identity_id == managed_identity_resource_id or managed_identity_id == managed_identity_properties.client_id: break logging.info("Found managed identity id {}".format( managed_identity_resource_id)) cluster_builder.with_managed_identity( managed_identity_resource_id) cluster_builder.with_kubelet_identity( managed_identity_resource_id, managed_identity_properties.client_id, managed_identity_properties.principal_id) else: control_plane_mi = None if cluster_identity.get( "useAKSManagedIdentity", True ) else cluster_identity["controlPlaneUserAssignedIdentity"] cluster_builder.with_managed_identity(control_plane_mi) if control_plane_mi is None: logging.info( "Configure cluster with system managed identity.") else: logging.info( "Configure cluster with user assigned identity: {}" .format(control_plane_mi)) if not cluster_identity.get("useAKSManagedKubeletIdentity", True): kubelet_mi = cluster_identity[ "kubeletUserAssignedIdentity"] _, _, mi_subscription_id, _, mi_resource_group, _, _, _, mi_name = kubelet_mi.split( "/") msiclient = ManagedServiceIdentityClient( AzureIdentityCredentialAdapter(credentials), mi_subscription_id) mi = msiclient.user_assigned_identities.get( mi_resource_group, mi_name) cluster_builder.with_kubelet_identity( kubelet_mi, mi.client_id, mi.principal_id) logging.info( "Configure kubelet identity with user assigned identity resourceId=\"{}\", clientId=\"{}\", objectId=\"{}\"" .format(kubelet_mi, mi.client_id, mi.principal_id)) elif cluster_identity_type == "service-principal": cluster_builder.with_cluster_sp(cluster_identity["clientId"], cluster_identity["password"]) logging.info("Configure cluster with service principal") else: raise Exception( "Cluster identity type \"{}\" is unknown".format( cluster_identity_type)) # Fail fast for non existing ACRs to avoid drama in case of failure AFTER cluster is created acr_role_id = None authorization_client = None if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedKubeletIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): acr_name = cluster_identity.get("attachToACRName", None) if not _is_none_or_blank(acr_name): # build acr scope acr_identifier_splitted = acr_name.split('/') acr_subscription_id = subscription_id acr_resource_group = resource_group if 9 == len(acr_identifier_splitted): _, _, acr_subscription_id, _, acr_resource_group, _, _, _, acr_name = acr_identifier_splitted elif 2 == len(acr_identifier_splitted): acr_resource_group, acr_name = acr_identifier_splitted authorization_client = AuthorizationManagementClient( credentials, acr_subscription_id) acr_scope = "/subscriptions/{acr_subscription_id}/resourceGroups/{acr_resource_group}/providers/Microsoft.ContainerRegistry/registries/{acr_name}".format( **locals()) try: acr_roles = list( authorization_client.role_definitions.list( acr_scope, "roleName eq 'AcrPull'")) except ResourceNotFoundError as e: raise Exception( "ACR {} not found. Check it exists and you are Owner of it." .format(acr_scope)) if 0 == len(acr_roles): raise Exception( "Could not find the AcrPull role on the ACR {}. Check you are Owner of it." .format(acr_scope)) else: acr_role_id = acr_roles[0].id logging.info("ACR pull role id: %s", acr_role_id) # Try to run a fake role assignment. Depending on the failure type we know if we are Owner or not try: fake_role_assignment = authorization_client.role_assignments.create( scope=acr_scope, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": acr_role_id, "principal_id": "00000000-0000-0000-0000-000000000000", }, }, ) except HttpResponseError as e: if e.reason == "Forbidden" and "AuthorizationFailed" in str( e.error): raise Exception( "Cannot create role assignments on ACR {}. Check that your are Owner of it or provide an existing Kubelet identity." .format(acr_scope)) elif e.reason == "Bad Request" and "PrincipalNotFound" in str( e.error): logging.info( "Fake role assignment on ACR looks ok. Identity should be allowed to assign roles in further steps." ) else: raise (e) except Exception as e: raise (e) # Sanity check for node pools node_pool_vnets = set() for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() nodepool_vnet = node_pool_conf.get("vnet", None) nodepool_subnet = node_pool_conf.get("subnet", None) vnet, _ = node_pool_builder.resolve_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=nodepool_vnet, cluster_subnet=nodepool_subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group, dss_host_resource_group=dss_host_resource_group) node_pool_vnets.add(vnet) if 1 < len(node_pool_vnets): raise Exception( "Node pools must all share the same vnet. Current node pools configuration yields vnets {}." .format(",".join(node_pool_vnets))) elif 0 == len(node_pool_vnets): raise Exception( "You cannot deploy a cluster without any node pool.") # Check role assignments for vnet like on ACR for fail fast if not doable vnet_id = node_pool_vnets.pop() if not vnet_id.startswith("/"): vnet_name = vnet_id vnet_id = "/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.Network/virtualNetworks/{vnet_name}".format( **locals()) vnet_role_id = None if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): authorization_client = AuthorizationManagementClient( credentials, subscription_id) try: vnet_roles = list( authorization_client.role_definitions.list( vnet_id, "roleName eq 'Contributor'")) except ResourceNotFoundError as e: raise Exception( "Vnet {} not found. Check it exists and you are Owner of it." .format(vnet_id)) if 0 == len(acr_roles): raise Exception( "Could not find the Contributor role on the vnet {}. Check you are Owner of it." .format(vnet_id)) else: vnet_role_id = vnet_roles[0].id logging.info("Vnet contributor role id: %s", acr_role_id) # Try to run a fake role assignment. Depending on the failure type we know if we are Owner or not try: fake_role_assignment = authorization_client.role_assignments.create( scope=vnet_id, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": vnet_role_id, "principal_id": "00000000-0000-0000-0000-000000000000", }, }, ) except HttpResponseError as e: if e.reason == "Forbidden" and "AuthorizationFailed" in str( e.error): raise Exception( "Cannot create role assignments on Vnet {}. Check that your are Owner of it or provide an existing Controle Plane identity." .format(vnet_id)) elif e.reason == "Bad Request" and "PrincipalNotFound" in str( e.error): logging.info( "Fake role assignment on Vnet looks ok. Identity should be allowed to assign roles in further steps." ) else: raise (e) except Exception as e: raise (e) # Access level if self.config.get("privateAccess"): cluster_builder.with_private_access( self.config.get("privateAccess")) cluster_builder.with_cluster_version( self.config.get("clusterVersion", None)) # Node pools for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_idx(idx) node_pool_builder.with_vm_size(node_pool_conf.get("vmSize", None)) vnet = node_pool_conf.get("vnet", None) subnet = node_pool_conf.get("subnet", None) node_pool_builder.with_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=vnet, cluster_subnet=subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group, dss_host_resource_group=dss_host_resource_group) node_pool_builder.with_availability_zones( use_availability_zones=node_pool_conf.get( "useAvailabilityZones", True)) node_pool_builder.with_node_count( enable_autoscaling=node_pool_conf.get("autoScaling", False), num_nodes=node_pool_conf.get("numNodes", None), min_num_nodes=node_pool_conf.get("minNumNodes", None), max_num_nodes=node_pool_conf.get("maxNumNodes", None)) node_pool_builder.with_mode( mode=node_pool_conf.get("mode", "Automatic"), system_pods_only=node_pool_conf.get("systemPodsOnly", True)) node_pool_builder.with_disk_size_gb( disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.with_node_labels( node_pool_conf.get("labels", None)) node_pool_builder.with_node_taints( node_pool_conf.get("taints", None)) node_pool_builder.add_tags(self.config.get("tags", None)) node_pool_builder.add_tags(node_pool_conf.get("tags", None)) node_pool_builder.build() cluster_builder.with_node_pool( node_pool=node_pool_builder.agent_pool_profile) # Run creation logging.info("Start creation of cluster") def do_creation(): cluster_create_op = cluster_builder.build() return cluster_create_op.result() create_result = run_and_process_cloud_error(do_creation) logging.info("Cluster creation finished") # Attach to ACR acr_attachment = {} if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedKubeletIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): kubelet_mi_object_id = create_result.identity_profile.get( "kubeletidentity").object_id logging.info("Kubelet Managed Identity object id: %s", kubelet_mi_object_id) if not _is_none_or_blank(acr_role_id): logging.info("Assign ACR pull role id %s to %s", acr_role_id, kubelet_mi_object_id) role_assignment = authorization_client.role_assignments.create( scope=acr_scope, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": acr_role_id, "principal_id": kubelet_mi_object_id, }, }, ) acr_attachment.update({ "name": acr_name, "resource_group": acr_resource_group, "subscription_id": acr_subscription_id, "resource_id": acr_scope, "role_assignment": role_assignment.as_dict(), }) # Attach to VNET to allow LoadBalancers creation vnet_attachment = {} if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): # And here we are blocked because we cant get the principal id of a System Assigned Managed Id easily control_plane_object_id = create_result.identity.principal_id logging.info("Controle Plane Managed Identity object id: %s", control_plane_object_id) if not _is_none_or_blank(vnet_role_id): logging.info("Assign Vnet contributolr role id %s to %s", vnet_role_id, control_plane_object_id) vnet_role_assignment = authorization_client.role_assignments.create( scope=vnet_id, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": vnet_role_id, "principal_id": control_plane_object_id, }, }, ) vnet_attachment.update({ "subscription_id": subscription_id, "resource_id": vnet_id, "role_assignment": vnet_role_assignment.as_dict(), }) logging.info("Fetching kubeconfig for cluster {} in {}...".format( self.cluster_name, resource_group)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group, self.cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode("utf8") logging.info("Writing kubeconfig file...") kube_config_path = os.path.join(os.getcwd(), "kube_config") with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides( self.config, yaml.safe_load(kube_config_content), kube_config_path, acr_name=None if _is_none_or_blank(acr_attachment) else acr_attachment["name"], ) return [ overrides, { "kube_config_path": kube_config_path, "cluster": create_result.as_dict(), "acr_attachment": acr_attachment, "vnet_attachment": vnet_attachment } ]
def start(self): """ Build the create cluster request. """ connection_info = self.config.get("connectionInfo", {}) connection_info_secret = self.plugin_config.get("connectionInfo", {}) credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) subscription_id = connection_info.get('subscriptionId', None) resource_group = self.config.get('resourceGroup', None) clusters_client = ContainerServiceClient(credentials, subscription_id) # Credit the cluster to DATAIKU if os.environ.get("DISABLE_AZURE_USAGE_ATTRIBUTION", "0") == "1": logging.info("Azure usage attribution is disabled") else: clusters_client.config.add_user_agent( 'pid-fd3813c7-273c-5eec-9221-77323f62a148') resource_group_name = self.config.get('resourceGroup', None) # TODO: Auto detection #if _is_none_or_blank(resource_group_name): # resource_group_name = vm_infos.get('resource_group_name', None) if _is_none_or_blank(resource_group_name): raise Exception( "A resource group to put the cluster in is required") location = self.config.get('location', None) # TODO: Auto detection #if _is_none_or_blank(location): # location = vm_infos.get('location', None) if _is_none_or_blank(location): raise Exception("A location to put the cluster in is required") # check that the cluster doesn't exist yet, otherwise azure will try to update it # and will almost always fail try: existing = clusters_client.managed_clusters.get( resource_group_name, self.cluster_name) if existing is not None: raise Exception( "A cluster with name %s in resource group %s already exists" % (self.cluster_name, resource_group_name)) except CloudError as e: logging.info("Cluster doesn't seem to exist yet") cluster_builder = ClusterBuilder(clusters_client) cluster_builder.with_name(self.cluster_name) cluster_builder.with_dns_prefix("{}-dns".format(self.cluster_name)) cluster_builder.with_resource_group(resource_group) cluster_builder.with_location(self.config.get("location", None)) cluster_builder.with_linux_profile() # default is None cluster_builder.with_network_profile( service_cidr=self.config.get("serviceCIDR", None), dns_service_ip=self.config.get("dnsServiceIP", None), load_balancer_sku=self.config.get("loadBalancerSku", None)) if self.config.get("useDistinctSPForCluster", False): cluster_sp = self.config.get("clusterServicePrincipal") else: cluster_sp = connection_info cluster_builder.with_cluster_sp( cluster_service_principal_connection_info=cluster_sp) cluster_builder.with_cluster_version( self.config.get("clusterVersion", None)) for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_idx(idx) node_pool_builder.with_vm_size(node_pool_conf.get("vmSize", None)) vnet = node_pool_conf.get("vnet", None) subnet = node_pool_conf.get("subnet", None) node_pool_builder.with_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=vnet, cluster_subnet=subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group) node_pool_builder.with_node_count( enable_autoscaling=node_pool_conf.get("autoScaling", False), num_nodes=node_pool_conf.get("numNodes", None), min_num_nodes=node_pool_conf.get("minNumNodes", None), max_num_nodes=node_pool_conf.get("maxNumNodes", None)) node_pool_builder.with_disk_size_gb( disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.build() cluster_builder.with_node_pool( node_pool=node_pool_builder.agent_pool_profile) def do_creation(): cluster_create_op = cluster_builder.build() return cluster_create_op.result() create_result = run_and_process_cloud_error(do_creation) logging.info("Fetching kubeconfig for cluster {} in {}...".format( self.cluster_name, resource_group)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group, self.cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode("utf8") logging.info("Writing kubeconfig file...") kube_config_path = os.path.join(os.getcwd(), "kube_config") with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) return [ overrides, { "kube_config_path": kube_config_path, "cluster": create_result.as_dict() } ]
def start(self): # build the create cluster request clusters = get_cluster_from_connection_info( self.config['connectionInfo'], self.plugin_config['connectionInfo']) cluster_builder = clusters.new_cluster_builder() cluster_builder.with_name(self.cluster_name) cluster_builder.with_version( self.config.get("clusterVersion", "latest")) cluster_builder.with_initial_node_count(self.config.get("numNodes", 3)) cluster_builder.with_network( self.config.get("inheritFromDSSHost", True), self.config.get("network", "").strip(), self.config.get("subNetwork", "").strip()) cluster_builder.with_vpc_native_settings( self.config.get("isVpcNative", None), self.config.get("podIpRange", ""), self.config.get("svcIpRange", "")) cluster_builder.with_labels(self.config.get("clusterLabels", {})) cluster_builder.with_legacy_auth(self.config.get("legacyAuth", False)) cluster_builder.with_http_load_balancing( self.config.get("httpLoadBalancing", False)) for node_pool in self.config.get('nodePools', []): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_node_count(node_pool.get('numNodes', 3)) node_pool_builder.use_gcr_io(node_pool.get('useGcrIo', False)) node_pool_builder.with_oauth_scopes( node_pool.get('oauthScopes', None)) node_pool_builder.with_machine_type( node_pool.get('machineType', None)) node_pool_builder.with_disk_type(node_pool.get('diskType', None)) node_pool_builder.with_disk_size_gb( node_pool.get('diskSizeGb', None)) node_pool_builder.with_service_account( node_pool.get('serviceAccountType', None), node_pool.get('serviceAccount', None)) node_pool_builder.with_auto_scaling( node_pool.get('numNodesAutoscaling', False), node_pool.get('minNumNodes', 2), node_pool.get('maxNumNodes', 5)) node_pool_builder.with_gpu(node_pool.get('withGpu', False), node_pool.get('gpuType', None), node_pool.get('gpuCount', 1)) node_pool_builder.with_nodepool_labels( node_pool.get('nodepoolLabels', {})) node_pool_builder.build() cluster_builder.with_settings_valve( self.config.get("creationSettingsValve", None)) start_op = cluster_builder.build() # can take a few mins... logging.info("Waiting for cluster start") start_op.wait_done() logging.info("Cluster started") # cluster is ready, fetch its info from GKE cluster = clusters.get_cluster(self.cluster_name) cluster_info = cluster.get_info() # build the config file for kubectl # we don't add the context to the main config file, to not end up with an oversized config, # and because 2 different clusters could be concurrently editing the config file kube_config_path = os.path.join(os.getcwd(), 'kube_config') kube_config = cluster.get_kube_config() with open(kube_config_path, 'w') as f: yaml.safe_dump(kube_config, f, default_flow_style=False) # add the admin role so that we can do the managed kubernetes stuff for spark create_admin_binding(self.config.get("userName", None), kube_config_path) # Launch NVIDIA driver installer daemonset (will only apply on tainted gpu nodes) create_installer_daemonset(kube_config_path=kube_config_path) # collect and prepare the overrides so that DSS can know where and how to use the cluster overrides = make_overrides(self.config, kube_config, kube_config_path) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': cluster_info } ]
def start(self): cluster_id = self.config['clusterId'] # retrieve the cluster info from EKS # this will fail if the cluster doesn't exist, but the API message is enough connection_info = self.config.get('connectionInfo', {}) args = ['get', 'cluster'] args = args + ['--name', cluster_id] if _has_not_blank_property(connection_info, 'region'): args = args + ['--region', connection_info['region']] elif 'AWS_DEFAULT_REGION' is os.environ: args = args + ['--region', os.environ['AWS_DEFAULT_REGION']] args = args + ['-o', 'json'] c = EksctlCommand(args, connection_info) cluster_info = json.loads(c.run_and_get_output())[0] kube_config_str = """ apiVersion: v1 clusters: - cluster: certificate-authority-data: %s server: %s name: cluster-__CLUSTER_ID__ contexts: - context: cluster: cluster-__CLUSTER_ID__ user: user-__CLUSTER_ID__ name: context-__CLUSTER_ID__ current-context: context-__CLUSTER_ID__ kind: Config preferences: {} users: - name: user-__CLUSTER_ID__ user: exec: apiVersion: client.authentication.k8s.io/v1alpha1 args: - token - -i - %s command: aws-iam-authenticator env: null """ % (cluster_info['CertificateAuthority']['Data'], cluster_info['Endpoint'], cluster_id) kube_config_str = kube_config_str.replace( "__CLUSTER_ID__", cluster_id ) # cluster_id is as good as anything, since this kubeconfig won't be merged into another one # build the config file for kubectl # we don't add the context to the main config file, to not end up with an oversized config, # and because 2 different clusters could be concurrently editing the config file kube_config_path = os.path.join(os.getcwd(), 'kube_config') with open(kube_config_path, 'w') as f: f.write(kube_config_str) if _has_not_blank_property(connection_info, 'accessKey') and _has_not_blank_property( connection_info, 'secretKey'): creds_in_env = { 'AWS_ACCESS_KEY_ID': connection_info['accessKey'], 'AWS_SECRET_ACCESS_KEY': connection_info['secretKey'] } add_authenticator_env(kube_config_path, creds_in_env) kube_config = yaml.safe_load(kube_config_str) # collect and prepare the overrides so that DSS can know where and how to use the cluster overrides = make_overrides(self.config, kube_config, kube_config_path) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': cluster_info } ]