def create_container(name, image, pr_number, user_id, resource_group, port=None, location='westus'): from azure.mgmt.containerinstance.models import (ContainerGroup, Container, ContainerPort, Port, IpAddress, ImageRegistryCredential, ResourceRequirements, ResourceRequests, ContainerGroupNetworkProtocol, OperatingSystemTypes, EnvironmentVariable, Volume, AzureFileVolume, VolumeMount) from random import randint import secrets pr_number = str(pr_number) if port is None: port = randint(1000, 65000) instance_token = secrets.token_urlsafe(256) environment_variables = [EnvironmentVariable('PR_NUM', pr_number), EnvironmentVariable('PORT', port), EnvironmentVariable('INSTANCE_TOKEN', instance_token)] tags = {'userId': user_id, 'prNumber': pr_number} container_resource_requests = ResourceRequests(memory_in_gb='1.5', cpu='1') container_resource_requirements = ResourceRequirements(requests=container_resource_requests) container = Container(name=name, image=image, resources=container_resource_requirements, ports=[ContainerPort(port=port)], environment_variables=environment_variables, volume_mounts=[VolumeMount('volume1', '/cert', read_only=True)]) cgroup_ip_address = IpAddress(ports=[Port(protocol=ContainerGroupNetworkProtocol.tcp, port=port)]) cgroup_os_type = OperatingSystemTypes.linux afv = AzureFileVolume(SHARE_NAME, SHARE_STORAGE_ACCOUNT_NAME, read_only=True, storage_account_key=SHARE_STORAGE_ACCOUNT_KEY) cgroup = ContainerGroup(location=location, containers=[container], os_type=cgroup_os_type, ip_address=cgroup_ip_address, tags=tags, volumes=[Volume('volume1', afv)]) return get_aci_client().container_groups.create_or_update(resource_group, name, cgroup)
def create_env_vars(msg, database_uri, container_name): msg_var = EnvironmentVariable(name="MESSAGE", value=msg) database_var = EnvironmentVariable(name="DATABASE_URI", value=database_uri) container_name_var = EnvironmentVariable(name="CONTAINER_NAME", value=container_name) return [msg_var, database_var, container_name_var]
def main(): sys.stdout.write("Starting Work Cycle...\n") # same as print sys.stdout.flush() while True: try: msg = bus_service.receive_queue_message(queueConfig['queue_name'], peek_lock=False) if msg.body is not None: work = msg.body.decode("utf-8") container_name = generate_container_name() env_vars = [ EnvironmentVariable(name="MESSAGE", value=work), EnvironmentVariable(name="CONTAINER_NAME", value=container_name) ] sys.stdout.write("Creating container: " + container_name + " with work: " + work + '\n') # same as print sys.stdout.flush() create_container_group(ACI_CONFIG['resourceGroup'], container_name, ACI_CONFIG['location'], IMAGE, env_vars) except KeyboardInterrupt: pass
def create(self): endpoint_env = EnvironmentVariable(name="ENDPOINTURL", value=self.config.endpoint_url) container_resource_requests = ResourceRequests(memory_in_gb=1, cpu=1.0) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) self.logger.container_created() return Container(name=self.config.container_group_name, image=self.config.container_image, resources=container_resource_requirements, environment_variables=[endpoint_env])
def create_container(azure_auth, registry_credentials, container_info, env_vars): """Create a new container group.""" _LOGGER.info("Creating container group '%s'...", container_info["groupName"]) # Map registry credentials image_registry_credentials = ImageRegistryCredential( server=registry_credentials["server"], username=registry_credentials["username"], password=registry_credentials["password"], ) # Map to Azure Container objects environment_variables = [] for var in env_vars: environment_variables.append( EnvironmentVariable(name=var["name"], secure_value=var["value"])) # Configure the container container_resource_requests = ResourceRequests(memory_in_gb=1, cpu=1.0) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) container = Container( name=container_info["groupName"], image=container_info["image"], resources=container_resource_requirements, environment_variables=environment_variables, ) group = ContainerGroup( location=container_info["region"], containers=[container], os_type=OperatingSystemTypes.linux, image_registry_credentials=[image_registry_credentials], restart_policy=ContainerGroupRestartPolicy.never, ) # Create the container group aci_client = get_container_client(azure_auth) aci_client.container_groups.create_or_update( container_info["resourceGroup"], container_info["groupName"], group) _LOGGER.info("Created Container group '%s'", container_info["groupName"])
def create_spark_worker_container(*args, **kwargs): # TODO: change resources container_resource_requests = ResourceRequests(memory_in_gb=14, cpu=4.0) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) container_name = kwargs.get('cluster_id') + '-worker-' + str( kwargs.get("worker_number")) return create_container(container_name=container_name, image=kwargs.get('image') + "-worker", resources=container_resource_requirements, environment_variables=[ EnvironmentVariable( name="MASTER_IP", value=kwargs.get('master_ip')) ], ports=[ ContainerPort(port=7077, protocol='TCP'), ContainerPort(port=4040, protocol='TCP'), ], volume_mounts=kwargs.get('volume_mounts'))
def execute(self, context: dict) -> int: # Check name again in case it was templated. self._check_name(self.name) self._ci_hook = AzureContainerInstanceHook(self.ci_conn_id) if self.fail_if_exists: self.log.info("Testing if container group already exists") if self._ci_hook.exists(self.resource_group, self.name): raise AirflowException("Container group exists") if self.registry_conn_id: registry_hook = AzureContainerRegistryHook(self.registry_conn_id) image_registry_credentials: Optional[list] = [ registry_hook.connection, ] else: image_registry_credentials = None environment_variables = [] for key, value in self.environment_variables.items(): if key in self.secured_variables: e = EnvironmentVariable(name=key, secure_value=value) else: e = EnvironmentVariable(name=key, value=value) environment_variables.append(e) volumes: List[Union[Volume, Volume]] = [] volume_mounts: List[Union[VolumeMount, VolumeMount]] = [] for conn_id, account_name, share_name, mount_path, read_only in self.volumes: hook = AzureContainerVolumeHook(conn_id) mount_name = "mount-%d" % len(volumes) volumes.append( hook.get_file_volume(mount_name, share_name, account_name, read_only)) volume_mounts.append( VolumeMount(name=mount_name, mount_path=mount_path, read_only=read_only)) exit_code = 1 try: self.log.info("Starting container group with %.1f cpu %.1f mem", self.cpu, self.memory_in_gb) if self.gpu: self.log.info("GPU count: %.1f, GPU SKU: %s", self.gpu.count, self.gpu.sku) resources = ResourceRequirements(requests=ResourceRequests( memory_in_gb=self.memory_in_gb, cpu=self.cpu, gpu=self.gpu)) if self.ip_address and not self.ports: self.ports = [ContainerPort(port=80)] self.log.info( "Default port set. Container will listen on port 80") container = Container( name=self.name, image=self.image, resources=resources, command=self.command, environment_variables=environment_variables, volume_mounts=volume_mounts, ports=self.ports, ) container_group = ContainerGroup( location=self.region, containers=[ container, ], image_registry_credentials=image_registry_credentials, volumes=volumes, restart_policy=self.restart_policy, os_type=self.os_type, tags=self.tags, ip_address=self.ip_address, ) self._ci_hook.create_or_update(self.resource_group, self.name, container_group) self.log.info("Container group started %s/%s", self.resource_group, self.name) exit_code = self._monitor_logging(self.resource_group, self.name) self.log.info("Container had exit code: %s", exit_code) if exit_code != 0: raise AirflowException( f"Container had a non-zero exit code, {exit_code}") return exit_code except CloudError: self.log.exception("Could not start container group") raise AirflowException("Could not start container group") finally: if exit_code == 0 or self.remove_on_error: self.on_kill()
def execute(self, context): ci_hook = AzureContainerInstanceHook(self.ci_conn_id) if self.fail_if_exists: self.log.info("Testing if container group already exists") if ci_hook.exists(self.resource_group, self.name): raise AirflowException("Container group exists") if self.registry_conn_id: registry_hook = AzureContainerRegistryHook(self.registry_conn_id) image_registry_credentials = [ registry_hook.connection, ] else: image_registry_credentials = None environment_variables = [] for key, value in self.environment_variables.items(): environment_variables.append(EnvironmentVariable(key, value)) volumes = [] volume_mounts = [] for conn_id, account_name, share_name, mount_path, read_only in self.volumes: hook = AzureContainerVolumeHook(conn_id) mount_name = "mount-%d" % len(volumes) volumes.append( hook.get_file_volume(mount_name, share_name, account_name, read_only)) volume_mounts.append(VolumeMount(mount_name, mount_path, read_only)) exit_code = 1 try: self.log.info("Starting container group with %.1f cpu %.1f mem", self.cpu, self.memory_in_gb) resources = ResourceRequirements(requests=ResourceRequests( memory_in_gb=self.memory_in_gb, cpu=self.cpu)) container = Container(name=self.name, image=self.image, resources=resources, command=self.command, environment_variables=environment_variables, volume_mounts=volume_mounts) container_group = ContainerGroup( location=self.region, containers=[ container, ], image_registry_credentials=image_registry_credentials, volumes=volumes, restart_policy='Never', os_type='Linux') ci_hook.create_or_update(self.resource_group, self.name, container_group) self.log.info("Container group started %s/%s", self.resource_group, self.name) exit_code = self._monitor_logging(ci_hook, self.resource_group, self.name) self.log.info("Container had exit code: %s", exit_code) if exit_code != 0: raise AirflowException( "Container had a non-zero exit code, %s" % exit_code) except CloudError: self.log.exception("Could not start container group") raise AirflowException("Could not start container group") finally: if exit_code == 0 or self.remove_on_error: self.log.info("Deleting container group") try: ci_hook.delete(self.resource_group, self.name) except Exception: self.log.exception("Could not delete container group")
def _start_container(self, resource_handler): log.debug('Starting Azure ACI') location = self.res['location'].lower() self.resource_client.resource_groups.create_or_update( self.res['resource_group'], {'location': self.res['location']}) container_group_name = unique_vmname(self.node_def) network_type = self.res['network_type'] network_profile = None if 'gpu_type' in self.res: count = self.res['gpu_count'] if 'gpu_count' in self.res else 1 gpu = GpuResource(count=count, sku=self.res['gpu_type']) container_resource_requests = ResourceRequests( memory_in_gb=self.res['memory'], cpu=self.res['cpu_cores'], gpu=gpu) else: container_resource_requests = ResourceRequests( memory_in_gb=self.res['memory'], cpu=self.res['cpu_cores']) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) ports = [] ipports = [] for porte in self.res.get('ports', []): port = porte protocol = 'TCP' if isinstance(porte, str) and '/' in porte: (port, protocol) = port.split('/') port = int(port) ports.append(ContainerPort(port=port, protocol=protocol)) ipports.append(Port(protocol=protocol, port=port)) environment = [] if network_type.lower() == 'public': pubip_var = EnvironmentVariable(name='_OCCOPUS_ALLOCATED_FQDN', value='%s.%s.azurecontainer.io' % (container_group_name, location)) environment.append(pubip_var) for env in self.env: edata = env.split('=', 1) if len(edata) != 2: continue env_var = EnvironmentVariable(name=edata[0], value=edata[1]) environment.append(env_var) container = Container( name=container_group_name, image=self.res['image'], resources=container_resource_requirements, ports=ports, command=self.command if self.command is not None else None, environment_variables=environment) if network_type.lower() == 'public': group_ip_address = IpAddress(ports=ipports, dns_name_label=container_group_name, type='Public') self.vnet_name = None elif network_type.lower() == 'private': vnet_name = unique_vmname(self.node_def) + '-vnet' if self.res.get( 'vnet_name', None) == None else self.res['vnet_name'] self.vnet_name = vnet_name subnet_name = unique_vmname( self.node_def) + '-subnet' if self.res.get( 'subnet_name', None) == None else self.res['subnet_name'] network_profile_name = unique_vmname(self.node_def) + '-netprofile' if self.res.get('vnet_name', None) == None: log.debug('Creating vnet') async_vnet_creation = self.network_client.virtual_networks.create_or_update( self.res['resource_group'], vnet_name, { 'location': location, 'address_space': { 'address_prefixes': ['10.0.0.0/16'] } }) async_vnet_creation.wait() self.created_resources['virtual_network'] = vnet_name log.debug('Created vnet') if self.res.get('subnet_name', None) == None: # Create Subnet log.debug('Creating Subnet') aci_delegation_service_name = "Microsoft.ContainerInstance/containerGroups" aci_delegation = Delegation( name=aci_delegation_service_name, service_name=aci_delegation_service_name) subnet = Subnet(name=subnet_name, location=location, address_prefix='10.0.0.0/24', delegations=[aci_delegation]) subnet_info = self.network_client.subnets.create_or_update( self.res['resource_group'], vnet_name, subnet_name, subnet).result() self.created_resources['subnet'] = subnet_name log.debug('Creatied Subnet') else: subnet_info = self.network_client.subnets.get( self.res['resource_group'], vnet_name, subnet_name) default_network_profile_name = "aci-network-profile-{}-{}".format( vnet_name, subnet_name) network_profile_ops = self.network_client.network_profiles network_profile = NetworkProfile( name=default_network_profile_name, location=location, container_network_interface_configurations=[ ContainerNetworkInterfaceConfiguration( name="eth0", ip_configurations=[ IPConfigurationProfile(name="ipconfigprofile", subnet=subnet_info) ]) ]) network_profile = network_profile_ops.create_or_update( self.res['resource_group'], network_profile_name, network_profile).result() group_ip_address = IpAddress(ports=ipports, type='Private') else: errormsg = '[{0}] Network type "{1}" is not supported. Please use either "Public" or "Private"'.format( resource_handler.name, network_type) log.debug(errormsg) raise NodeCreationError(None, errormsg) cg_network_profile = None if network_profile: cg_network_profile = ContainerGroupNetworkProfile( id=network_profile.id) self.created_resources['network_profile'] = network_profile_name group = ContainerGroup(location=location, containers=[container], os_type=self.res['os_type'], ip_address=group_ip_address, network_profile=cg_network_profile) # Create the container group self.aci_client.container_groups.create_or_update( self.res['resource_group'], container_group_name, group) return container_group_name
def run_task_based_container(aci_client, resource_group, container_group_name, container_image_name, start_command_line=None): """Creates a container group with a single task-based container who's restart policy is 'Never'. If specified, the container runs a custom command line at startup. Arguments: aci_client {azure.mgmt.containerinstance.ContainerInstanceManagementClient} -- An authenticated container instance management client. resource_group {azure.mgmt.resource.resources.models.ResourceGroup} -- The resource group in which to create the container group. container_group_name {str} -- The name of the container group to create. container_image_name {str} -- The container image name and tag, for example: microsoft\aci-helloworld:latest start_command_line {str} -- The command line that should be executed when the container starts. This value can be None. """ # If a start command wasn't specified, use a default if start_command_line is None: start_command_line = "python wordcount.py http://shakespeare.mit.edu/romeo_juliet/full.html" # Configure some environment variables in the container which the # wordcount.py or other script can read to modify its behavior. env_var_1 = EnvironmentVariable(name='NumWords', value='5') env_var_2 = EnvironmentVariable(name='MinLength', value='8') print("Creating container group '{0}' with start command '{1}'".format( container_group_name, start_command_line)) # Configure the container container_resource_requests = ResourceRequests(memory_in_gb=1, cpu=1.0) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) container = Container(name=container_group_name, image=container_image_name, resources=container_resource_requirements, command=start_command_line.split(), environment_variables=[env_var_1, env_var_2]) # Configure the container group group = ContainerGroup(location=resource_group.location, containers=[container], os_type=OperatingSystemTypes.linux, restart_policy=ContainerGroupRestartPolicy.never) # Create the container group result = aci_client.container_groups.create_or_update( resource_group.name, container_group_name, group) # Wait for the container create operation to complete. The operation is # "done" when the container group provisioning state is one of: # Succeeded, Canceled, Failed while result.done() is False: sys.stdout.write('.') time.sleep(1) # Get the provisioning state of the container group. container_group = aci_client.container_groups.get(resource_group.name, container_group_name) if str(container_group.provisioning_state).lower() == 'succeeded': print("\nCreation of container group '{}' succeeded.".format( container_group_name)) else: print("\nCreation of container group '{}' failed. Provisioning state" "is: {}".format(container_group_name, container_group.provisioning_state)) # Get the logs for the container logs = aci_client.container.list_logs(resource_group.name, container_group_name, container.name) print("Logs for container '{0}':".format(container_group_name)) print("{0}".format(logs.content))
def run_task_based_container(self, container_image_name: str, command: list = None, memory_in_gb: int = 1, cpu: float = 1.0, gpu_count: int = 0, gpu_type: str = 'K80', envs: dict = {}, timeout: int = 600, afs_volumes: list = [], volume_mount_path: str = "/input", afs_name: str = None, afs_key: str = None, afs_share: str = None, afs_mount_subpath: str = '', image_registry_server: str = None, image_registry_username: str = None, image_registry_pwd: str = None, tag: str = ""): """Creates a container group with a single task-based container who's restart policy is 'Never'. If specified, the container runs a custom command line at startup. Arguments: container_image_name {str} -- The container image name and tag, for example: microsoft\aci-helloworld:latest command {list} -- The command line that should be executed when the container starts. This value can be None. """ container_group_name = str(id_generator()) + tag envs['DATA'] = str(Path(volume_mount_path) / afs_mount_subpath) if command is not None: cloudhunky_logger.info( "Creating container group '{0}' with start command '{1}'". format(container_group_name, command)) gpu = None if gpu_count > 0: gpu = GpuResource(count=gpu_count, sku=gpu_type) container_resource_requests = ResourceRequests( memory_in_gb=memory_in_gb, cpu=cpu, gpu=gpu) container_resource_requirements = ResourceRequirements( requests=container_resource_requests) environment_variables = [] if envs is not None: for env, val in envs.items(): environment_variables.append( EnvironmentVariable(name=env, value=val)) volume_mounts = None volumes = None if afs_mount_subpath is not None: if len(afs_volumes) == 0: #looks like client using deprecated afs mount method afs_volumes = [ { 'name': 'azure-volume', 'mount_path': volume_mount_path, 'afs_name': afs_name, 'afs_key': afs_key, 'afs_share': afs_share, }, ] volumes, volume_mounts = self.prepare_azure_volumes(afs_volumes) image_registry_credentials = None if image_registry_username is not None or image_registry_pwd is not None: if image_registry_username is None: raise ValueError("insert image_registry_username") if image_registry_pwd is None: raise ValueError("insert image_registry_pwd") if image_registry_server is None: raise ValueError("insert image_registry_server") image_registry_credentials = [ ImageRegistryCredential(server=image_registry_server, username=image_registry_username, password=image_registry_pwd) ] container = Container(name=container_group_name, image=container_image_name, resources=container_resource_requirements, command=command, environment_variables=environment_variables, volume_mounts=volume_mounts, ports=None) group = ContainerGroup( location=self.resource_group.location, containers=[container], os_type=OperatingSystemTypes.linux, restart_policy=ContainerGroupRestartPolicy.never, volumes=volumes, image_registry_credentials=image_registry_credentials) for i in range(2): try: result = self.aci_client.container_groups.create_or_update( self.resource_group.name, container_group_name, group) except CloudError as exp: cloudhunky_logger.exception(exp) cloudhunky_logger.info("Try to reduce the required resources") if i == 1: raise exp else: cloudhunky_logger.info("Azure is provisioning container group") break # Wait for the container create operation to complete. The operation is # "done" when the container group provisioning state is one of: # Succeeded, Canceled, Failed cloudhunky_logger.info("Container Group is pending") while result.done() is False: time.sleep(30) try: container_group = self.aci_client.container_groups.get( self.resource_group.name, container_group_name) if str(container_group.provisioning_state).lower() == 'succeeded': cloudhunky_logger.info( "Creation of container group '{}' succeeded.".format( container_group_name)) else: cloudhunky_logger.warning( "\nCreation of container group '{}' failed. Provisioning state" "is: {}. Deleting the container group.".format( container_group_name, container_group.provisioning_state)) self.aci_client.container_groups.delete( self.resource_group.name, container_group_name) return except Exception as exp: cloudhunky_logger.exception(exp) try: start = time.time() while timeout > (time.time() - start): container_group = self.aci_client.container_groups.get( self.resource_group.name, container_group_name) container_state = container_group.containers[ 0].instance_view.current_state.state if container_state.lower() == "terminated": cloudhunky_logger.info("Container terminated") break time.sleep(1) if timeout < (time.time() - start): cloudhunky_logger.warning(f"Timeout {timeout} was exceeded!") except Exception as exp: cloudhunky_logger.exception(exp) try: logs = self.aci_client.container.list_logs( self.resource_group.name, container_group_name, container.name) self.aci_client.container_groups.delete(self.resource_group.name, container_group_name) except Exception as exp: cloudhunky_logger.exception(exp) return container_group_name, logs
try: aci_config = yaml.safe_load(f) aci_config['apiVersion'] = aci_config['apiVersion'].strftime( '%Y-%m-%d') except yaml.YAMLError as e: raise e logger.info('Defining container group') containers = [] for c in aci_config['properties']['containers']: config = [ EnvironmentVariable(name=p['name'], value=p.get('value'), secure_value=p.get('secureValue')) for p in c['properties']['environmentVariables'] ] resources = ResourceRequests( cpu=c['properties']['resources']['requests']['cpu'], memory_in_gb=c['properties']['resources']['requests'] ['memoryInGB']) unit = Container( name=c['name'], image=c['properties']['image'], resources=ResourceRequirements( requests=resources, limits=None, # ResourceLimits()