def _start_instance(self, image_uuid, flavor_uuid, block_devices, nova_args, meta): # ensure existing, potentially duplicated, workers are stopped self._stop_instance(None, True) # then try to start new one boot_args = [self.workername, image_uuid, flavor_uuid] boot_kwargs = dict(meta=meta, block_device_mapping_v2=block_devices, **nova_args) instance = self.novaclient.servers.create(*boot_args, **boot_kwargs) # There is an issue when using sessions that the status is not # available on the first try. Trying again will work fine. Fetch the # instance to avoid that. try: instance = self.novaclient.servers.get(instance.id) except NotFound as e: log.msg( '{class_name} {name} instance {instance.id} ' '({instance.name}) never found', class_name=self.__class__.__name__, name=self.workername, instance=instance) raise LatentWorkerFailedToSubstantiate(instance.id, BUILD) from e self.instance = instance log.msg( f'{self.__class__.__name__} {self.workername} starting instance {instance.id} ' f'(image {image_uuid})') duration = 0 interval = self._poll_resolution while instance.status.startswith(BUILD): time.sleep(interval) duration += interval if duration % 60 == 0: log.msg( f'{self.__class__.__name__} {self.workername} has waited {duration // 60} ' f'minutes for instance {instance.id}') try: instance = self.novaclient.servers.get(instance.id) except NotFound as e: log.msg( f'{self.__class__.__name__} {self.workername} instance {instance.id} ' f'({instance.name}) went missing') raise LatentWorkerFailedToSubstantiate(instance.id, instance.status) from e if instance.status == ACTIVE: minutes = duration // 60 seconds = duration % 60 log.msg( f'{self.__class__.__name__} {self.workername} instance {instance.id} ' f'({instance.name}) started in about {minutes} minutes {seconds} seconds' ) return [ instance.id, image_uuid, f'{minutes // 60:02d}:{minutes % 60:02d}:{seconds:02d}' ] else: self.failed_to_start(instance.id, instance.status)
def _thd_wait_for_request(self, reservation): duration = 0 interval = self._poll_resolution while True: # Sometimes it can take a second or so for the spot request to be # ready. If it isn't ready, you will get a "Spot instance request # ID 'sir-abcd1234' does not exist" exception. try: requests = self.ec2.meta.client.describe_spot_instance_requests( SpotInstanceRequestIds=[ reservation['SpotInstanceRequestId'] ]) except ClientError as e: if 'InvalidSpotInstanceRequestID.NotFound' in str(e): requests = None else: raise if requests is not None: request = requests['SpotInstanceRequests'][0] request_status = request['Status']['Code'] if request_status not in SPOT_REQUEST_PENDING_STATES: break time.sleep(interval) duration += interval if duration % 10 == 0: log.msg( '{} {} has waited {} seconds for spot request {}'.format( self.__class__.__name__, self.workername, duration, reservation['SpotInstanceRequestId'])) if request_status == FULFILLED: minutes = duration // 60 seconds = duration % 60 log.msg( '{} {} spot request {} fulfilled in about {} minutes {} seconds' .format(self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], minutes, seconds)) return request, True elif request_status == PRICE_TOO_LOW: self.ec2.meta.client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) log.msg('{} {} spot request rejected, spot price too low'.format( self.__class__.__name__, self.workername)) raise LatentWorkerFailedToSubstantiate( request['SpotInstanceRequestId'], request_status) else: log.msg('{} {} failed to fulfill spot request {} with status {}'. format(self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], request_status)) # try to cancel, just for good measure self.ec2.meta.client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) raise LatentWorkerFailedToSubstantiate( request['SpotInstanceRequestId'], request_status)
def _thd_start_instance(self, image, volumes): docker_client = client.Client(**self.client_args) found = False if image is not None: found = self._image_exists(docker_client, image) else: image = '%s_%s_image' % (self.workername, id(self)) if (not found) and (self.dockerfile is not None): log.msg("Image '%s' not found, building it from scratch" % image) for line in docker_client.build(fileobj=BytesIO(self.dockerfile.encode('utf-8')), tag=image): for streamline in _handle_stream_line(line): log.msg(streamline) if (not self._image_exists(docker_client, image)): log.msg("Image '%s' not found" % image) raise LatentWorkerFailedToSubstantiate( 'Image "%s" not found on docker host.' % image ) self.parse_volumes(volumes) self.hostconfig['binds'] = self.binds host_conf = docker_client.create_host_config(**self.hostconfig) instance = docker_client.create_container( image, self.command, name='%s_%s' % (self.workername, id(self)), volumes=self.volumes, environment=self.createEnvironment(), host_config=host_conf ) if instance.get('Id') is None: log.msg('Failed to create the container') raise LatentWorkerFailedToSubstantiate( 'Failed to start container' ) shortid = instance['Id'][:6] log.msg('Container created, Id: %s...' % (shortid,)) instance['image'] = image self.instance = instance docker_client.start(instance) log.msg('Container started') if self.followStartupLogs: logs = docker_client.attach( container=instance, stdout=True, stderr=True, stream=True) for line in logs: log.msg("docker VM %s: %s" % (shortid, line.strip())) if self.conn: break del logs return [instance['Id'], image]
def _start_instance(self, image_uuid, block_devices): boot_args = [self.workername, image_uuid, self.flavor] boot_kwargs = dict(meta=self.meta, block_device_mapping_v2=block_devices, **self.nova_args) instance = self.novaclient.servers.create(*boot_args, **boot_kwargs) # There is an issue when using sessions that the status is not # available on the first try. Trying again will work fine. Fetch the # instance to avoid that. try: instance = self.novaclient.servers.get(instance.id) except NotFound: log.msg( '{class_name} {name} instance {instance.id} ' '({instance.name}) never found', class_name=self.__class__.__name__, name=self.workername, instance=instance) raise LatentWorkerFailedToSubstantiate(instance.id, BUILD) self.instance = instance log.msg('%s %s starting instance %s (image %s)' % (self.__class__.__name__, self.workername, instance.id, image_uuid)) duration = 0 interval = self._poll_resolution while instance.status.startswith(BUILD): time.sleep(interval) duration += interval if duration % 60 == 0: log.msg('%s %s has waited %d minutes for instance %s' % (self.__class__.__name__, self.workername, duration // 60, instance.id)) try: instance = self.novaclient.servers.get(instance.id) except NotFound: log.msg('%s %s instance %s (%s) went missing' % (self.__class__.__name__, self.workername, instance.id, instance.name)) raise LatentWorkerFailedToSubstantiate(instance.id, instance.status) if instance.status == ACTIVE: minutes = duration // 60 seconds = duration % 60 log.msg('%s %s instance %s (%s) started ' 'in about %d minutes %d seconds' % (self.__class__.__name__, self.workername, instance.id, instance.name, minutes, seconds)) return [ instance.id, image_uuid, '%02d:%02d:%02d' % (minutes // 60, minutes % 60, seconds) ] else: self.failed_to_start(instance.id, instance.status)
def start_instance(self, build): yield self.stop_instance(reportFailure=False) image, marathon_extra_config = \ yield self.renderWorkerPropsOnStart(build) marathon_config = { "container": { "docker": { "image": image, "network": "BRIDGE", }, "type": "DOCKER" }, "id": self.getApplicationId(), "instances": 1, "env": self.createEnvironment() } util.dictionary_merge(marathon_config, marathon_extra_config) res = yield self._http.post("/v2/apps", json=marathon_config) res_json = yield res.json() if res.code != 201: raise LatentWorkerFailedToSubstantiate( "Unable to create Marathon app: {} {}: {} {}".format( self.getApplicationId(), res.code, res_json['message'], res_json)) self.instance = res_json defer.returnValue(True)
def _submit_request(self): timestamp_yesterday = time.gmtime(int(time.time() - 86400)) spot_history_starttime = time.strftime('%Y-%m-%dT%H:%M:%SZ', timestamp_yesterday) spot_prices = self.ec2.meta.client.describe_spot_price_history( StartTime=spot_history_starttime, ProductDescriptions=[self.product_description], AvailabilityZone=self.placement) price_sum = 0.0 price_count = 0 for price in spot_prices['SpotPriceHistory']: if price['InstanceType'] == self.instance_type: price_sum += float(price['SpotPrice']) price_count += 1 if price_count == 0: self.current_spot_price = 0.02 else: self.current_spot_price = (price_sum / price_count) * self.price_multiplier if self.current_spot_price > self.max_spot_price: log.msg('%s %s calculated spot price %0.3f exceeds ' 'configured maximum of %0.3f' % (self.__class__.__name__, self.workername, self.current_spot_price, self.max_spot_price)) raise LatentWorkerFailedToSubstantiate() else: if self.retry > 1: log.msg( '%s %s requesting spot instance with price %0.4f, attempt %d of %d' % (self.__class__.__name__, self.workername, self.current_spot_price, self.attempt, self.retry)) else: log.msg('%s %s requesting spot instance with price %0.4f' % (self.__class__.__name__, self.workername, self.current_spot_price)) reservations = self.ec2.meta.client.request_spot_instances( SpotPrice=str(self.current_spot_price), LaunchSpecification=self._remove_none_opts( ImageId=self.ami, KeyName=self.keypair_name, SecurityGroups=self.classic_security_groups, UserData=self.user_data, InstanceType=self.instance_type, Placement=self._remove_none_opts( AvailabilityZone=self.placement, ), SubnetId=self.subnet_id, SecurityGroupIds=self.security_group_ids, BlockDeviceMappings=self.block_device_map, IamInstanceProfile=self._remove_none_opts( Name=self.instance_profile_name, ))) request, success = self._wait_for_request( reservations['SpotInstanceRequests'][0]) if not success: return request, None, None, False else: instance_id = request['InstanceId'] self.instance = self.ec2.Instance(instance_id) image = self.get_image() instance_id, start_time = self._wait_for_instance() return instance_id, image.id, start_time, True
def start_instance_result(result): # If we don't report success, then preparation failed. if not result: msg = "Worker does not want to substantiate at this time" self._substantiation_notifier.notify(LatentWorkerFailedToSubstantiate(self.name, msg)) return None return result
def _prepare_base_image(self): """ I am a private method for creating (possibly cheap) copies of a base_image for start_instance to boot. """ if not self.base_image: return if self.cheap_copy: clone_cmd = [ 'qemu-img', 'create', '-b', self.base_image, '-f', 'qcow2', self.image ] else: clone_cmd = ['cp', self.base_image, self.image] log.msg(f"Cloning base image: {clone_cmd}'") try: rc = yield runprocess.run_process(self.master.reactor, clone_cmd, collect_stdout=False, collect_stderr=False) if rc != 0: raise LatentWorkerFailedToSubstantiate( f'Failed to clone image (rc={rc})') except Exception as e: log.err(f"Cloning failed: {e}") raise
def _substantiate(self, build): # register event trigger try: # if build_wait_timeout is negative we don't ever disconnect the # worker ourselves, so we don't need to wait for it to attach # to declare it as substantiated. dont_wait_to_attach = \ self.build_wait_timeout < 0 and self.conn is not None start_success = yield self.start_instance(build) if not start_success: # this behaviour is kept as compatibility, but it is better # to just errback with a workable reason msg = "Worker does not want to substantiate at this time" raise LatentWorkerFailedToSubstantiate(self.name, msg) if dont_wait_to_attach and \ self.state == States.SUBSTANTIATING and \ self.conn is not None: log.msg(r"Worker %s substantiated (already attached)" % (self.name, )) self.state = States.SUBSTANTIATED self._fireSubstantiationNotifier(True) except Exception as e: self.stopMissingTimer() self._substantiation_failed(failure.Failure(e))
def _thd_start_instance(self, image, size): t1 = time.time() self._thd_cleanup_instance() t2 = time.time() instance = self.client.create_container( image, environment=self.createEnvironment(), labels={'sh_hyper_instancetype': size}, name=self.getContainerName()) t3 = time.time() if instance.get('Id') is None: raise LatentWorkerFailedToSubstantiate('Failed to start container') instance['image'] = image self.instance = instance self.client.start(instance) t4 = time.time() log.debug( '{name}:{containerid}: Container started in {total_time:.2f}', name=self.name, containerid=self.shortid, clean_time=t2 - t1, create_time=t3 - t2, start_time=t4 - t3, total_time=t4 - t1) return [instance['Id'], image]
def _request_spot_instance(self): if self.price_multiplier is None: bid_price = self.max_spot_price else: bid_price = self._bid_price_from_spot_price_history() if self.max_spot_price is not None \ and bid_price > self.max_spot_price: bid_price = self.max_spot_price log.msg('%s %s requesting spot instance with price %0.4f' % (self.__class__.__name__, self.workername, bid_price)) reservations = self.ec2.meta.client.request_spot_instances( SpotPrice=str(bid_price), LaunchSpecification=self._remove_none_opts( ImageId=self.ami, KeyName=self.keypair_name, SecurityGroups=self.classic_security_groups, UserData=self.user_data, InstanceType=self.instance_type, Placement=self._remove_none_opts( AvailabilityZone=self.placement, ), SubnetId=self.subnet_id, SecurityGroupIds=self.security_group_ids, BlockDeviceMappings=self.block_device_map, IamInstanceProfile=self._remove_none_opts( Name=self.instance_profile_name, ))) request, success = self._wait_for_request( reservations['SpotInstanceRequests'][0]) if not success: raise LatentWorkerFailedToSubstantiate() instance_id = request['InstanceId'] self.instance = self.ec2.Instance(instance_id) image = self.get_image() instance_id, start_time = self._wait_for_instance() return instance_id, image.id, start_time
def _request_spot_instance(self): if self.retry > 1: for attempt in range(1, self.retry + 1): self.attempt = attempt instance_id, image_id, start_time, success = self._submit_request() if success: break if attempt >= self.retry: self.attempt = 0 log.msg('%s %s failed to substantiate after %d requests' % (self.__class__.__name__, self.workername, self.retry)) raise LatentWorkerFailedToSubstantiate() else: instance_id, image_id, start_time, success = self._submit_request() if not success: raise LatentWorkerFailedToSubstantiate() return instance_id, image_id, start_time
def start_instance(self, build): yield self.stop_instance(reportFailure=False) pod_spec = yield self.renderWorkerPropsOnStart(build) try: yield self._kube.createPod(self.namespace, pod_spec) except kubeclientservice.KubeError as e: raise LatentWorkerFailedToSubstantiate(str(e)) defer.returnValue(True)
def _submit_request(self): timestamp_yesterday = time.gmtime(int(time.time() - 86400)) spot_history_starttime = time.strftime('%Y-%m-%dT%H:%M:%SZ', timestamp_yesterday) spot_prices = self.ec2_conn.get_spot_price_history( start_time=spot_history_starttime, product_description=self.product_description, availability_zone=self.placement) price_sum = 0.0 price_count = 0 for price in spot_prices: if price.instance_type == self.instance_type: price_sum += price.price price_count += 1 if price_count == 0: self.current_spot_price = 0.02 else: self.current_spot_price = (price_sum / price_count) * self.price_multiplier if self.current_spot_price > self.max_spot_price: log.msg('%s %s calculated spot price %0.3f exceeds ' 'configured maximum of %0.3f' % (self.__class__.__name__, self.workername, self.current_spot_price, self.max_spot_price)) raise LatentWorkerFailedToSubstantiate() else: if self.retry > 1: log.msg( '%s %s requesting spot instance with price %0.4f, attempt %d of %d' % (self.__class__.__name__, self.workername, self.current_spot_price, self.attempt, self.retry)) else: log.msg('%s %s requesting spot instance with price %0.4f' % (self.__class__.__name__, self.workername, self.current_spot_price)) reservations = self.ec2_conn.request_spot_instances( self.current_spot_price, self.ami, key_name=self.keypair_name, security_groups=[self.classic_security_groups], instance_type=self.instance_type, user_data=self.user_data, placement=self.placement, subnet_id=self.subnet_id, security_group_ids=self.security_group_ids, instance_profile_name=self.instance_profile_name, block_device_map=self.block_device_map) request, success = self._wait_for_request(reservations[0]) if not success: return request, None, None, False else: instance_id = request.instance_id reservations = self.ec2_conn.get_all_instances( instance_ids=[instance_id]) self.instance = reservations[0].instances[0] instance_id, image_id, start_time = self._wait_for_instance( self.get_image()) return instance_id, image_id, start_time, True
def start_instance(self, build): yield self.stop_instance(reportFailure=False) pod_spec = self.merge_spec(self.default_pod_spec(), self.kube_extra_spec) try: yield self._kube.createPod(self.namespace, pod_spec) except kubeclientservice.KubeError as e: raise LatentWorkerFailedToSubstantiate(str(e)) defer.returnValue(True)
def start_instance_result(result): # If we don't report success, then preparation failed. # we let the errback handle the issue if not result: # this behaviour is kept as compatibility, but it is better # to just errback with a workable reason msg = "Worker does not want to substantiate at this time" return failure.Failure(LatentWorkerFailedToSubstantiate(self.name, msg)) return result
def start_instance(self, build): pprint("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") yield self.stop_instance(reportFailure=False) pod_spec = self.get_pod_spec(build) pprint(pod_spec) try: yield self._kube.createPod(self.namespace, pod_spec) except kubeclientservice.KubeError as e: raise LatentWorkerFailedToSubstantiate(str(e)) defer.returnValue(True)
def _wait_for_request(self, reservation): duration = 0 interval = self._poll_resolution requests = self.ec2.meta.client.describe_spot_instance_requests( SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']]) request = requests['SpotInstanceRequests'][0] request_status = request['Status']['Code'] while request_status in SPOT_REQUEST_PENDING_STATES: time.sleep(interval) duration += interval if duration % 60 == 0: log.msg( '{} {} has waited {} minutes for spot request {}'.format( self.__class__.__name__, self.workername, duration // 60, request['SpotInstanceRequestId'])) requests = self.ec2.meta.client.describe_spot_instance_requests( SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']]) request = requests['SpotInstanceRequests'][0] request_status = request['Status']['Code'] if request_status == FULFILLED: minutes = duration // 60 seconds = duration % 60 log.msg( '{} {} spot request {} fulfilled in about {} minutes {} seconds' .format(self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], minutes, seconds)) return request, True elif request_status == PRICE_TOO_LOW: self.ec2.meta.client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) log.msg('{} {} spot request rejected, spot price too low'.format( self.__class__.__name__, self.workername)) raise LatentWorkerFailedToSubstantiate( request['SpotInstanceRequestId'], request_status) else: log.msg('{} {} failed to fulfill spot request {} with status {}'. format(self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], request_status)) # try to cancel, just for good measure self.ec2.meta.client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) raise LatentWorkerFailedToSubstantiate( request['SpotInstanceRequestId'], request_status)
def start_instance(self, build): """ I start a new instance of a VM. If a base_image is specified, I will make a clone of that otherwise i will use image directly. If i'm not given libvirt domain definition XML, I will look for my name in the list of defined virtual machines and start that. """ try: domain_id = yield self._get_domain_id() if domain_id != -1: raise LatentWorkerFailedToSubstantiate( "{}: Cannot start_instance as it's already active".format( self)) except Exception as e: raise LatentWorkerFailedToSubstantiate( '{}: Got error while retrieving domain ID: {}'.format(self, e)) yield self._prepare_base_image() try: if self.xml: yield self._pool_do(lambda conn: conn.createXML(self.xml, 0)) else: domain = yield self._get_domain() yield self._pool_do(lambda conn: domain.setMetadata( libvirt.VIR_DOMAIN_METADATA_ELEMENT, self.metadata.format(self.workername, self.password, self. masterFQDN), self.metakey, self.ns, libvirt.VIR_DOMAIN_AFFECT_CONFIG)) yield self._pool_do(lambda conn: domain.create()) except Exception as e: raise LatentWorkerFailedToSubstantiate( '{}: Got error while starting VM: {}'.format(self, e)) return True
def _substantiate(self, build): # register event trigger try: start_success = yield self.start_instance(build) if not start_success: # this behaviour is kept as compatibility, but it is better # to just errback with a workable reason msg = "Worker does not want to substantiate at this time" raise LatentWorkerFailedToSubstantiate(self.name, msg) except Exception as e: self.stopMissingTimer() self._substantiation_failed(failure.Failure(e))
def _thd_start_pod(self, pod, wait_for_completion=False): """Start the pod resource provided as a dictionnary. This method will block until the pod has reached one of the stable condition RUNNING/COMPLETE/FAILED. """ pod_name = pod.get('metadata', {}).get('name', 'no_name') self.logger.debug( 'Starting pod %r with config:\n%s' % (pod_name, yaml.safe_dump(pod, default_flow_style=False))) try: instance = client.CoreV1Api().create_namespaced_pod( self.namespace, pod) except ApiException as ex: raise LatentWorkerCannotSubstantiate( 'Failed to create pod %s: %s' % (pod_name, ex.reason)) pending = [None, 'Pending', 'Unknown'] if wait_for_completion: pending.append('Running') duration = 0 while instance.status.phase in pending: sleep(self._poll_resolution) duration += self._poll_resolution try: instance = client.CoreV1Api().read_namespaced_pod_status( instance.metadata.name, self.namespace) except ApiException as ex: if wait_for_completion: # pod may have completed break raise LatentWorkerFailedToSubstantiate( 'Pod %s went missing: %s' % (instance.metadata.name, ex.reason)) # Ensure the pod is running or has run successfully if instance.status.phase in [None, 'Pending', 'Failed', 'Unknown']: try: raise KubePodWorkerCannotSubstantiate( 'Creating Pod %(pod)s failed (%(phase)s)', instance) finally: self.delete_pod(instance.metadata.name) if wait_for_completion: self.delete_pod(instance.metadata.name) return instance.metadata.name
def _thd_start_instance(self, image): instance = self.client.create_container( image, environment=self.createEnvironment(), labels={'sh_hyper_instancetype': self.size}, name=('%s%s' % (self.workername, id(self))).replace("_", "-")) if instance.get('Id') is None: raise LatentWorkerFailedToSubstantiate('Failed to start container') shortid = instance['Id'][:6] log.msg('Container created, Id: %s...' % (shortid, )) instance['image'] = image self.instance = instance self.client.start(instance) return [instance['Id'], image]
def _start_instance(self): # Authenticate to OpenStack. os_client = client.Client(self.client_version, self.os_username, self.os_password, self.os_tenant_name, self.os_auth_url) image_uuid = self._getImage(os_client, self.image) boot_args = [self.workername, image_uuid, self.flavor] boot_kwargs = dict(meta=self.meta, block_device_mapping_v2=self.block_devices, **self.nova_args) instance = os_client.servers.create(*boot_args, **boot_kwargs) self.instance = instance log.msg('%s %s starting instance %s (image %s)' % (self.__class__.__name__, self.workername, instance.id, image_uuid)) duration = 0 interval = self._poll_resolution inst = instance while inst.status.startswith(BUILD): time.sleep(interval) duration += interval if duration % 60 == 0: log.msg('%s %s has waited %d minutes for instance %s' % (self.__class__.__name__, self.workername, duration // 60, instance.id)) try: inst = os_client.servers.get(instance.id) except nce.NotFound: log.msg('%s %s instance %s (%s) went missing' % (self.__class__.__name__, self.workername, instance.id, instance.name)) raise LatentWorkerFailedToSubstantiate(instance.id, instance.status) if inst.status == ACTIVE: minutes = duration // 60 seconds = duration % 60 log.msg('%s %s instance %s (%s) started ' 'in about %d minutes %d seconds' % (self.__class__.__name__, self.workername, instance.id, instance.name, minutes, seconds)) return [ instance.id, image_uuid, '%02d:%02d:%02d' % (minutes // 60, minutes % 60, seconds) ] else: self.failed_to_start(instance.id, inst.status)
def _thd_start_instance(self, namespace, job): self.load_config(self.kubeConfig) batch_client = client.BatchV1Api() # TODO: cleanup or not cleanup ? # cleanup the old instances instance = batch_client.create_namespaced_job(namespace, job) if instance is None: log.msg('Failed to create the container') raise LatentWorkerFailedToSubstantiate( 'Failed to start container' ) job_name = instance.metadata.name # pylint: disable=no-member log.msg('Job created, Id: %s...' % job_name) self.instance = instance return [ job_name, # pylint: disable=no-member instance.spec.template.spec.containers[0].image ]
def _substantiate(self, build): assert self.state == States.SUBSTANTIATING try: # if build_wait_timeout is negative we don't ever disconnect the # worker ourselves, so we don't need to wait for it to attach # to declare it as substantiated. dont_wait_to_attach = \ self.build_wait_timeout < 0 and self.conn is not None start_success = True if ILatentMachine.providedBy(self.machine): start_success = yield self.machine.substantiate(self) try: self._log_start_stop_locked('substantiating') yield self._start_stop_lock.acquire() if start_success: self.state = States.SUBSTANTIATING_STARTING start_success = yield self.start_instance(build) finally: self._start_stop_lock.release() if not start_success: # this behaviour is kept as compatibility, but it is better # to just errback with a workable reason msg = "Worker does not want to substantiate at this time" raise LatentWorkerFailedToSubstantiate(self.name, msg) if dont_wait_to_attach and \ self.state == States.SUBSTANTIATING_STARTING and \ self.conn is not None: log.msg(r"Worker {} substantiated (already attached)".format( self.name)) self.state = States.SUBSTANTIATED self._fireSubstantiationNotifier(True) except Exception as e: self.stopMissingTimer() self._substantiation_failed(failure.Failure(e))
def _wait_for_request(self, reservation): duration = 0 interval = self._poll_resolution requests = self.ec2.meta.client.describe_spot_instance_requests( SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']]) request = requests['SpotInstanceRequests'][0] request_status = request['Status']['Code'] while request_status in SPOT_REQUEST_PENDING_STATES: time.sleep(interval) duration += interval if duration % 60 == 0: log.msg('%s %s has waited %d minutes for spot request %s' % (self.__class__.__name__, self.workername, duration // 60, request['SpotInstanceRequestId'])) requests = self.ec2.meta.client.describe_spot_instance_requests( SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']]) request = requests['SpotInstanceRequests'][0] request_status = request['Status']['Code'] if request_status == FULFILLED: minutes = duration // 60 seconds = duration % 60 log.msg('%s %s spot request %s fulfilled ' 'in about %d minutes %d seconds' % (self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], minutes, seconds)) return request, True elif request_status == PRICE_TOO_LOW: self.ec2.meta.client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) log.msg('%s %s spot request rejected, spot price too low' % (self.__class__.__name__, self.workername)) self.current_spot_price *= self.retry_price_adjustment return request, False else: log.msg('%s %s failed to fulfill spot request %s with status %s' % (self.__class__.__name__, self.workername, request['SpotInstanceRequestId'], request_status)) raise LatentWorkerFailedToSubstantiate( request['SpotInstanceRequestId'], request.status)
def _thd_start_instance(self, image, dockerfile, volumes): docker_client = self._getDockerClient() # cleanup the old instances instances = docker_client.containers( all=1, filters=dict(name=self.getContainerName())) for instance in instances: try: docker_client.remove_container(instance['Id'], v=True, force=True) except NotFound: pass # that's a race condition found = False if image is not None: found = self._image_exists(docker_client, image) else: image = '%s_%s_image' % (self.workername, id(self)) if (not found) and (dockerfile is not None): log.msg("Image '%s' not found, building it from scratch" % image) for line in docker_client.build(fileobj=BytesIO( dockerfile.encode('utf-8')), tag=image): for streamline in _handle_stream_line(line): log.msg(streamline) if (not self._image_exists(docker_client, image)): log.msg("Image '%s' not found" % image) raise LatentWorkerFailedToSubstantiate( 'Image "%s" not found on docker host.' % image) volumes, binds = self._thd_parse_volumes(volumes) host_conf = self.hostconfig.copy() host_conf['binds'] = binds host_conf = docker_client.create_host_config(**host_conf) instance = docker_client.create_container( image, self.command, name=self.getContainerName(), volumes=volumes, environment=self.createEnvironment(), host_config=host_conf) if instance.get('Id') is None: log.msg('Failed to create the container') raise LatentWorkerFailedToSubstantiate('Failed to start container') shortid = instance['Id'][:6] log.msg('Container created, Id: %s...' % (shortid, )) instance['image'] = image self.instance = instance docker_client.start(instance) log.msg('Container started') if self.followStartupLogs: logs = docker_client.attach(container=instance, stdout=True, stderr=True, stream=True) for line in logs: log.msg("docker VM %s: %s" % (shortid, line.strip())) if self.conn: break del logs return [instance['Id'], image]
def _thd_start_instance(self, image, dockerfile, volumes, custom_context, encoding, buildargs): docker_client = self._getDockerClient() container_name = self.getContainerName() # cleanup the old instances instances = docker_client.containers(all=1, filters=dict(name=container_name)) container_name = "/{0}".format(container_name) for instance in instances: if container_name not in instance['Names']: continue try: docker_client.remove_container(instance['Id'], v=True, force=True) except NotFound: pass # that's a race condition found = False if image is not None: found = self._image_exists(docker_client, image) else: image = '{}_{}_image'.format(self.workername, id(self)) if (not found) and (dockerfile is not None): log.msg( "Image '{}' not found, building it from scratch".format(image)) if (custom_context): with open(dockerfile, 'rb') as fin: lines = docker_client.build(fileobj=fin, custom_context=custom_context, encoding=encoding, tag=image, buildargs=buildargs) else: lines = docker_client.build( fileobj=BytesIO(dockerfile.encode('utf-8')), tag=image, ) for line in lines: for streamline in _handle_stream_line(line): log.msg(streamline) imageExists = self._image_exists(docker_client, image) if ((not imageExists) or self.alwaysPull) and self.autopull: if (not imageExists): log.msg("Image '{}' not found, pulling from registry".format( image)) docker_client.pull(image) if (not self._image_exists(docker_client, image)): msg = 'Image "{}" not found on docker host.'.format(image) log.msg(msg) raise LatentWorkerCannotSubstantiate(msg) volumes, binds = self._thd_parse_volumes(volumes) host_conf = self.hostconfig.copy() host_conf['binds'] = binds if docker_py_version >= 2.2: host_conf['init'] = True host_conf = docker_client.create_host_config(**host_conf) instance = docker_client.create_container( image, self.command, name=self.getContainerName(), volumes=volumes, environment=self.createEnvironment(), host_config=host_conf) if instance.get('Id') is None: log.msg('Failed to create the container') raise LatentWorkerFailedToSubstantiate('Failed to start container') shortid = instance['Id'][:6] log.msg('Container created, Id: {}...'.format(shortid)) instance['image'] = image self.instance = instance docker_client.start(instance) log.msg('Container started') if self.followStartupLogs: logs = docker_client.attach(container=instance, stdout=True, stderr=True, stream=True) for line in logs: log.msg("docker VM {}: {}".format(shortid, line.strip())) if self.conn: break del logs return [instance['Id'], image]
def failed_to_start(self, instance_id, instance_state): log.msg('%s %s failed to start instance %s (%s)' % (self.__class__.__name__, self.workername, instance_id, instance_state)) raise LatentWorkerFailedToSubstantiate(instance_id, instance_state)
def _thd_start_instance(self, image, dockerfile, hostconfig, volumes): # License note: # copied from the original implementation with minor modification # to pass runtime configuration to the containers with self.docker_client() as docker_client: container_name = self.getContainerName() # cleanup the old instances instances = docker_client.containers( all=1, filters=dict(name=container_name)) container_name = '/{0}'.format(container_name) for instance in instances: if container_name not in instance['Names']: continue try: docker_client.remove_container(instance['Id'], v=True, force=True) except docker.errors.NotFound: pass # that's a race condition found = False if image is not None: found = self._image_exists(docker_client, image) else: worker_id = id(self) worker_name = self.workername image = f'{worker_name}_{worker_id}_image' if (not found) and (dockerfile is not None): log.info(f'Image {image} not found, building it from scratch') for line in docker_client.build( fileobj=BytesIO(dockerfile.encode('utf-8')), tag=image ): for streamline in _handle_stream_line(line): log.info(streamline) imageExists = self._image_exists(docker_client, image) if ((not imageExists) or self.alwaysPull) and self.autopull: if (not imageExists): log.info(f'Image {image} not found, pulling from registry') docker_client.pull(image) if (not self._image_exists(docker_client, image)): log.info(f'Image {image} not found') raise LatentWorkerCannotSubstantiate( f'Image {image} not found on docker host.' ) volumes, binds = self._thd_parse_volumes(volumes) hostconfig['binds'] = binds if docker_py_version >= 2.2: hostconfig['init'] = True instance = docker_client.create_container( image, self.command, name=self.getContainerName(), volumes=volumes, environment=self.createEnvironment(), host_config=docker_client.create_host_config( **hostconfig ) ) if instance.get('Id') is None: log.info('Failed to create the container') raise LatentWorkerFailedToSubstantiate( 'Failed to start container' ) shortid = instance['Id'][:6] log.info(f'Container created, Id: {shortid}...') instance['image'] = image self.instance = instance docker_client.start(instance) log.info('Container started') if self.followStartupLogs: logs = docker_client.attach( container=instance, stdout=True, stderr=True, stream=True) for line in logs: line = line.strip() log.info(f'docker VM {shortid}: {line}') if self.conn: break del logs return [instance['Id'], image]