Example #1
0
    def _start_instance(self, image_uuid, flavor_uuid, block_devices,
                        nova_args, meta):
        # ensure existing, potentially duplicated, workers are stopped
        self._stop_instance(None, True)

        # then try to start new one
        boot_args = [self.workername, image_uuid, flavor_uuid]
        boot_kwargs = dict(meta=meta,
                           block_device_mapping_v2=block_devices,
                           **nova_args)
        instance = self.novaclient.servers.create(*boot_args, **boot_kwargs)
        # There is an issue when using sessions that the status is not
        # available on the first try. Trying again will work fine. Fetch the
        # instance to avoid that.
        try:
            instance = self.novaclient.servers.get(instance.id)
        except NotFound as e:
            log.msg(
                '{class_name} {name} instance {instance.id} '
                '({instance.name}) never found',
                class_name=self.__class__.__name__,
                name=self.workername,
                instance=instance)
            raise LatentWorkerFailedToSubstantiate(instance.id, BUILD) from e
        self.instance = instance
        log.msg(
            f'{self.__class__.__name__} {self.workername} starting instance {instance.id} '
            f'(image {image_uuid})')
        duration = 0
        interval = self._poll_resolution
        while instance.status.startswith(BUILD):
            time.sleep(interval)
            duration += interval
            if duration % 60 == 0:
                log.msg(
                    f'{self.__class__.__name__} {self.workername} has waited {duration // 60} '
                    f'minutes for instance {instance.id}')
            try:
                instance = self.novaclient.servers.get(instance.id)
            except NotFound as e:
                log.msg(
                    f'{self.__class__.__name__} {self.workername} instance {instance.id} '
                    f'({instance.name}) went missing')
                raise LatentWorkerFailedToSubstantiate(instance.id,
                                                       instance.status) from e
        if instance.status == ACTIVE:
            minutes = duration // 60
            seconds = duration % 60
            log.msg(
                f'{self.__class__.__name__} {self.workername} instance {instance.id} '
                f'({instance.name}) started in about {minutes} minutes {seconds} seconds'
            )
            return [
                instance.id, image_uuid,
                f'{minutes // 60:02d}:{minutes % 60:02d}:{seconds:02d}'
            ]
        else:
            self.failed_to_start(instance.id, instance.status)
Example #2
0
    def _thd_wait_for_request(self, reservation):
        duration = 0
        interval = self._poll_resolution

        while True:
            # Sometimes it can take a second or so for the spot request to be
            # ready.  If it isn't ready, you will get a "Spot instance request
            # ID 'sir-abcd1234' does not exist" exception.
            try:
                requests = self.ec2.meta.client.describe_spot_instance_requests(
                    SpotInstanceRequestIds=[
                        reservation['SpotInstanceRequestId']
                    ])
            except ClientError as e:
                if 'InvalidSpotInstanceRequestID.NotFound' in str(e):
                    requests = None
                else:
                    raise

            if requests is not None:
                request = requests['SpotInstanceRequests'][0]
                request_status = request['Status']['Code']
                if request_status not in SPOT_REQUEST_PENDING_STATES:
                    break

            time.sleep(interval)
            duration += interval
            if duration % 10 == 0:
                log.msg(
                    '{} {} has waited {} seconds for spot request {}'.format(
                        self.__class__.__name__, self.workername, duration,
                        reservation['SpotInstanceRequestId']))

        if request_status == FULFILLED:
            minutes = duration // 60
            seconds = duration % 60
            log.msg(
                '{} {} spot request {} fulfilled in about {} minutes {} seconds'
                .format(self.__class__.__name__, self.workername,
                        request['SpotInstanceRequestId'], minutes, seconds))
            return request, True
        elif request_status == PRICE_TOO_LOW:
            self.ec2.meta.client.cancel_spot_instance_requests(
                SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
            log.msg('{} {} spot request rejected, spot price too low'.format(
                self.__class__.__name__, self.workername))
            raise LatentWorkerFailedToSubstantiate(
                request['SpotInstanceRequestId'], request_status)
        else:
            log.msg('{} {} failed to fulfill spot request {} with status {}'.
                    format(self.__class__.__name__, self.workername,
                           request['SpotInstanceRequestId'], request_status))
            # try to cancel, just for good measure
            self.ec2.meta.client.cancel_spot_instance_requests(
                SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
            raise LatentWorkerFailedToSubstantiate(
                request['SpotInstanceRequestId'], request_status)
Example #3
0
    def _thd_start_instance(self, image, volumes):
        docker_client = client.Client(**self.client_args)

        found = False
        if image is not None:
            found = self._image_exists(docker_client, image)
        else:
            image = '%s_%s_image' % (self.workername, id(self))
        if (not found) and (self.dockerfile is not None):
            log.msg("Image '%s' not found, building it from scratch" %
                    image)
            for line in docker_client.build(fileobj=BytesIO(self.dockerfile.encode('utf-8')),
                                            tag=image):
                for streamline in _handle_stream_line(line):
                    log.msg(streamline)

        if (not self._image_exists(docker_client, image)):
            log.msg("Image '%s' not found" % image)
            raise LatentWorkerFailedToSubstantiate(
                'Image "%s" not found on docker host.' % image
            )

        self.parse_volumes(volumes)
        self.hostconfig['binds'] = self.binds
        host_conf = docker_client.create_host_config(**self.hostconfig)

        instance = docker_client.create_container(
            image,
            self.command,
            name='%s_%s' % (self.workername, id(self)),
            volumes=self.volumes,
            environment=self.createEnvironment(),
            host_config=host_conf
        )

        if instance.get('Id') is None:
            log.msg('Failed to create the container')
            raise LatentWorkerFailedToSubstantiate(
                'Failed to start container'
            )
        shortid = instance['Id'][:6]
        log.msg('Container created, Id: %s...' % (shortid,))
        instance['image'] = image
        self.instance = instance
        docker_client.start(instance)
        log.msg('Container started')
        if self.followStartupLogs:
            logs = docker_client.attach(
                container=instance, stdout=True, stderr=True, stream=True)
            for line in logs:
                log.msg("docker VM %s: %s" % (shortid, line.strip()))
                if self.conn:
                    break
            del logs
        return [instance['Id'], image]
Example #4
0
 def _start_instance(self, image_uuid, block_devices):
     boot_args = [self.workername, image_uuid, self.flavor]
     boot_kwargs = dict(meta=self.meta,
                        block_device_mapping_v2=block_devices,
                        **self.nova_args)
     instance = self.novaclient.servers.create(*boot_args, **boot_kwargs)
     # There is an issue when using sessions that the status is not
     # available on the first try. Trying again will work fine. Fetch the
     # instance to avoid that.
     try:
         instance = self.novaclient.servers.get(instance.id)
     except NotFound:
         log.msg(
             '{class_name} {name} instance {instance.id} '
             '({instance.name}) never found',
             class_name=self.__class__.__name__,
             name=self.workername,
             instance=instance)
         raise LatentWorkerFailedToSubstantiate(instance.id, BUILD)
     self.instance = instance
     log.msg('%s %s starting instance %s (image %s)' %
             (self.__class__.__name__, self.workername, instance.id,
              image_uuid))
     duration = 0
     interval = self._poll_resolution
     while instance.status.startswith(BUILD):
         time.sleep(interval)
         duration += interval
         if duration % 60 == 0:
             log.msg('%s %s has waited %d minutes for instance %s' %
                     (self.__class__.__name__, self.workername,
                      duration // 60, instance.id))
         try:
             instance = self.novaclient.servers.get(instance.id)
         except NotFound:
             log.msg('%s %s instance %s (%s) went missing' %
                     (self.__class__.__name__, self.workername, instance.id,
                      instance.name))
             raise LatentWorkerFailedToSubstantiate(instance.id,
                                                    instance.status)
     if instance.status == ACTIVE:
         minutes = duration // 60
         seconds = duration % 60
         log.msg('%s %s instance %s (%s) started '
                 'in about %d minutes %d seconds' %
                 (self.__class__.__name__, self.workername, instance.id,
                  instance.name, minutes, seconds))
         return [
             instance.id, image_uuid,
             '%02d:%02d:%02d' % (minutes // 60, minutes % 60, seconds)
         ]
     else:
         self.failed_to_start(instance.id, instance.status)
Example #5
0
    def start_instance(self, build):
        yield self.stop_instance(reportFailure=False)

        image, marathon_extra_config = \
            yield self.renderWorkerPropsOnStart(build)

        marathon_config = {
            "container": {
                "docker": {
                    "image": image,
                    "network": "BRIDGE",
                },
                "type": "DOCKER"
            },
            "id": self.getApplicationId(),
            "instances": 1,
            "env": self.createEnvironment()
        }
        util.dictionary_merge(marathon_config, marathon_extra_config)
        res = yield self._http.post("/v2/apps", json=marathon_config)
        res_json = yield res.json()
        if res.code != 201:
            raise LatentWorkerFailedToSubstantiate(
                "Unable to create Marathon app: {} {}: {} {}".format(
                    self.getApplicationId(), res.code, res_json['message'],
                    res_json))
        self.instance = res_json
        defer.returnValue(True)
Example #6
0
 def _submit_request(self):
     timestamp_yesterday = time.gmtime(int(time.time() - 86400))
     spot_history_starttime = time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                            timestamp_yesterday)
     spot_prices = self.ec2.meta.client.describe_spot_price_history(
         StartTime=spot_history_starttime,
         ProductDescriptions=[self.product_description],
         AvailabilityZone=self.placement)
     price_sum = 0.0
     price_count = 0
     for price in spot_prices['SpotPriceHistory']:
         if price['InstanceType'] == self.instance_type:
             price_sum += float(price['SpotPrice'])
             price_count += 1
     if price_count == 0:
         self.current_spot_price = 0.02
     else:
         self.current_spot_price = (price_sum /
                                    price_count) * self.price_multiplier
     if self.current_spot_price > self.max_spot_price:
         log.msg('%s %s calculated spot price %0.3f exceeds '
                 'configured maximum of %0.3f' %
                 (self.__class__.__name__, self.workername,
                  self.current_spot_price, self.max_spot_price))
         raise LatentWorkerFailedToSubstantiate()
     else:
         if self.retry > 1:
             log.msg(
                 '%s %s requesting spot instance with price %0.4f, attempt %d of %d'
                 % (self.__class__.__name__, self.workername,
                    self.current_spot_price, self.attempt, self.retry))
         else:
             log.msg('%s %s requesting spot instance with price %0.4f' %
                     (self.__class__.__name__, self.workername,
                      self.current_spot_price))
     reservations = self.ec2.meta.client.request_spot_instances(
         SpotPrice=str(self.current_spot_price),
         LaunchSpecification=self._remove_none_opts(
             ImageId=self.ami,
             KeyName=self.keypair_name,
             SecurityGroups=self.classic_security_groups,
             UserData=self.user_data,
             InstanceType=self.instance_type,
             Placement=self._remove_none_opts(
                 AvailabilityZone=self.placement, ),
             SubnetId=self.subnet_id,
             SecurityGroupIds=self.security_group_ids,
             BlockDeviceMappings=self.block_device_map,
             IamInstanceProfile=self._remove_none_opts(
                 Name=self.instance_profile_name, )))
     request, success = self._wait_for_request(
         reservations['SpotInstanceRequests'][0])
     if not success:
         return request, None, None, False
     else:
         instance_id = request['InstanceId']
         self.instance = self.ec2.Instance(instance_id)
         image = self.get_image()
         instance_id, start_time = self._wait_for_instance()
         return instance_id, image.id, start_time, True
Example #7
0
 def start_instance_result(result):
     # If we don't report success, then preparation failed.
     if not result:
         msg = "Worker does not want to substantiate at this time"
         self._substantiation_notifier.notify(LatentWorkerFailedToSubstantiate(self.name, msg))
         return None
     return result
Example #8
0
    def _prepare_base_image(self):
        """
        I am a private method for creating (possibly cheap) copies of a
        base_image for start_instance to boot.
        """
        if not self.base_image:
            return

        if self.cheap_copy:
            clone_cmd = [
                'qemu-img', 'create', '-b', self.base_image, '-f', 'qcow2',
                self.image
            ]
        else:
            clone_cmd = ['cp', self.base_image, self.image]

        log.msg(f"Cloning base image: {clone_cmd}'")

        try:
            rc = yield runprocess.run_process(self.master.reactor,
                                              clone_cmd,
                                              collect_stdout=False,
                                              collect_stderr=False)
            if rc != 0:
                raise LatentWorkerFailedToSubstantiate(
                    f'Failed to clone image (rc={rc})')
        except Exception as e:
            log.err(f"Cloning failed: {e}")
            raise
Example #9
0
    def _substantiate(self, build):
        # register event trigger
        try:
            # if build_wait_timeout is negative we don't ever disconnect the
            # worker ourselves, so we don't need to wait for it to attach
            # to declare it as substantiated.
            dont_wait_to_attach = \
                self.build_wait_timeout < 0 and self.conn is not None

            start_success = yield self.start_instance(build)

            if not start_success:
                # this behaviour is kept as compatibility, but it is better
                # to just errback with a workable reason
                msg = "Worker does not want to substantiate at this time"
                raise LatentWorkerFailedToSubstantiate(self.name, msg)

            if dont_wait_to_attach and \
                    self.state == States.SUBSTANTIATING and \
                    self.conn is not None:
                log.msg(r"Worker %s substantiated (already attached)" %
                        (self.name, ))
                self.state = States.SUBSTANTIATED
                self._fireSubstantiationNotifier(True)

        except Exception as e:
            self.stopMissingTimer()
            self._substantiation_failed(failure.Failure(e))
Example #10
0
    def _thd_start_instance(self, image, size):
        t1 = time.time()
        self._thd_cleanup_instance()
        t2 = time.time()
        instance = self.client.create_container(
            image,
            environment=self.createEnvironment(),
            labels={'sh_hyper_instancetype': size},
            name=self.getContainerName())
        t3 = time.time()

        if instance.get('Id') is None:
            raise LatentWorkerFailedToSubstantiate('Failed to start container')
        instance['image'] = image
        self.instance = instance
        self.client.start(instance)
        t4 = time.time()
        log.debug(
            '{name}:{containerid}: Container started in {total_time:.2f}',
            name=self.name,
            containerid=self.shortid,
            clean_time=t2 - t1,
            create_time=t3 - t2,
            start_time=t4 - t3,
            total_time=t4 - t1)
        return [instance['Id'], image]
Example #11
0
 def _request_spot_instance(self):
     if self.price_multiplier is None:
         bid_price = self.max_spot_price
     else:
         bid_price = self._bid_price_from_spot_price_history()
         if self.max_spot_price is not None \
            and bid_price > self.max_spot_price:
             bid_price = self.max_spot_price
     log.msg('%s %s requesting spot instance with price %0.4f' %
             (self.__class__.__name__, self.workername, bid_price))
     reservations = self.ec2.meta.client.request_spot_instances(
         SpotPrice=str(bid_price),
         LaunchSpecification=self._remove_none_opts(
             ImageId=self.ami,
             KeyName=self.keypair_name,
             SecurityGroups=self.classic_security_groups,
             UserData=self.user_data,
             InstanceType=self.instance_type,
             Placement=self._remove_none_opts(
                 AvailabilityZone=self.placement, ),
             SubnetId=self.subnet_id,
             SecurityGroupIds=self.security_group_ids,
             BlockDeviceMappings=self.block_device_map,
             IamInstanceProfile=self._remove_none_opts(
                 Name=self.instance_profile_name, )))
     request, success = self._wait_for_request(
         reservations['SpotInstanceRequests'][0])
     if not success:
         raise LatentWorkerFailedToSubstantiate()
     instance_id = request['InstanceId']
     self.instance = self.ec2.Instance(instance_id)
     image = self.get_image()
     instance_id, start_time = self._wait_for_instance()
     return instance_id, image.id, start_time
Example #12
0
 def _request_spot_instance(self):
     if self.retry > 1:
         for attempt in range(1, self.retry + 1):
             self.attempt = attempt
             instance_id, image_id, start_time, success = self._submit_request()
             if success:
                 break
             if attempt >= self.retry:
                 self.attempt = 0
                 log.msg('%s %s failed to substantiate after %d requests' %
                         (self.__class__.__name__, self.workername, self.retry))
                 raise LatentWorkerFailedToSubstantiate()
     else:
         instance_id, image_id, start_time, success = self._submit_request()
         if not success:
             raise LatentWorkerFailedToSubstantiate()
     return instance_id, image_id, start_time
Example #13
0
 def start_instance(self, build):
     yield self.stop_instance(reportFailure=False)
     pod_spec = yield self.renderWorkerPropsOnStart(build)
     try:
         yield self._kube.createPod(self.namespace, pod_spec)
     except kubeclientservice.KubeError as e:
         raise LatentWorkerFailedToSubstantiate(str(e))
     defer.returnValue(True)
Example #14
0
 def _submit_request(self):
     timestamp_yesterday = time.gmtime(int(time.time() - 86400))
     spot_history_starttime = time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                            timestamp_yesterday)
     spot_prices = self.ec2_conn.get_spot_price_history(
         start_time=spot_history_starttime,
         product_description=self.product_description,
         availability_zone=self.placement)
     price_sum = 0.0
     price_count = 0
     for price in spot_prices:
         if price.instance_type == self.instance_type:
             price_sum += price.price
             price_count += 1
     if price_count == 0:
         self.current_spot_price = 0.02
     else:
         self.current_spot_price = (price_sum /
                                    price_count) * self.price_multiplier
     if self.current_spot_price > self.max_spot_price:
         log.msg('%s %s calculated spot price %0.3f exceeds '
                 'configured maximum of %0.3f' %
                 (self.__class__.__name__, self.workername,
                  self.current_spot_price, self.max_spot_price))
         raise LatentWorkerFailedToSubstantiate()
     else:
         if self.retry > 1:
             log.msg(
                 '%s %s requesting spot instance with price %0.4f, attempt %d of %d'
                 % (self.__class__.__name__, self.workername,
                    self.current_spot_price, self.attempt, self.retry))
         else:
             log.msg('%s %s requesting spot instance with price %0.4f' %
                     (self.__class__.__name__, self.workername,
                      self.current_spot_price))
     reservations = self.ec2_conn.request_spot_instances(
         self.current_spot_price,
         self.ami,
         key_name=self.keypair_name,
         security_groups=[self.classic_security_groups],
         instance_type=self.instance_type,
         user_data=self.user_data,
         placement=self.placement,
         subnet_id=self.subnet_id,
         security_group_ids=self.security_group_ids,
         instance_profile_name=self.instance_profile_name,
         block_device_map=self.block_device_map)
     request, success = self._wait_for_request(reservations[0])
     if not success:
         return request, None, None, False
     else:
         instance_id = request.instance_id
         reservations = self.ec2_conn.get_all_instances(
             instance_ids=[instance_id])
         self.instance = reservations[0].instances[0]
         instance_id, image_id, start_time = self._wait_for_instance(
             self.get_image())
         return instance_id, image_id, start_time, True
Example #15
0
 def start_instance(self, build):
     yield self.stop_instance(reportFailure=False)
     pod_spec = self.merge_spec(self.default_pod_spec(),
                                self.kube_extra_spec)
     try:
         yield self._kube.createPod(self.namespace, pod_spec)
     except kubeclientservice.KubeError as e:
         raise LatentWorkerFailedToSubstantiate(str(e))
     defer.returnValue(True)
Example #16
0
 def start_instance_result(result):
     # If we don't report success, then preparation failed.
     # we let the errback handle the issue
     if not result:
         # this behaviour is kept as compatibility, but it is better
         # to just errback with a workable reason
         msg = "Worker does not want to substantiate at this time"
         return failure.Failure(LatentWorkerFailedToSubstantiate(self.name, msg))
     return result
 def start_instance(self, build):
     pprint("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
     yield self.stop_instance(reportFailure=False)
     pod_spec = self.get_pod_spec(build)
     pprint(pod_spec)
     try:
         yield self._kube.createPod(self.namespace, pod_spec)
     except kubeclientservice.KubeError as e:
         raise LatentWorkerFailedToSubstantiate(str(e))
     defer.returnValue(True)
Example #18
0
 def _wait_for_request(self, reservation):
     duration = 0
     interval = self._poll_resolution
     requests = self.ec2.meta.client.describe_spot_instance_requests(
         SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']])
     request = requests['SpotInstanceRequests'][0]
     request_status = request['Status']['Code']
     while request_status in SPOT_REQUEST_PENDING_STATES:
         time.sleep(interval)
         duration += interval
         if duration % 60 == 0:
             log.msg(
                 '{} {} has waited {} minutes for spot request {}'.format(
                     self.__class__.__name__, self.workername,
                     duration // 60, request['SpotInstanceRequestId']))
         requests = self.ec2.meta.client.describe_spot_instance_requests(
             SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']])
         request = requests['SpotInstanceRequests'][0]
         request_status = request['Status']['Code']
     if request_status == FULFILLED:
         minutes = duration // 60
         seconds = duration % 60
         log.msg(
             '{} {} spot request {} fulfilled in about {} minutes {} seconds'
             .format(self.__class__.__name__, self.workername,
                     request['SpotInstanceRequestId'], minutes, seconds))
         return request, True
     elif request_status == PRICE_TOO_LOW:
         self.ec2.meta.client.cancel_spot_instance_requests(
             SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
         log.msg('{} {} spot request rejected, spot price too low'.format(
             self.__class__.__name__, self.workername))
         raise LatentWorkerFailedToSubstantiate(
             request['SpotInstanceRequestId'], request_status)
     else:
         log.msg('{} {} failed to fulfill spot request {} with status {}'.
                 format(self.__class__.__name__, self.workername,
                        request['SpotInstanceRequestId'], request_status))
         # try to cancel, just for good measure
         self.ec2.meta.client.cancel_spot_instance_requests(
             SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
         raise LatentWorkerFailedToSubstantiate(
             request['SpotInstanceRequestId'], request_status)
Example #19
0
    def start_instance(self, build):
        """
        I start a new instance of a VM.

        If a base_image is specified, I will make a clone of that otherwise i will
        use image directly.

        If i'm not given libvirt domain definition XML, I will look for my name
        in the list of defined virtual machines and start that.
        """

        try:
            domain_id = yield self._get_domain_id()
            if domain_id != -1:
                raise LatentWorkerFailedToSubstantiate(
                    "{}: Cannot start_instance as it's already active".format(
                        self))
        except Exception as e:
            raise LatentWorkerFailedToSubstantiate(
                '{}: Got error while retrieving domain ID: {}'.format(self, e))

        yield self._prepare_base_image()

        try:
            if self.xml:
                yield self._pool_do(lambda conn: conn.createXML(self.xml, 0))
            else:
                domain = yield self._get_domain()
                yield self._pool_do(lambda conn: domain.setMetadata(
                    libvirt.VIR_DOMAIN_METADATA_ELEMENT,
                    self.metadata.format(self.workername, self.password, self.
                                         masterFQDN), self.metakey, self.ns,
                    libvirt.VIR_DOMAIN_AFFECT_CONFIG))

                yield self._pool_do(lambda conn: domain.create())

        except Exception as e:
            raise LatentWorkerFailedToSubstantiate(
                '{}: Got error while starting VM: {}'.format(self, e))

        return True
Example #20
0
    def _substantiate(self, build):
        # register event trigger
        try:
            start_success = yield self.start_instance(build)

            if not start_success:
                # this behaviour is kept as compatibility, but it is better
                # to just errback with a workable reason
                msg = "Worker does not want to substantiate at this time"
                raise LatentWorkerFailedToSubstantiate(self.name, msg)

        except Exception as e:
            self.stopMissingTimer()
            self._substantiation_failed(failure.Failure(e))
Example #21
0
    def _thd_start_pod(self, pod, wait_for_completion=False):
        """Start the pod resource provided as a dictionnary.

        This method will block until the pod has reached one
        of the stable condition RUNNING/COMPLETE/FAILED.

        """
        pod_name = pod.get('metadata', {}).get('name', 'no_name')
        self.logger.debug(
            'Starting pod %r with config:\n%s' %
            (pod_name, yaml.safe_dump(pod, default_flow_style=False)))
        try:
            instance = client.CoreV1Api().create_namespaced_pod(
                self.namespace, pod)
        except ApiException as ex:
            raise LatentWorkerCannotSubstantiate(
                'Failed to create pod %s: %s' % (pod_name, ex.reason))

        pending = [None, 'Pending', 'Unknown']
        if wait_for_completion:
            pending.append('Running')
        duration = 0
        while instance.status.phase in pending:
            sleep(self._poll_resolution)
            duration += self._poll_resolution
            try:
                instance = client.CoreV1Api().read_namespaced_pod_status(
                    instance.metadata.name, self.namespace)
            except ApiException as ex:
                if wait_for_completion:
                    # pod may have completed
                    break

                raise LatentWorkerFailedToSubstantiate(
                    'Pod %s went missing: %s' %
                    (instance.metadata.name, ex.reason))

        # Ensure the pod is running or has run successfully
        if instance.status.phase in [None, 'Pending', 'Failed', 'Unknown']:
            try:
                raise KubePodWorkerCannotSubstantiate(
                    'Creating Pod %(pod)s failed (%(phase)s)', instance)
            finally:
                self.delete_pod(instance.metadata.name)

        if wait_for_completion:
            self.delete_pod(instance.metadata.name)

        return instance.metadata.name
Example #22
0
    def _thd_start_instance(self, image):
        instance = self.client.create_container(
            image,
            environment=self.createEnvironment(),
            labels={'sh_hyper_instancetype': self.size},
            name=('%s%s' % (self.workername, id(self))).replace("_", "-"))

        if instance.get('Id') is None:
            raise LatentWorkerFailedToSubstantiate('Failed to start container')
        shortid = instance['Id'][:6]
        log.msg('Container created, Id: %s...' % (shortid, ))
        instance['image'] = image
        self.instance = instance
        self.client.start(instance)
        return [instance['Id'], image]
Example #23
0
 def _start_instance(self):
     # Authenticate to OpenStack.
     os_client = client.Client(self.client_version, self.os_username,
                               self.os_password, self.os_tenant_name,
                               self.os_auth_url)
     image_uuid = self._getImage(os_client, self.image)
     boot_args = [self.workername, image_uuid, self.flavor]
     boot_kwargs = dict(meta=self.meta,
                        block_device_mapping_v2=self.block_devices,
                        **self.nova_args)
     instance = os_client.servers.create(*boot_args, **boot_kwargs)
     self.instance = instance
     log.msg('%s %s starting instance %s (image %s)' %
             (self.__class__.__name__, self.workername, instance.id,
              image_uuid))
     duration = 0
     interval = self._poll_resolution
     inst = instance
     while inst.status.startswith(BUILD):
         time.sleep(interval)
         duration += interval
         if duration % 60 == 0:
             log.msg('%s %s has waited %d minutes for instance %s' %
                     (self.__class__.__name__, self.workername,
                      duration // 60, instance.id))
         try:
             inst = os_client.servers.get(instance.id)
         except nce.NotFound:
             log.msg('%s %s instance %s (%s) went missing' %
                     (self.__class__.__name__, self.workername, instance.id,
                      instance.name))
             raise LatentWorkerFailedToSubstantiate(instance.id,
                                                    instance.status)
     if inst.status == ACTIVE:
         minutes = duration // 60
         seconds = duration % 60
         log.msg('%s %s instance %s (%s) started '
                 'in about %d minutes %d seconds' %
                 (self.__class__.__name__, self.workername, instance.id,
                  instance.name, minutes, seconds))
         return [
             instance.id, image_uuid,
             '%02d:%02d:%02d' % (minutes // 60, minutes % 60, seconds)
         ]
     else:
         self.failed_to_start(instance.id, inst.status)
Example #24
0
    def _thd_start_instance(self, namespace, job):
        self.load_config(self.kubeConfig)
        batch_client = client.BatchV1Api()
        # TODO: cleanup or not cleanup ?
        # cleanup the old instances

        instance = batch_client.create_namespaced_job(namespace, job)

        if instance is None:
            log.msg('Failed to create the container')
            raise LatentWorkerFailedToSubstantiate(
                'Failed to start container'
            )
        job_name = instance.metadata.name  # pylint: disable=no-member
        log.msg('Job created, Id: %s...' % job_name)
        self.instance = instance
        return [
            job_name,
            # pylint: disable=no-member
            instance.spec.template.spec.containers[0].image
        ]
Example #25
0
    def _substantiate(self, build):
        assert self.state == States.SUBSTANTIATING
        try:
            # if build_wait_timeout is negative we don't ever disconnect the
            # worker ourselves, so we don't need to wait for it to attach
            # to declare it as substantiated.
            dont_wait_to_attach = \
                self.build_wait_timeout < 0 and self.conn is not None

            start_success = True
            if ILatentMachine.providedBy(self.machine):
                start_success = yield self.machine.substantiate(self)

            try:
                self._log_start_stop_locked('substantiating')
                yield self._start_stop_lock.acquire()

                if start_success:
                    self.state = States.SUBSTANTIATING_STARTING
                    start_success = yield self.start_instance(build)
            finally:
                self._start_stop_lock.release()

            if not start_success:
                # this behaviour is kept as compatibility, but it is better
                # to just errback with a workable reason
                msg = "Worker does not want to substantiate at this time"
                raise LatentWorkerFailedToSubstantiate(self.name, msg)

            if dont_wait_to_attach and \
                    self.state == States.SUBSTANTIATING_STARTING and \
                    self.conn is not None:
                log.msg(r"Worker {} substantiated (already attached)".format(
                    self.name))
                self.state = States.SUBSTANTIATED
                self._fireSubstantiationNotifier(True)

        except Exception as e:
            self.stopMissingTimer()
            self._substantiation_failed(failure.Failure(e))
Example #26
0
 def _wait_for_request(self, reservation):
     duration = 0
     interval = self._poll_resolution
     requests = self.ec2.meta.client.describe_spot_instance_requests(
         SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']])
     request = requests['SpotInstanceRequests'][0]
     request_status = request['Status']['Code']
     while request_status in SPOT_REQUEST_PENDING_STATES:
         time.sleep(interval)
         duration += interval
         if duration % 60 == 0:
             log.msg('%s %s has waited %d minutes for spot request %s' %
                     (self.__class__.__name__, self.workername,
                      duration // 60, request['SpotInstanceRequestId']))
         requests = self.ec2.meta.client.describe_spot_instance_requests(
             SpotInstanceRequestIds=[reservation['SpotInstanceRequestId']])
         request = requests['SpotInstanceRequests'][0]
         request_status = request['Status']['Code']
     if request_status == FULFILLED:
         minutes = duration // 60
         seconds = duration % 60
         log.msg('%s %s spot request %s fulfilled '
                 'in about %d minutes %d seconds' %
                 (self.__class__.__name__, self.workername,
                  request['SpotInstanceRequestId'], minutes, seconds))
         return request, True
     elif request_status == PRICE_TOO_LOW:
         self.ec2.meta.client.cancel_spot_instance_requests(
             SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
         log.msg('%s %s spot request rejected, spot price too low' %
                 (self.__class__.__name__, self.workername))
         self.current_spot_price *= self.retry_price_adjustment
         return request, False
     else:
         log.msg('%s %s failed to fulfill spot request %s with status %s' %
                 (self.__class__.__name__, self.workername,
                  request['SpotInstanceRequestId'], request_status))
         raise LatentWorkerFailedToSubstantiate(
             request['SpotInstanceRequestId'], request.status)
Example #27
0
    def _thd_start_instance(self, image, dockerfile, volumes):
        docker_client = self._getDockerClient()
        # cleanup the old instances
        instances = docker_client.containers(
            all=1, filters=dict(name=self.getContainerName()))
        for instance in instances:
            try:
                docker_client.remove_container(instance['Id'],
                                               v=True,
                                               force=True)
            except NotFound:
                pass  # that's a race condition

        found = False
        if image is not None:
            found = self._image_exists(docker_client, image)
        else:
            image = '%s_%s_image' % (self.workername, id(self))
        if (not found) and (dockerfile is not None):
            log.msg("Image '%s' not found, building it from scratch" % image)
            for line in docker_client.build(fileobj=BytesIO(
                    dockerfile.encode('utf-8')),
                                            tag=image):
                for streamline in _handle_stream_line(line):
                    log.msg(streamline)

        if (not self._image_exists(docker_client, image)):
            log.msg("Image '%s' not found" % image)
            raise LatentWorkerFailedToSubstantiate(
                'Image "%s" not found on docker host.' % image)

        volumes, binds = self._thd_parse_volumes(volumes)
        host_conf = self.hostconfig.copy()
        host_conf['binds'] = binds
        host_conf = docker_client.create_host_config(**host_conf)

        instance = docker_client.create_container(
            image,
            self.command,
            name=self.getContainerName(),
            volumes=volumes,
            environment=self.createEnvironment(),
            host_config=host_conf)

        if instance.get('Id') is None:
            log.msg('Failed to create the container')
            raise LatentWorkerFailedToSubstantiate('Failed to start container')
        shortid = instance['Id'][:6]
        log.msg('Container created, Id: %s...' % (shortid, ))
        instance['image'] = image
        self.instance = instance
        docker_client.start(instance)
        log.msg('Container started')
        if self.followStartupLogs:
            logs = docker_client.attach(container=instance,
                                        stdout=True,
                                        stderr=True,
                                        stream=True)
            for line in logs:
                log.msg("docker VM %s: %s" % (shortid, line.strip()))
                if self.conn:
                    break
            del logs
        return [instance['Id'], image]
Example #28
0
    def _thd_start_instance(self, image, dockerfile, volumes, custom_context,
                            encoding, buildargs):
        docker_client = self._getDockerClient()
        container_name = self.getContainerName()
        # cleanup the old instances
        instances = docker_client.containers(all=1,
                                             filters=dict(name=container_name))
        container_name = "/{0}".format(container_name)
        for instance in instances:
            if container_name not in instance['Names']:
                continue
            try:
                docker_client.remove_container(instance['Id'],
                                               v=True,
                                               force=True)
            except NotFound:
                pass  # that's a race condition

        found = False
        if image is not None:
            found = self._image_exists(docker_client, image)
        else:
            image = '{}_{}_image'.format(self.workername, id(self))
        if (not found) and (dockerfile is not None):
            log.msg(
                "Image '{}' not found, building it from scratch".format(image))
            if (custom_context):
                with open(dockerfile, 'rb') as fin:
                    lines = docker_client.build(fileobj=fin,
                                                custom_context=custom_context,
                                                encoding=encoding,
                                                tag=image,
                                                buildargs=buildargs)
            else:
                lines = docker_client.build(
                    fileobj=BytesIO(dockerfile.encode('utf-8')),
                    tag=image,
                )

            for line in lines:
                for streamline in _handle_stream_line(line):
                    log.msg(streamline)

        imageExists = self._image_exists(docker_client, image)
        if ((not imageExists) or self.alwaysPull) and self.autopull:
            if (not imageExists):
                log.msg("Image '{}' not found, pulling from registry".format(
                    image))
            docker_client.pull(image)

        if (not self._image_exists(docker_client, image)):
            msg = 'Image "{}" not found on docker host.'.format(image)
            log.msg(msg)
            raise LatentWorkerCannotSubstantiate(msg)

        volumes, binds = self._thd_parse_volumes(volumes)
        host_conf = self.hostconfig.copy()
        host_conf['binds'] = binds
        if docker_py_version >= 2.2:
            host_conf['init'] = True
        host_conf = docker_client.create_host_config(**host_conf)

        instance = docker_client.create_container(
            image,
            self.command,
            name=self.getContainerName(),
            volumes=volumes,
            environment=self.createEnvironment(),
            host_config=host_conf)

        if instance.get('Id') is None:
            log.msg('Failed to create the container')
            raise LatentWorkerFailedToSubstantiate('Failed to start container')
        shortid = instance['Id'][:6]
        log.msg('Container created, Id: {}...'.format(shortid))
        instance['image'] = image
        self.instance = instance
        docker_client.start(instance)
        log.msg('Container started')
        if self.followStartupLogs:
            logs = docker_client.attach(container=instance,
                                        stdout=True,
                                        stderr=True,
                                        stream=True)
            for line in logs:
                log.msg("docker VM {}: {}".format(shortid, line.strip()))
                if self.conn:
                    break
            del logs
        return [instance['Id'], image]
Example #29
0
 def failed_to_start(self, instance_id, instance_state):
     log.msg('%s %s failed to start instance %s (%s)' %
             (self.__class__.__name__, self.workername,
                 instance_id, instance_state))
     raise LatentWorkerFailedToSubstantiate(instance_id, instance_state)
Example #30
0
    def _thd_start_instance(self, image, dockerfile, hostconfig, volumes):
        # License note:
        #    copied from the original implementation with minor modification
        #    to pass runtime configuration to the containers
        with self.docker_client() as docker_client:
            container_name = self.getContainerName()
            # cleanup the old instances
            instances = docker_client.containers(
                all=1,
                filters=dict(name=container_name))
            container_name = '/{0}'.format(container_name)
            for instance in instances:
                if container_name not in instance['Names']:
                    continue
                try:
                    docker_client.remove_container(instance['Id'], v=True,
                                                   force=True)
                except docker.errors.NotFound:
                    pass  # that's a race condition

            found = False
            if image is not None:
                found = self._image_exists(docker_client, image)
            else:
                worker_id = id(self)
                worker_name = self.workername
                image = f'{worker_name}_{worker_id}_image'
            if (not found) and (dockerfile is not None):
                log.info(f'Image {image} not found, building it from scratch')
                for line in docker_client.build(
                    fileobj=BytesIO(dockerfile.encode('utf-8')),
                    tag=image
                ):
                    for streamline in _handle_stream_line(line):
                        log.info(streamline)

            imageExists = self._image_exists(docker_client, image)
            if ((not imageExists) or self.alwaysPull) and self.autopull:
                if (not imageExists):
                    log.info(f'Image {image} not found, pulling from registry')
                docker_client.pull(image)

            if (not self._image_exists(docker_client, image)):
                log.info(f'Image {image} not found')
                raise LatentWorkerCannotSubstantiate(
                    f'Image {image} not found on docker host.'
                )

            volumes, binds = self._thd_parse_volumes(volumes)

            hostconfig['binds'] = binds
            if docker_py_version >= 2.2:
                hostconfig['init'] = True

            instance = docker_client.create_container(
                image,
                self.command,
                name=self.getContainerName(),
                volumes=volumes,
                environment=self.createEnvironment(),
                host_config=docker_client.create_host_config(
                    **hostconfig
                )
            )

            if instance.get('Id') is None:
                log.info('Failed to create the container')
                raise LatentWorkerFailedToSubstantiate(
                    'Failed to start container'
                )
            shortid = instance['Id'][:6]
            log.info(f'Container created, Id: {shortid}...')

            instance['image'] = image
            self.instance = instance
            docker_client.start(instance)
            log.info('Container started')
            if self.followStartupLogs:
                logs = docker_client.attach(
                    container=instance, stdout=True, stderr=True, stream=True)
                for line in logs:
                    line = line.strip()
                    log.info(f'docker VM {shortid}: {line}')
                    if self.conn:
                        break
                del logs

        return [instance['Id'], image]