def set_spot_fleet_request_capacity(sfr_id, capacity, dry_run, region=None):
    """ AWS won't modify a request that is already modifying. This
    function ensures we wait a few seconds in case we've just modified
    a SFR"""
    ec2_client = boto3.client('ec2', region_name=region)
    with Timeout(seconds=AWS_SPOT_MODIFY_TIMEOUT):
        try:
            state = None
            while True:
                state = get_sfr(sfr_id, region=region)['SpotFleetRequestState']
                if state == 'active':
                    break
                if state == 'cancelled_running':
                    log.info("Not updating target capacity because this is a cancelled SFR, "
                             "we are just draining and killing the instances")
                    return
                log.debug("SFR {0} in state {1}, waiting for state: active".format(sfr_id, state))
                log.debug("Sleep 5 seconds")
                time.sleep(5)
        except TimeoutError:
            log.error("Spot fleet {0} not in active state so we can't modify it.".format(sfr_id))
            raise FailSetSpotCapacity
    if dry_run:
        return True
    try:
        ret = ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=capacity,
                                                   ExcessCapacityTerminationPolicy='noTermination')
    except ClientError as e:
        log.error("Error modifying spot fleet request: {0}".format(e))
        raise FailSetSpotCapacity
    return ret
Example #2
0
def perform_http_healthcheck(url, timeout):
    """Returns true if healthcheck on url succeeds, false otherwise

    :param url: the healthcheck url
    :param timeout: timeout in seconds
    :returns: True if healthcheck succeeds within number of seconds specified by timeout, false otherwise
    """
    try:
        with Timeout(seconds=timeout):
            try:
                res = requests.get(url)
            except requests.ConnectionError:
                return (False, "http request failed: connection failed")
    except TimeoutError:
        return (False, "http request timed out after %d seconds" % timeout)

    if 'content-type' in res.headers and ',' in res.headers['content-type']:
        paasta_print(PaastaColors.yellow(
            "Multiple content-type headers detected in response."
            " The Mesos healthcheck system will treat this as a failure!"))
        return (False, "http request succeeded, code %d" % res.status_code)
    # check if response code is valid per https://mesosphere.github.io/marathon/docs/health-checks.html
    elif res.status_code >= 200 and res.status_code < 400:
        return (True, "http request succeeded, code %d" % res.status_code)
    else:
        return (False, "http request failed, code %s" % str(res.status_code))
Example #3
0
def perform_http_healthcheck(url, timeout):
    """Returns true if healthcheck on url succeeds, false otherwise

    :param url: the healthcheck url
    :param timeout: timeout in seconds
    :returns: True if healthcheck succeeds within number of seconds specified by timeout, false otherwise
    """
    try:
        with Timeout(seconds=timeout):
            try:
                res = requests.head(url)
            except requests.ConnectionError:
                return False
    except TimeoutError:
        return False

    if 'content-type' in res.headers and ',' in res.headers['content-type']:
        sys.stdout.write(
            PaastaColors.yellow(
                "Multiple content-type headers detected in response."
                " The Mesos healthcheck system will treat this as a failure!"))
        return False
    # check if response code is valid per https://mesosphere.github.io/marathon/docs/health-checks.html
    elif res.status_code >= 200 and res.status_code < 400:
        return True
Example #4
0
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout):
    cluster_map = get_cluster_instance_map_for_service(soa_dir, service,
                                                       deploy_group)
    if not cluster_map:
        line = "Couldn't find any instances for service {0} in deploy group {1}".format(
            service, deploy_group)
        _log(service=service, component='deploy', line=line, level='event')
        raise NoInstancesFound
    for cluster in cluster_map.values():
        cluster['deployed'] = 0
    try:
        with Timeout(seconds=timeout):
            total_instances = sum(
                [len(v["instances"]) for v in cluster_map.values()])
            with progressbar.ProgressBar(maxval=total_instances) as bar:
                while True:
                    for cluster, instances in cluster_map.items():
                        if cluster_map[cluster]['deployed'] != len(
                                cluster_map[cluster]['instances']):
                            cluster_map[cluster][
                                'deployed'] = instances_deployed(
                                    cluster=cluster,
                                    service=service,
                                    instances=instances['instances'],
                                    git_sha=git_sha)
                            if cluster_map[cluster]['deployed'] == len(
                                    cluster_map[cluster]['instances']):
                                instance_csv = ", ".join(
                                    cluster_map[cluster]['instances'])
                                print "Deploy to %s complete! (instances: %s)" % (
                                    cluster, instance_csv)
                        bar.update(
                            sum([v["deployed"] for v in cluster_map.values()]))
                    if all([
                            cluster['deployed'] == len(cluster["instances"])
                            for cluster in cluster_map.values()
                    ]):
                        break
                    else:
                        time.sleep(10)
    except TimeoutError:
        human_status = [
            "{0}: {1}".format(cluster, data['deployed'])
            for cluster, data in cluster_map.items()
        ]
        line = "\nCurrent deployment status of {0} per cluster:\n".format(
            deploy_group) + "\n".join(human_status)
        _log(service=service, component='deploy', line=line, level='event')
        line = "\n\nTimed out after {0} seconds, waiting for {1} in {2} to be deployed by PaaSTA. \n\n"\
               "This probably means the deploy hasn't suceeded. The new service might not be healthy or one "\
               "or more clusters could be having issues.\n\n"\
               "To debug: try running 'paasta status -s {2} -vv' or 'paasta logs -s {2}' to determine the cause.\n\n"\
               "{3} is still *marked* for deployment. To rollback, you can run: 'paasta rollback --service "\
               "{2} --deploy-group {1}'\n\n"\
               "If the service is known to be slow to start you may wish to increase "\
               "the timeout on this step.".format(timeout, deploy_group, service, git_sha)
        _log(service=service, component='deploy', line=line, level='event')
        raise
    return True
    def wait_and_terminate(self, slave, drain_timeout, dry_run, region=None):
        """Waits for slave to be drained and then terminate

        :param slave: dict of slave to kill
        :param drain_timeout: how long to wait before terminating
            even if not drained
        :param region to connect to ec2
        :param dry_run: Don't drain or make changes to spot fleet if True"""
        ec2_client = boto3.client('ec2', region_name=region)
        try:
            # This loop should always finish because the maintenance window should trigger is_ready_to_kill
            # being true. Just in case though we set a timeout and terminate anyway
            with Timeout(seconds=drain_timeout + 300):
                while True:
                    instance_id = slave.instance_id
                    if not instance_id:
                        self.log.warning(
                            "Didn't find instance ID for slave: {}. Skipping terminating"
                            .format(slave.pid), )
                        continue
                    # Check if no tasks are running or we have reached the maintenance window
                    if is_safe_to_kill(slave.hostname) or dry_run:
                        self.log.info(
                            "TERMINATING: {} (Hostname = {}, IP = {})".format(
                                instance_id,
                                slave.hostname,
                                slave.ip,
                            ))
                        try:
                            ec2_client.terminate_instances(
                                InstanceIds=[instance_id], DryRun=dry_run)
                        except ClientError as e:
                            if e.response['Error'].get(
                                    'Code') == 'DryRunOperation':
                                pass
                            else:
                                raise
                        break
                    else:
                        self.log.info("Instance {}: NOT ready to kill".format(
                            instance_id))
                    self.log.debug("Waiting 5 seconds and then checking again")
                    time.sleep(5)
        except TimeoutError:
            self.log.error(
                "Timed out after {} waiting to drain {}, now terminating anyway"
                .format(
                    drain_timeout,
                    slave.pid,
                ))
            try:
                ec2_client.terminate_instances(InstanceIds=instance_id,
                                               DryRun=dry_run)
            except ClientError as e:
                if e.response['Error'].get('Code') == 'DryRunOperation':
                    pass
                else:
                    raise
Example #6
0
def wait_and_terminate(slave, drain_timeout, dry_run, region=None):
    """Currently kills a slave, will wait for draining to complete soon

    :param slave: dict of slave to kill
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    ec2_client = boto3.client('ec2', region_name=region)
    try:
        # This loop should always finish because the maintenance window should trigger is_ready_to_kill
        # being true. Just in case though we set a timeout and terminate anyway
        with Timeout(seconds=drain_timeout + 300):
            while True:
                instance_id = slave['instance_id']
                if not instance_id:
                    log.warning("Didn't find instance ID for slave: {0}. Skipping terminating".format(slave['pid']))
                    continue
                # Check if no tasks are running or we have reached the maintenance window
                if is_safe_to_kill(slave['hostname']) or dry_run:
                    log.info("TERMINATING: {0} (Hostname = {1}, IP = {2})".format(
                        instance_id,
                        slave['hostname'],
                        slave['ip'],
                    ))
                    try:
                        ec2_client.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run)
                    except ClientError as e:
                        if e.response['Error'].get('Code') == 'DryRunOperation':
                            pass
                        else:
                            raise
                    break
                else:
                    log.info("Instance {0}: NOT ready to kill".format(instance_id))
                log.debug("Waiting 5 seconds and then checking again")
                time.sleep(5)
    except TimeoutError:
        log.error("Timed out after {0} waiting to drain {1}, now terminating anyway".format(drain_timeout,
                                                                                            slave['pid']))
        try:
            ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run)
        except ClientError as e:
            if e.response['Error'].get('Code') == 'DryRunOperation':
                pass
            else:
                raise
 def set_capacity(self, capacity):
     """ AWS won't modify a request that is already modifying. This
     function ensures we wait a few seconds in case we've just modified
     a SFR"""
     rounded_capacity = int(floor(capacity))
     ec2_client = boto3.client('ec2', region_name=self.resource['region'])
     with Timeout(seconds=AWS_SPOT_MODIFY_TIMEOUT):
         try:
             state = None
             while True:
                 state = self.get_sfr(self.resource['id'],
                                      region=self.resource['region']
                                      )['SpotFleetRequestState']
                 if state == 'active':
                     break
                 if state == 'cancelled_running':
                     self.log.info(
                         "Not updating target capacity because this is a cancelled SFR, "
                         "we are just draining and killing the instances", )
                     return
                 self.log.debug(
                     "SFR {} in state {}, waiting for state: active".format(
                         self.resource['id'], state))
                 self.log.debug("Sleep 5 seconds")
                 time.sleep(5)
         except TimeoutError:
             self.log.error(
                 "Spot fleet {} not in active state so we can't modify it.".
                 format(self.resource['id']))
             raise FailSetResourceCapacity
     if self.dry_run:
         return True
     try:
         ret = ec2_client.modify_spot_fleet_request(
             SpotFleetRequestId=self.resource['id'],
             TargetCapacity=rounded_capacity,
             ExcessCapacityTerminationPolicy='noTermination',
         )
     except ClientError as e:
         self.log.error("Error modifying spot fleet request: {}".format(e))
         raise FailSetResourceCapacity
     self.capacity = capacity
     return ret