Beispiel #1
0
def new_asg(cluster, ami_id):
    """
    Create a new ASG in the given asgard cluster using the given AMI.

    Ensures that the new ASG has a min or desired instance count greater than 0.

    Arguments:
        cluster(str): Name of the cluster.
        ami_id(str): AWS AMI ID

    Returns:
        str: The name of the new ASG.

    Raises:
        TimeoutException: When the task to bring up the new ASG times out.
        BackendError: When the task to bring up the new ASG fails.
        ASGCountZeroException: When the new ASG brought online has 0 for it's min and desired counts
        RateLimitedException: When we are being rate limited by AWS.
    """

    payload = {
        "name": cluster,
        "imageId": ami_id,
    }

    response = requests.post(NEW_ASG_URL,
                             data=payload,
                             params=ASGARD_API_TOKEN,
                             timeout=REQUESTS_TIMEOUT)
    LOG.debug("Sent request to create new ASG in Cluster({}).".format(cluster))

    if response.status_code == 404:
        msg = "Can't create more ASGs for cluster {}. Please either wait " \
              "until older ASGs have been removed automatically or remove " \
              "old ASGs manually via Asgard."
        raise BackendError(msg.format(cluster))
    if response.status_code != 200:
        # The requests library follows redirects. The 200 comes from the job status page
        msg = "Error occured attempting to create new ASG for cluster {}.\nResponse: {}"
        raise BackendError(msg.format(cluster, response.text))

    response = wait_for_task_completion(response.url,
                                        ASGARD_NEW_ASG_CREATION_TIMEOUT)
    if response['status'] == 'failed':
        msg = "Failure during new ASG creation. Task Log: \n{}".format(
            response['log'])
        raise BackendError(msg)

    # Potential Race condition if multiple people are making ASGs for the same cluster
    # Return the name of the newest asg
    newest_asg = asgs_for_cluster(cluster)[-1]
    LOG.debug("New ASG({}) created in cluster({}).".format(
        newest_asg['autoScalingGroupName'], cluster))

    if newest_asg['desiredCapacity'] <= 0 or newest_asg['minSize'] <= 0:
        raise ASGCountZeroException(
            "New ASG {asg_name} created with 0 instances, aborting. Please check Asgard for more information"
            .format(asg_name=newest_asg['autoScalingGroupName']))

    return newest_asg['autoScalingGroupName']
Beispiel #2
0
def _get_asgard_resource_info(url):
    """
    A generic function for querying Asgard for inforamtion about a specific resource,
    such as an Autoscaling Group, A cluster.


    Raises:
        TimeoutException: When the task to bring up the new ASG times out.
        BackendError: When the task to bring up the new ASG fails.
        ASGCountZeroException: When the new ASG brought online has 0 for it's min and desired counts
        RateLimitedException: When we are being rate limited by AWS.
    """

    LOG.debug("URL: {}".format(url))
    response = requests.get(url,
                            params=ASGARD_API_TOKEN,
                            timeout=REQUESTS_TIMEOUT)

    if response.status_code == 404:
        raise ResourceDoesNotExistException(
            'Resource for url {} does not exist'.format(url))
    if response.status_code >= 500:
        raise BackendError('Asgard experienced an error: {}'.format(
            response.text))
    if response.status_code != 200:
        raise BackendError(
            'Call to asgard failed with status code: {0}: {1}'.format(
                response.status_code, response.text))
    LOG.debug("ASG info: {}".format(response.text))
    resource_info_json = _parse_asgard_json_response(url, response)
    return resource_info_json
Beispiel #3
0
def check_state(task_id, username, password):
    """
    Checks the state of the response to verify it is "done"

    Args:
        task_id (int): The task id to check the state of.
        username (str): The Acquia username necessary to run the command.
        password (str): The Acquia password necessary to run the command.

    Returns:
        True if state of the response is "done"

    Raises:
        BackendError: Raised so the method will retry since immediately after receiving the
            response, the state will still be "waiting". Can"t rely on parse_response since
            the response should return a 200, just not the state wanted.
    """
    api_client = get_api_client(username, password)
    response = api_client.get(CHECK_TASKS_URL.format(id=task_id))
    response_json = parse_response(response,
                                   "Failed to check state of response.")
    if response_json["state"] == "done":
        return True
    raise BackendError(
        "Check state failed. The state of the response was {state}, not done as expected.\n"
        "JSON Data: {response}".format(state=response_json["state"],
                                       response=response_json))
Beispiel #4
0
def clear_varnish_cache(env, username, password):
    """
    Clears the Varnish cache from all domains in a Drupal environment.

    Args:
        env (str): The environment to clear varnish caches in (e.g. test or prod)
        username (str): The Acquia username necessary to run the command.
        password (str): The Acquia password necessary to run the command.

    Returns:
        True if all of the Varnish caches are successfully cleared.

    Raises:
        KeyError: Raised if env value is invalid.
        BackendError: Raised if the varnish cache fails to clear in any of the domains.
    """
    api_client = get_api_client(username, password)
    domains = VALID_ENVIRONMENTS[env]
    failure = ""
    for domain in domains:
        response = api_client.delete(
            CLEAR_CACHE_URL.format(env=env, domain=domain))
        error_message = "Failed to clear cache in {domain}.".format(
            domain=domain)
        try:
            response_json = parse_response(response, error_message)
        except BackendError:
            failure = failure + error_message + "\n"
            continue
        check_state(response_json["id"], username, password)
    if failure:
        raise BackendError(failure)
    return True
Beispiel #5
0
def enable_asg(asg):
    """
    Enable an ASG in asgard.  This means it will have ELBs routing to it
    if any are associated and autoscaling will be turned on.

    Arguments:
        asg(str): The name of the asg to enable.

    Returns:
        None: When the asg has been enabled.

    Raises:
        BackendError: If the task to enable the ASG fails.
        TimeoutException: If the request to enable the ASG times out
        RateLimitedException: When we are being rate limited by AWS.
    """
    payload = {"name": asg}
    response = requests.post(ASG_ACTIVATE_URL,
                             data=payload,
                             params=ASGARD_API_TOKEN,
                             timeout=REQUESTS_TIMEOUT)
    task_url = response.url
    task_status = wait_for_task_completion(task_url, 301)
    if task_status['status'] == 'failed':
        msg = "Failure while enabling ASG. Task Log: \n{}".format(
            task_status['log'])
        raise BackendError(msg)
Beispiel #6
0
def check_state(notification_url, token):
    """
    Checks the status of the response to verify it is "done"

    Args:
        notification_url (str): The notification url to use to check the state of.
        token (str): token to authenticate client

    Returns:
        True if status of the response is "completed"

    Raises:
        BackendError: Raised so the method will retry since immediately after receiving the
            response, the status will still be "in-progress". Can"t rely on parse_response since
            the response should return a 200, just not the status wanted.
    """
    response = get_acquia_v2(notification_url, token)
    response_json = parse_response(response,
                                   "Failed to check state of response.")
    if response_json["status"] == "completed":
        return True
    raise BackendError(
        "Check status failed. The status of the response was {status}, not done as expected.\n"
        "JSON Data: {response}".format(status=response_json["status"],
                                       response=response_json))
Beispiel #7
0
def delete_asg(asg,
               fail_if_active=True,
               fail_if_last=True,
               wait_for_deletion=True):
    """
    Delete an ASG using asgard.
    curl -d "name=helloworld-example-v004" http://asgardprod/us-east-1/cluster/delete

    Arguments:
        asg(str): The name of the asg to delete.

    Returns:
        None: When the asg has been deleted.

    Raises:
        TimeoutException: If the task to delete the ASG fails...
        BackendError: If asgard was unable to delete the ASG
        ASGDoesNotExistException: When an ASG does not exist
        RateLimitedException: When we are being rate limited by AWS.
    """
    if is_asg_pending_delete(asg):
        LOG.info(
            "Not deleting ASG {} due to its already pending deletion.".format(
                asg))
        return
    if fail_if_active and is_asg_enabled(asg):
        msg = "Not deleting ASG {} as it is currently active.".format(asg)
        LOG.warning(msg)
        try:
            ec2.remove_asg_deletion_tag(asg)
        except EC2ResponseError as tagging_error:
            LOG.warning(
                "Failed to remove deletion tag from asg {}. Ignoring: {}".
                format(asg, tagging_error))
        raise CannotDeleteActiveASG(msg)

    if fail_if_last and is_last_asg(asg):
        msg = "Not deleting ASG {} since it is the last ASG in this cluster."
        LOG.warning(msg)
        try:
            ec2.remove_asg_deletion_tag(asg)
        except EC2ResponseError as tagging_error:
            LOG.warning(
                "Failed to remove deletion tag from asg {}. Ignoring: {}".
                format(asg, tagging_error))
        raise CannotDeleteLastASG(msg)

    payload = {"name": asg}
    response = requests.post(ASG_DELETE_URL,
                             data=payload,
                             params=ASGARD_API_TOKEN,
                             timeout=REQUESTS_TIMEOUT)
    task_url = response.url
    if wait_for_deletion:
        task_status = wait_for_task_completion(task_url, 300)
        if task_status['status'] == 'failed':
            msg = "Failure while deleting ASG. Task Log: \n{}".format(
                task_status['log'])
            raise BackendError(msg)
Beispiel #8
0
def _poll_giveup(data):
    u""" Raise an error when the polling tries are exceeded."""
    orig_args = data.get(u'args')
    # The Build object was the only parameter to the original method call,
    # and so it's the first and only item in the args.
    build = orig_args[0]
    msg = u'Timed out waiting for build {} to finish.'.format(build.name)
    raise BackendError(msg)
Beispiel #9
0
def _poll_giveup(results):
    """
    Raise an error when the polling tries are exceeded.
    """
    orig_args = results['args']
    msg = 'Timed out after {tries} attempts to send email with subject "{subject}".'.format(
        tries=results['tries'], subject=orig_args[3])
    raise BackendError(msg)
Beispiel #10
0
def _parse_json(url, response):
    """
    Protect against non-JSON responses that are sometimes returned from Asgard.
    """
    try:
        response_json = response.json()
    except ValueError:
        msg = "Expected json response from url: '{}' - but got the following:\n{}"
        raise BackendError(msg.format(url, response.text))
    return response_json
Beispiel #11
0
def _get_asgard_resource_info(url):
    """
    A generic function for querying Asgard for inforamtion about a specific resource,
    such as an Autoscaling Group, A cluster.
    """

    LOG.debug("URL: {}".format(url))
    response = requests.get(url, params=ASGARD_API_TOKEN, timeout=REQUESTS_TIMEOUT)

    if response.status_code == 404:
        raise ResourceDoesNotExistException('Resource for url {} does not exist'.format(url))
    elif response.status_code >= 500:
        raise BackendError('Asgard experienced an error: {}'.format(response.text))
    elif response.status_code != 200:
        raise BackendError('Call to asgard failed with status code: {0}: {1}'
                           .format(response.status_code, response.text))

    LOG.debug("ASG info: {}".format(response.text))
    return _parse_json(url, response)
Beispiel #12
0
def deploy(ami_id):
    """
    Deploys an AMI as an auto-scaling group (ASG) to AWS.

    Arguments:
        ami_id(str): AWS AMI ID

    Returns:
        dict(str, str, dict): Returns a dictionary with the keys:
            'ami_id' - AMI id used to deploy the AMI
            'current_asgs' - Lists of current active ASGs, keyed by cluster.
            'disabled_asgs' - Lists of current inactive ASGs, keyed by cluster.

    Raises:
        TimeoutException: When the task to bring up the new instance times out.
        BackendError: When the task to bring up the new instance fails.
        ASGDoesNotExistException: If the ASG being queried does not exist.
    """
    LOG.info("Processing request to deploy {}.".format(ami_id))

    # Pull the EDP from the AMI ID
    edp = ec2.edp_for_ami(ami_id)

    # These are all autoscaling groups that match the tags we care about.
    existing_edp_asgs = ec2.asgs_for_edp(edp, filter_asgs_pending_delete=False)

    # Find the clusters for all the existing ASGs.
    existing_clustered_asgs = clusters_for_asgs(existing_edp_asgs)
    LOG.info("Deploying to cluster(s) {}".format(existing_clustered_asgs.keys()))

    # Create a new ASG in each cluster.
    new_clustered_asgs = defaultdict(list)
    for cluster in existing_clustered_asgs:
        try:
            newest_asg = new_asg(cluster, ami_id)
            new_clustered_asgs[cluster].append(newest_asg)
        except:
            msg = "ASG creation failed for cluster '{}' but succeeded for cluster(s) {}."
            msg = msg.format(cluster, new_clustered_asgs.keys())
            LOG.exception(msg)
            raise

    new_asgs = [asgs[0] for asgs in new_clustered_asgs.values()]
    LOG.info("New ASGs created: {}".format(new_asgs))
    ec2.wait_for_in_service(new_asgs, 300)
    LOG.info("New ASGs healthy: {}".format(new_asgs))

    LOG.info("Enabling traffic to new ASGs for the {} cluster(s).".format(existing_clustered_asgs.keys()))
    success, enabled_asgs, disabled_asgs = _red_black_deploy(dict(new_clustered_asgs), existing_clustered_asgs)
    if not success:
        raise BackendError("Error performing red/black deploy - deploy was unsuccessful. "
                           "enabled_asgs: {} - disabled_asgs: {}".format(enabled_asgs, disabled_asgs))

    LOG.info("Woot! Deploy Done!")
    return {'ami_id': ami_id, 'current_asgs': enabled_asgs, 'disabled_asgs': disabled_asgs}
Beispiel #13
0
def disable_asg(asg):
    """
    Disable an ASG using asgard.
    curl -d "name=helloworld-example-v004" http://asgardprod/us-east-1/cluster/deactivate

    Arguments:
        asg(str): The name of the asg to disable.

    Returns:
        None: When the asg has been disabled.

    Raises:
        TimeoutException: If the task to enable the ASG times out
        BackendError: If asgard was unable to disable the ASG
        ASGDoesNotExistException: If the ASG does not exist
        CannotDisableActiveASG: if the current ASG is active
        RateLimitedException: When we are being rate limited by AWS.
    """
    try:
        if is_asg_pending_delete(asg):
            LOG.info(
                "Not disabling old ASG {} due to its pending deletion.".format(
                    asg))
            return
    except ASGDoesNotExistException:
        LOG.info("Not disabling ASG {}, it no longer exists.".format(asg))
        return

    if is_last_asg(asg):
        msg = "Not disabling ASG {}, it is the last ASG in this cluster."
        raise CannotDisableActiveASG(msg)

    payload = {"name": asg}
    response = requests.post(ASG_DEACTIVATE_URL,
                             data=payload,
                             params=ASGARD_API_TOKEN,
                             timeout=REQUESTS_TIMEOUT)
    task_url = response.url
    task_status = wait_for_task_completion(task_url, 300)
    if task_status['status'] == 'failed':
        msg = "Failure while disabling ASG. Task Log: \n{}".format(
            task_status['log'])
        raise BackendError(msg)
Beispiel #14
0
def parse_response(response, error_message):
    """
    Parses the response.

    Args:
        response (requests.Response):
        error_message (str):

    Returns:
        The JSON representation of the response if no errors.

    Raises:
        BackendError: Raised if the response's status code is not 200.
    """
    if response.status_code != 200:
        msg = "{specific}\nStatus Code: {status}\nBody: {body}".format(specific=error_message,
                                                                       status=response.status_code, body=response.text)
        LOG.error(msg)
        raise BackendError(msg)
    return response.json()
Beispiel #15
0
def clear_varnish_cache(app_id, env, client_id, secret):
    """
    Clears the Varnish cache from all domains in a Drupal environment.

    Args:
        app_id (str): Application id assigned to Drupal instance.
        env (str): The environment to clear varnish caches in (e.g. test or prod)
        client_id (str): The Acquia api client id necessary to run the command.
        secret (str): The Acquia api secret key to run the command.

    Returns:
        True if all of the Varnish caches are successfully cleared.

    Raises:
        KeyError: Raised if env value is invalid.
        BackendError: Raised if the varnish cache fails to clear in any of the domains.
    """
    domains = VALID_ENVIRONMENTS[env]
    failure = ""

    token = get_api_token(client_id, secret)
    environmentId = fetch_environment_uid(app_id, env, token)
    if environmentId:
        for domain in domains:
            response = post_acquia_v2(
                CLEAR_CACHE_URL.format(environmentId=environmentId,
                                       domain=domain), token)
            error_message = "Failed to clear cache in {domain}.".format(
                domain=domain)
            try:
                response_json = parse_response(response, error_message)
            except BackendError:
                failure = failure + error_message + "\n"
                continue
            check_state(response_json['_links']['notification']['href'], token)
        if failure:
            raise BackendError(failure)
        return True
Beispiel #16
0
def trigger_build(base_url,
                  user_name,
                  user_token,
                  job_name,
                  job_token,
                  job_cause=None,
                  job_params=None,
                  timeout=60 * 30):
    u"""
    Trigger a jenkins job/project (note that jenkins uses these terms interchangeably)

    Args:
        base_url (str): The base URL for the jenkins server, e.g. https://test-jenkins.testeng.edx.org
        user_name (str): The jenkins username
        user_token (str): API token for the user. Available at {base_url}/user/{user_name)/configure
        job_name (str): The Jenkins job name, e.g. test-project
        job_token (str): Jobs must be configured with the option "Trigger builds remotely" selected.
            Under this option, you must provide an authorization token (configured in the job)
            in the form of a string so that only those who know it would be able to remotely
            trigger this project's builds.
        job_cause (str): Text that will be included in the recorded build cause
        job_params (set of tuples): Parameter names and their values to pass to the job
        timeout (int): The maximum number of seconds to wait for the jenkins build to complete (measured
            from when the job is triggered.)

    Returns:
        A the status of the build that was triggered

    Raises:
        BackendError: if the Jenkins job could not be triggered successfully
    """
    @backoff.on_predicate(
        backoff.constant,
        interval=60,
        max_tries=timeout / 60 + 1,
        on_giveup=_poll_giveup,
        # We aren't worried about concurrent access, so turn off jitter
        jitter=None,
    )
    def poll_build_for_result(build):
        u"""
        Poll for the build running, with exponential backoff, capped to ``timeout`` seconds.
        The on_predicate decorator is used to retry when the return value
        of the target function is True.
        """
        return not build.is_running()

    # Create a dict with key/value pairs from the job_params
    # that were passed in like this:  --param FOO bar --param BAZ biz
    # These will get passed to the job as string parameters like this:
    # {u'FOO': u'bar', u'BAX': u'biz'}
    request_params = {}
    for param in job_params:
        request_params[param[0]] = param[1]

    # Contact jenkins, log in, and get the base data on the system.
    try:
        crumb_requester = CrumbRequester(baseurl=base_url,
                                         username=user_name,
                                         password=user_token,
                                         ssl_verify=True)
        jenkins = Jenkins(base_url,
                          username=user_name,
                          password=user_token,
                          requester=crumb_requester)
    except (JenkinsAPIException, HTTPError) as err:
        raise BackendError(str(err))

    if not jenkins.has_job(job_name):
        msg = u'Job not found: {}.'.format(job_name)
        msg += u' Verify that you have permissions for the job and double check the spelling of its name.'
        raise BackendError(msg)

    # This will start the job and will return a QueueItem object which can be used to get build results
    job = jenkins[job_name]
    queue_item = job.invoke(securitytoken=job_token,
                            build_params=request_params,
                            cause=job_cause)
    LOG.info(u'Added item to jenkins. Server: {} Job: {} '.format(
        jenkins.base_server_url(), queue_item))

    # Block this script until we are through the queue and the job has begun to build.
    queue_item.block_until_building()
    build = queue_item.get_build()
    LOG.info(u'Created build {}'.format(build))
    LOG.info(u'See {}'.format(build.baseurl))

    # Now block until you get a result back from the build.
    poll_build_for_result(build)

    # Update the build's internal state, so that the final status is available
    build.poll()

    status = build.get_status()
    LOG.info(u'Build status: {status}'.format(status=status))
    return status