Exemple #1
0
def test_base_get_connection_error():
    n = nomad.Nomad(host="162.16.10.102",
                    port=common.NOMAD_PORT,
                    timeout=0.001,
                    verify=False)
    with pytest.raises(nomad.api.exceptions.BaseNomadException):
        j = n.evaluations["nope"]
Exemple #2
0
def test_base_delete_connection_error():
    n = nomad.Nomad(host="162.16.10.102",
                    port=common.NOMAD_PORT,
                    timeout=0.001,
                    verify=False)
    with pytest.raises(nomad.api.exceptions.BaseNomadException):
        j = n.job.deregister_job("example")
Exemple #3
0
def test_base_put_connection_error():
    n = nomad.Nomad(host="162.16.10.102",
                    port=common.NOMAD_PORT,
                    timeout=0.001,
                    verify=False)
    with pytest.raises(nomad.api.exceptions.BaseNomadException):
        j = n.system.initiate_garbage_collection()
Exemple #4
0
def get_active_volumes_detailed() -> Dict:
    """Returns the instance type and number of allocations (jobs) for each active volume

    These can be used to determine which jobs would actually be able
    to be placed if they were queued up.
    """
    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = nomad.Nomad(nomad_host, port=int(nomad_port), timeout=30)

    volumes = dict()
    try:
        for node in nomad_client.nodes.get_nodes():
            node_detail = nomad_client.node.get_node(node["ID"])
            allocations = len(nomad_client.node.get_allocations(node["ID"]))
            if ("Status" in node_detail and node_detail["Status"] == "ready"
                    and "Meta" in node_detail
                    and "volume_index" in node_detail["Meta"]):
                volume_info = dict()
                volume_info["type"] = node_detail["Attributes"].get(
                    "platform.aws.instance-type", None)
                volume_info["allocations"] = allocations

                volumes[node_detail["Meta"]["volume_index"]] = volume_info
    except nomad.api.exceptions.BaseNomadException:
        # Nomad is down, return the empty dict.
        pass

    return volumes
Exemple #5
0
def send_job(job_type: Enum, job_id: int) -> None:
    """Queues a worker job by sending a Nomad Job dispatch message.

    job_type must be a valid Enum for ProcessorPipelines or
    Downloaders as defined in data_refinery_common.job_lookup.
    job_id must correspond to an existing ProcessorJob or
    DownloaderJob record.
    """
    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_client = nomad.Nomad(nomad_host, timeout=5)

    # Once I have every job specced out with its own Nomad job, this
    # code can change and the meta won't need "JOB_NAME" in it because
    # the just specifying the nomad_job to dispatch will be enough.
    if job_type in list(ProcessorPipeline):
        nomad_job = NOMAD_PROCESSOR_JOB
    elif job_type in list(Downloaders):
        nomad_job = NOMAD_DOWNLOADER_JOB
    else:
        raise ValueError("Invalid job_type.")

    logger.info("Queuing %s nomad job to run DR job %s with id %d.",
                nomad_job,
                job_type.value,
                job_id)
    nomad_client.job.dispatch_job(nomad_job, meta={"JOB_NAME": job_type.value,
                                                   "JOB_ID": str(job_id)})
Exemple #6
0
def run_module():
    # seed the result dict in the object
    # we primarily care about changed and state
    # change is if this module effectively modified the target
    # state will include any data that you want your module to pass back
    # for consumption, for example, in a subsequent task
    result = dict(changed=False, original_message='', message='')

    module = AnsibleModule(argument_spec=dict(inline_data=dict(required=False,
                                                               type='str'),
                                              nomad_host=dict(required=False,
                                                              type='str'),
                                              nomad_port=dict(required=False,
                                                              type='int'),
                                              nomad_token=dict(required=False,
                                                               type='str'),
                                              cert_path=dict(required=False,
                                                             type='path'),
                                              secure=dict(required=False,
                                                          type='bool')),
                           supports_check_mode=True)

    inline_data = yaml.load(module.params['inline_data'])
    nomad_client = nomad.Nomad(host='127.0.0.1')
    nomad_client.job.register_job("example", inline_data)

    module.exit_json(**result)
Exemple #7
0
def nomad_setup_vault_invalid_token():
    n = nomad.Nomad(host=common.IP,
                    port=common.NOMAD_PORT,
                    verify=False,
                    token=common.NOMAD_TOKEN,
                    vaulttoken=common.VAULT_POLICY_INVALID_TOKEN)
    return n
Exemple #8
0
def nomad_setup_with_namespace():
    n = nomad.Nomad(host=common.IP,
                    port=common.NOMAD_PORT,
                    verify=False,
                    token=common.NOMAD_TOKEN,
                    namespace=common.NOMAD_NAMESPACE)
    return n
Exemple #9
0
def test_base_get_connnection_not_authorized():
    n = nomad.Nomad(host=common.IP,
                    port=common.NOMAD_PORT,
                    token='aed2fc63-c155-40d5-b58a-18deed4b73e5',
                    verify=False)
    with pytest.raises(nomad.api.exceptions.URLNotAuthorizedNomadException):
        j = n.job.get_job("example")
def run():
    module = AnsibleModule(argument_spec=dict(host=dict(required=True,
                                                        type='str'),
                                              use_ssl=dict(type='bool',
                                                           default=True),
                                              timeout=dict(type='int',
                                                           default=5),
                                              validate_certs=dict(
                                                  type='bool', default=True),
                                              client_cert=dict(type='path',
                                                               default=None),
                                              client_key=dict(type='path',
                                                              default=None),
                                              namespace=dict(type='str',
                                                             default=None),
                                              name=dict(type='str',
                                                        default=None),
                                              token=dict(type='str',
                                                         default=None,
                                                         no_log=True)),
                           supports_check_mode=True)

    if not import_nomad:
        module.fail_json(msg=missing_required_lib("python-nomad"))

    certificate_ssl = (module.params.get('client_cert'),
                       module.params.get('client_key'))

    nomad_client = nomad.Nomad(host=module.params.get('host'),
                               secure=module.params.get('use_ssl'),
                               timeout=module.params.get('timeout'),
                               verify=module.params.get('validate_certs'),
                               cert=certificate_ssl,
                               namespace=module.params.get('namespace'),
                               token=module.params.get('token'))

    changed = False
    nomad_jobs = list()
    try:
        job_list = nomad_client.jobs.get_jobs()
        for job in job_list:
            nomad_jobs.append(nomad_client.job.get_job(job.get('ID')))
            result = nomad_jobs
    except Exception as e:
        module.fail_json(msg=to_native(e))

    if module.params.get('name'):
        filter = list()
        try:
            for job in result:
                if job.get('ID') == module.params.get('name'):
                    filter.append(job)
                    result = filter
            if not filter:
                module.fail_json(msg="Couldn't find Job with id " +
                                 str(module.params.get('name')))
        except Exception as e:
            module.fail_json(msg=to_native(e))

    module.exit_json(changed=changed, result=result)
Exemple #11
0
def check_all_boxes():
    nomad_client = nomad.Nomad(discover_service("nomad").ip)
    deployments = nomad_client.job.get_deployments("ssh-client")

    for deployment in deployments:
        tunnel_exist = Tunnel.query.filter_by(job_id=deployment).first()
        if not tunnel_exist:
            cleanup_old_nomad_box(deployment)
Exemple #12
0
def cleanup_old_nomad_box(job_id):
    nomad_client = nomad.Nomad(discover_service("nomad").ip)

    try:
        del_box_nomad(nomad_client, job_id)
    except nomad.api.exceptions.BaseNomadException:
        cleanup_old_nomad_box.schedule(timedelta(hours=2), job_id, timeout=60000)
        raise nomad.api.exceptions.BaseNomadException
Exemple #13
0
def test_base_region_and_namespace_qs():
    n = nomad.Nomad(host=common.IP, port=common.NOMAD_PORT, verify=False, token=common.NOMAD_TOKEN, region="random", namespace="test")
    qs = n.jobs._query_string_builder("v1/jobs")

    assert "region" in qs
    assert qs["region"] == "random"

    assert "namespace" in qs
    assert qs["namespace"] == "test"
Exemple #14
0
def dump_jobs(host=None, cert_path=None, verbose=False):
    """
    Dump all jobs and allocation logs on this host using the path passed for certificates.

    :param host:
    :param cert_path: Path to SSL certificates
    :param verbose: If 'True' enable extra output
    :return:
    """
    cert_files = {
        'CA': os.path.join(cert_path, 'ca.pem'),
        'CRT': os.path.join(cert_path, 'client.crt'),
        'KEY': os.path.join(cert_path, 'client.key')
    }

    my_nomad = nomad.Nomad(host=host,
                           secure=True,
                           verify=cert_files['CA'],
                           cert=(cert_files['CRT'], cert_files['KEY']))

    try:
        for job in my_nomad.jobs:
            submit_time = get_datetime(
                job['SubmitTime']
            )  # time.strftime('%Y-%m-%d %I:%M:%S%p', time.localtime(job['SubmitTime']/1000000000))

            print('=' * 80)
            print("JOB ID : {:50s} STATUS : {:12s} SUBMITTED : {}".format(
                job['ID'], job['Status'], submit_time))

            allocations = my_nomad.job.get_allocations(job["ID"])
            if allocations:
                print("ALLOCATIONS:")

                # Iterate through all allocations

                for allocation in allocations:
                    print(">> ALLOCATION ID : {} ALLOCATION NAME : {}".format(
                        allocation['ID'], allocation['Name']))

                    alloc_id = allocation['ID']
                    task_id = list(allocation['TaskStates'].keys())[0]

                    try:
                        stderr_log = my_nomad.client.stream_logs.stream(
                            id=alloc_id,
                            task=task_id,
                            type='stderr',
                            plain=True)
                        print('-- LOG', '-' * 74)
                        print(stderr_log)
                        print('-' * 80)
                    except Exception as e:
                        print("EXCEPTION: {}".format(e))
    except OSError as e:
        print(e)
Exemple #15
0
    def __init__(self, current_user: User, tunnel: Optional[Tunnel], job_id=None):
        self.current_user = current_user
        self.tunnel = tunnel
        if job_id:
            self.job_id = job_id
        if tunnel:
            self.subdomain = tunnel.subdomain
            self.job_id = tunnel.job_id

        self.nomad_client = nomad.Nomad(discover_service("nomad").ip)
Exemple #16
0
    def __init__(self, current_user: User, box: Optional[Box], job_id=None):
        self.current_user = current_user
        self.box = box
        if job_id:
            self.job_id = job_id
        if box:
            self.config = box.config
            self.job_id = box.job_id

        self.nomad_client = nomad.Nomad(discover_service("nomad").ip)
Exemple #17
0
def test_base_use_address_instead_on_host_port():
    responses.add(
        responses.GET,
        'https://nomad.service.consul:4646/v1/jobs',
        status=200,
        json=[]
    )

    nomad_address = "https://nomad.service.consul:4646"
    n = nomad.Nomad(address=nomad_address, host=common.IP, port=common.NOMAD_PORT, verify=False, token=common.NOMAD_TOKEN)
    n.jobs.get_jobs()
Exemple #18
0
    def _get_client(self):
        url = self.config['url']
        token = self.config['token']
        verify = self.config['verify']
        region = self.config['region']

        client = nomad.Nomad(host=url,
                             token=token,
                             verify=verify,
                             region=region)
        return client
Exemple #19
0
def get_nomad_jobs() -> list:
    """Calls nomad service and return all jobs"""
    try:
        nomad_host = get_env_variable("NOMAD_HOST")
        nomad_port = get_env_variable("NOMAD_PORT", "4646")
        nomad_client = nomad.Nomad(nomad_host,
                                   port=int(nomad_port),
                                   timeout=30)
        return nomad_client.jobs.get_jobs()
    except nomad.api.exceptions.BaseNomadException:
        # Nomad is not available right now
        return []
    def test_third_invocation_of_named_tunnel_works(self, current_user,
                                                    session):
        asub = ReservedSubdomainFactory(user=current_user, name="bobjoeboe")
        session.add(asub)
        session.flush()

        first_time = TunnelCreationService(
            current_user=current_user,
            subdomain_id=asub.id,
            port_types=["http"],
            ssh_key="",
        ).create()

        nomad_client = nomad.Nomad(discover_service("nomad").ip)
        del_tunnel_nomad(nomad_client, first_time.job_id)
        asub.in_use = False
        session.add(asub)
        session.flush()

        second_time = TunnelCreationService(
            current_user=current_user,
            subdomain_id=asub.id,
            port_types=["http"],
            ssh_key="",
        ).create()

        nomad_client = nomad.Nomad(discover_service("nomad").ip)
        del_tunnel_nomad(nomad_client, first_time.job_id)
        asub.in_use = False
        session.add(asub)
        session.flush()

        third_time = TunnelCreationService(
            current_user=current_user,
            subdomain_id=asub.id,
            port_types=["http"],
            ssh_key="",
        ).create()

        assert first_time.ssh_port != second_time.ssh_port != third_time.ssh_port
Exemple #21
0
def check_all_boxes():
    nomad_client = nomad.Nomad(discover_service("nomad").ip)
    deployments = nomad_client.job.get_deployments("ssh-client")

    for deployment in deployments:
        box_exist = Box.query.filter_by(job_id=deployment).first()

        if datetime.utcnow >= box_exist.session_end_time:
            cleanup_old_nomad_box(deployment)
            continue

        if not box_exist:
            cleanup_old_nomad_box(deployment)
Exemple #22
0
def run_job(app, config, nomad_job_name, params):
    nomad_host = config.JOB_QUEUE_NOMAD_HOST

    params.update({"FS_BACKEND": config.FS_BACKEND})
    if config.FS_BACKEND == "s3":
        params.update({
            "S3_ENDPOINT": config.FS_S3_ENDPOINT,
            "AWS_DEFAULT_REGION": config.FS_S3_REGION,
            "AWS_ACCESS_KEY_ID": config.FS_S3_ACCESS_KEY,
            "AWS_SECRET_ACCESS_KEY": config.FS_S3_SECRET_KEY,
        })
    elif config.FS_BACKEND == "swift":
        params.update({
            "OS_USERNAME": config.FS_SWIFT_USER,
            "OS_PASSWORD": config.FS_SWIFT_KEY,
            "OS_AUTH_URL": config.FS_SWIFT_AUTHURL,
            "OS_TENANT_NAME": config.FS_SWIFT_TENANT_NAME,
            "OS_REGION_NAME": config.FS_SWIFT_REGION_NAME,
        })

    data = json.dumps(params).encode("utf-8")
    payload = base64.b64encode(data).decode("utf-8")
    ncli = nomad.Nomad(host=nomad_host, timeout=5)

    response = ncli.job.dispatch_job(nomad_job_name, payload=payload)

    nomad_jobid = response["DispatchedJobID"]

    while True:
        summary = ncli.job.get_summary(nomad_jobid)
        task_group = list(summary["Summary"])[0]
        status = summary["Summary"][task_group]
        if status["Failed"] != 0 or status["Lost"] != 0:
            app.logger.error("Nomad job %r failed: %r", nomad_jobid, status)
            out, err = get_nomad_job_logs(ncli, nomad_jobid, nomad_job_name)
            out = textwrap.indent(out, "\t")
            err = textwrap.indent(err, "\t")
            raise Exception("Job %s is 'Failed' or 'Lost':\nStatus: "
                            "%s\nerr:\n%s\nout:\n%s" %
                            (nomad_jobid, status, err, out))
            return False
        if status["Complete"] == 1:
            app.logger.info("Nomad job %r: complete", nomad_jobid)
            break
        # there isn't a timeout here but python rq jobs have a timeout. Nomad
        # jobs have a timeout too.
        time.sleep(1)
    return True
Exemple #23
0
def main(service, buildNumber):
    """main"""
    n = nomad.Nomad('nomad.service.consul', timeout=5)
    if check(n, service, buildNumber):
        #success!
        sys.exit(0)
    else:
        # rollback to a previous working build...
        # up to 5 previous builds.
        prevBuildNumber = int(buildNumber) - 1
        for i in range(prevBuildNumber, prevBuildNumber - 5, -1):
            print("INFO:rolling back to buildnumber:%s" % i)
            prevbuildNumber = str(i)
            rollback(service, i)
            ret = check(n, service, i)
            if ret:
                sys.exit(0)
Exemple #24
0
def test_base_raise_exception_not_requests_response_object(mock_requests):
    mock_requests().delete.side_effect = [requests.RequestException()]

    with pytest.raises(nomad.api.exceptions.BaseNomadException) as excinfo:
        n = nomad.Nomad(host="162.16.10.102",
                        port=common.NOMAD_PORT,
                        timeout=0.001,
                        verify=False)

        _ = n.job.deregister_job("example")

    # excinfo is a ExceptionInfo instance, which is a wrapper around the actual exception raised.
    # The main attributes of interest are .type, .value and .traceback.
    # https://docs.pytest.org/en/3.0.1/assert.html#assertions-about-expected-exceptions
    assert hasattr(excinfo.value.nomad_resp, "text") is False
    assert isinstance(excinfo.value.nomad_resp, requests.RequestException)
    assert "raised due" in str(excinfo)
Exemple #25
0
def main():
    module = AnsibleModule(
        argument_spec=dict(
            token=dict(type='str'),
            token_file=dict(type='str'),
            Name=dict(required=True, type='str'),
            Description=dict(type='str'),
            Rules=dict(required=True, type='dict', options=dict(
                namespace=dict(type='dict'),
                agent=dict(type='dict', options=dict(
                    policy=dict(type='str')
                )),
                node=dict(type='dict', options=dict(
                    policy=dict(type='str')
                )),
                quota=dict(type='dict', options=dict(
                    policy=dict(type='str')
                ))
            ))
        ),
        supports_check_mode=True
    )

    try:
        token = module.params.get('token')
        token_file = module.params.get('token_file')
        if token_file:
            with open(token_file) as f:
                token = f.read()
        if token is None:
            token = os.getenv('NOMAD_TOKEN')
        n = nomad.Nomad(token=token)

        if module.check_mode:
            module.exit_json(changed=False)
        else:
            n.acl.create_policy(module.params.get('Name'), dict(
                Name=module.params.get('Name'),
                Description=module.params.get('Description', ''),
                Rules=json.dumps(strip(module.params.get('Rules')))
            ))
            module.exit_json(changed=True)
    except BaseNomadException as e:
        module.fail_json(status_code=e.nomad_resp.status_code, msg=e.nomad_resp.text, type=type(e))
    def execute(self, context):
        self.log.info('Starting nomad batch job')
        if self.nomad_conn_id:
            self.nomad = self.get_hook().get_conn()
        else:
            self.nomad = nomad.Nomad(
                host = self.nomad_url,
                secure = self.secure
                )

        job_json = self.normalize_job_spec(self.job_spec)
        job_id = job_json["ID"]
        self.log.info("Starting the nomad job %s" % job_id)
        self.nomad.jobs.register_job({"Job" : job_json})
            
        start = time.time()

        self.log.info("waiting for nomad job to complete %s" % job_id)

        while True:
            summary = self.nomad.job.get_summary(job_id)
            last_check = time.time()

            states = []
            for group in summary["Summary"].values():
                inprogress = sum([ group[x] for x in ["Queued", "Running", "Starting"]])
                failed = sum([ group[x] for x in ["Failed", "Lost"]]) 
                done = sum([ group[x] for x in ["Complete"]]) 
                total_jobs = done + failed + inprogress
                if inprogress == 0 and float(done)/target > self.failed_fraction:
                    # complete     
                    
                break;
            elif last_check-start > self.ttl:
                #self.nomad.job.deregister_job(job_id)
                raise  AirflowException('nomad job out of time')

        if inprogress == 0 and float(failed) / total_jobs > self.failed_fraction:
            raise AirflowException(f'{failed} out of {total_jobs} nomad batch jobs failed' )
        else:
            # good enough!
            return
Exemple #27
0
    def test_find_unused_boxes(self, current_user, session):
        """ Kills unused boxes """
        asub = ReservedSubdomainFactory(user=current_user, name="bobjoebob")
        session.add(asub)
        session.flush()

        TunnelCreationService(
            current_user=current_user,
            subdomain_id=asub.id,
            port_types=["http"],
            ssh_key="",
        ).create()

        find_unused_boxes()
        find_unused_boxes()

        nomad_client = nomad.Nomad(discover_service("nomad").ip)
        deploys = nomad_client.job.get_deployments("ssh-client-bobjoebob")

        assert len(deploys) == 0
Exemple #28
0
    def do_GET(self):
        if self.path == '/metrics':
            global allocated_memory_gauge, allocated_cpu_gauge
            core.REGISTRY.unregister(allocated_cpu_gauge)
            core.REGISTRY.unregister(allocated_memory_gauge)
            allocated_cpu_gauge = Gauge('nomad_allocated_cpu', 'Nomad allocated cpu', ['job', 'taskgroup', 'task', 'alloc_id'])
            allocated_memory_gauge = Gauge('nomad_allocated_memory', 'Nomad allocated memory', ['job', 'taskgroup', 'task', 'alloc_id'])

            nomad_server = os.environ.get('NOMAD_SERVER', 'nomad.service.consul')
            nomad_port = os.environ.get('NOMAD_PORT', 4646)
            n = nomad.Nomad(host=nomad_server, port=nomad_port)
            get_allocs(n)
            get_deployments(n)
            get_jobs(n)
            get_resources(n)
            stats = generate_latest(core.REGISTRY)
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            self.wfile.write(stats)
Exemple #29
0
    def __init__(
        self,
        current_user: User,
        config_id: Optional[int],
        ssh_key: str,
        image: str,
    ):

        self.ssh_key = ssh_key
        self.image = image
        self.current_user = current_user
        self.session_end_time = datetime.utcnow() + timedelta(
            seconds=current_user.limits().duration)

        if config_id:
            self.config = Config.query.get(config_id)
        else:
            self.config = ConfigCreationService(self.current_user).create()

        # We need to do this each time so each if a nomad service goes down
        # it doesnt affect web api
        self.nomad_client = nomad.Nomad(discover_service("nomad").ip)
Exemple #30
0
def get_active_volumes() -> Set[str]:
    """Returns a Set of indices for volumes that are currently mounted.

    These can be used to determine which jobs would actually be able
    to be placed if they were queued up.
    """
    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = nomad.Nomad(nomad_host, port=int(nomad_port), timeout=30)

    volumes = set()
    try:
        for node in nomad_client.nodes.get_nodes():
            node_detail = nomad_client.node.get_node(node["ID"])
            if 'Status' in node_detail and node_detail['Status'] == 'ready' \
               and 'Meta' in node_detail and 'volume_index' in node_detail['Meta']:
                volumes.add(node_detail['Meta']['volume_index'])
    except nomad.api.exceptions.BaseNomadException:
        # Nomad is down, return the empty set.
        pass

    return volumes