Exemple #1
0
def get_job_log(jobname):
    """
    get log belong to job
    ---
    responses:
      200:
        description: get logs
        schema:
          type: object
        examples:
          application/json:
            data: "haha"
    """
    job = Job.get_by_name(name=jobname)
    if not job:
        abort(404, "job {} not found".format(jobname))

    try:
        pods = KubeApi.instance().get_job_pods(jobname,
                                               namespace=DEFAULT_JOB_NS)
        if pods.items:
            podname = pods.items[0].metadata.name
            data = KubeApi.instance().get_pod_log(podname=podname,
                                                  namespace=DEFAULT_JOB_NS)
            return {'data': data}
        else:
            return {'data': "no log, please retry"}
    except ApiException as e:
        abort(e.status, "Error when get job log: {}".format(str(e)))
    except Exception as e:
        abort(500, "Error when get job log: {}".format(str(e)))
Exemple #2
0
def delete_job(jobname):
    """
    Delete a single job
    ---
    parameters:
      - name: jobname
        in: path
        type: string
        required: true
    responses:
      200:
        description: error message
        schema:
          $ref: '#/definitions/Error'
        examples:
          application/json:
            error: null
    """
    job = Job.get_by_name(jobname)
    if not job:
        abort(404, "job {} not found".format(jobname))

    with handle_k8s_err("Error when delete job"):
        KubeApi.instance().delete_job(jobname,
                                      ignore_404=True,
                                      namespace=DEFAULT_JOB_NS)
    job.delete()
    return DEFAULT_RETURN_VALUE
Exemple #3
0
    def watch_app_job_pods(self, cluster):
        last_seen_version = None
        label_selector = "kae-type in (app, job)"
        while True:
            try:
                if last_seen_version is not None:
                    watcher = KubeApi.instance().watch_pods(cluster_name=cluster, label_selector=label_selector, resource_version=last_seen_version)
                else:
                    watcher = KubeApi.instance().watch_pods(cluster_name=cluster, label_selector=label_selector)

                for event in watcher:
                    obj = event['object']
                    labels = obj.metadata.labels or {}
                    last_seen_version = obj.metadata.resource_version

                    if 'kae-app-name' in labels:
                        appname = labels['kae-app-name']
                        channel = make_app_watcher_channel_name(cluster, appname)
                        data = {
                            'object': obj.to_dict(),
                            'action': event['type'],
                        }
                        rds.publish(message=json.dumps(data, cls=VersatileEncoder), channel=channel)
                    elif 'kae-job-name' in labels:
                        if event['type'] == 'DELETED':
                            continue
                        jobname = labels['kae-job-name']
                        handle_job_pod_event.delay(jobname, event['raw_object'])
            except ProtocolError:
                logger.warn('skip this error... because kubernetes disconnect client after default 10m...')
            except Exception as e:
                # logger.error("---------watch error ------------------")
                logger.exception("watch pods workers error")
Exemple #4
0
def save_pod_log(jobname, podname, version=0):
    try:
        resp = KubeApi.instance().get_pod_log(podname=podname)
    except ApiException as e:
        if e.status == 404:
            return
        else:
            raise e
    try:
        save_job_log(job_name=jobname, resp=resp, version=version)
    except:
        logger.exception("Error when get pod log")
Exemple #5
0
def get_job_log_events(socket, jobname):
    """
    SSE endpoint fo job log
    ---
    responses:
      200:
        description: event stream
        schema:
          type: object
    """
    ns = DEFAULT_JOB_NS

    job = Job.get_by_name(jobname)
    if not job:
        socket.send(json.dumps({"error": "job {} not found".format(jobname)}))
        return
    with session_removed():
        try:
            pods = KubeApi.instance().get_job_pods(jobname, namespace=ns)
        except ApiException as e:
            socket.send(
                json.dumps(
                    {"error": "Error when get job pods: {}".format(str(e))}))
            return
        if pods.items:
            podname = pods.items[0].metadata.name
            try:
                for line in KubeApi.instance().follow_pod_log(podname=podname,
                                                              namespace=ns):
                    socket.send(json.dumps({'data': line}))
            except ApiException as e:
                socket.send(
                    json.dumps({
                        "error":
                        "Error when follow job log, please retry: {}".format(
                            str(e))
                    }))
                return
        else:
            socket.send(json.dumps({"error": "no log, please retry"}))
Exemple #6
0
def list_cluster():
    """
    List all the available clusters
    ---
    responses:
      200:
        description: available cluster list
        schema:
          type: array
          items:
            type: string
        examples:
          application/json: [
            "cluster1",
            "cluster2",
            ]
    """
    return KubeApi.instance().cluster_names
Exemple #7
0
def restart_job(jobname):
    """
    Restart a single job
    ---
    parameters:
      - name: jobname
        in: path
        type: string
        required: true
    responses:
      200:
        description: error message
        schema:
          $ref: '#/definitions/Error'
        examples:
          application/json:
            error: null
    """
    job = Job.get_by_name(jobname)
    if not job:
        abort(404, "job {} not found".format(jobname))

    job.inc_version()

    with handle_k8s_err("Error when delete job"):
        KubeApi.instance().delete_job(jobname,
                                      ignore_404=True,
                                      namespace=DEFAULT_JOB_NS)
    specs = job.specs
    # FIXME: need to wait the old job to be deleted
    while True:
        try:
            KubeApi.instance().get_job(jobname, namespace=DEFAULT_JOB_NS)
        except ApiException as e:
            if e.status == 404:
                break
            else:
                logger.exception("kubernetes error")
                abort(500, "kubernetes error")
        except:
            logger.exception("kubernetes error")
            abort(500, "kubernetes error")
        time.sleep(2)
    with handle_k8s_err("Error when create job"):
        KubeApi.instance().create_job(specs, namespace=DEFAULT_JOB_NS)
    return DEFAULT_RETURN_VALUE
Exemple #8
0
def create_job(args):
    """
    create a new job
    ---
    responses:
      200:
        description: Error message
        schema:
          type: object
          $ref: '#/definitions/Error'
        examples:
          application/json:
            error: null
    """
    specs_text = args.get('specs_text', None)
    cluster = args.get('cluster', KubeApi.DEFAULT_CLUSTER)

    if specs_text:
        try:
            yaml_dict = yaml.load(specs_text)
        except yaml.YAMLError as e:
            return abort(403, 'specs text is invalid yaml {}'.format(str(e)))
    else:
        # construct specs dict from args
        command = shlex.split(args['command'])
        if args['shell'] or args.get('gpu', 0) > 0:
            if len(command) > 2 and (command[0] != 'sh' or command[1] != '-c'):
                command = ['sh', '-c'] + command

        yaml_dict = {
            'containers': [{
                'name': args['jobname'],
                'image': args['image'],
                'command': command,
            }]
        }
        copy_list = ('jobname', 'git', 'branch', 'commit', 'autoRestart',
                     'comment')
        for field in copy_list:
            if field in args:
                yaml_dict[field] = args[field]
        if 'gpu' in args:
            yaml_dict['containers'][0]['gpu'] = args['gpu']
    try:
        specs = load_job_specs(yaml_dict)
    except ValidationError as e:
        return abort(400, 'specs text is invalid {}'.format(str(e)))
    try:
        job = Job.create(name=specs.jobname,
                         git=specs.get('git'),
                         branch=specs.get('branch'),
                         commit=specs.get('commit'),
                         comment=specs.get('comment'),
                         status="Pending",
                         specs_text=yaml.dump(specs.to_dict()))
    except IntegrityError as e:
        return abort(400, 'job is duplicate')
    except ValueError as e:
        return abort(400, str(e))

    def clean_func():
        """
        clean database when got an error.
        :return:
        """
        job.delete()

    job_dir = os.path.join(JOBS_ROOT_DIR, specs.jobname)
    code_dir = os.path.join(job_dir, "code")
    if specs.git:
        try:
            cloner = Cloner(repo=specs.git,
                            dst_directory=code_dir,
                            branch=specs.branch,
                            commit_id=specs.commit)
            cloner.clone_and_copy()
        except Exception as e:
            job.delete()

            logger.exception("clone error")
            abort(500, "clone and copy code error: {}".format(str(e)))

    with handle_k8s_err("Error when create job", clean_func=clean_func):
        KubeApi.instance().create_job(specs,
                                      namespace=DEFAULT_JOB_NS,
                                      cluster_name=cluster)

    try:
        job.grant_user(g.user)
    except IntegrityError as e:
        pass

    return DEFAULT_RETURN_VALUE
Exemple #9
0
def validate_cluster_name(cluster):
    if KubeApi.instance().cluster_exist(cluster) is False:
        raise ValidationError("cluster {} not exists".format(cluster))
Exemple #10
0
def enter_pod(socket, appname):
    payload = None
    while True:
        message = socket.receive()
        if message is None:
            return
        try:
            payload = pod_entry_schema.loads(message)
            break
        except ValidationError as e:
            socket.send(json.dumps(e.messages))
        except JSONDecodeError as e:
            socket.send(json.dumps({'error': str(e)}))

    app = App.get_by_name(appname)
    if not app:
        socket.send(
            make_errmsg('app {} not found'.format(appname), jsonize=True))
        return

    if not g.user.granted_to_app(app):
        socket.send(
            make_errmsg(
                'You\'re not granted to this app, ask administrators for permission',
                jsonize=True))
        return

    args = payload.data
    podname = args['podname']
    cluster = args['cluster']
    namespace = args['namespace']
    container = args.get('container', None)
    sh = KubeApi.instance().exec_shell(podname,
                                       namespace=namespace,
                                       cluster_name=cluster,
                                       container=container)
    need_exit = False

    def heartbeat_sender():
        nonlocal need_exit
        interval = WS_HEARTBEAT_TIMEOUT - 3
        if interval <= 0:
            interval = WS_HEARTBEAT_TIMEOUT

        try:
            while need_exit is False:
                time.sleep(interval)
                try:
                    # send a null character to client
                    logger.debug("send PING")
                    send_ping(socket)
                except WebSocketError as e:
                    need_exit = True
                    return
        finally:
            logger.debug("pod entry heartbeat greenlet exit")

    def resp_sender():
        nonlocal need_exit
        try:
            while sh.is_open() and need_exit is False:
                sh.update(timeout=1)
                if sh.peek_stdout():
                    msg = sh.read_stdout()
                    logger.debug("STDOUT: %s" % msg)
                    socket.send(msg)
                if sh.peek_stderr():
                    msg = sh.read_stderr()
                    logger.debug("STDERR: %s" % msg)
                    socket.send(msg)
        except ProtocolError:
            logger.warn('kubernetes disconnect client after default 10m...')
        except WebSocketError as e:
            logger.warn('client socket is closed')
        except Exception as e:
            logger.warn("unknown exception: {}".format(str(e)))
        finally:
            need_exit = True
            logger.debug("exec output sender greenlet exit")

    gevent.spawn(resp_sender)
    gevent.spawn(heartbeat_sender)

    # to avoid lost mysql connection exception
    db.session.remove()
    try:
        while need_exit is False:
            # get command from client
            message = socket.receive()
            if message is None:
                logger.info("client socket closed")
                break
            sh.write_stdin(message)
            continue
    finally:
        need_exit = True
        logger.debug("pod entry greenlet exit")
Exemple #11
0
def get_app_pods_events(socket, appname):
    payload = None
    socket_active_ts = time.time()

    while True:
        message = socket.receive()
        if message is None:
            return
        try:
            payload = cluster_canary_schema.loads(message)
            break
        except ValidationError as e:
            socket.send(json.dumps(e.messages))
        except JSONDecodeError as e:
            socket.send(json.dumps({'error': str(e)}))

    args = payload.data
    cluster = args['cluster']
    canary = args['canary']
    name = "{}-canary".format(appname) if canary else appname
    channel = make_app_watcher_channel_name(cluster, name)
    ns = DEFAULT_APP_NS

    app = App.get_by_name(appname)
    if not app:
        socket.send(
            make_errmsg('app {} not found'.format(appname), jsonize=True))
        return

    if not g.user.granted_to_app(app):
        socket.send(
            make_errmsg(
                'You\'re not granted to this app, ask administrators for permission',
                jsonize=True))
        return

    # since this request may pend long time, so we remove the db session
    # otherwise we may get error like `sqlalchemy.exc.TimeoutError: QueuePool limit of size 50 overflow 10 reached, connection timed out`
    with session_removed():
        pod_list = KubeApi.instance().get_app_pods(name,
                                                   cluster_name=cluster,
                                                   namespace=ns)
        pods = pod_list.to_dict()
        for item in pods['items']:
            data = {
                'object': item,
                'action': "ADDED",
            }
            socket.send(json.dumps(data, cls=VersatileEncoder))

        pubsub = rds.pubsub()
        pubsub.subscribe(channel)
        need_exit = False

        def check_client_socket():
            nonlocal need_exit
            while need_exit is False:
                if socket.receive() is None:
                    need_exit = True
                    break

        def heartbeat_sender():
            nonlocal need_exit, socket_active_ts
            interval = WS_HEARTBEAT_TIMEOUT - 3
            if interval <= 0:
                interval = WS_HEARTBEAT_TIMEOUT

            while need_exit is False:
                now = time.time()
                if now - socket_active_ts <= (interval - 1):
                    time.sleep(interval - (now - socket_active_ts))
                else:
                    try:
                        send_ping(socket)
                        socket_active_ts = time.time()
                    except WebSocketError as e:
                        need_exit = True
                        return

        gevent.spawn(check_client_socket)
        gevent.spawn(heartbeat_sender)

        try:

            while need_exit is False:
                resp = pubsub.get_message(timeout=30)
                if resp is None:
                    continue

                if resp['type'] == 'message':
                    raw_content = resp['data']
                    # omit the initial message where resp['data'] is 1L
                    if not isinstance(raw_content, (bytes, str)):
                        continue
                    content = raw_content
                    if isinstance(content, bytes):
                        content = content.decode('utf-8')
                    socket.send(content)
                    socket_active_ts = time.time()
        finally:
            # need close the connection created by PUB/SUB,
            # otherwise it will cause too many redis connections
            pubsub.unsubscribe()
            pubsub.close()
            need_exit = True
    logger.info("ws connection closed")
Exemple #12
0
 def start(self):
     for name in KubeApi.instance().cluster_names:
         logger.info("create watcher thread for cluster {}".format(name))
         self.thread_map[name] = spawn(self.watch_app_job_pods, name)