Exemple #1
0
def remove_containers(task_id, cids, rmi=False):
    current_flask.logger.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error('Task (id=%s) not found, quit', task_id)
        return

    notifier = TaskNotifier(task)
    containers = Container.get_multi(cids)
    container_ids = [c.container_id for c in containers]
    host = task.host
    try:
        flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids}
        rds.mset(**flags)
        for c in containers:
            remove_container_backends(c)
            current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed',
                    task_id, c.container_id[:7])
        appnames = {c.appname for c in containers}
        publish_to_service_discovery(*appnames)

        dockerjob.remove_host_containers(containers, host)
        current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids)
        if rmi:
            dockerjob.remove_image(task.version, host)
    except Exception, e:
        task.finish_with_result(consts.TASK_FAILED)
        notifier.pub_fail()
        current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
Exemple #2
0
def task_log(task_id):
    ws = request.environ['wsgi.websocket']

    task = Task.get(task_id)
    if not task:
        ws.close()
        _log.info('Task %s not found, close websocket' % task_id)
        return 'websocket closed'

    notifier = TaskNotifier(task)
    try:
        pub = rds.pubsub()
        pub.subscribe(task.publish_key)

        for line in notifier.get_store_logs():
            ws.send(line)

        if task.finished:
            return ''

        for line in pub.listen():
            if line['data'] == consts.PUB_END_MESSAGE:
                break
            if line['type'] != 'message':
                continue
            ws.send(line['data'])
    except geventwebsocket.WebSocketError, e:
        _log.exception(e)
Exemple #3
0
def task_log(task_id):
    ws = request.environ['wsgi.websocket']

    task = Task.get(task_id)
    if not task:
        ws.close()
        logger.info('Task %s not found, close websocket' % task_id)
        return 'websocket closed'

    notifier = TaskNotifier(task)
    try:
        pub = rds.pubsub()
        pub.subscribe(task.publish_key)

        for line in notifier.get_store_logs():
            ws.send(line)

        if task.finished:
            return ''

        for line in pub.listen():
            if line['data'] == code.PUB_END_MESSAGE:
                break
            if line['type'] != 'message':
                continue
            ws.send(line['data'])
    except geventwebsocket.WebSocketError, e:
        logger.exception(e)
Exemple #4
0
def build_docker_image(task_id, base, file_path):
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip)
    notifier = TaskNotifier(task)

    app = task.app
    host = task.host
    version = task.version

    try:
        repo, tag = base.split(':', 1)
        repo = repo if repo.startswith('eru/') else 'eru/' + repo.strip('/')
        _log.info('Task<id=%s>: Pull base image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.pull_image(host, repo, tag))

        _log.info('Task<id=%s>: Build image (base=%s)', task_id, base)
        notifier.store_and_broadcast(
            dockerjob.build_image(host, version, base, file_path))

        _log.info('Task<id=%s>: Push image (base=%s)', task_id, base)
        last_line = notifier.store_and_broadcast(
            dockerjob.push_image(host, version))
        dockerjob.remove_image(version, host)
    except Exception, e:
        task.finish(consts.TASK_FAILED)
        task.reason = str(e.message)
        notifier.pub_fail()
        _log.error('Task<id=%s>, exception', task_id)
        _log.exception(e)
Exemple #5
0
def remove_containers(task_id, cids, rmi=False):
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip)
    notifier = TaskNotifier(task)

    containers = Container.get_multi(cids)
    if not containers:
        _log.error('Task (id=%s) no container found, quit')
        return

    host = containers[0].host

    for c in containers:
        c.in_removal = 1

    container_ids = [c.container_id for c in containers if c]
    try:
        set_flag_for_agent(container_ids)
        for c in containers:
            remove_container_backends(c)
            _log.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.short_id)

        appnames = {c.appname for c in containers}
        publish_to_service_discovery(*appnames)

        time.sleep(3)

        dockerjob.remove_host_containers(containers, host)
        _log.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids)

        if rmi:
            try:
                dockerjob.remove_image(task.version, host)
            except Exception as e:
                _log.error('Task<id=%s>, fail to remove image', task_id, e)
    except Exception as e:
        task.finish(consts.TASK_FAILED)
        task.reason = str(e.message)
        notifier.pub_fail()
        _log.error('Task<id=%s> exception', task_id)
        _log.exception(e)
    else:
        for c in containers:
            c.delete()
        task.finish(consts.TASK_SUCCESS)
        task.reason = 'ok'
        notifier.pub_success()
        remove_container_for_agent(host, container_ids)
        remove_flag_for_agent(container_ids)
        _log.info('Task<id=%s>: Done', task_id)
Exemple #6
0
def build_docker_image(task_id, base, file_path):
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip)
    notifier = TaskNotifier(task)

    app = task.app
    host = task.host
    version = task.version

    try:
        repo, tag = base.split(':', 1)
        repo = repo if repo.startswith('eru/') else 'eru/' + repo.strip('/')
        _log.info('Task<id=%s>: Pull base image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.pull_image(host, repo, tag))

        _log.info('Task<id=%s>: Build image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.build_image(host, version, base, file_path))

        _log.info('Task<id=%s>: Push image (base=%s)', task_id, base)
        last_line = notifier.store_and_broadcast(dockerjob.push_image(host, version))
        dockerjob.remove_image(version, host)
    except Exception, e:
        task.finish(consts.TASK_FAILED)
        task.reason = str(e.message)
        notifier.pub_fail()
        _log.error('Task<id=%s>, exception', task_id)
        _log.exception(e)
Exemple #7
0
def remove_containers(task_id, cids, rmi=False):
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error('Task (id=%s) not found, quit', task_id)
        return

    current_flask.logger.info('Task<id=%s>: Start on host %s' % (task_id, task.host.ip))
    notifier = TaskNotifier(task)
    containers = Container.get_multi(cids)
    container_ids = [c.container_id for c in containers if c]
    host = task.host
    version = task.version
    try:
        # flag, don't report these
        flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids}
        rds.mset(**flags)
        for c in containers:
            remove_container_backends(c)
            current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed',
                    task_id, c.container_id[:7])
        appnames = {c.appname for c in containers}
        publish_to_service_discovery(*appnames)

        time.sleep(3)

        dockerjob.remove_host_containers(containers, host)
        current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids)
        if rmi:
            try:
                dockerjob.remove_image(task.version, host)
            except Exception as e:
                current_flask.logger.error('Task<id=%s>: Exception (e=%s), fail to remove image', task_id, e)
    except Exception as e:
        task.finish_with_result(consts.TASK_FAILED)
        notifier.pub_fail()
        current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
    else:
        for c in containers:
            c.delete()
        task.finish_with_result(consts.TASK_SUCCESS)
        notifier.pub_success()
        if container_ids:
            rds.hdel('eru:agent:%s:containers:meta' % host.name, *container_ids)
        rds.delete(*flags.keys())
        current_flask.logger.info('Task<id=%s>: Done', task_id)

    if not version.containers.count():
        falcon_remove_alarms(version)
Exemple #8
0
def build_docker_image(task_id, base):
    task = Task.get(task_id)
    notifier = TaskNotifier(task)
    try:
        repo, tag = base.split(':', 1)
        notifier.store_and_broadcast(dockerjob.pull_image(task.host, repo, tag))
        notifier.store_and_broadcast(dockerjob.build_image(task.host, task.version, base))
        notifier.store_and_broadcast(dockerjob.push_image(task.host, task.version))
        try:
            dockerjob.remove_image(task.version, task.host)
        except:
            pass
    except Exception, e:
        logger.exception(e)
        task.finish_with_result(code.TASK_FAILED)
        notifier.pub_fail()
Exemple #9
0
def remove_containers(task_id, cids, rmi):
    task = Task.get(task_id)
    notifier = TaskNotifier(task)
    containers = Container.get_multi(cids)
    container_ids = [c.container_id for c in containers]
    host = task.host
    try:
        flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids}
        rds.mset(**flags)
        for c in containers:
            remove_container_backends(c)
        appnames = {c.appname for c in containers}
        publish_to_service_discovery(*appnames)

        dockerjob.remove_host_containers(containers, host)
        if rmi:
            dockerjob.remove_image(task.version, host)
    except Exception, e:
        logger.exception(e)
        task.finish_with_result(code.TASK_FAILED)
        notifier.pub_fail()
Exemple #10
0
def build_docker_image(task_id, base):
    current_flask.logger.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error('Task (id=%s) not found, quit', task_id)
        return

    notifier = TaskNotifier(task)
    try:
        repo, tag = base.split(':', 1)
        current_flask.logger.info('Task<id=%s>: Pull base image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.pull_image(task.host, repo, tag))
        current_flask.logger.info('Task<id=%s>: Build image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.build_image(task.host, task.version, base))
        current_flask.logger.info('Task<id=%s>: Push image (base=%s)', task_id, base)
        notifier.store_and_broadcast(dockerjob.push_image(task.host, task.version))
        dockerjob.remove_image(task.version, task.host)
    except Exception, e:
        task.finish_with_result(consts.TASK_FAILED)
        notifier.pub_fail()
        current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
Exemple #11
0
def remove_containers(task_id, cids, rmi=False):
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip)
    notifier = TaskNotifier(task)

    containers = Container.get_multi(cids)
    if not containers:
        _log.error('Task (id=%s) no container found, quit')
        return

    host = containers[0].host

    for c in containers:
        c.in_removal = 1

    container_ids = [c.container_id for c in containers if c]
    try:
        set_flag_for_agent(container_ids)
        for c in containers:
            remove_container_backends(c)
            _log.info('Task<id=%s>: Container (cid=%s) backends removed',
                      task_id, c.short_id)

        appnames = {c.appname for c in containers}
        publish_to_service_discovery(*appnames)

        time.sleep(3)

        dockerjob.remove_host_containers(containers, host)
        _log.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids)

        if rmi:
            try:
                dockerjob.remove_image(task.version, host)
            except Exception as e:
                _log.error('Task<id=%s>, fail to remove image', task_id, e)
    except Exception as e:
        task.finish(consts.TASK_FAILED)
        task.reason = str(e.message)
        notifier.pub_fail()
        _log.error('Task<id=%s> exception', task_id)
        _log.exception(e)
    else:
        for c in containers:
            c.delete()
        task.finish(consts.TASK_SUCCESS)
        task.reason = 'ok'
        notifier.pub_success()
        remove_container_for_agent(host, container_ids)
        remove_flag_for_agent(container_ids)
        _log.info('Task<id=%s>: Done', task_id)
Exemple #12
0
def create_containers_with_macvlan(task_id, ncontainer, core_ids, network_ids):
    """
    执行task_id的任务. 部署ncontainer个容器, 占用core_ids这些核, 绑定到networks这些子网
    """
    task = Task.get(task_id)
    if not task:
        return

    networks = Network.get_multi(network_ids)

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props['entrypoint']
    env = task.props['env']
    used_cores = Core.get_multi(core_ids)

    pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name
    feedback_key = 'eru:agent:%s:feedback' % task_id

    cids = []

    for cores in more_itertools.chunked(used_cores, len(core_ids)/ncontainer):
        try:
            cid, cname = dockerjob.create_one_container(host, version,
                    entrypoint, env, cores)
        except:
            host.release_cores(cores)
            continue

        ips = [n.acquire_ip() for n in networks]
        ip_dict = {ip.vlan_address: ip for ip in ips}

        if ips:
            ident_id = cname.split('_')[-1]
            values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips]
            rds.publish(pub_agent_vlan_key, '|'.join(values))

        for _ in ips:
            # timeout 15s
            rv = rds.blpop(feedback_key, 15)
            if rv is None:
                break
            # rv is like (feedback_key, 'succ|container_id|vethname|ip')
            succ, _, vethname, vlan_address = rv[1].split('|')
            if succ == '0':
                break
            ip = ip_dict.get(vlan_address, None)
            if ip:
                ip.set_vethname(vethname)

        else:
            logger.info('Creating container with cid %s and ips %s' % (cid, ips))
            c = Container.create(cid, host, version, cname, entrypoint, cores, env)
            for ip in ips:
                ip.assigned_to_container(c)
            notifier.notify_agent(cid)
            add_container_for_agent(c)
            add_container_backends(c)
            cids.append(cid)
            # 略过清理工作
            continue

        # 清理掉失败的容器, 释放核, 释放ip
        logger.info('Cleaning failed container with cid %s' % cid)
        dockerjob.remove_container_by_cid([cid], host)
        host.release_cores(cores)
        [ip.release() for ip in ips]

    publish_to_service_discovery(version.name)
    task.finish_with_result(code.TASK_SUCCESS, container_ids=cids)
    notifier.pub_success()
Exemple #13
0
def create_containers_with_macvlan_public(task_id, ncontainer, nshare, network_ids, spec_ips=None):
    """
    执行task_id的任务. 部署ncontainer个容器, 绑定到networks这些子网
    """
    current_flask.logger.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error('Task (id=%s) not found, quit', task_id)
        return

    if spec_ips is None:
        spec_ips = []

    networks = Network.get_multi(network_ids)

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props['entrypoint']
    env = task.props['env']
    # use raw
    image = task.props['image']
    cpu_shares = 1024

    pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name
    feedback_key = 'eru:agent:%s:feedback' % task_id

    cids = []

    for _ in range(ncontainer):
        try:
            cid, cname = dockerjob.create_one_container(host, version,
                entrypoint, env, cores=None, cpu_shares=cpu_shares, image=image)
        except Exception as e:
            print e # 同上
            continue

        if spec_ips:
            ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)]
        else:
            ips = [n.acquire_ip() for n in networks]
        ips = [i for i in ips if i]
        ip_dict = {ip.vlan_address: ip for ip in ips}

        if ips:
            ident_id = cname.split('_')[-1]
            values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips]
            rds.publish(pub_agent_vlan_key, '|'.join(values))

        for _ in ips:
            # timeout 15s
            rv = rds.blpop(feedback_key, 15)
            if rv is None:
                break
            # rv is like (feedback_key, 'succ|container_id|vethname|ip')
            succ, _, vethname, vlan_address = rv[1].split('|')
            if succ == '0':
                break
            ip = ip_dict.get(vlan_address, None)
            if ip:
                ip.set_vethname(vethname)

        else:
            current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips)
            c = Container.create(cid, host, version, cname, entrypoint, {}, env, nshare)
            for ip in ips:
                ip.assigned_to_container(c)
            notifier.notify_agent(cid)
            add_container_for_agent(c)
            add_container_backends(c)
            cids.append(cid)
            # 略过清理工作
            continue

        # 清理掉失败的容器, 释放核, 释放ip
        current_flask.logger.info('Cleaning failed container (cid=%s)', cid)
        dockerjob.remove_container_by_cid([cid], host)
        [ip.release() for ip in ips]
        # 失败了就得清理掉这个key
        rds.delete(feedback_key)

    publish_to_service_discovery(version.name)
    task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids)
    notifier.pub_success()
    current_flask.logger.info('Task<id=%s>: Done', task_id)
Exemple #14
0
def create_containers(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None):
    """
    执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网
    """
    _log.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    if spec_ips is None:
        spec_ips = []

    need_network = bool(network_ids)
    networks = [ipam.get_pool(n) for n in network_ids]

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props['entrypoint']
    env = task.props['env']
    ports = task.props['ports']
    args = task.props['args']
    # use raw
    image = task.props['image']
    callback_url = task.props['callback_url']
    cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024

    cids = []
    backends = []
    entry = version.appconfig.entrypoints[entrypoint]

    for fcores, pcores in _iter_cores(cores, ncontainer):
        cores_for_one_container = {'full': fcores, 'part': pcores}
        # 在宿主机上创建容器
        try:
            cid, cname = dockerjob.create_one_container(host,
                                                        version,
                                                        entrypoint,
                                                        env,
                                                        fcores + pcores,
                                                        ports=ports, args=args,
                                                        cpu_shares=cpu_shares,
                                                        image=image,
                                                        need_network=need_network)
        except Exception as e:
            # 写给celery日志看
            _log.exception(e)
            host.release_cores(cores_for_one_container, nshare)
            continue

        # 容器记录下来
        c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url)

        # 为容器创建网络栈
        # 同时把各种信息都记录下来
        # 如果失败, 清除掉所有记录和宿主机上的容器
        # 循环下一次尝试
        cidrs = [n.netspace for n in networks]
        if not ipam.allocate_ips(cidrs, cid, spec_ips):
            _clean_failed_containers(cid)
            continue

        notifier.notify_agent(c)
        add_container_for_agent(host, c)
        add_container_backends(c)
        cids.append(cid)
        backends.extend(c.get_backends())

        c.callback_report(status='start')

    health_check = entry.get('health_check', '')
    if health_check and backends:
        urls = [b + health_check for b in backends]
        if not wait_health_check(urls):
            # TODO 这里要么回滚要么报警
            _log.info('Task<id=%s>: Done, but something went error', task_id)
            return

    publish_to_service_discovery(version.name)
    task.finish(consts.TASK_SUCCESS)
    task.reason = 'ok'
    task.container_ids = cids
    notifier.pub_success()

    _log.info('Task<id=%s>: Done', task_id)
Exemple #15
0
def create_containers(task_id,
                      ncontainer,
                      nshare,
                      cores,
                      network_ids,
                      spec_ips=None):
    """
    执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网
    """
    _log.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        _log.error('Task (id=%s) not found, quit', task_id)
        return

    if spec_ips is None:
        spec_ips = []

    need_network = bool(network_ids)
    networks = [ipam.get_pool(n) for n in network_ids]

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props['entrypoint']
    env = task.props['env']
    ports = task.props['ports']
    args = task.props['args']
    # use raw
    image = task.props['image']
    callback_url = task.props['callback_url']
    cpu_shares = int(float(nshare) / host.pod.core_share *
                     1024) if nshare else 1024

    cids = []
    backends = []
    entry = version.appconfig.entrypoints[entrypoint]

    for fcores, pcores in _iter_cores(cores, ncontainer):
        cores_for_one_container = {'full': fcores, 'part': pcores}
        # 在宿主机上创建容器
        try:
            cid, cname = dockerjob.create_one_container(
                host,
                version,
                entrypoint,
                env,
                fcores + pcores,
                ports=ports,
                args=args,
                cpu_shares=cpu_shares,
                image=image,
                need_network=need_network)
        except Exception as e:
            # 写给celery日志看
            _log.exception(e)
            host.release_cores(cores_for_one_container, nshare)
            continue

        # 容器记录下来
        c = Container.create(cid, host, version, cname, entrypoint,
                             cores_for_one_container, env, nshare,
                             callback_url)

        # 为容器创建网络栈
        # 同时把各种信息都记录下来
        # 如果失败, 清除掉所有记录和宿主机上的容器
        # 循环下一次尝试
        cidrs = [n.netspace for n in networks]
        if not ipam.allocate_ips(cidrs, cid, spec_ips):
            _clean_failed_containers(cid)
            continue

        notifier.notify_agent(c)
        add_container_for_agent(host, c)
        add_container_backends(c)
        cids.append(cid)
        backends.extend(c.get_backends())

        c.callback_report(status='start')

    health_check = entry.get('health_check', '')
    if health_check and backends:
        urls = [b + health_check for b in backends]
        if not wait_health_check(urls):
            # TODO 这里要么回滚要么报警
            _log.info('Task<id=%s>: Done, but something went error', task_id)
            return

    publish_to_service_discovery(version.name)
    task.finish(consts.TASK_SUCCESS)
    task.reason = 'ok'
    task.container_ids = cids
    notifier.pub_success()

    _log.info('Task<id=%s>: Done', task_id)
Exemple #16
0
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None):
    """
    执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网
    """
    current_flask.logger.info('Task<id=%s>: Started', task_id)
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error('Task (id=%s) not found, quit', task_id)
        return

    if spec_ips is None:
        spec_ips = []

    need_network = bool(network_ids)
    networks = Network.get_multi(network_ids)

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props['entrypoint']
    env = task.props['env']
    ports = task.props['ports']
    args = task.props['args']
    # use raw
    route = task.props['route']
    image = task.props['image']
    callback_url = task.props['callback_url']
    cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024

    pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name
    pub_agent_route_key = 'eru:agent:%s:route' % host.name
    feedback_key = 'eru:agent:%s:feedback' % task_id

    cids = []

    for fcores, pcores in _iter_cores(cores, ncontainer):
        cores_for_one_container = {'full': fcores, 'part': pcores}
        try:
            cid, cname = dockerjob.create_one_container(host, version,
                entrypoint, env, fcores+pcores, ports=ports, args=args,
                cpu_shares=cpu_shares, image=image, need_network=need_network)
        except Exception as e:
            # 写给celery日志看
            print e
            host.release_cores(cores_for_one_container, nshare)
            continue

        if spec_ips:
            ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)]
        else:
            ips = [n.acquire_ip() for n in networks]
        ips = [i for i in ips if i]
        ip_dict = {ip.vlan_address: ip for ip in ips}

        if ips:
            if ERU_AGENT_API == 'pubsub':
                values = [str(task_id), cid] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips]
                rds.publish(pub_agent_vlan_key, '|'.join(values))
            elif ERU_AGENT_API == 'http':
                agent = get_agent(host)
                ip_list = [(ip.vlan_seq_id, ip.vlan_address) for ip in ips]
                agent.add_container_vlan(cid, str(task_id), ip_list)

        for _ in ips:
            # timeout 15s
            rv = rds.blpop(feedback_key, 15)
            if rv is None:
                break
            # rv is like (feedback_key, 'succ|container_id|vethname|ip')
            succ, _, vethname, vlan_address = rv[1].split('|')
            if succ == '0':
                break
            ip = ip_dict.get(vlan_address, None)
            if ip:
                ip.set_vethname(vethname)

            if route:
                rds.publish(pub_agent_route_key, '%s|%s' % (cid, route))

        else:
            current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips)
            c = Container.create(cid, host, version, cname, entrypoint,
                    cores_for_one_container, env, nshare, callback_url)
            for ip in ips:
                ip.assigned_to_container(c)
            notifier.notify_agent(c)
            add_container_for_agent(c)
            add_container_backends(c)
            cids.append(cid)
            # 略过清理工作
            continue

        # 清理掉失败的容器, 释放核, 释放ip
        current_flask.logger.info('Cleaning failed container (cid=%s)', cid)
        dockerjob.remove_container_by_cid([cid], host)
        host.release_cores(cores_for_one_container, nshare)
        [ip.release() for ip in ips]
        # 失败了就得清理掉这个key
        rds.delete(feedback_key)

    publish_to_service_discovery(version.name)
    task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids)
    notifier.pub_success()

    # 有IO, 丢最后面算了
    falcon_all_graphs(version)
    falcon_all_alarms(version)

    current_flask.logger.info('Task<id=%s>: Done', task_id)
Exemple #17
0
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None):
    """
    执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网
    """
    # TODO support part core
    current_flask.logger.info("Task<id=%s>: Started", task_id)
    task = Task.get(task_id)
    if not task:
        current_flask.logger.error("Task (id=%s) not found, quit", task_id)
        return

    if spec_ips is None:
        spec_ips = []

    networks = Network.get_multi(network_ids)

    notifier = TaskNotifier(task)
    host = task.host
    version = task.version
    entrypoint = task.props["entrypoint"]
    env = task.props["env"]
    # use raw
    image = task.props["image"]
    cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024

    pub_agent_vlan_key = "eru:agent:%s:vlan" % host.name
    feedback_key = "eru:agent:%s:feedback" % task_id

    cids = []

    full_cores, part_cores = cores.get("full", []), cores.get("part", [])
    for fcores, pcores in izip_longest(
        chunked(full_cores, len(full_cores) / ncontainer),
        chunked(part_cores, len(part_cores) / ncontainer),
        fillvalue=[],
    ):
        cores_for_one_container = {"full": fcores, "part": pcores}
        try:
            cid, cname = dockerjob.create_one_container(
                host, version, entrypoint, env, fcores + pcores, cpu_shares, image=image
            )
        except:
            host.release_cores(cores_for_one_container, nshare)
            continue

        if spec_ips:
            ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)]
        else:
            ips = [n.acquire_ip() for n in networks]
        ips = [i for i in ips if i]
        ip_dict = {ip.vlan_address: ip for ip in ips}

        if ips:
            ident_id = cname.split("_")[-1]
            values = [str(task_id), cid, ident_id] + ["{0}:{1}".format(ip.vlan_seq_id, ip.vlan_address) for ip in ips]
            rds.publish(pub_agent_vlan_key, "|".join(values))

        for _ in ips:
            # timeout 15s
            rv = rds.blpop(feedback_key, 15)
            if rv is None:
                break
            # rv is like (feedback_key, 'succ|container_id|vethname|ip')
            succ, _, vethname, vlan_address = rv[1].split("|")
            if succ == "0":
                break
            ip = ip_dict.get(vlan_address, None)
            if ip:
                ip.set_vethname(vethname)

        else:
            current_flask.logger.info("Creating container (cid=%s, ips=%s)", cid, ips)
            c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare)
            for ip in ips:
                ip.assigned_to_container(c)
            notifier.notify_agent(cid)
            add_container_for_agent(c)
            add_container_backends(c)
            cids.append(cid)
            # 略过清理工作
            continue

        # 清理掉失败的容器, 释放核, 释放ip
        current_flask.logger.info("Cleaning failed container (cid=%s)", cid)
        dockerjob.remove_container_by_cid([cid], host)
        host.release_cores(cores_for_one_container, nshare)
        [ip.release() for ip in ips]
        # 失败了就得清理掉这个key
        rds.delete(feedback_key)

    publish_to_service_discovery(version.name)
    task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids)
    notifier.pub_success()
    current_flask.logger.info("Task<id=%s>: Done", task_id)