def test_create_container(client, test_db): # 反正本地也跑不过 -_-! return app, version, group, pod, host = create_local_test_data() rv = client.post( '/api/deploy/public/group/pod/blueberry', data=json.dumps({ 'ncontainer': 1, 'version': version.sha, 'entrypoint': 'web', 'env': 'prod' }), content_type='application/json') assert rv.status_consts == 200 r = json.loads(rv.data) assert len(r['tasks']) == 1 task_id = r['tasks'][0] assert task_id task = Task.get(task_id) assert task.host_id == host.id assert task.app_id == app.id assert task.version_id == version.id assert task.type == consts.TASK_CREATE props = task.props assert props['ncontainer'] == 1 assert props['entrypoint'] == 'web' assert props['cores'] == []
def task_log(task_id): ws = request.environ['wsgi.websocket'] task = Task.get(task_id) if not task: ws.close() logger.info('Task %s not found, close websocket' % task_id) return 'websocket closed' notifier = TaskNotifier(task) try: pub = rds.pubsub() pub.subscribe(task.publish_key) for line in notifier.get_store_logs(): ws.send(line) if task.finished: return '' for line in pub.listen(): if line['data'] == code.PUB_END_MESSAGE: break if line['type'] != 'message': continue ws.send(line['data']) except geventwebsocket.WebSocketError, e: logger.exception(e)
def test_build_image(client, test_db): # 反正本地也跑不过 -_-! return app, version, group, pod, host = create_local_test_data() rv = client.post( '/api/deploy/build/group/pod/blueberry', data=json.dumps({ 'base': 'containerops.cn/tonicbupt/ubuntu:python-2014.11.28', 'version': version.sha }), content_type='application/json') assert rv.status_consts == 200 r = json.loads(rv.data) assert r[u'r'] == 0 task_id = r[u'task'] assert task_id task = Task.get(task_id) assert task.host_id == host.id assert task.app_id == app.id assert task.version_id == version.id assert task.type == consts.TASK_BUILD assert task.props == { 'base': 'containerops.cn/tonicbupt/ubuntu:python-2014.11.28' }
def task_log(task_id): ws = request.environ['wsgi.websocket'] task = Task.get(task_id) if not task: ws.close() _log.info('Task %s not found, close websocket' % task_id) return 'websocket closed' notifier = TaskNotifier(task) try: pub = rds.pubsub() pub.subscribe(task.publish_key) for line in notifier.get_store_logs(): ws.send(line) if task.finished: return '' for line in pub.listen(): if line['data'] == consts.PUB_END_MESSAGE: break if line['type'] != 'message': continue ws.send(line['data']) except geventwebsocket.WebSocketError, e: _log.exception(e)
def build_docker_image(task_id, base, file_path): task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip) notifier = TaskNotifier(task) app = task.app host = task.host version = task.version try: repo, tag = base.split(':', 1) repo = repo if repo.startswith('eru/') else 'eru/' + repo.strip('/') _log.info('Task<id=%s>: Pull base image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.pull_image(host, repo, tag)) _log.info('Task<id=%s>: Build image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.build_image(host, version, base, file_path)) _log.info('Task<id=%s>: Push image (base=%s)', task_id, base) last_line = notifier.store_and_broadcast(dockerjob.push_image(host, version)) dockerjob.remove_image(version, host) except Exception, e: task.finish(consts.TASK_FAILED) task.reason = str(e.message) notifier.pub_fail() _log.error('Task<id=%s>, exception', task_id) _log.exception(e)
def build_docker_image(task_id, base, file_path): task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip) notifier = TaskNotifier(task) app = task.app host = task.host version = task.version try: repo, tag = base.split(':', 1) repo = repo if repo.startswith('eru/') else 'eru/' + repo.strip('/') _log.info('Task<id=%s>: Pull base image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.pull_image(host, repo, tag)) _log.info('Task<id=%s>: Build image (base=%s)', task_id, base) notifier.store_and_broadcast( dockerjob.build_image(host, version, base, file_path)) _log.info('Task<id=%s>: Push image (base=%s)', task_id, base) last_line = notifier.store_and_broadcast( dockerjob.push_image(host, version)) dockerjob.remove_image(version, host) except Exception, e: task.finish(consts.TASK_FAILED) task.reason = str(e.message) notifier.pub_fail() _log.error('Task<id=%s>, exception', task_id) _log.exception(e)
def remove_containers(task_id, cids, rmi=False): current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers] host = task.host try: flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.container_id[:7]) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) dockerjob.remove_host_containers(containers, host) current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: dockerjob.remove_image(task.version, host) except Exception, e: task.finish_with_result(consts.TASK_FAILED) notifier.pub_fail() current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
def test_create_container(client, test_db): # 反正本地也跑不过 -_-! return app, version, pod, host = create_local_test_data() rv = client.post('/api/deploy/public/group/pod/blueberry', data=json.dumps({ 'ncontainer': 1, 'version': version.sha, 'entrypoint': 'web', 'env': 'prod' }), content_type='application/json') assert rv.status_consts == 200 r = json.loads(rv.data) assert len(r['tasks']) == 1 task_id = r['tasks'][0] assert task_id task = Task.get(task_id) assert task.host_id == host.id assert task.app_id == app.id assert task.version_id == version.id assert task.type == consts.TASK_CREATE props = task.props assert props['ncontainer'] == 1 assert props['entrypoint'] == 'web' assert props['cores'] == []
def test_build_image(client, test_db): # 反正本地也跑不过 -_-! return app, version, pod, host = create_local_test_data() rv = client.post('/api/deploy/build/group/pod/blueberry', data=json.dumps({ 'base': 'containerops.cn/tonicbupt/ubuntu:python-2014.11.28', 'version': version.sha }), content_type='application/json') assert rv.status_consts == 200 r = json.loads(rv.data) assert r[u'r'] == 0 task_id = r[u'task'] assert task_id task = Task.get(task_id) assert task.host_id == host.id assert task.app_id == app.id assert task.version_id == version.id assert task.type == consts.TASK_BUILD assert task.props == { 'base': 'containerops.cn/tonicbupt/ubuntu:python-2014.11.28' }
def remove_containers(task_id, cids, rmi=False): task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip) notifier = TaskNotifier(task) containers = Container.get_multi(cids) if not containers: _log.error('Task (id=%s) no container found, quit') return host = containers[0].host for c in containers: c.in_removal = 1 container_ids = [c.container_id for c in containers if c] try: set_flag_for_agent(container_ids) for c in containers: remove_container_backends(c) _log.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.short_id) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) time.sleep(3) dockerjob.remove_host_containers(containers, host) _log.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: try: dockerjob.remove_image(task.version, host) except Exception as e: _log.error('Task<id=%s>, fail to remove image', task_id, e) except Exception as e: task.finish(consts.TASK_FAILED) task.reason = str(e.message) notifier.pub_fail() _log.error('Task<id=%s> exception', task_id) _log.exception(e) else: for c in containers: c.delete() task.finish(consts.TASK_SUCCESS) task.reason = 'ok' notifier.pub_success() remove_container_for_agent(host, container_ids) remove_flag_for_agent(container_ids) _log.info('Task<id=%s>: Done', task_id)
def remove_containers(task_id, cids, rmi=False): task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return current_flask.logger.info('Task<id=%s>: Start on host %s' % (task_id, task.host.ip)) notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers if c] host = task.host version = task.version try: # flag, don't report these flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.container_id[:7]) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) time.sleep(3) dockerjob.remove_host_containers(containers, host) current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: try: dockerjob.remove_image(task.version, host) except Exception as e: current_flask.logger.error('Task<id=%s>: Exception (e=%s), fail to remove image', task_id, e) except Exception as e: task.finish_with_result(consts.TASK_FAILED) notifier.pub_fail() current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e) else: for c in containers: c.delete() task.finish_with_result(consts.TASK_SUCCESS) notifier.pub_success() if container_ids: rds.hdel('eru:agent:%s:containers:meta' % host.name, *container_ids) rds.delete(*flags.keys()) current_flask.logger.info('Task<id=%s>: Done', task_id) if not version.containers.count(): falcon_remove_alarms(version)
def build_docker_image(task_id, base): task = Task.get(task_id) notifier = TaskNotifier(task) try: repo, tag = base.split(':', 1) notifier.store_and_broadcast(dockerjob.pull_image(task.host, repo, tag)) notifier.store_and_broadcast(dockerjob.build_image(task.host, task.version, base)) notifier.store_and_broadcast(dockerjob.push_image(task.host, task.version)) try: dockerjob.remove_image(task.version, task.host) except: pass except Exception, e: logger.exception(e) task.finish_with_result(code.TASK_FAILED) notifier.pub_fail()
def remove_containers(task_id, cids, rmi): task = Task.get(task_id) notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers] host = task.host try: flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) dockerjob.remove_host_containers(containers, host) if rmi: dockerjob.remove_image(task.version, host) except Exception, e: logger.exception(e) task.finish_with_result(code.TASK_FAILED) notifier.pub_fail()
def build_docker_image(task_id, base): current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return notifier = TaskNotifier(task) try: repo, tag = base.split(':', 1) current_flask.logger.info('Task<id=%s>: Pull base image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.pull_image(task.host, repo, tag)) current_flask.logger.info('Task<id=%s>: Build image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.build_image(task.host, task.version, base)) current_flask.logger.info('Task<id=%s>: Push image (base=%s)', task_id, base) notifier.store_and_broadcast(dockerjob.push_image(task.host, task.version)) dockerjob.remove_image(task.version, task.host) except Exception, e: task.finish_with_result(consts.TASK_FAILED) notifier.pub_fail() current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
def get_task(task_id): task = Task.get(task_id) if not task: abort(404, 'Task %s not found' % task_id) return task
def create_containers(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ _log.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = [ipam.get_pool(n) for n in network_ids] notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 cids = [] backends = [] entry = version.appconfig.entrypoints[entrypoint] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} # 在宿主机上创建容器 try: cid, cname = dockerjob.create_one_container( host, version, entrypoint, env, fcores + pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 _log.exception(e) host.release_cores(cores_for_one_container, nshare) continue # 容器记录下来 c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) # 为容器创建网络栈 # 同时把各种信息都记录下来 # 如果失败, 清除掉所有记录和宿主机上的容器 # 循环下一次尝试 cidrs = [n.netspace for n in networks] if not ipam.allocate_ips(cidrs, cid, spec_ips): _clean_failed_containers(cid) continue notifier.notify_agent(c) add_container_for_agent(host, c) add_container_backends(c) cids.append(cid) backends.extend(c.get_backends()) c.callback_report(status='start') health_check = entry.get('health_check', '') if health_check and backends: urls = [b + health_check for b in backends] if not wait_health_check(urls): # TODO 这里要么回滚要么报警 _log.info('Task<id=%s>: Done, but something went error', task_id) return publish_to_service_discovery(version.name) task.finish(consts.TASK_SUCCESS) task.reason = 'ok' task.container_ids = cids notifier.pub_success() _log.info('Task<id=%s>: Done', task_id)
def create_containers_with_macvlan(task_id, ncontainer, core_ids, network_ids): """ 执行task_id的任务. 部署ncontainer个容器, 占用core_ids这些核, 绑定到networks这些子网 """ task = Task.get(task_id) if not task: return networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] used_cores = Core.get_multi(core_ids) pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for cores in more_itertools.chunked(used_cores, len(core_ids)/ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores) except: host.release_cores(cores) continue ips = [n.acquire_ip() for n in networks] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: logger.info('Creating container with cid %s and ips %s' % (cid, ips)) c = Container.create(cid, host, version, cname, entrypoint, cores, env) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip logger.info('Cleaning failed container with cid %s' % cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores) [ip.release() for ip in ips] publish_to_service_discovery(version.name) task.finish_with_result(code.TASK_SUCCESS, container_ids=cids) notifier.pub_success()
def create_containers(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ _log.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = [ipam.get_pool(n) for n in network_ids] notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 cids = [] backends = [] entry = version.appconfig.entrypoints[entrypoint] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} # 在宿主机上创建容器 try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, fcores + pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 _log.exception(e) host.release_cores(cores_for_one_container, nshare) continue # 容器记录下来 c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) # 为容器创建网络栈 # 同时把各种信息都记录下来 # 如果失败, 清除掉所有记录和宿主机上的容器 # 循环下一次尝试 cidrs = [n.netspace for n in networks] if not ipam.allocate_ips(cidrs, cid, spec_ips): _clean_failed_containers(cid) continue notifier.notify_agent(c) add_container_for_agent(host, c) add_container_backends(c) cids.append(cid) backends.extend(c.get_backends()) c.callback_report(status='start') health_check = entry.get('health_check', '') if health_check and backends: urls = [b + health_check for b in backends] if not wait_health_check(urls): # TODO 这里要么回滚要么报警 _log.info('Task<id=%s>: Done, but something went error', task_id) return publish_to_service_discovery(version.name) task.finish(consts.TASK_SUCCESS) task.reason = 'ok' task.container_ids = cids notifier.pub_success() _log.info('Task<id=%s>: Done', task_id)
def create_containers_with_macvlan_public(task_id, ncontainer, nshare, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] # use raw image = task.props['image'] cpu_shares = 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for _ in range(ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores=None, cpu_shares=cpu_shares, image=image) except Exception as e: print e # 同上 continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, {}, env, nshare) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() current_flask.logger.info('Task<id=%s>: Done', task_id)
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw route = task.props['route'] image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name pub_agent_route_key = 'eru:agent:%s:route' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, fcores+pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 print e host.release_cores(cores_for_one_container, nshare) continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: if ERU_AGENT_API == 'pubsub': values = [str(task_id), cid] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) elif ERU_AGENT_API == 'http': agent = get_agent(host) ip_list = [(ip.vlan_seq_id, ip.vlan_address) for ip in ips] agent.add_container_vlan(cid, str(task_id), ip_list) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) if route: rds.publish(pub_agent_route_key, '%s|%s' % (cid, route)) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(c) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores_for_one_container, nshare) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() # 有IO, 丢最后面算了 falcon_all_graphs(version) falcon_all_alarms(version) current_flask.logger.info('Task<id=%s>: Done', task_id)
def task_log(task_id): task = Task.get(task_id) if not task: abort(404, 'Task %s not found' % task_id) return [json.loads(l) for l in rds.lrange(task.log_key, 0, -1)]
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ # TODO support part core current_flask.logger.info("Task<id=%s>: Started", task_id) task = Task.get(task_id) if not task: current_flask.logger.error("Task (id=%s) not found, quit", task_id) return if spec_ips is None: spec_ips = [] networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props["entrypoint"] env = task.props["env"] # use raw image = task.props["image"] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 pub_agent_vlan_key = "eru:agent:%s:vlan" % host.name feedback_key = "eru:agent:%s:feedback" % task_id cids = [] full_cores, part_cores = cores.get("full", []), cores.get("part", []) for fcores, pcores in izip_longest( chunked(full_cores, len(full_cores) / ncontainer), chunked(part_cores, len(part_cores) / ncontainer), fillvalue=[], ): cores_for_one_container = {"full": fcores, "part": pcores} try: cid, cname = dockerjob.create_one_container( host, version, entrypoint, env, fcores + pcores, cpu_shares, image=image ) except: host.release_cores(cores_for_one_container, nshare) continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split("_")[-1] values = [str(task_id), cid, ident_id] + ["{0}:{1}".format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, "|".join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split("|") if succ == "0": break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: current_flask.logger.info("Creating container (cid=%s, ips=%s)", cid, ips) c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info("Cleaning failed container (cid=%s)", cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores_for_one_container, nshare) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() current_flask.logger.info("Task<id=%s>: Done", task_id)
def get_task(task_id): task = Task.get(task_id) if not task: raise EruAbortException(code.HTTP_NOT_FOUND, 'Task %s not found' % task_id) return task