def rm_containers(): cids = request.get_json()['cids'] if not all(len(cid) >= 7 for cid in cids): abort(400, 'must given at least 7 chars for container_id') version_dict = {} for cid in cids: container = Container.get_by_container_id(cid) if not container: continue version_dict.setdefault((container.version, container.host), []).append(container) ts, watch_keys = [], [] for (version, host), containers in version_dict.iteritems(): cids = [c.id for c in containers] task = Task.create(consts.TASK_REMOVE, version, host, {'container_ids': cids}) all_host_cids = [c.id for c in Container.get_multi_by_host(host) if c and c.version_id == version.id] need_to_delete_image = set(cids) == set(all_host_cids) remove_containers.apply_async( args=(task.id, cids, need_to_delete_image), task_id='task:%d' % task.id ) ts.append(task.id) watch_keys.append(task.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': watch_keys}
def rm_containers(): cids = request.get_json()['cids'] if not all(len(cid) >= 7 for cid in cids): abort(400, 'must given at least 7 chars for container_id') version_dict = {} for cid in cids: container = Container.get_by_container_id(cid) if not container: continue version_dict.setdefault((container.version, container.host), []).append(container) task_ids, watch_keys = [], [] for (version, host), containers in version_dict.iteritems(): cids = [c.id for c in containers] task = Task.create(TASK_REMOVE, version, host, {'container_ids': cids}) all_host_cids = [c.id for c in Container.get_multi_by_host(host) if c and c.version_id == version.id] need_to_delete_image = set(cids) == set(all_host_cids) remove_containers.apply_async( args=(task.id, cids, need_to_delete_image), task_id='task:%d' % task.id ) task_ids.append(task.id) watch_keys.append(task.result_key) return {'tasks': task_ids, 'watch_keys': watch_keys}
def cure_container(cid): c = Container.get_by_container_id(cid) if c and not c.is_alive: rebind_container_ip(c) c.cure() current_app.logger.info('Cure container (container_id=%s)', cid[:7]) return {'r': 0, 'msg': consts.OK}
def stop_container(cid): c = Container.get_by_container_id(cid) if c: c.kill() dockerjob.stop_containers([c,], c.host) current_app.logger.info('Stop container (container_id=%s)', cid[:7]) return {'r': 0, 'msg': consts.OK}
def remove_containers(task_id, cids, rmi=False): current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers] host = task.host try: flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.container_id[:7]) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) dockerjob.remove_host_containers(containers, host) current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: dockerjob.remove_image(task.version, host) except Exception, e: task.finish_with_result(consts.TASK_FAILED) notifier.pub_fail() current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e)
def fix_core(host): containers = Container.get_multi_by_host(host) # 没有的话, 直接销毁重建 if not containers: rds.delete(host._cores_key) _create_cores_on_host(host, host.ncore) return data = {str(i): host.core_share for i in xrange(host.ncore)} for c in containers: cores = c.cores nshare = int(cores.get("nshare", "0")) for e in cores.get("full", []): e.remain = host.core_share data.pop(e.label) for s in cores.get("part", []): s.remain = host.core_share - nshare data[s.label] -= nshare c.cores = cores rds.delete(host._cores_key) if data: rds.zadd(host._cores_key, **data) print "done", host
def test_container_release_cores(test_db): a = App.get_or_create('app', 'http://git.hunantv.com/group/app.git') v = a.add_version(random_sha1()) p = Pod.create('pod', 'pod', 10, -1) host = Host.create(p, random_ipv4(), random_string(), random_uuid(), 200, 0) for core in host.cores: assert core.host_id == host.id assert core.remain == 10 containers = [] cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): used_cores = {'full': fcores, 'part': pcores} host.occupy_cores(used_cores, 5) c = Container.create(random_sha1(), host, v, random_string(), 'entrypoint', used_cores, 'env', nshare=5) containers.append(c) cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): for core in fcores: assert core.remain == 0 for core in pcores: assert core.remain == 5 for c in containers: c.delete() cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): for core in fcores: assert core.remain == 10 for core in pcores: assert core.remain == 10
def container_log(cid): stderr = request.args.get('stderr', type=int, default=0) stdout = request.args.get('stdout', type=int, default=0) tail = request.args.get('tail', type=int, default=10) # docker client's argument if tail == 0: tail = 'all' ws = request.environ['wsgi.websocket'] container = Container.get_by_container_id(cid) if not container: ws.close() _log.info('Container %s not found, close websocket' % cid) return 'websocket closed' try: client = get_docker_client(container.host.addr) for line in client.logs(cid, stream=True, stderr=bool(stderr), stdout=bool(stdout), tail=tail): ws.send(line) except geventwebsocket.WebSocketError, e: _log.exception(e)
def start_container(cid): c = Container.get_by_container_id(cid) if c: c.cure() dockerjob.start_containers([c, ], c.host) rebind_container_ip(c) return {'r': 0, 'msg': code.OK}
def start_container(cid): c = Container.get_by_container_id(cid) if c and not c.is_alive: c.cure() dockerjob.start_containers([c, ], c.host) rebind_container_ip(c) current_app.logger.info('Start container (container_id=%s)', cid[:7]) return {'r': 0, 'msg': consts.OK}
def _clean_failed_containers(cid): # 清理掉失败的容器, 释放核, 释放ip _log.info('Cleaning failed container (cid=%s)', cid) container = Container.get_by_container_id(cid) if not container: return dockerjob.remove_container_by_cid([cid], container.host) container.delete()
def remove_containers(task_id, cids, rmi=False): task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return _log.info('Task<id=%s>: Start on host %s', task_id, task.host.ip) notifier = TaskNotifier(task) containers = Container.get_multi(cids) if not containers: _log.error('Task (id=%s) no container found, quit') return host = containers[0].host for c in containers: c.in_removal = 1 container_ids = [c.container_id for c in containers if c] try: set_flag_for_agent(container_ids) for c in containers: remove_container_backends(c) _log.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.short_id) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) time.sleep(3) dockerjob.remove_host_containers(containers, host) _log.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: try: dockerjob.remove_image(task.version, host) except Exception as e: _log.error('Task<id=%s>, fail to remove image', task_id, e) except Exception as e: task.finish(consts.TASK_FAILED) task.reason = str(e.message) notifier.pub_fail() _log.error('Task<id=%s> exception', task_id) _log.exception(e) else: for c in containers: c.delete() task.finish(consts.TASK_SUCCESS) task.reason = 'ok' notifier.pub_success() remove_container_for_agent(host, container_ids) remove_flag_for_agent(container_ids) _log.info('Task<id=%s>: Done', task_id)
def kill_container(cid): c = Container.get_by_container_id(cid) if c: c.kill() key = consts.ERU_AGENT_DIE_REASON % c.container_id r = rds.get(key) rds.delete(key) if r is not None: c.set_props({'oom': 1}) current_app.logger.info('Kill container (container_id=%s)', cid[:7]) return {'r': 0, 'msg': consts.OK}
def test_container_transform(test_db): a = App.get_or_create('app', 'http://git.hunantv.com/group/app.git', '') assert a is not None v = a.add_version(random_sha1()) v2 = a.add_version(random_sha1()) assert v is not None assert v.app.id == a.id assert v.name == a.name assert len(v.containers.all()) == 0 assert len(v.tasks.all()) == 0 g = Group.create('group', 'group') p = Pod.create('pod', 'pod') assert p.assigned_to_group(g) hosts = [Host.create(p, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) for i in range(6)] for host in hosts[:3]: host.assigned_to_group(g) assert g.get_max_containers(p, 3, 0) == 3 host_cores = g.get_free_cores(p, 3, 3, 0) assert len(host_cores) == 3 containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores) / count for i in range(count): cid = random_sha1() used_cores = {'full': cores['full'][i*cores_per_container:(i+1)*cores_per_container]} c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env') assert c is not None containers.append(c) host.occupy_cores(cores, 0) for host in g.private_hosts.all(): assert len(host.get_free_cores()[0]) == 1 assert len(host.containers.all()) == 1 assert host.count == 1 assert len(containers) == 3 assert len(v.containers.all()) == 3 cids = [c.container_id for c in containers] for c in containers: host = c.host cid = c.container_id c.transform(v2, random_sha1(), random_string()) assert c.container_id != cid new_cids = [c.container_id for c in containers] assert new_cids != cids
def remove_containers(task_id, cids, rmi=False): task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return current_flask.logger.info('Task<id=%s>: Start on host %s' % (task_id, task.host.ip)) notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers if c] host = task.host version = task.version try: # flag, don't report these flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) current_flask.logger.info('Task<id=%s>: Container (cid=%s) backends removed', task_id, c.container_id[:7]) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) time.sleep(3) dockerjob.remove_host_containers(containers, host) current_flask.logger.info('Task<id=%s>: Containers (cids=%s) removed', task_id, cids) if rmi: try: dockerjob.remove_image(task.version, host) except Exception as e: current_flask.logger.error('Task<id=%s>: Exception (e=%s), fail to remove image', task_id, e) except Exception as e: task.finish_with_result(consts.TASK_FAILED) notifier.pub_fail() current_flask.logger.error('Task<id=%s>: Exception (e=%s)', task_id, e) else: for c in containers: c.delete() task.finish_with_result(consts.TASK_SUCCESS) notifier.pub_success() if container_ids: rds.hdel('eru:agent:%s:containers:meta' % host.name, *container_ids) rds.delete(*flags.keys()) current_flask.logger.info('Task<id=%s>: Done', task_id) if not version.containers.count(): falcon_remove_alarms(version)
def rm_containers(): cids = request.get_json()["cids"] version_dict = {} ts, watch_keys = [], [] for cid in cids: container = Container.get_by_container_id(cid) if not container: continue version_dict.setdefault((container.version, container.host), []).append(container) for (version, host), containers in version_dict.iteritems(): cids = [c.id for c in containers] task_props = {"container_ids": cids} task = Task.create(consts.TASK_REMOVE, version, host, task_props) remove_containers.apply_async(args=(task.id, cids, False), task_id="task:%d" % task.id) ts.append(task.id) watch_keys.append(task.result_key) return {"r": 0, "msg": "ok", "tasks": ts, "watch_keys": watch_keys}
def create_test_suite(): appyaml = { 'appname': 'app', 'entrypoints': { 'web': { 'cmd': 'python app.py', 'ports': ['5000/tcp'], }, 'daemon': { 'cmd': 'python daemon.py', }, 'service': { 'cmd': 'python service.py' }, }, 'build': 'pip install -r ./requirements.txt', } app = App.get_or_create('app', 'http://git.hunantv.com/group/app.git') version = app.add_version(random_sha1()) appconfig = version.appconfig appconfig.update(**appyaml) appconfig.save() pod = Pod.create('pod', 'pod') hosts = [ Host.create(pod, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) for i in range(4) ] containers = [] for (host, count), cores in centralized_schedule(pod, 4, 4, 0).iteritems(): cores_per_container = len(cores) / count for i in range(count): cid = random_sha1() used_cores = { 'full': cores['full'][i * cores_per_container:(i + 1) * cores_per_container] } c = Container.create(cid, host, version, random_string(), 'web', used_cores, 'env') containers.append(c) host.occupy_cores(cores, 0) return app, version, pod, hosts, containers
def migrate_container(container_id, need_to_remove=True): container = Container.get_by_container_id(container_id) if not container: _log.error('container %s is not found, ignore migration', container_id) return ncore, nshare = container.host.pod.get_core_allocation(container.ncore) host_cores = average_schedule(container.host.pod, 1, ncore, nshare, None) if not host_cores: _log.error('not enough cores to migrate') return cids = [container.id] spec_ips = cidrs = container.get_ips() (host, container_count), cores = next(host_cores.iteritems()) props = { 'ncontainer': 1, 'entrypoint': container.entrypoint, 'env': container.env, 'full_cores': [c.label for c in cores.get('full', [])], 'part_cores': [c.label for c in cores.get('part', [])], 'ports': None, 'args': None, 'nshare': nshare, 'networks': cidrs, 'image': None, 'route': '', 'callback_url': container.callback_url, 'container_ids': cids, } task = Task.create(consts.TASK_MIGRATE, container.version, host, props) if not task: _log.error('create migrate task error') return _log.info('start migration...') if need_to_remove: remove_containers.apply(args=(task.id, cids, False), task_id='task:%s' % task.id) create_containers.apply(args=(task.id, 1, nshare, cores, cidrs, spec_ips), task_id='task:%s' % task.id) _log.info('migration done')
def create_test_suite(): appyaml = { 'appname': 'app', 'entrypoints': { 'web': { 'cmd': 'python app.py', 'ports': ['5000/tcp'], }, 'daemon': { 'cmd': 'python daemon.py', }, 'service': { 'cmd': 'python service.py' }, }, 'build': 'pip install -r ./requirements.txt', } app = App.get_or_create('app', 'http://git.hunantv.com/group/app.git', 'token') version = app.add_version(random_sha1()) appconfig = version.appconfig appconfig.update(**appyaml) appconfig.save() group = Group.create('group', 'group') pod = Pod.create('pod', 'pod') pod.assigned_to_group(group) hosts = [Host.create(pod, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) for i in range(4)] for host in hosts: host.assigned_to_group(group) containers = [] for (host, count), cores in group.get_free_cores(pod, 4, 4, 0).iteritems(): cores_per_container = len(cores) / count for i in range(count): cid = random_sha1() used_cores = cores['full'][i*cores_per_container:(i+1)*cores_per_container] c = Container.create(cid, host, version, random_string(), 'entrypoint', used_cores, 'env') containers.append(c) host.occupy_cores(cores, 0) return app, version, group, pod, hosts, containers
def rm_containers(): cids = request.get_json()['cids'] version_dict = {} ts, watch_keys = [], [] for cid in cids: container = Container.get_by_container_id(cid) if not container: continue version_dict.setdefault((container.version, container.host), []).append(container) for (version, host), containers in version_dict.iteritems(): cids = [c.id for c in containers] task_props = {'container_ids': cids} task = Task.create(consts.TASK_REMOVE, version, host, task_props) remove_containers.apply_async( args=(task.id, cids, False), task_id='task:%d' % task.id ) ts.append(task.id) watch_keys.append(task.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': watch_keys}
def migrate_container(container_id, need_to_remove=True): container = Container.get_by_container_id(container_id) if not container: _log.error('container %s is not found, ignore migration', container_id) return ncore, nshare= container.host.pod.get_core_allocation(container.ncore) host_cores = average_schedule(container.host.pod, 1, ncore, nshare, None) if not host_cores: _log.error('not enough cores to migrate') return cids = [container.id] spec_ips = cidrs = container.get_ips() (host, container_count), cores = next(host_cores.iteritems()) props = { 'ncontainer': 1, 'entrypoint': container.entrypoint, 'env': container.env, 'full_cores': [c.label for c in cores.get('full', [])], 'part_cores': [c.label for c in cores.get('part', [])], 'ports': None, 'args': None, 'nshare': nshare, 'networks': cidrs, 'image': None, 'route': '', 'callback_url': container.callback_url, 'container_ids': cids, } task = Task.create(consts.TASK_MIGRATE, container.version, host, props) if not task: _log.error('create migrate task error') return _log.info('start migration...') if need_to_remove: remove_containers.apply(args=(task.id, cids, False), task_id='task:%s' % task.id) create_containers.apply(args=(task.id, 1, nshare, cores, cidrs, spec_ips), task_id='task:%s' % task.id) _log.info('migration done')
def remove_containers(task_id, cids, rmi): task = Task.get(task_id) notifier = TaskNotifier(task) containers = Container.get_multi(cids) container_ids = [c.container_id for c in containers] host = task.host try: flags = {'eru:agent:%s:container:flag' % cid: 1 for cid in container_ids} rds.mset(**flags) for c in containers: remove_container_backends(c) appnames = {c.appname for c in containers} publish_to_service_discovery(*appnames) dockerjob.remove_host_containers(containers, host) if rmi: dockerjob.remove_image(task.version, host) except Exception, e: logger.exception(e) task.finish_with_result(code.TASK_FAILED) notifier.pub_fail()
def container_log(cid): stderr = request.args.get('stderr', type=int, default=0) stdout = request.args.get('stdout', type=int, default=0) tail = request.args.get('tail', type=int, default=10) # docker client's argument if tail == 0: tail = 'all' ws = request.environ['wsgi.websocket'] container = Container.get_by_container_id(cid) if not container: ws.close() logger.info('Container %s not found, close websocket' % cid) return 'websocket closed' try: client = get_docker_client(container.host.addr) for line in client.logs(cid, stream=True, stderr=bool(stderr), stdout=bool(stdout), tail=tail): ws.send(line) except geventwebsocket.WebSocketError, e: logger.exception(e)
def bind_network(cid): data = request.get_json() appname = data.get('appname') c = Container.get_by_container_id(cid) if not (c and c.is_alive): raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container %s not found' % cid) if c.appname != appname: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container does not belong to app') network_names = data.get('networks', []) networks = filter(None, [Network.get_by_name(n) for n in network_names]) if not networks: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'network empty') ips = filter(None, [n.acquire_ip() for n in networks]) if not ips: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'no ip available') nid = max([ip.network_id for ip in c.ips.all()] + [-1]) + 1 bind_container_ip(c, ips, nid=nid) for ip in ips: ip.assigned_to_container(c) return {'r': 0, 'msg': ips}
def fix_core(host): containers = Container.get_multi_by_host(host) # 没有的话, 直接销毁重建 if not containers: _create_cores_on_host(host, host.ncore) return data = {str(i): host.core_share for i in xrange(host.ncore)} for c in containers: cores = c.cores nshare = int(cores.get('nshare', '0')) for e in cores.get('full', []): e.remain = host.core_share data.pop(e.label) for s in cores.get('part', []): s.remain = host.core_share - nshare data[s.label] -= nshare c.cores = cores rds.delete(host._cores_key) if data: rds.zadd(host._cores_key, **data) print 'done', host
def rm_containers(): cids = request.get_json()['cids'] if not all(len(cid) >= 7 for cid in cids): raise EruAbortException(consts.HTTP_BAD_REQUEST, 'must given at least 7 chars for container_id') version_dict = {} ts, watch_keys = [], [] for cid in cids: container = Container.get_by_container_id(cid) if not container: continue version_dict.setdefault((container.version, container.host), []).append(container) for (version, host), containers in version_dict.iteritems(): cids = [c.id for c in containers] task_props = {'container_ids': cids} task = Task.create(consts.TASK_REMOVE, version, host, task_props) remove_containers.apply_async( args=(task.id, cids, False), task_id='task:%d' % task.id ) ts.append(task.id) watch_keys.append(task.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': watch_keys}
def poll_container(cid): c = Container.get_by_container_id(cid) if not c: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container %s not found' % cid) return {'r': 0, 'container': c.container_id, 'status': c.is_alive}
def cure_container(cid): c = Container.get_by_container_id(cid) if c: c.cure() return {'r':0, 'msg': code.OK}
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw route = task.props['route'] image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name pub_agent_route_key = 'eru:agent:%s:route' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, fcores+pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 print e host.release_cores(cores_for_one_container, nshare) continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: if ERU_AGENT_API == 'pubsub': values = [str(task_id), cid] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) elif ERU_AGENT_API == 'http': agent = get_agent(host) ip_list = [(ip.vlan_seq_id, ip.vlan_address) for ip in ips] agent.add_container_vlan(cid, str(task_id), ip_list) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) if route: rds.publish(pub_agent_route_key, '%s|%s' % (cid, route)) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(c) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores_for_one_container, nshare) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() # 有IO, 丢最后面算了 falcon_all_graphs(version) falcon_all_alarms(version) current_flask.logger.info('Task<id=%s>: Done', task_id)
def get_container_by_id(id): c = Container.get(id) if not c: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container %s not found' % id) return c
def remove_container(cid): c = Container.get_by_container_id(cid) if not c: return {'r': 1, 'msg': 'container %s not found' % cid} dockerjob.remove_container_by_cid([cid], c.host) return {'r': 0, 'msg': consts.OK}
def create_containers_with_macvlan(task_id, ncontainer, core_ids, network_ids): """ 执行task_id的任务. 部署ncontainer个容器, 占用core_ids这些核, 绑定到networks这些子网 """ task = Task.get(task_id) if not task: return networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] used_cores = Core.get_multi(core_ids) pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for cores in more_itertools.chunked(used_cores, len(core_ids)/ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores) except: host.release_cores(cores) continue ips = [n.acquire_ip() for n in networks] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: logger.info('Creating container with cid %s and ips %s' % (cid, ips)) c = Container.create(cid, host, version, cname, entrypoint, cores, env) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip logger.info('Cleaning failed container with cid %s' % cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores) [ip.release() for ip in ips] publish_to_service_discovery(version.name) task.finish_with_result(code.TASK_SUCCESS, container_ids=cids) notifier.pub_success()
def create_containers_with_macvlan_public(task_id, ncontainer, nshare, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] # use raw image = task.props['image'] cpu_shares = 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for _ in range(ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores=None, cpu_shares=cpu_shares, image=image) except Exception as e: print e # 同上 continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, {}, env, nshare) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() current_flask.logger.info('Task<id=%s>: Done', task_id)
def test_container(test_db): a = App.get_or_create('app', 'http://git.hunantv.com/group/app.git', '') assert a is not None assert a.id == a.user_id v = a.add_version(random_sha1()) assert v is not None assert v.app.id == a.id assert v.name == a.name assert len(v.containers.all()) == 0 assert len(v.tasks.all()) == 0 g = Group.create('group', 'group') p = Pod.create('pod', 'pod', 10, -1) assert p.assigned_to_group(g) hosts = [Host.create(p, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) for i in range(6)] for host in hosts[:3]: host.assigned_to_group(g) host_ids1 = {h.id for h in hosts[:3]} host_ids2 = {h.id for h in hosts[3:]} host_cores = g.get_free_cores(p, 3, 3, 0) #测试没有碎片核的情况 #获取核 containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores['full']) / count for i in range(count): cid = random_sha1() used_cores = {'full': cores['full'][i*cores_per_container:(i+1)*cores_per_container]} # not using a port c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env') assert c is not None containers.append(c) host.occupy_cores(cores, 0) for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 1 assert len(part_cores) == 0 assert len(host.containers.all()) == 1 assert host.count == 1 assert len(containers) == 3 assert len(v.containers.all()) == 3 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores.all()) == 3 assert len(c.part_cores.all()) == 0 all_core_labels = sorted(['0', '1', '2', '3', ]) used_full_core_labels = [core.label for core in c.full_cores.all()] used_part_core_labels = [core.label for core in c.part_cores.all()] free_core_labels = [core.label for core in c.host.get_free_cores()[0]] assert all_core_labels == sorted(used_full_core_labels + used_part_core_labels + free_core_labels) #释放核 for c in containers: c.delete() assert len(v.containers.all()) == 0 assert g.get_max_containers(p, 3, 0) == 3 host_cores = g.get_free_cores(p, 3, 3, 0) assert len(host_cores) == 3 for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(part_cores) == 0 assert len(host.containers.all()) == 0 assert host.count == 0 #测试有碎片的情况 #获取核 host_cores = g.get_free_cores(p, 3, 3, 4) containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores['full']) / count for i in range(count): cid = random_sha1() used_cores = {'full': cores['full'][i*cores_per_container:(i+1)*cores_per_container], 'part': cores['part']} # not using a port c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env') assert c is not None containers.append(c) host.occupy_cores(cores, 4) for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 0 assert len(part_cores) == 1 assert part_cores[0].used == 4 assert len(host.containers.all()) == 1 assert host.count == 1 assert len(containers) == 3 assert len(v.containers.all()) == 3 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores.all()) == 3 assert len(c.part_cores.all()) == 1 all_core_labels = sorted(['0', '1', '2', '3', ]) used_full_core_labels = [core.label for core in c.full_cores.all()] used_part_core_labels = [core.label for core in c.part_cores.all()] free_core_labels = [core.label for core in c.host.get_free_cores()[0]] assert all_core_labels == sorted(used_full_core_labels + used_part_core_labels + free_core_labels) #释放核 for c in containers: c.delete(4) assert len(v.containers.all()) == 0 assert g.get_max_containers(p, 3, 0) == 3 host_cores = g.get_free_cores(p, 3, 3, 0) assert len(host_cores) == 3 for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(host.containers.all()) == 0 assert host.count == 0 #获取 host_cores = g.get_free_cores(p, 6, 1, 5) containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores['full']) / count for i in range(count): cid = random_sha1() used_cores = {'full': cores['full'][i*cores_per_container:(i+1)*cores_per_container], 'part': cores['part'][i:i+1]} # not using a port c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env') assert c is not None containers.append(c) host.occupy_cores(used_cores, 5) for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 1 assert len(part_cores) == 0 assert len(host.containers.all()) == 2 assert host.count == 2 assert len(containers) == 6 assert len(v.containers.all()) == 6 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores.all()) == 1 assert len(c.part_cores.all()) == 1 ##释放核 for c in containers: c.delete(5) assert len(v.containers.all()) == 0 assert g.get_max_containers(p, 3, 0) == 3 host_cores = g.get_free_cores(p, 3, 3, 0) assert len(host_cores) == 3 for host in g.private_hosts.all(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(part_cores) == 0 assert len(host.containers.all()) == 0 assert host.count == 0
def migrate_container_set(host): containers = Container.get_multi_by_host(host) for c in containers: add_container_for_agent(c)
def test_container(test_db): a = App.get_or_create('app', 'http://git.hunantv.com/group/app.git') assert a is not None assert a.id == a.user_id v = a.add_version(random_sha1()) assert v is not None assert v.app.id == a.id assert v.name == a.name assert len(v.containers.all()) == 0 assert len(v.tasks.all()) == 0 p = Pod.create('pod', 'pod', 10, -1) hosts = [ Host.create(p, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) for i in range(6) ] for host in hosts[3:]: host.set_public() host_ids1 = {h.id for h in hosts[:3]} host_ids2 = {h.id for h in hosts[3:]} host_cores = centralized_schedule(p, 3, 3, 0) #测试没有碎片核的情况 #获取核 containers = [] for (host, count), cores in host_cores.iteritems(): host.occupy_cores(cores, 0) cores_per_container = len(cores['full']) / count for i in range(count): cid = random_sha1() used_cores = { 'full': cores['full'][i * cores_per_container:(i + 1) * cores_per_container] } c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env', nshare=0) assert c is not None containers.append(c) for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 1 assert len(part_cores) == 0 assert len(host.containers.all()) == 1 assert host.count == 1 assert len(containers) == 3 assert len(v.containers.all()) == 3 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores) == 3 assert len(c.part_cores) == 0 all_core_labels = sorted([ '0', '1', '2', '3', ]) used_full_core_labels = [core.label for core in c.full_cores] used_part_core_labels = [core.label for core in c.part_cores] free_core_labels = [core.label for core in c.host.get_free_cores()[0]] assert all_core_labels == sorted(used_full_core_labels + used_part_core_labels + free_core_labels) #释放核 for c in containers: c.delete() assert len(v.containers.all()) == 0 assert get_max_container_count(p, 3, 0) == 3 host_cores = centralized_schedule(p, 3, 3, 0) assert len(host_cores) == 3 for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(part_cores) == 0 assert len(host.containers.all()) == 0 assert host.count == 4 #测试有碎片的情况 #获取核 host_cores = centralized_schedule(p, 3, 3, 4) containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores['full']) / count host.occupy_cores(cores, 4) for i in range(count): cid = random_sha1() used_cores = { 'full': cores['full'][i * cores_per_container:(i + 1) * cores_per_container], 'part': cores['part'] } # not using a port c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env', nshare=4) assert c is not None containers.append(c) for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 0 assert len(part_cores) == 1 assert part_cores[0].remain == 6 assert len(host.containers.all()) == 1 assert host.count == D('0.6') assert len(containers) == 3 assert len(v.containers.all()) == 3 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores) == 3 assert len(c.part_cores) == 1 all_core_labels = sorted([ '0', '1', '2', '3', ]) used_full_core_labels = [core.label for core in c.full_cores] used_part_core_labels = [core.label for core in c.part_cores] free_core_labels = [core.label for core in c.host.get_free_cores()[0]] assert all_core_labels == sorted(used_full_core_labels + used_part_core_labels + free_core_labels) #释放核 for c in containers: c.delete() assert len(v.containers.all()) == 0 assert get_max_container_count(p, 3, 0) == 3 host_cores = centralized_schedule(p, 3, 3, 0) assert len(host_cores) == 3 for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(host.containers.all()) == 0 assert host.count == 4 #获取 host_cores = centralized_schedule(p, 6, 1, 5) containers = [] for (host, count), cores in host_cores.iteritems(): cores_per_container = len(cores['full']) / count for i in range(count): cid = random_sha1() used_cores = { 'full': cores['full'][i * cores_per_container:(i + 1) * cores_per_container], 'part': cores['part'][i:i + 1], } host.occupy_cores(used_cores, 5) # not using a port c = Container.create(cid, host, v, random_string(), 'entrypoint', used_cores, 'env', nshare=5) assert c is not None containers.append(c) for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 1 assert len(part_cores) == 0 assert len(host.containers.all()) == 2 assert host.count == D('1') assert len(containers) == 6 assert len(v.containers.all()) == 6 for c in containers: assert c.host_id in host_ids1 assert c.host_id not in host_ids2 assert c.app.id == a.id assert c.version.id == v.id assert c.is_alive assert len(c.full_cores) == 1 assert len(c.part_cores) == 1 ##释放核 for c in containers: c.delete() assert len(v.containers.all()) == 0 assert get_max_container_count(p, 3, 0) == 3 host_cores = centralized_schedule(p, 3, 3, 0) assert len(host_cores) == 3 for host in p.get_private_hosts(): full_cores, part_cores = host.get_free_cores() assert len(full_cores) == 4 assert len(part_cores) == 0 assert len(host.containers.all()) == 0 assert host.count == 4
def stop_container(cid): c = Container.get_by_container_id(cid) if c: c.kill() dockerjob.stop_containers([c,], c.host) return {'r':0, 'msg': code.OK}
def create_containers(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ _log.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: _log.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = [ipam.get_pool(n) for n in network_ids] notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 cids = [] backends = [] entry = version.appconfig.entrypoints[entrypoint] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} # 在宿主机上创建容器 try: cid, cname = dockerjob.create_one_container( host, version, entrypoint, env, fcores + pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 _log.exception(e) host.release_cores(cores_for_one_container, nshare) continue # 容器记录下来 c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) # 为容器创建网络栈 # 同时把各种信息都记录下来 # 如果失败, 清除掉所有记录和宿主机上的容器 # 循环下一次尝试 cidrs = [n.netspace for n in networks] if not ipam.allocate_ips(cidrs, cid, spec_ips): _clean_failed_containers(cid) continue notifier.notify_agent(c) add_container_for_agent(host, c) add_container_backends(c) cids.append(cid) backends.extend(c.get_backends()) c.callback_report(status='start') health_check = entry.get('health_check', '') if health_check and backends: urls = [b + health_check for b in backends] if not wait_health_check(urls): # TODO 这里要么回滚要么报警 _log.info('Task<id=%s>: Done, but something went error', task_id) return publish_to_service_discovery(version.name) task.finish(consts.TASK_SUCCESS) task.reason = 'ok' task.container_ids = cids notifier.pub_success() _log.info('Task<id=%s>: Done', task_id)