def get_network(id_or_name): if id_or_name.isdigit(): n = Network.get(id_or_name) else: n = Network.get_by_name(id_or_name) if not n: abort(404, 'Network %s not found' % id_or_name) return n
def create_public(group_name, pod_name, appname): """ ncontainer: int container nums version: string deploy version expose: bool true or false, default true """ data = request.get_json() group, pod, application, version = validate_instance(group_name, pod_name, appname, data['version']) networks = Network.get_multi(data.get('networks', [])) ncontainer = int(data['ncontainer']) appconfig = version.appconfig if not data['entrypoint'] in appconfig.entrypoints: raise EruAbortException(code.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % data['entrypoint']) tasks_info = [] with rds.lock('%s:%s' % (group_name, pod_name)): try: # 轮询, 尽可能均匀部署 hosts = pod.get_free_public_hosts(ncontainer) for host in itertools.islice(itertools.cycle(hosts), ncontainer): tasks_info.append( (version, host, 1, [], networks, data['entrypoint'], data['env']) ) except Exception, e: logger.exception(e) raise EruAbortException(code.HTTP_BAD_REQUEST, str(e))
def create_public(group_name, pod_name, appname): """参数同private, 只是不能指定需要的核心数量""" data = request.get_json() if data.get('raw', ''): vstr = consts.RAW_VERSION_PLACEHOLDER else: vstr = data['version'] group, pod, application, version = validate_instance(group_name, pod_name, appname, vstr) networks = Network.get_multi(data.get('networks', [])) spec_ips = data.get('spec_ips', []) ncontainer = int(data['ncontainer']) appconfig = version.appconfig if not data['entrypoint'] in appconfig.entrypoints: current_app.logger.error('Entrypoint not in app.yaml (entry=%s, name=%s, version=%s)', data['entrypoint'], appname, version.short_sha) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % data['entrypoint']) ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): hosts = pod.get_free_public_hosts(ncontainer) for host in itertools.islice(itertools.cycle(hosts), ncontainer): t = _create_task(consts.TASK_CREATE, version, host, 1, {}, 0, networks, spec_ips, data['entrypoint'], data['env'], image=data.get('image', '')) if not t: continue ts.append(t.id) keys.append(t.result_key) return {'r':0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}
def create_public(group_name, pod_name, appname): """参数同private, 只是不能指定需要的核心数量""" data = request.get_json() group, pod, application, version = validate_instance(group_name, pod_name, appname, data['version']) networks = Network.get_multi(data.get('networks', [])) ncontainer = int(data['ncontainer']) appconfig = version.appconfig if not data['entrypoint'] in appconfig.entrypoints: raise EruAbortException(code.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % data['entrypoint']) ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): hosts = pod.get_free_public_hosts(ncontainer) for host in itertools.islice(itertools.cycle(hosts), ncontainer): t = _create_task(code.TASK_CREATE, version, host, 1, {}, 0, networks, data['entrypoint'], data['env']) if not t: continue ts.append(t.id) keys.append(t.result_key) return {'r':0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}
def test_network(test_db): n = Network.create('net', '10.1.0.0/16') assert n is not None assert len(n.ips.all()) == 0 assert n.hostmask_string == '16' assert n.pool_size == 65436 assert n.used_count == 0 ip = n.acquire_ip() assert ip is not None assert ip.network_id == n.id assert ip.vethname == '' assert not ip.container_id assert ip.hostmask == n.hostmask_string assert ip.vlan_seq_id == n.id assert ip.address.startswith('10.1') assert len(n.ips.all()) == 1 assert n.pool_size == 65435 assert n.used_count == 1 ip.release() assert len(n.ips.all()) == 0 assert n.pool_size == 65436 assert n.used_count == 0
def take_gateway(netspace, ip, host): net = Network.get_by_netspace(netspace) if not net: print 'net %s not found' % netspace return ipnum = IPAddress(ip).value rds.srem(net.gatekey, ipnum) VLanGateway.create(ipnum, net.id, host.id) print '%s on %s --> %s done' % (ip, host.ip, netspace)
def create_private(group_name, pod_name, appname): """ncore: 需要的核心数, 可以是小数, 例如1.5个""" data = request.get_json() if data.get('raw', ''): vstr = consts.RAW_VERSION_PLACEHOLDER else: vstr = data['version'] group, pod, application, version = validate_instance(group_name, pod_name, appname, vstr) # TODO check if group has this pod core_require = int(float(data['ncore']) * pod.core_share) # 是说一个容器要几个核... ncore = core_require / pod.core_share nshare = core_require % pod.core_share ncontainer = int(data['ncontainer']) networks = Network.get_multi(data.get('networks', [])) spec_ips = data.get('spec_ips', []) appconfig = version.appconfig # 指定的host, 如果没有则按照编排分配host hostname = data.get('hostname', '') host = hostname and Host.get_by_name(hostname) or None if host and not (host.group_id == group.id and host.pod_id == pod.id): current_app.logger.error('Host must belong to pod/group (hostname=%s, pod=%s, group=%s)', host, pod_name, group_name) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Host must belong to this pod and group') if not data['entrypoint'] in appconfig.entrypoints: current_app.logger.error('Entrypoint not in app.yaml (entry=%s, name=%s, version=%s)', data['entrypoint'], appname, version.short_sha) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % data['entrypoint']) ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): host_cores = group.get_free_cores(pod, ncontainer, ncore, nshare, spec_host=host) if not host_cores: current_app.logger.error('Not enough cores (name=%s, version=%s, ncore=%s)', appname, version.short_sha, data['ncore']) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Not enough core resources') for (host, container_count), cores in host_cores.iteritems(): t = _create_task(consts.TASK_CREATE, version, host, container_count, cores, nshare, networks, spec_ips, data['entrypoint'], data['env'], image=data.get('image', '')) if not t: continue host.occupy_cores(cores, nshare) ts.append(t.id) keys.append(t.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}
def check_addr(addr): """addr is like 10.20.0.1/16 or 10.100.3.12/24""" try: interface = IPv4Interface(addr) except AddressValueError: abort(400, 'Not valid interface') net = Network.get_by_netspace(interface.network.compressed) if not net: abort(400, 'Interface not found') return {'r': 0, 'msg': consts.OK, 'result': interface.ip in net}
def create_network(): data = request.get_json() n = Network.create(data['name'], data['netspace']) if not n: current_app.logger.info('Network create failed (name=%s, net=%s)', data['name'], data['netspace']) abort(400, 'Network create failed') current_app.logger.info('Network create succeeded (name=%s, net=%s)', data['name'], data['netspace']) return {'r': 0, 'msg': consts.OK}
def create_network(): data = request.get_json() n = Network.create(data['name'], data['netspace']) if not n: current_app.logger.info('Network create failed (name=%s, net=%s)', data['name'], data['netspace']) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Network create failed') current_app.logger.info('Network create succeeded (name=%s, net=%s)', data['name'], data['netspace']) return {'r': 0, 'msg': consts.OK}
def check_addr(addr): """addr is like 10.20.0.1/16 or 10.100.3.12/24""" try: interface = IPv4Interface(addr) except AddressValueError: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Not valid interface') net = Network.get_by_netspace(interface.network.compressed) if not net: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Interface not found') return {'r': 0, 'msg': consts.OK, 'result': interface.ip in net}
def test_network(test_db): n = Network.create('net', '10.1.0.0/16') assert n is not None assert len(n.ips.all()) == 0 assert n.hostmask_string == '16' assert n.pool_size == 65436 assert n.used_count == 0 assert n.used_gate_count == 0 assert n.gate_pool_size == 100 ip = n.acquire_ip() assert ip is not None assert ip.network_id == n.id assert ip.vethname == '' assert not ip.container_id assert ip.hostmask == n.hostmask_string assert ip.vlan_seq_id == n.id assert ip.address.startswith('10.1') assert len(n.ips.all()) == 1 assert n.pool_size == 65435 assert n.used_count == 1 ip.release() assert len(n.ips.all()) == 0 assert n.pool_size == 65436 assert n.used_count == 0 p = Pod.create('pod', 'pod', 10, -1) host = Host.create(p, random_ipv4(), random_string(prefix='host'), random_uuid(), 4, 4096) gate = n.acquire_gateway_ip(host) assert gate is not None assert gate.network_id == n.id assert gate.vlan_address.startswith('10.1.0.') assert gate.vlan_seq_id == n.id assert gate.name == 'vlan.%02d.br' % n.id g = VLanGateway.get_by_host_and_network(host.id, n.id) assert g is not None assert g.id == gate.id assert len(host.list_vlans()) == 1 assert n.used_gate_count == 1 assert n.gate_pool_size == 99 gate.release() assert n.used_gate_count == 0 assert n.gate_pool_size == 100 assert VLanGateway.get_by_host_and_network(host.id, n.id) is None assert len(host.list_vlans()) == 0
def create_public(group_name, pod_name, appname): data = request.get_json() vstr = data['version'] group, pod, _, version = validate_instance(group_name, pod_name, appname, vstr) ports = data.get('ports', []) args = data.get('args', []) callback_url = data.get('callback_url', '') if callback_url and not is_strict_url(callback_url): abort(400, 'callback_url must starts with http:// or https://') networks = Network.get_multi(data.get('networks', [])) spec_ips = data.get('spec_ips', []) appconfig = version.appconfig ncontainer = int(data['ncontainer']) if not ncontainer: abort(400, 'ncontainer must be > 0') entrypoint = data['entrypoint'] if entrypoint not in appconfig.entrypoints: abort(400, 'Entrypoint %s not in app.yaml' % entrypoint) ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): hosts = pod.get_free_public_hosts(ncontainer) for host in itertools.islice(itertools.cycle(hosts), ncontainer): t = _create_task( version, host, 1, {}, 0, networks, ports, args, spec_ips, data['entrypoint'], data['env'], image=data.get('image', ''), callback_url=callback_url, ) if not t: continue ts.append(t.id) keys.append(t.result_key) return {'r':0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}
def macvlan(id_or_name): host = _get_host(id_or_name) if request.method == 'GET': return host.list_vlans(g.start, g.limit) data = request.get_json() netname = data.get('network', '') network = Network.get_by_name(netname) if not network: abort(404, 'Network not found') vg = VLanGateway.get_by_host_and_network(host.id, network.id) if request.method == 'POST': return vg or network.acquire_gateway_ip(host) elif request.method == 'DELETE': if vg: vg.release() return {'r': 0, 'msg': consts.OK}
def create_public(group_name, pod_name, appname): """参数同private, 只是不能指定需要的核心数量""" data = request.get_json() if data.get("raw", ""): vstr = consts.RAW_VERSION_PLACEHOLDER else: vstr = data["version"] group, pod, application, version = validate_instance(group_name, pod_name, appname, vstr) networks = Network.get_multi(data.get("networks", [])) spec_ips = data.get("spec_ips", []) ncontainer = int(data["ncontainer"]) appconfig = version.appconfig if not data["entrypoint"] in appconfig.entrypoints: current_app.logger.error( "Entrypoint not in app.yaml (entry=%s, name=%s, version=%s)", data["entrypoint"], appname, version.short_sha ) raise EruAbortException(consts.HTTP_BAD_REQUEST, "Entrypoint %s not in app.yaml" % data["entrypoint"]) ts, keys = [], [] with rds.lock("%s:%s" % (group_name, pod_name)): hosts = pod.get_free_public_hosts(ncontainer) for host in itertools.islice(itertools.cycle(hosts), ncontainer): t = _create_task( consts.TASK_CREATE, version, host, 1, {}, 0, networks, spec_ips, data["entrypoint"], data["env"], image=data.get("image", ""), ) if not t: continue ts.append(t.id) keys.append(t.result_key) return {"r": 0, "msg": "ok", "tasks": ts, "watch_keys": keys}
def create_private(group_name, pod_name, appname): """ ncore: int cpu num per container -1 means share ncontainer: int container nums version: string deploy version expose: bool true or false, default true """ data = request.get_json() group, pod, application, version = validate_instance(group_name, pod_name, appname, data['version']) # TODO check if group has this pod ncore = int(data['ncore']) # 是说一个容器要几个核... ncontainer = int(data['ncontainer']) networks = Network.get_multi(data.get('networks', [])) appconfig = version.appconfig if not data['entrypoint'] in appconfig.entrypoints: raise EruAbortException(code.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % data['entrypoint']) tasks_info = [] with rds.lock('%s:%s' % (group_name, pod_name)): # 不够了 if ncore > 0 and group.get_max_containers(pod, ncore) < ncontainer: raise EruAbortException(code.HTTP_BAD_REQUEST, 'Not enough cores') try: host_cores = group.get_free_cores(pod, ncontainer, ncore) # 这个pod都不够host了 if not host_cores: raise EruAbortException(code.HTTP_BAD_REQUEST, 'Not enough cores') for (host, container_count), cores in host_cores.iteritems(): tasks_info.append( (version, host, container_count, cores, networks, data['entrypoint'], data['env']) ) host.occupy_cores(cores) except Exception, e: logger.exception(e) raise EruAbortException(code.HTTP_BAD_REQUEST, str(e))
def bind_network(cid): data = request.get_json() appname = data.get('appname') c = Container.get_by_container_id(cid) if not (c and c.is_alive): raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container %s not found' % cid) if c.appname != appname: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Container does not belong to app') network_names = data.get('networks', []) networks = filter(None, [Network.get_by_name(n) for n in network_names]) if not networks: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'network empty') ips = filter(None, [n.acquire_ip() for n in networks]) if not ips: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'no ip available') nid = max([ip.network_id for ip in c.ips.all()] + [-1]) + 1 bind_container_ip(c, ips, nid=nid) for ip in ips: ip.assigned_to_container(c) return {'r': 0, 'msg': ips}
def create_private(group_name, pod_name, appname): """ncore: 需要的核心数, 可以是小数, 例如1.5个""" data = request.get_json() vstr = data['version'] group, pod, application, version = validate_instance(group_name, pod_name, appname, vstr) # TODO check if group has this pod core_require = int(float(data['ncore']) * pod.core_share) # 是说一个容器要几个核... ncore = core_require / pod.core_share nshare = core_require % pod.core_share ports = data.get('ports', []) args = data.get('args', []) ncontainer = int(data['ncontainer']) networks = Network.get_multi(data.get('networks', [])) spec_ips = data.get('spec_ips', []) entrypoint = data['entrypoint'] appconfig = version.appconfig strategy = data.get('strategy', 'average') # 指定的host, 如果没有则按照编排分配host hostname = data.get('hostname', '') host = hostname and Host.get_by_name(hostname) or None if host and not (host.group_id == group.id and host.pod_id == pod.id): current_app.logger.error('Host must belong to pod/group (hostname=%s, pod=%s, group=%s)', host, pod_name, group_name) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Host must belong to this pod and group') if not entrypoint in appconfig.entrypoints: current_app.logger.error('Entrypoint not in app.yaml (entry=%s, name=%s, version=%s)', entrypoint, appname, version.short_sha) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Entrypoint %s not in app.yaml' % entrypoint) route = appconfig.entrypoints[entrypoint].get('network_route', '') ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): if strategy == 'average': host_cores = average_schedule(group, pod, ncontainer, ncore, nshare, spec_host=host) elif strategy == 'centralized': host_cores = centralized_schedule(group, pod, ncontainer, ncore, nshare, spec_host=host) else: raise EruAbortException(consts.HTTP_BAD_REQUEST, 'strategy %s not supported' % strategy) if not host_cores: current_app.logger.error('Not enough cores (name=%s, version=%s, ncore=%s)', appname, version.short_sha, data['ncore']) raise EruAbortException(consts.HTTP_BAD_REQUEST, 'Not enough core resources') for (host, container_count), cores in host_cores.iteritems(): t = _create_task( version, host, container_count, cores, nshare, networks, ports, args, spec_ips, route, data['entrypoint'], data['env'], image=data.get('image', ''), ) if not t: continue host.occupy_cores(cores, nshare) ts.append(t.id) keys.append(t.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}
def list_networks(): return Network.list_networks()
def get_network_by_name(network_name): n = Network.get_by_name(network_name) if not n: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Network %s not found' % network_name) return n
def get_network(network_id): n = Network.get(network_id) if not n: raise EruAbortException(consts.HTTP_NOT_FOUND, 'Network %s not found' % network_id) return n
def create_private(group_name, pod_name, appname): """ncore: 需要的核心数, 可以是小数, 例如1.5个""" data = request.get_json() if data.get("raw", ""): vstr = consts.RAW_VERSION_PLACEHOLDER else: vstr = data["version"] group, pod, application, version = validate_instance(group_name, pod_name, appname, vstr) # TODO check if group has this pod core_require = int(float(data["ncore"]) * pod.core_share) # 是说一个容器要几个核... ncore = core_require / pod.core_share nshare = core_require % pod.core_share ncontainer = int(data["ncontainer"]) networks = Network.get_multi(data.get("networks", [])) spec_ips = data.get("spec_ips", []) appconfig = version.appconfig # 指定的host, 如果没有则按照编排分配host hostname = data.get("hostname", "") host = hostname and Host.get_by_name(hostname) or None if host and not (host.group_id == group.id and host.pod_id == pod.id): current_app.logger.error( "Host must belong to pod/group (hostname=%s, pod=%s, group=%s)", host, pod_name, group_name ) raise EruAbortException(consts.HTTP_BAD_REQUEST, "Host must belong to this pod and group") if not data["entrypoint"] in appconfig.entrypoints: current_app.logger.error( "Entrypoint not in app.yaml (entry=%s, name=%s, version=%s)", data["entrypoint"], appname, version.short_sha ) raise EruAbortException(consts.HTTP_BAD_REQUEST, "Entrypoint %s not in app.yaml" % data["entrypoint"]) ts, keys = [], [] with rds.lock("%s:%s" % (group_name, pod_name)): host_cores = group.get_free_cores(pod, ncontainer, ncore, nshare, spec_host=host) if not host_cores: current_app.logger.error( "Not enough cores (name=%s, version=%s, ncore=%s)", appname, version.short_sha, data["ncore"] ) raise EruAbortException(consts.HTTP_BAD_REQUEST, "Not enough core resources") for (host, container_count), cores in host_cores.iteritems(): t = _create_task( consts.TASK_CREATE, version, host, container_count, cores, nshare, networks, spec_ips, data["entrypoint"], data["env"], image=data.get("image", ""), ) if not t: continue host.occupy_cores(cores, nshare) ts.append(t.id) keys.append(t.result_key) return {"r": 0, "msg": "ok", "tasks": ts, "watch_keys": keys}
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ # TODO support part core current_flask.logger.info("Task<id=%s>: Started", task_id) task = Task.get(task_id) if not task: current_flask.logger.error("Task (id=%s) not found, quit", task_id) return if spec_ips is None: spec_ips = [] networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props["entrypoint"] env = task.props["env"] # use raw image = task.props["image"] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 pub_agent_vlan_key = "eru:agent:%s:vlan" % host.name feedback_key = "eru:agent:%s:feedback" % task_id cids = [] full_cores, part_cores = cores.get("full", []), cores.get("part", []) for fcores, pcores in izip_longest( chunked(full_cores, len(full_cores) / ncontainer), chunked(part_cores, len(part_cores) / ncontainer), fillvalue=[], ): cores_for_one_container = {"full": fcores, "part": pcores} try: cid, cname = dockerjob.create_one_container( host, version, entrypoint, env, fcores + pcores, cpu_shares, image=image ) except: host.release_cores(cores_for_one_container, nshare) continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split("_")[-1] values = [str(task_id), cid, ident_id] + ["{0}:{1}".format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, "|".join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split("|") if succ == "0": break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: current_flask.logger.info("Creating container (cid=%s, ips=%s)", cid, ips) c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info("Cleaning failed container (cid=%s)", cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores_for_one_container, nshare) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() current_flask.logger.info("Task<id=%s>: Done", task_id)
def create_containers_with_macvlan(task_id, ncontainer, core_ids, network_ids): """ 执行task_id的任务. 部署ncontainer个容器, 占用core_ids这些核, 绑定到networks这些子网 """ task = Task.get(task_id) if not task: return networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] used_cores = Core.get_multi(core_ids) pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for cores in more_itertools.chunked(used_cores, len(core_ids)/ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores) except: host.release_cores(cores) continue ips = [n.acquire_ip() for n in networks] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: logger.info('Creating container with cid %s and ips %s' % (cid, ips)) c = Container.create(cid, host, version, cname, entrypoint, cores, env) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip logger.info('Cleaning failed container with cid %s' % cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores) [ip.release() for ip in ips] publish_to_service_discovery(version.name) task.finish_with_result(code.TASK_SUCCESS, container_ids=cids) notifier.pub_success()
def create_containers_with_macvlan_public(task_id, ncontainer, nshare, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] # use raw image = task.props['image'] cpu_shares = 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for _ in range(ncontainer): try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, cores=None, cpu_shares=cpu_shares, image=image) except Exception as e: print e # 同上 continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: ident_id = cname.split('_')[-1] values = [str(task_id), cid, ident_id] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, {}, env, nshare) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(cid) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() current_flask.logger.info('Task<id=%s>: Done', task_id)
def create_containers_with_macvlan(task_id, ncontainer, nshare, cores, network_ids, spec_ips=None): """ 执行task_id的任务. 部署ncontainer个容器, 占用*_core_ids这些核, 绑定到networks这些子网 """ current_flask.logger.info('Task<id=%s>: Started', task_id) task = Task.get(task_id) if not task: current_flask.logger.error('Task (id=%s) not found, quit', task_id) return if spec_ips is None: spec_ips = [] need_network = bool(network_ids) networks = Network.get_multi(network_ids) notifier = TaskNotifier(task) host = task.host version = task.version entrypoint = task.props['entrypoint'] env = task.props['env'] ports = task.props['ports'] args = task.props['args'] # use raw route = task.props['route'] image = task.props['image'] callback_url = task.props['callback_url'] cpu_shares = int(float(nshare) / host.pod.core_share * 1024) if nshare else 1024 pub_agent_vlan_key = 'eru:agent:%s:vlan' % host.name pub_agent_route_key = 'eru:agent:%s:route' % host.name feedback_key = 'eru:agent:%s:feedback' % task_id cids = [] for fcores, pcores in _iter_cores(cores, ncontainer): cores_for_one_container = {'full': fcores, 'part': pcores} try: cid, cname = dockerjob.create_one_container(host, version, entrypoint, env, fcores+pcores, ports=ports, args=args, cpu_shares=cpu_shares, image=image, need_network=need_network) except Exception as e: # 写给celery日志看 print e host.release_cores(cores_for_one_container, nshare) continue if spec_ips: ips = [n.acquire_specific_ip(ip) for n, ip in zip(networks, spec_ips)] else: ips = [n.acquire_ip() for n in networks] ips = [i for i in ips if i] ip_dict = {ip.vlan_address: ip for ip in ips} if ips: if ERU_AGENT_API == 'pubsub': values = [str(task_id), cid] + ['{0}:{1}'.format(ip.vlan_seq_id, ip.vlan_address) for ip in ips] rds.publish(pub_agent_vlan_key, '|'.join(values)) elif ERU_AGENT_API == 'http': agent = get_agent(host) ip_list = [(ip.vlan_seq_id, ip.vlan_address) for ip in ips] agent.add_container_vlan(cid, str(task_id), ip_list) for _ in ips: # timeout 15s rv = rds.blpop(feedback_key, 15) if rv is None: break # rv is like (feedback_key, 'succ|container_id|vethname|ip') succ, _, vethname, vlan_address = rv[1].split('|') if succ == '0': break ip = ip_dict.get(vlan_address, None) if ip: ip.set_vethname(vethname) if route: rds.publish(pub_agent_route_key, '%s|%s' % (cid, route)) else: current_flask.logger.info('Creating container (cid=%s, ips=%s)', cid, ips) c = Container.create(cid, host, version, cname, entrypoint, cores_for_one_container, env, nshare, callback_url) for ip in ips: ip.assigned_to_container(c) notifier.notify_agent(c) add_container_for_agent(c) add_container_backends(c) cids.append(cid) # 略过清理工作 continue # 清理掉失败的容器, 释放核, 释放ip current_flask.logger.info('Cleaning failed container (cid=%s)', cid) dockerjob.remove_container_by_cid([cid], host) host.release_cores(cores_for_one_container, nshare) [ip.release() for ip in ips] # 失败了就得清理掉这个key rds.delete(feedback_key) publish_to_service_discovery(version.name) task.finish_with_result(consts.TASK_SUCCESS, container_ids=cids) notifier.pub_success() # 有IO, 丢最后面算了 falcon_all_graphs(version) falcon_all_alarms(version) current_flask.logger.info('Task<id=%s>: Done', task_id)
def create_private(group_name, pod_name, appname): data = request.get_json() vstr = data['version'] group, pod, _, version = validate_instance(group_name, pod_name, appname, vstr) # TODO check if group has this pod ncore, nshare = pod.get_core_allocation(float(data['ncore'])) ports = data.get('ports', []) args = data.get('args', []) strategy = data.get('strategy', 'average') callback_url = data.get('callback_url', '') if callback_url and not is_strict_url(callback_url): abort(400, 'callback_url must start with http:// or https://') ncontainer = int(data['ncontainer']) if not ncontainer: abort(400, 'ncontainer must be > 0') networks = Network.get_multi(data.get('networks', [])) spec_ips = data.get('spec_ips', []) appconfig = version.appconfig entrypoint = data['entrypoint'] if entrypoint not in appconfig.entrypoints: abort(400, 'Entrypoint %s not in app.yaml' % entrypoint) hostname = data.get('hostname', '') host = hostname and Host.get_by_name(hostname) or None if host and not (host.group_id == group.id and host.pod_id == pod.id): abort(400, 'Host must belong to this pod and group') ts, keys = [], [] with rds.lock('%s:%s' % (group_name, pod_name)): host_cores = _get_strategy(strategy)(group, pod, ncontainer, ncore, nshare, host) if not host_cores: abort(400, 'Not enough core resources') for (host, container_count), cores in host_cores.iteritems(): t = _create_task( version, host, container_count, cores, nshare, networks, ports, args, spec_ips, entrypoint, data['env'], image=data.get('image', ''), callback_url=callback_url, ) if not t: continue host.occupy_cores(cores, nshare) ts.append(t.id) keys.append(t.result_key) return {'r': 0, 'msg': 'ok', 'tasks': ts, 'watch_keys': keys}