def create_from_template_container(job, parent): """ if not it creates it. return the container service """ from zeroos.orchestrator.configuration import get_configuration from zeroos.orchestrator.sal.Container import Container from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) container_name = 'vdisk_{}_{}'.format(job.service.name, parent.name) node = Node.from_ays(parent, job.context['token']) config = get_configuration(job.service.aysrepo) flist = config.get( '0-disk-flist', 'https://hub.gig.tech/gig-official-apps/0-disk-master.flist') print("Creating container for flist: %s" % flist) container = Container(name=container_name, flist=flist, host_network=True, node=node) container.start() return container
def create_zerodisk_container_service(job, parent, service_kind): """ first check if the vdisks container for this vm exists. if not it creates it. return the container service """ from zeroos.orchestrator.configuration import get_configuration from zeroos.orchestrator.configuration import get_jwt_token import time job.context['token'] = get_jwt_token(job.service.aysrepo) service = job.service config = get_configuration(service.aysrepo) actor = service.aysrepo.actorGet("container") args = { 'node': parent.name, 'flist': config.get( '0-disk-flist', 'https://hub.gig.tech/gig-official-apps/0-disk-master.flist'), 'hostNetworking': True, } job.logger.info("create zerodisk container from %s", args['flist']) container_name = 'vdisks_{}_{}_{}_{}'.format(service.name, parent.name, service_kind, int(time.time() * 1000000)) containerservice = actor.serviceCreate(instance=container_name, args=args) # make sure the container has the right parent, the node where this vm runs. containerservice.model.changeParent(parent) containerservice.executeAction('start', context=job.context) return containerservice
def init(job): from zeroos.orchestrator.configuration import get_configuration service = job.service container_actor = service.aysrepo.actorGet('container') config = get_configuration(service.aysrepo) args = { 'node': service.model.data.node, 'flist': config.get( '0-statscollector-flist', 'https://hub.gig.tech/gig-official-apps/0-statscollector-master.flist' ), 'hostname': service.model.data.node, 'hostNetworking': True, 'initProcesses': get_init_processes(service), } cont_service = container_actor.serviceCreate( instance='{}_stats_collector'.format(service.name), args=args) service.consume(cont_service)
def create_zerodisk_container(job, parent): """ first check if the vdisks container for this vm exists. if not it creates it. return the container service """ from zeroos.orchestrator.configuration import get_configuration service = job.service config = get_configuration(service.aysrepo) actor = service.aysrepo.actorGet("container") args = { 'node': parent.name, 'flist': config.get( '0-disk-flist', 'https://hub.gig.tech/gig-official-apps/0-disk-master.flist'), 'hostNetworking': True, } container_name = 'vdisks_{}_{}'.format(service.name, parent.name) containerservice = actor.serviceCreate(instance=container_name, args=args) # make sure the container has the right parent, the node where this vm runs. containerservice.model.changeParent(parent) j.tools. async .wrappers.sync( containerservice.executeAction('start', context=job.context)) return containerservice
def create_server(node, disk, baseport, tcp, variant='data'): diskmap = [{'device': disk.devicename}] args = { 'node': node.name, 'metadataProfile': 'single', 'dataProfile': 'single', 'devices': diskmap } storagepoolname = 'cluster_{}_{}_{}'.format(node.name, service.name, disk.name) spservice = spactor.serviceCreate(instance=storagepoolname, args=args) service.consume(spservice) containername = '{}_{}_{}'.format(storagepoolname, variant, baseport) # adding filesystem args = { 'storagePool': storagepoolname, 'name': containername, } filesystems.append( fsactor.serviceCreate(instance=containername, args=args)) config = get_configuration(job.service.aysrepo) # create containers args = { 'node': node.name, 'hostname': containername, 'flist': config.get( 'storage-engine-flist', 'https://hub.gig.tech/gig-official-apps/ardb-rocksdb.flist'), 'mounts': [{ 'filesystem': containername, 'target': '/mnt/data' }], 'hostNetworking': True } containeractor.serviceCreate(instance=containername, args=args) # create storageEngines args = { 'homeDir': '/mnt/data', 'bind': '{}:{}'.format(node.storageAddr, baseport), 'container': containername } storageEngine = storageEngineActor.serviceCreate( instance=containername, args=args) storageEngine.consume(tcp) storageEngines.append(storageEngine)
def monitor(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token, get_configuration import redis service = job.service config = get_configuration(service.aysrepo) token = get_jwt_token(job.service.aysrepo) if service.model.actionsState['install'] != 'ok': return try: node = Node.from_ays(service, token, timeout=15) node.client.testConnectionAttempts = 0 state = node.client.ping() except RuntimeError: state = False except redis.ConnectionError: state = False if state: service.model.data.status = 'running' configured = isConfigured(node, service.name) if not configured: job = service.getJob('install', args={}) j.tools.async.wrappers.sync(job.execute()) job.context['token'] = token stats_collector_service = get_stats_collector(service) statsdb_service = get_statsdb(service) # Check if statsdb is installed on this node and start it if needed if (statsdb_service and str(statsdb_service.parent) == str(job.service) and statsdb_service.model.data.status != 'running'): j.tools.async.wrappers.sync(statsdb_service.executeAction( 'start', context=job.context)) # Check if there is a running statsdb and if so make sure stats_collector for this node is started if (stats_collector_service and stats_collector_service.model.data.status != 'running' and statsdb_service.model.data.status == 'running'): j.tools.async.wrappers.sync(stats_collector_service.executeAction( 'start', context=job.context)) else: service.model.data.status = 'halted' flist = config.get('healthcheck-flist', 'https://hub.gig.tech/deboeckj/js9container.flist') with node.healthcheck.with_container(flist) as cont: update_healthcheck(service, node.healthcheck.run(cont, 'openfiledescriptors')) service.saveAll()
def init(job): from zeroos.orchestrator.configuration import get_configuration service = job.service container_actor = service.aysrepo.actorGet('container') config = get_configuration(service.aysrepo) args = { 'node': service.model.data.node, 'flist': config.get( 'influxdb-flist', 'https://hub.gig.tech/gig-official-apps/influxdb.flist'), 'hostNetworking': True } cont_service = container_actor.serviceCreate(instance='{}_influxdb'.format(service.name), args=args) service.consume(cont_service)
def init(job): from zeroos.orchestrator.configuration import get_configuration service = job.service containeractor = service.aysrepo.actorGet("container") nics = service.model.data.to_dict()['nics'] # get dict version of nics for nic in nics: nic.pop('dhcpserver', None) zerotierbridge = nic.pop('zerotierbridge', None) if zerotierbridge: nics.append( { 'id': zerotierbridge['id'], 'type': 'zerotier', 'name': 'z-{}'.format(nic['name']), 'token': zerotierbridge.get('token', '') }) config = get_configuration(service.aysrepo) args = { 'node': service.model.data.node, 'flist': config.get('gw-flist', 'https://hub.gig.tech/gig-official-apps/zero-os-gw-master.flist'), 'nics': nics, 'hostname': service.model.data.hostname, 'hostNetworking': False, "privileged": True } cont_service = containeractor.serviceCreate(instance=service.name, args=args) service.consume(cont_service) args = { 'container': service.name } # create firewall fwactor = service.aysrepo.actorGet('firewall') fwactor.serviceCreate(instance=service.name, args=args) # create http httpactor = service.aysrepo.actorGet('http') httpactor.serviceCreate(instance=service.name, args=args) # create dhcp dhcpactor = service.aysrepo.actorGet('dhcp') dhcpactor.serviceCreate(instance=service.name, args=args) # Start cloudinit cloudinitactor = service.aysrepo.actorGet("cloudinit") cloudinitactor.serviceCreate(instance=service.name, args=args)
def input(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_configuration, get_jwt_token args = job.model.args ip = args.get('redisAddr') node = Node(ip, args.get('redisPort'), get_jwt_token(job.service.aysrepo)) config = get_configuration(job.service.aysrepo) version = node.client.info.version() core0_version = config.get('0-core-version') core0_revision = config.get('0-core-revision') if (core0_version and core0_version != version['branch']) or \ (core0_revision and core0_revision != version['revision']): raise RuntimeError("Node with IP {} has a wrong version. Found version {}@{} and expected version {}@{} ".format(ip, version['branch'], version['revision'], core0_version, core0_revision))
def configure(job): import random from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token from zeroos.orchestrator.configuration import get_configuration service = job.service job.context['token'] = get_jwt_token(job.service.aysrepo) config = get_configuration(service.aysrepo) nodes = set() for node_service in service.producers['node']: nodes.add(Node.from_ays(node_service, job.context['token'])) nodes = list(nodes) if len(nodes) % 2 == 0: nodes = random.sample(nodes, len(nodes) - 1) etcd_actor = service.aysrepo.actorGet("etcd") container_actor = service.aysrepo.actorGet("container") fsactor = service.aysrepo.actorGet("filesystem") etcd_args = {} peers = [] etcds = [] flist = config.get( 'etcd-flist', 'https://hub.gig.tech/gig-official-apps/etcd-release-3.2.flist') for node in nodes: baseports, tcpservices = get_baseports(job, node, baseport=2379, nrports=2) containername = '{}_{}_{}_{}'.format(service.name, 'etcd', node.name, baseports[1]) args = { 'storagePool': ensureStoragepool(job, node), 'name': containername, } old_filesystem_service = service.aysrepo.servicesFind( name=containername, role='filesystem') if old_filesystem_service: node.client.filesystem.remove( '/mnt/storagepools/%s/filesystems/%s/member' % (args['storagePool'], containername)) fsactor.serviceCreate(instance=containername, args=args) # create container data_dir = '/mnt/data' args = { 'node': node.name, 'flist': flist, 'mounts': [{ 'filesystem': containername, 'target': data_dir }], 'hostNetworking': True, } container_actor.serviceCreate(instance=containername, args=args) server_bind = '{}:{}'.format(node.storageAddr, baseports[1]) client_bind = '{}:{}'.format(node.storageAddr, baseports[0]) mgmt_client_bind = '{}:{}'.format(node.addr, baseports[0]) etcd_args[node.name] = { "serverBind": server_bind, "clientBind": client_bind, "container": containername, "mgmtClientBind": mgmt_client_bind, "tcps": tcpservices, "homeDir": data_dir, } etcdID = "{}_{}_{}".format(service.name, node.name, baseports[1]) if service.aysrepo.servicesFind(name=etcdID, role='etcd'): etcdID = "%s_recovered" % etcdID peers.append("{}=http://{}".format(etcdID, server_bind)) for k, v in etcd_args.items(): tcps = v.pop("tcps") etcdname = "{}_{}_{}".format(service.name, k, tcps[1].model.data.port) if service.aysrepo.servicesFind(name=etcdname, role='etcd'): etcdname = "%s_recovered" % etcdname v["peers"] = peers etcd_service = etcd_actor.serviceCreate(instance=etcdname, args=v) etcd_service.consume(tcps[0]) etcd_service.consume(tcps[1]) etcds.append(etcd_service.name) service.consume(etcd_service) service.model.data.etcds = etcds
def configure(job): """ this method will be called from the node.zero-os install action. """ import netaddr from zeroos.orchestrator.configuration import get_configuration, get_jwt_token from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.sal.Container import Container nodeservice = job.service.aysrepo.serviceGet( role='node', instance=job.model.args['node_name']) job.logger.info("execute network configure on {}".format(nodeservice)) node = Node.from_ays(nodeservice, get_jwt_token(job.service.aysrepo)) service = job.service network = netaddr.IPNetwork(service.model.data.cidr) addresses = node.network.get_addresses(network) actor = service.aysrepo.actorGet("container") config = get_configuration(service.aysrepo) args = { 'node': node.name, 'hostname': 'ovs', 'flist': config.get('ovs-flist', 'https://hub.gig.tech/gig-official-apps/ovs.flist'), 'hostNetworking': True, 'privileged': True, } job.context['token'] = get_jwt_token(job.service.aysrepo) cont_service = actor.serviceCreate(instance='{}_ovs'.format(node.name), args=args) j.tools. async .wrappers.sync( cont_service.executeAction('install', context=job.context)) container_client = Container.from_ays(cont_service, get_jwt_token( job.service.aysrepo)).client nics = node.client.info.nic() nicmap = {nic['name']: nic for nic in nics} freenics = node.network.get_free_nics() if not freenics: raise j.exceptions.RuntimeError("Could not find available nic") # freenics = ([1000, ['eth0']], [100, ['eth1']]) interface = freenics[0][1][0] if 'backplane' not in nicmap: container_client.json('ovs.bridge-add', {"bridge": "backplane"}) container_client.json('ovs.port-add', { "bridge": "backplane", "port": interface, "vlan": 0 }) node.client.system('ip address add {storageaddr} dev backplane'.format( **addresses)).get() node.client.system( 'ip link set dev {} mtu 2000'.format(interface)).get() node.client.system('ip link set dev backplane up').get() if 'vxbackend' not in nicmap: container_client.json( 'ovs.vlan-ensure', { 'master': 'backplane', 'vlan': service.model.data.vlanTag, 'name': 'vxbackend' }) node.client.system( 'ip address add {vxaddr} dev vxbackend'.format(**addresses)).get() node.client.system('ip link set dev vxbackend mtu 2000').get() node.client.system('ip link set dev vxbackend up').get()
def configure(job): """ this method will be called from the node.zero-os install action. """ import netaddr import time from zeroos.orchestrator.configuration import get_configuration, get_jwt_token from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.sal.Container import Container nodeservice = job.service.aysrepo.serviceGet(role='node', instance=job.model.args['node_name']) job.logger.info("execute network configure on {}".format(nodeservice)) node = Node.from_ays(nodeservice, get_jwt_token(job.service.aysrepo)) service = job.service network = netaddr.IPNetwork(service.model.data.cidr) if service.model.data.driver: # we reload the driver because on some buggy hardware this is required node.client.system('modprobe -r {}'.format(service.model.data.driver)).get() devs = {link['name'] for link in node.client.ip.link.list()} node.client.system('modprobe {}'.format(service.model.data.driver)).get() alldevs = {link['name'] for link in node.client.ip.link.list()} driverdevs = alldevs - devs for link in driverdevs: node.client.ip.link.up(link) # wait max 10 seconds for these nics to become up (speed available) now = time.time() while time.time() - 10 < now: for nic in node.client.info.nic(): if nic['speed'] and nic['name'] in driverdevs: driverdevs.remove(nic['name']) if not driverdevs: break time.sleep(1) addresses = node.network.get_addresses(network) actor = service.aysrepo.actorGet("container") config = get_configuration(service.aysrepo) args = { 'node': node.name, 'hostname': 'ovs', 'flist': config.get('ovs-flist', 'https://hub.gig.tech/gig-official-apps/ovs.flist'), 'hostNetworking': True, 'privileged': True, } job.context['token'] = get_jwt_token(job.service.aysrepo) cont_service = actor.serviceCreate(instance='{}_ovs'.format(node.name), args=args) cont_service.executeAction('install', context=job.context) container_client = Container.from_ays(cont_service, get_jwt_token(job.service.aysrepo)).client nics = node.client.info.nic() nicmap = {nic['name']: nic for nic in nics} freenics = node.network.get_free_nics() if not freenics: raise j.exceptions.RuntimeError("Could not find available nic") # freenics = ([1000, ['eth0']], [100, ['eth1']]) for speed, nics in freenics: if len(nics) >= 2: break else: raise j.exceptions.RuntimeError("Could not find two equal available nics") if 'backplane' not in nicmap: container_client.json('ovs.bridge-add', {"bridge": "backplane"}) container_client.json('ovs.bond-add', {"bridge": "backplane", "port": "bond0", "links": [nics[0], nics[1]], "lacp": True, "mode": "balance-tcp"}) node.client.system('ip address add {storageaddr} dev backplane'.format(**addresses)).get() node.client.system('ip link set dev {} mtu 2000'.format(nics[0])).get() node.client.system('ip link set dev {} mtu 2000'.format(nics[1])).get() node.client.system('ip link set dev backplane up').get() if 'vxbackend' not in nicmap: container_client.json('ovs.vlan-ensure', {'master': 'backplane', 'vlan': service.model.data.vlanTag, 'name': 'vxbackend'}) node.client.system('ip address add {vxaddr} dev vxbackend'.format(**addresses)).get() node.client.system('ip link set dev vxbackend mtu 2000').get() node.client.system('ip link set dev vxbackend up').get()
def create_server(node, datadisk, metadisk, baseport, tcp): diskmap = [{'device': disk.devicename}] args = { 'node': node.name, 'metadataProfile': 'single', 'dataProfile': 'single', 'devices': diskmap } storagepoolname = 'cluster_{}_{}_{}'.format(node.name, service.name, disk.name) spservice = spactor.serviceCreate(instance=storagepoolname, args=args) service.consume(spservice) containername = '{}_{}'.format(storagepoolname, baseport) # adding filesystem args = { 'storagePool': storagepoolname, 'name': containername, } filesystems.append( fsactor.serviceCreate(instance=containername, args=args)) config = get_configuration(job.service.aysrepo) metastoragepoolname = 'cluster_{}_{}_{}'.format( node.name, service.name, metadisk.name) if not service.aysrepo.serviceGet( role='storagepool', instance=metastoragepoolname, die=False): diskmap = [{'device': metadisk.devicename}] args = { 'node': node.name, 'metadataProfile': 'single', 'dataProfile': 'single', 'devices': diskmap } metaspservice = spactor.serviceCreate(instance=metastoragepoolname, args=args) service.consume(metaspservice) metacontainername = '{}_{}_meta'.format(metastoragepoolname, baseport) # adding filesystem args = { 'storagePool': metastoragepoolname, 'name': metacontainername, } fs = fsactor.serviceCreate(instance=metacontainername, args=args) filesystems.append(fs) service.consume(fs) # create containers args = { 'node': node.name, 'hostname': metacontainername, 'flist': config.get( '0-stor-flist', 'https://hub.gig.tech/gig-official-apps/0-stor-master.flist'), 'mounts': [{ 'filesystem': containername, 'target': '/mnt/data' }, { 'filesystem': metacontainername, 'target': '/mnt/metadata' }], 'hostNetworking': True } containeractor.serviceCreate(instance=containername, args=args) # create zerostor args = { 'dataDir': '/mnt/data', 'metaDir': '/mnt/metadata', 'bind': '{}:{}'.format(node.storageAddr, baseport), 'container': containername } zerostorService = zerostorActor.serviceCreate(instance=containername, args=args) zerostorService.consume(tcp) service.consume(zerostorService) zerostors.append(zerostorService)
def monitor(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.sal.healthcheck import HealthCheckObject from zeroos.orchestrator.configuration import get_jwt_token, get_configuration service = job.service config = get_configuration(service.aysrepo) token = get_jwt_token(job.service.aysrepo) job.context['token'] = token install_action = service.model.actionsState['install'] if install_action != 'ok' and install_action != 'error': return healthcheck_service = job.service.aysrepo.serviceGet(role='healthcheck', instance='node_%s' % service.name, die=False) if healthcheck_service is None: healthcheck_actor = service.aysrepo.actorGet('healthcheck') healthcheck_service = healthcheck_actor.serviceCreate( instance='node_%s' % service.name) service.consume(healthcheck_service) nodestatus = HealthCheckObject('nodestatus', 'Node Status', 'Node Status', '/nodes/{}'.format(service.name)) node = Node.from_ays(service, token, timeout=5) state = node.is_running() if state: service.model.data.status = 'running' configured = node.is_configured(service.name) if not configured: service.executeAction('install', context=job.context) for consumer in service.getConsumersRecursive(): consumer.self_heal_action('monitor') stats_collector_service = get_stats_collector(service) statsdb_service = get_statsdb(service) # Check if statsdb is installed on this node and start it if needed if (statsdb_service and str(statsdb_service.parent) == str(job.service) and statsdb_service.model.data.status != 'running'): statsdb_service.executeAction('start', context=job.context) # Check if there is a running statsdb and if so make sure stats_collector for this node is started if (stats_collector_service and stats_collector_service.model.data.status != 'running' and statsdb_service.model.data.status == 'running'): stats_collector_service.executeAction('start', context=job.context) # healthchecks nodestatus.add_message('node', 'OK', 'Node is running') update_healthcheck(job, healthcheck_service, node.healthcheck.openfiledescriptors()) update_healthcheck(job, healthcheck_service, node.healthcheck.cpu_mem()) update_healthcheck(job, healthcheck_service, node.healthcheck.rotate_logs()) update_healthcheck(job, healthcheck_service, node.healthcheck.network_bond()) update_healthcheck(job, healthcheck_service, node.healthcheck.interrupts()) update_healthcheck(job, healthcheck_service, node.healthcheck.context_switch()) update_healthcheck(job, healthcheck_service, node.healthcheck.threads()) update_healthcheck(job, healthcheck_service, node.healthcheck.qemu_vm_logs()) update_healthcheck(job, healthcheck_service, node.healthcheck.network_load()) update_healthcheck(job, healthcheck_service, node.healthcheck.disk_usage()) update_healthcheck(job, healthcheck_service, node.healthcheck.ssh_cleanup(job=job)) flist = config.get( 'healthcheck-flist', 'https://hub.gig.tech/gig-official-apps/healthcheck.flist') with node.healthcheck.with_container(flist) as cont: update_healthcheck(job, healthcheck_service, node.healthcheck.node_temperature(cont)) update_healthcheck(job, healthcheck_service, node.healthcheck.powersupply(cont)) update_healthcheck(job, healthcheck_service, node.healthcheck.fan(cont)) # check network stability of node with the rest of the nodes ! TODO else: if service.model.data.status != 'rebooting': service.model.data.status = 'halted' nodestatus.add_message('node', 'ERROR', 'Node is halted') update_healthcheck(job, healthcheck_service, nodestatus.to_dict()) get_version(job) service.saveAll()