def invoke_agent_expect_result(self, host, command, args = {}): from chroma_core.services.job_scheduler.agent_rpc import AgentException result = self.invoke_agent(host, command, args) # This case is to deal with upgrades, once every installation is using the new protocol then we should not allow this. # Once everything is 3.0 or later we will also have version information in the wrapper header. if (result == None) or \ ((type(result) == dict) and ('error' not in result) and ('result' not in result)): job_log.info("Invalid result %s fixed up on called to %s with args %s" % (result, command, args)) # Prior to 3.0 update_packages returned {'update_packages': data} so fix this up. This code is here so that all # of the legacy fixups are in one place and can easily be removed. if command == 'install_packages' and 'scan_packages' in result: result = agent_result(result['scan_packages']) else: result = agent_result(result) if type(result) != dict: raise AgentException(host.fqdn, command, args, "Expected a dictionary but got a %s when calling %s" % (type(result), command)) if ('error' not in result) and ('result' not in result): raise AgentException(host.fqdn, command, args, "Expected a dictionary with 'error' or 'result' in keys but got %s when calling %s" % (result, command)) if 'error' in result: self.log(result['error']) raise AgentException(host.fqdn, command, args, result['error']) return result['result']
def start_target(ha_label): """ Start the high availability target Return: Value using simple return protocol """ if not _resource_exists(ha_label): return agent_error("Target {} does not exist".format(ha_label)) # if resource already started but not on primary, move it location = get_resource_location(ha_label) primary = _find_resource_constraint(ha_label, True) if location: if location != primary: console_log.info( "Resource %s already started, moving to primary node %s", ha_label, primary, ) error = _move_target(ha_label, primary) if error: return agent_error(error) location = primary return agent_result(location) try: _res_set_started(ha_label, True) if _resource_exists(_zfs_name(ha_label)): _res_set_started(_zfs_name(ha_label), True) # enable group also, in case group was disabled _res_set_started(_group_name(ha_label), True) # now wait for it to start if not _wait_target(ha_label, True): # try to leave things in a sane state for a failed mount _res_set_started(ha_label, False) return agent_error("Failed to start target {}".format(ha_label)) location = get_resource_location(ha_label) if not location: return agent_error( "Started {} but now can't locate it!".format(ha_label)) return agent_result(location) except AgentShell.CommandExecutionError as err: return agent_error( "Error (%s) running '%s': '%s' '%s'" % (err.result.rc, err.command, err.result.stdout, err.result.stderr))
def get_corosync_autoconfig(): """ Automatically detect the configuration for corosync. :return: dictionary containing 'result' or 'error'. """ ring0 = get_shared_ring() if not ring0: return agent_error("Failed to detect ring0 interface") ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0) try: ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix) except RingDetectionError as e: return agent_error(e.message) return agent_result({ "interfaces": { ring0.name: { "dedicated": False, "ipaddr": ring0.ipv4_address, "prefix": ring0.ipv4_prefixlen, }, ring1.name: { "dedicated": True, "ipaddr": ring1.ipv4_address, "prefix": ring1.ipv4_prefixlen, }, }, "mcast_port": ring1.mcastport, })
def get_corosync_autoconfig(self): with self._lock: port_names = {'tcp': 'eth', 'o2ib': 'ib'} inet4_addresses = [] names = [] for inet4_address in self.network_interfaces.keys(): interface = self.network_interfaces[inet4_address] inet4_addresses.append(inet4_address) names.append( '%s%s' % (port_names[interface['type']], interface['interface_no'])) return agent_result({ 'interfaces': { names[0]: { 'dedicated': False, 'ipaddr': inet4_addresses[0], 'prefix': 24 }, names[1]: { 'dedicated': True, 'ipaddr': inet4_addresses[1], 'prefix': 24 } }, 'mcast_port': self.state['corosync'].mcast_port })
def get_corosync_autoconfig(): """ Automatically detect the configuration for corosync. :return: dictionary containing 'result' or 'error'. """ ring0 = get_ring0() if not ring0: return agent_error('Failed to detect ring0 interface') ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0) try: ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix) except RingDetectionError as e: return agent_error(e.message) return agent_result({ 'interfaces': { ring0.name: { 'dedicated': False, 'ipaddr': ring0.ipv4_address, 'prefix': ring0.ipv4_prefixlen }, ring1.name: { 'dedicated': True, 'ipaddr': ring1.ipv4_address, 'prefix': ring1.ipv4_prefixlen } }, 'mcast_port': ring1.mcastport })
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def install_packages(self, repos, packages): for package in packages: try: self.state['packages'][ package] = self._simulator.available_packages( self.node_type)[package] except KeyError: raise RuntimeError( "Package '%s' not found (available: %s)!" % (package, self._simulator.available_packages( self.node_type))) self.save() return agent_result(self.scan_packages())
def _fake_invoke_agent(self, host, invoke, args=None): args = args if args is not None else {} assert type(args) is dict, "args list must be dict :%s" % type(args) args = InvokeAgentInvoke(host.fqdn, invoke, args, None, None) self._invokes_history.append(args) result = self._get_executable_invoke(args) result.executions_remaining -= 1 if result.error: return agent_error(result.error) if result.result: return agent_result(result.result) return agent_result_ok
def test_install_packages(self): self.add_commands( CommandCaptureCommand(('dnf', 'clean', 'all', '--enablerepo=*')), CommandCaptureCommand( ('dnf', 'repoquery', '--latest-limit', '1', '--requires', '--enablerepo=myrepo', 'foo', 'bar'), stdout="""/usr/bin/python python >= 2.4 python(abi) = 2.6 yum >= 3.2.29 /bin/sh kernel = 2.6.32-279.14.1.el6_lustre lustre-backend-fs """), CommandCaptureCommand( ('dnf', 'install', '--allowerasing', '-y', '--exclude', 'kernel-debug', '--enablerepo=myrepo', 'foo', 'bar', 'kernel-2.6.32-279.14.1.el6_lustre')), CommandCaptureCommand(( 'dnf', 'repoquery', '--queryformat=%{name} %{version}-%{release}.%{arch} %{repoid}', '--upgrades', '--disablerepo=*', '--enablerepo=myrepo'), stdout=""" jasper-libs.x86_64 1.900.1-16.el6_6.3 myrepo """), CommandCaptureCommand( ('dnf', 'update', '--allowerasing', '-y', '--exclude', 'kernel-debug', '--enablerepo=myrepo', 'jasper-libs.x86_64')), CommandCaptureCommand( ('grubby', '--default-kernel'), stdout='/boot/vmlinuz-2.6.32-504.3.3.el6.x86_64')) def isfile(arg): return True with patch('os.path.isfile', side_effect=isfile): self.assertEqual( agent_updates.install_packages(['myrepo'], ['foo', 'bar']), agent_result({})) self.assertRanAllCommandsInOrder()
def install_packages(repos, packages): """ Explicitly evaluate and install or update any specific-version dependencies and satisfy even if that involves installing an older package than is already installed. Primary use case is installing lustre-modules, which depends on a specific kernel package. :param repos: List of strings, yum repo names :param packages: List of strings, yum package names :return: package report of the format given by the lustre device plugin """ if packages != []: yum_util('clean') out = yum_util('requires', enablerepo=repos, packages=packages) for requirement in [l.strip() for l in out.strip().split("\n")]: match = re.match("([^\)/]*) = (.*)", requirement) if match: require_package, require_version = match.groups() packages.append("%s-%s" % (require_package, require_version)) yum_util('install', enablerepo=repos, packages=packages) # So now we have installed the packages requested, we will also make sure that any installed packages we # have that are already installed are updated to our presumably better versions. update_packages = yum_check_update(repos) if update_packages: daemon_log.debug( "The following packages need update after we installed IML packages %s" % update_packages) yum_util('update', packages=update_packages, enablerepo=repos) error = _check_HYD4050() if error: return agent_error(error) return agent_result(lustre.scan_packages())
def _call(cls, host, cmd, args): cls.calls.append((cmd, args)) cls.host_calls[host.fqdn].append((cmd, args)) if not cls.succeed: cls._fail(host.fqdn) if (cmd, args) in cls.fail_commands: cls._fail(host.fqdn) mock_server = cls.mock_servers[host.address] log.info("invoke_agent %s %s %s" % (host, cmd, args)) # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries. # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present. if cmd == "load_lnet": synthetic_lnet_configuration(host, mock_server["nids"]) return elif cmd == "device_plugin": # Only returns nid info today. return create_synthetic_device_info(host, mock_server, args["plugin"]) elif cmd == "format_target": inode_size = None if "mkfsoptions" in args: inode_arg = re.search("-I (\d+)", args["mkfsoptions"]) if inode_arg: inode_size = int(inode_arg.group(1).__str__()) if inode_size is None: # A 'foo' value inode_size = 777 return { "uuid": uuid.uuid1().__str__(), "inode_count": 666, "inode_size": inode_size, "filesystem_type": "ext4", } elif cmd == "stop_target": ha_label = args["ha_label"] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result_ok elif cmd == "start_target": ha_label = args["ha_label"] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result(target.primary_host.nodename) elif cmd == "register_target": # Assume mount paths are "/mnt/testfs-OST0001" style mount_point = args["mount_point"] label = re.search("/mnt/([^\s]+)", mount_point).group(1) return {"label": label} elif cmd == "detect_scan": return mock_server["detect-scan"] elif cmd == "install_packages": return agent_result([]) elif cmd == "register_server": api_client = TestApiClient() old_is_authenticated = CsrfAuthentication.is_authenticated try: CsrfAuthentication.is_authenticated = mock.Mock( return_value=True) api_client.client.login(username="******", password="******") fqdn = cls.mock_servers[host]["fqdn"] response = api_client.post( args["url"] + "register/%s/" % args["secret"], data={ "address": host, "fqdn": fqdn, "nodename": cls.mock_servers[host]["nodename"], "capabilities": ["manage_targets"], "version": cls.version, "csr": helper.generate_csr(fqdn), }, ) assert response.status_code == 201 registration_data = Serializer().deserialize( response.content, format=response["Content-Type"]) print("MockAgent.invoke returning %s" % registration_data) return registration_data finally: CsrfAuthentication.is_authenticated = old_is_authenticated elif cmd == "kernel_status": return { "running": "fake_kernel-0.1", "required": "fake_kernel-0.1", "available": ["fake_kernel-0.1"] } elif cmd == "selinux_status": return {"status": "Disabled"} elif cmd == "reboot_server": now = IMLDateTime.utcnow() log.info("rebooting %s; updating boot_time to %s" % (host, now)) job_scheduler_notify.notify(host, now, {"boot_time": now}) elif cmd == "which zfs": return 1 elif "import platform;" in cmd: return "0" elif "socket.gethostbyname(socket.gethostname())" in cmd: if not mock_server["tests"]["hostname_valid"]: return "127.0.0.1" else: return mock_server["address"] elif "print os.uname()[1]" in cmd: return "%s\n%s" % (mock_server["nodename"], mock_server["fqdn"]) elif "socket.getfqdn()" in cmd: return mock_server["fqdn"] elif "ping" in cmd: result = (0 if mock_server["tests"]["reverse_resolve"] else 2) + (0 if mock_server["tests"]["reverse_ping"] else 1) return result elif "ElectricFence" in cmd: return 0 if mock_server["tests"]["yum_can_update"] else 1 elif "openssl version -a" in cmd: return 0 if mock_server["tests"]["openssl"] else 1 elif "curl -k https" in cmd: return json.dumps({"host_id": host.id, "command_id": 0}) elif cmd in [ "configure_pacemaker", "unconfigure_pacemaker", "configure_target_store", "unconfigure_target_store", "deregister_server", "restart_agent", "shutdown_server", "host_corosync_config", "check_block_device", "set_conf_param", "purge_configuration", ]: return None elif cmd in [ "configure_target_ha", "unconfigure_target_ha", "start_lnet", "stop_lnet", "unload_lnet", "unconfigure_lnet", "configure_corosync", "unconfigure_corosync", "start_corosync", "stop_corosync", "start_pacemaker", "stop_pacemaker", "configure_ntp", "unconfigure_ntp", "import_target", "export_target", "set_profile", "update_profile", "failover_target", "failback_target", "configure_network", "open_firewall", "close_firewall", ]: return agent_result_ok elif cmd == "get_corosync_autoconfig": return agent_result({ "interfaces": { "eth0": { "dedicated": False, "ipaddr": "192.168.0.1", "prefix": 24 }, "eth1": { "dedicated": True, "ipaddr": "10.10.0.01", "prefix": 24 }, }, "mcast_port": "666", }) else: assert False, ( "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well." % cmd)
def _call(cls, host, cmd, args): cls.calls.append((cmd, args)) cls.host_calls[host].append((cmd, args)) if not cls.succeed: cls._fail(host.fqdn) if (cmd, args) in cls.fail_commands: cls._fail(host.fqdn) mock_server = cls.mock_servers[host.address] log.info("invoke_agent %s %s %s" % (host, cmd, args)) # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries. # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present. if cmd == "load_lnet": synthetic_lnet_configuration(host, mock_server['nids']) return elif cmd == "device_plugin": # Only returns nid info today. return create_synthetic_device_info(host, mock_server, args['plugin']) elif cmd == 'format_target': inode_size = None if 'mkfsoptions' in args: inode_arg = re.search("-I (\d+)", args['mkfsoptions']) if inode_arg: inode_size = int(inode_arg.group(1).__str__()) if inode_size is None: # A 'foo' value inode_size = 777 return { 'uuid': uuid.uuid1().__str__(), 'inode_count': 666, 'inode_size': inode_size, 'filesystem_type': 'ext4' } elif cmd == 'stop_target': ha_label = args['ha_label'] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result_ok elif cmd == 'start_target': ha_label = args['ha_label'] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result(target.primary_host.nodename) elif cmd == 'register_target': # Assume mount paths are "/mnt/testfs-OST0001" style mount_point = args['mount_point'] label = re.search("/mnt/([^\s]+)", mount_point).group(1) return {'label': label} elif cmd == 'detect_scan': return mock_server['detect-scan'] elif cmd == 'install_packages': return agent_result([]) elif cmd == 'register_server': api_client = TestApiClient() old_is_authenticated = CsrfAuthentication.is_authenticated try: CsrfAuthentication.is_authenticated = mock.Mock( return_value=True) api_client.client.login(username='******', password='******') fqdn = cls.mock_servers[host]['fqdn'] response = api_client.post( args['url'] + "register/%s/" % args['secret'], data={ 'address': host, 'fqdn': fqdn, 'nodename': cls.mock_servers[host]['nodename'], 'capabilities': ['manage_targets'], 'version': cls.version, 'csr': helper.generate_csr(fqdn) }) assert response.status_code == 201 registration_data = Serializer().deserialize( response.content, format=response['Content-Type']) print "MockAgent.invoke returning %s" % registration_data return registration_data finally: CsrfAuthentication.is_authenticated = old_is_authenticated elif cmd == 'kernel_status': return { 'running': 'fake_kernel-0.1', 'required': 'fake_kernel-0.1', 'available': ['fake_kernel-0.1'] } elif cmd == 'reboot_server': now = IMLDateTime.utcnow() log.info("rebooting %s; updating boot_time to %s" % (host, now)) job_scheduler_notify.notify(host, now, {'boot_time': now}) elif 'socket.gethostbyname(socket.gethostname())' in cmd: if not mock_server['tests']['hostname_valid']: return '127.0.0.1' else: return mock_server['address'] elif 'print os.uname()[1]' in cmd: return '%s\n%s' % (mock_server['nodename'], mock_server['fqdn']) elif 'socket.getfqdn()' in cmd: return mock_server['fqdn'] elif 'ping' in cmd: result = ((0 if mock_server['tests']['reverse_resolve'] else 2) + (0 if mock_server['tests']['reverse_ping'] else 1)) return result elif 'python-fedora-django' in cmd: return 0 if mock_server['tests']['yum_valid_repos'] else 1 elif 'ElectricFence' in cmd: return 0 if mock_server['tests']['yum_can_update'] else 1 elif 'curl -k https' in cmd: return json.dumps({'host_id': host.id, 'command_id': 0}) elif cmd in [ 'configure_pacemaker', 'unconfigure_pacemaker', 'configure_target_store', 'unconfigure_target_store', 'deregister_server', 'restart_agent', 'shutdown_server', 'host_corosync_config', 'check_block_device', 'set_conf_param', 'purge_configuration' ]: return None elif cmd in [ 'configure_target_ha', 'unconfigure_target_ha', 'start_lnet', 'stop_lnet', 'unload_lnet', 'unconfigure_lnet', 'configure_corosync', 'unconfigure_corosync', 'start_corosync', 'stop_corosync', 'start_pacemaker', 'stop_pacemaker', 'configure_ntp', 'unconfigure_ntp', 'import_target', 'export_target', 'import_target', 'export_target' 'set_profile', 'update_profile', 'failover_target', 'failback_target', 'configure_network', 'open_firewall', 'close_firewall' ]: return agent_result_ok elif cmd == 'get_corosync_autoconfig': return agent_result({ 'interfaces': { 'eth0': { 'dedicated': False, 'ipaddr': '192.168.0.1', 'prefix': 24 }, 'eth1': { 'dedicated': True, 'ipaddr': '10.10.0.01', 'prefix': 24 } }, 'mcast_port': '666' }) else: assert False, "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well." % cmd
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' if not _resource_exists(ha_label): return agent_error("Target {} does not exist".format(ha_label)) # if resource already started but not on primary, move it location = get_resource_location(ha_label) primary = _find_resource_constraint(ha_label, True) if location: if location != primary: console_log.info( "Resource %s already started, moving to primary node %s", ha_label, primary) error = _move_target(ha_label, primary) if error: return agent_error(error) location = primary return agent_result(location) # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', ha_label]) if error: return agent_error(error) if _resource_exists(_zfs_name(ha_label)): error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', _zfs_name(ha_label)]) if error: return agent_error(error) if _resource_exists(_group_name(ha_label)): # enable group also, in case group was disabled error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', _group_name(ha_label)]) if error: return agent_error(error) # now wait for it to start if _wait_target(ha_label, True): location = get_resource_location(ha_label) if not location: return agent_error( "Started {} but now can't locate it!".format(ha_label)) return agent_result(location) else: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s", ha_label) else: return agent_error( "Failed to start target {}".format(ha_label))
def start_target(self, ha_label): resource = self._cluster.start(ha_label) return agent_result(resource['started_on'])
def unconfigure_target_ha(self, primary, ha_label, uuid): return agent_result( self._cluster.unconfigure(self.nodename, ha_label, primary))
def configure_target_ha(self, primary, device, ha_label, uuid, mount_point): return agent_result( self._cluster.configure(self.nodename, device, ha_label, uuid, primary, mount_point))